@novastera-oss/llamarn 0.4.0 → 0.4.3-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +4 -1
- package/android/CMakeLists.txt +13 -3
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/cpp/rn-completion.cpp +3 -27
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -45,10 +45,12 @@
|
|
|
45
45
|
#include "ggml-cuda/sumrows.cuh"
|
|
46
46
|
#include "ggml-cuda/mean.cuh"
|
|
47
47
|
#include "ggml-cuda/tsembd.cuh"
|
|
48
|
+
#include "ggml-cuda/topk-moe.cuh"
|
|
48
49
|
#include "ggml-cuda/unary.cuh"
|
|
49
50
|
#include "ggml-cuda/upscale.cuh"
|
|
50
51
|
#include "ggml-cuda/wkv.cuh"
|
|
51
52
|
#include "ggml-cuda/gla.cuh"
|
|
53
|
+
#include "ggml-cuda/set.cuh"
|
|
52
54
|
#include "ggml-cuda/set-rows.cuh"
|
|
53
55
|
#include "ggml-cuda/pad_reflect_1d.cuh"
|
|
54
56
|
#include "ggml.h"
|
|
@@ -230,7 +232,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
230
232
|
|
|
231
233
|
info.default_tensor_split[id] = total_vram;
|
|
232
234
|
total_vram += prop.totalGlobalMem;
|
|
233
|
-
info.devices[id].integrated =
|
|
235
|
+
info.devices[id].integrated = false; // Temporarily disabled due to issues with corrupted output (e.g. #15034)
|
|
234
236
|
info.devices[id].nsm = prop.multiProcessorCount;
|
|
235
237
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
236
238
|
info.devices[id].warp_size = prop.warpSize;
|
|
@@ -272,6 +274,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
272
274
|
} else if (device_name.substr(0, 21) == "NVIDIA GeForce GTX 16") {
|
|
273
275
|
turing_devices_without_mma.push_back({ id, device_name });
|
|
274
276
|
}
|
|
277
|
+
|
|
278
|
+
// Temporary performance fix:
|
|
279
|
+
// Setting device scheduling strategy for iGPUs with cc121 to "spinning" to avoid delays in cuda synchronize calls.
|
|
280
|
+
// TODO: Check for future drivers the default scheduling strategy and
|
|
281
|
+
// remove this call again when cudaDeviceScheduleSpin is default.
|
|
282
|
+
if (prop.major == 12 && prop.minor == 1) {
|
|
283
|
+
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
|
|
284
|
+
}
|
|
285
|
+
|
|
275
286
|
#endif // defined(GGML_USE_HIP)
|
|
276
287
|
}
|
|
277
288
|
|
|
@@ -1947,8 +1958,15 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
|
|
1947
1958
|
|
|
1948
1959
|
size_t src1_stride_size = sizeof(cuda_t);
|
|
1949
1960
|
|
|
1950
|
-
|
|
1951
|
-
|
|
1961
|
+
const int threads_x = 16;
|
|
1962
|
+
const int threads_y = 16;
|
|
1963
|
+
dim3 block_dims(threads_x, threads_y);
|
|
1964
|
+
|
|
1965
|
+
dim3 grid_dims(
|
|
1966
|
+
(ne13 + threads_x - 1) / threads_x,
|
|
1967
|
+
(ne12 + threads_y - 1) / threads_y
|
|
1968
|
+
);
|
|
1969
|
+
k_compute_batched_ptrs<<<grid_dims, block_dims, 0, main_stream>>>(
|
|
1952
1970
|
src0_ptr, src1_ptr, dst_t,
|
|
1953
1971
|
ptrs_src.get(), ptrs_dst.get(),
|
|
1954
1972
|
ne12, ne13,
|
|
@@ -1997,6 +2015,164 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
|
1997
2015
|
}
|
|
1998
2016
|
}
|
|
1999
2017
|
|
|
2018
|
+
static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up,
|
|
2019
|
+
const ggml_tensor * ffn_gate,
|
|
2020
|
+
const ggml_tensor * glu,
|
|
2021
|
+
const ggml_tensor * ffn_up_bias = nullptr,
|
|
2022
|
+
const ggml_tensor * ffn_gate_bias = nullptr) {
|
|
2023
|
+
const bool has_bias = ffn_up_bias != nullptr || ffn_gate_bias != nullptr;
|
|
2024
|
+
|
|
2025
|
+
if (has_bias && (!ffn_up_bias || !ffn_gate_bias)) {
|
|
2026
|
+
return false;
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
const bool is_mul_mat = ffn_up->op == GGML_OP_MUL_MAT && ffn_gate->op == GGML_OP_MUL_MAT && glu->op == GGML_OP_GLU;
|
|
2030
|
+
const bool is_mul_mat_id = ffn_up->op == GGML_OP_MUL_MAT_ID && ffn_gate->op == GGML_OP_MUL_MAT_ID && glu->op == GGML_OP_GLU;
|
|
2031
|
+
|
|
2032
|
+
GGML_ASSERT(ffn_up && ffn_gate && glu);
|
|
2033
|
+
|
|
2034
|
+
if (!is_mul_mat && !is_mul_mat_id) {
|
|
2035
|
+
return false;
|
|
2036
|
+
}
|
|
2037
|
+
|
|
2038
|
+
const ggml_op expected_bias_op = is_mul_mat ? GGML_OP_ADD : GGML_OP_ADD_ID;
|
|
2039
|
+
|
|
2040
|
+
if (has_bias) {
|
|
2041
|
+
if (ffn_up_bias->op != expected_bias_op || ffn_gate_bias->op != expected_bias_op) {
|
|
2042
|
+
return false;
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
if (glu->src[0] != ffn_gate_bias || glu->src[1] != ffn_up_bias) {
|
|
2046
|
+
return false;
|
|
2047
|
+
}
|
|
2048
|
+
|
|
2049
|
+
if (expected_bias_op == GGML_OP_ADD) {
|
|
2050
|
+
const bool up_has_mul = ffn_up_bias->src[0] == ffn_up || ffn_up_bias->src[1] == ffn_up;
|
|
2051
|
+
const bool gate_has_mul = ffn_gate_bias->src[0] == ffn_gate || ffn_gate_bias->src[1] == ffn_gate;
|
|
2052
|
+
if (!up_has_mul || !gate_has_mul) {
|
|
2053
|
+
return false;
|
|
2054
|
+
}
|
|
2055
|
+
} else { // GGML_OP_ADD_ID
|
|
2056
|
+
if (ffn_up_bias->src[0] != ffn_up || ffn_gate_bias->src[0] != ffn_gate) {
|
|
2057
|
+
return false;
|
|
2058
|
+
}
|
|
2059
|
+
if (ffn_up_bias->src[2] != ffn_up->src[2] || ffn_gate_bias->src[2] != ffn_gate->src[2]) {
|
|
2060
|
+
return false;
|
|
2061
|
+
}
|
|
2062
|
+
}
|
|
2063
|
+
} else {
|
|
2064
|
+
if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) {
|
|
2065
|
+
return false;
|
|
2066
|
+
}
|
|
2067
|
+
}
|
|
2068
|
+
|
|
2069
|
+
if (ffn_up->src[0]->type != ffn_gate->src[0]->type || !ggml_are_same_shape(ffn_up->src[0], ffn_gate->src[0]) ||
|
|
2070
|
+
!ggml_are_same_stride(ffn_up->src[0], ffn_gate->src[0])) {
|
|
2071
|
+
return false;
|
|
2072
|
+
}
|
|
2073
|
+
|
|
2074
|
+
if (ffn_up->src[1] != ffn_gate->src[1]) {
|
|
2075
|
+
return false;
|
|
2076
|
+
}
|
|
2077
|
+
|
|
2078
|
+
if (ffn_up->src[2] && (ffn_up->src[2] != ffn_gate->src[2])) {
|
|
2079
|
+
return false;
|
|
2080
|
+
}
|
|
2081
|
+
|
|
2082
|
+
static constexpr std::array<ggml_glu_op, 3> valid_glu_ops = { GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU, GGML_GLU_OP_SWIGLU_OAI };
|
|
2083
|
+
|
|
2084
|
+
if (std::find(valid_glu_ops.begin(), valid_glu_ops.end(), ggml_get_glu_op(glu)) == valid_glu_ops.end()) {
|
|
2085
|
+
return false;
|
|
2086
|
+
}
|
|
2087
|
+
|
|
2088
|
+
if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) {
|
|
2089
|
+
return false;
|
|
2090
|
+
}
|
|
2091
|
+
|
|
2092
|
+
const bool split = ggml_backend_buft_is_cuda_split(ffn_up->src[0]->buffer->buft) ||
|
|
2093
|
+
ggml_backend_buft_is_cuda_split(ffn_gate->src[0]->buffer->buft);
|
|
2094
|
+
|
|
2095
|
+
//TODO: add support for fusion for split buffers
|
|
2096
|
+
if (split) {
|
|
2097
|
+
return false;
|
|
2098
|
+
}
|
|
2099
|
+
|
|
2100
|
+
return true;
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2103
|
+
static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
|
|
2104
|
+
ggml_tensor * src0 = tensor->src[0];
|
|
2105
|
+
ggml_tensor * src1 = tensor->src[1];
|
|
2106
|
+
const ggml_tensor * dst = tensor;
|
|
2107
|
+
|
|
2108
|
+
const bool is_mul_mat_id = tensor->op == GGML_OP_MUL_MAT_ID;
|
|
2109
|
+
|
|
2110
|
+
bool use_mul_mat_vec_f =
|
|
2111
|
+
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) &&
|
|
2112
|
+
src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
2113
|
+
|
|
2114
|
+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
|
2115
|
+
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
|
|
2116
|
+
|
|
2117
|
+
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
|
|
2118
|
+
ggml_backend_buft_is_cuda_split(src1->buffer->buft);
|
|
2119
|
+
|
|
2120
|
+
//TODO: add support for fusion for split buffers
|
|
2121
|
+
if (split) {
|
|
2122
|
+
return false;
|
|
2123
|
+
}
|
|
2124
|
+
|
|
2125
|
+
//we only support fusion for ncols_dst = 1
|
|
2126
|
+
if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
|
|
2127
|
+
return false;
|
|
2128
|
+
}
|
|
2129
|
+
|
|
2130
|
+
if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
|
|
2131
|
+
return false;
|
|
2132
|
+
}
|
|
2133
|
+
|
|
2134
|
+
|
|
2135
|
+
return use_mul_mat_vec_f;
|
|
2136
|
+
}
|
|
2137
|
+
|
|
2138
|
+
static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
|
|
2139
|
+
ggml_tensor * src0 = tensor->src[0];
|
|
2140
|
+
ggml_tensor * src1 = tensor->src[1];
|
|
2141
|
+
const ggml_tensor * dst = tensor;
|
|
2142
|
+
|
|
2143
|
+
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE &&
|
|
2144
|
+
ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
|
|
2145
|
+
src0->view_src;
|
|
2146
|
+
|
|
2147
|
+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
|
|
2148
|
+
dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
2149
|
+
|
|
2150
|
+
// fusion is not universally faster on Pascal
|
|
2151
|
+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
|
2152
|
+
if (cc <= GGML_CUDA_CC_PASCAL) {
|
|
2153
|
+
return false;
|
|
2154
|
+
}
|
|
2155
|
+
//we only support fusion for ncols_dst = 1
|
|
2156
|
+
if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
|
|
2157
|
+
return false;
|
|
2158
|
+
}
|
|
2159
|
+
|
|
2160
|
+
if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
|
|
2161
|
+
return false;
|
|
2162
|
+
}
|
|
2163
|
+
|
|
2164
|
+
|
|
2165
|
+
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
|
|
2166
|
+
ggml_backend_buft_is_cuda_split(src1->buffer->buft);
|
|
2167
|
+
|
|
2168
|
+
//TODO: add support for fusion for split buffers
|
|
2169
|
+
if (split) {
|
|
2170
|
+
return false;
|
|
2171
|
+
}
|
|
2172
|
+
|
|
2173
|
+
return use_mul_mat_vec_q;
|
|
2174
|
+
}
|
|
2175
|
+
|
|
2000
2176
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
2001
2177
|
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
|
2002
2178
|
|
|
@@ -2030,16 +2206,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
2030
2206
|
const int cc = ggml_cuda_info().devices[id].cc;
|
|
2031
2207
|
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
|
2032
2208
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
2033
|
-
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
|
|
2034
|
-
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
|
2209
|
+
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
|
2210
|
+
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
|
2035
2211
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
2036
2212
|
}
|
|
2037
2213
|
} else {
|
|
2038
2214
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
|
2039
2215
|
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
|
2040
2216
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
2041
|
-
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
|
|
2042
|
-
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
|
2217
|
+
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
|
2218
|
+
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
|
2043
2219
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
2044
2220
|
}
|
|
2045
2221
|
|
|
@@ -2109,6 +2285,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
|
2109
2285
|
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
|
2110
2286
|
return;
|
|
2111
2287
|
}
|
|
2288
|
+
|
|
2289
|
+
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
|
|
2290
|
+
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
|
|
2291
|
+
return;
|
|
2292
|
+
}
|
|
2112
2293
|
}
|
|
2113
2294
|
|
|
2114
2295
|
cudaStream_t stream = ctx.stream();
|
|
@@ -2253,6 +2434,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2253
2434
|
case GGML_OP_SET_ROWS:
|
|
2254
2435
|
ggml_cuda_op_set_rows(ctx, dst);
|
|
2255
2436
|
break;
|
|
2437
|
+
case GGML_OP_SET:
|
|
2438
|
+
ggml_cuda_op_set(ctx, dst);
|
|
2439
|
+
break;
|
|
2256
2440
|
case GGML_OP_DUP:
|
|
2257
2441
|
ggml_cuda_dup(ctx, dst);
|
|
2258
2442
|
break;
|
|
@@ -2328,6 +2512,27 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2328
2512
|
case GGML_UNARY_OP_ELU:
|
|
2329
2513
|
ggml_cuda_op_elu(ctx, dst);
|
|
2330
2514
|
break;
|
|
2515
|
+
case GGML_UNARY_OP_XIELU:
|
|
2516
|
+
ggml_cuda_op_xielu(ctx, dst);
|
|
2517
|
+
break;
|
|
2518
|
+
case GGML_UNARY_OP_FLOOR:
|
|
2519
|
+
ggml_cuda_op_floor(ctx, dst);
|
|
2520
|
+
break;
|
|
2521
|
+
case GGML_UNARY_OP_CEIL:
|
|
2522
|
+
ggml_cuda_op_ceil(ctx, dst);
|
|
2523
|
+
break;
|
|
2524
|
+
case GGML_UNARY_OP_ROUND:
|
|
2525
|
+
ggml_cuda_op_round(ctx, dst);
|
|
2526
|
+
break;
|
|
2527
|
+
case GGML_UNARY_OP_TRUNC:
|
|
2528
|
+
ggml_cuda_op_trunc(ctx, dst);
|
|
2529
|
+
break;
|
|
2530
|
+
case GGML_UNARY_OP_EXPM1:
|
|
2531
|
+
ggml_cuda_op_expm1(ctx, dst);
|
|
2532
|
+
break;
|
|
2533
|
+
case GGML_UNARY_OP_SOFTPLUS:
|
|
2534
|
+
ggml_cuda_op_softplus(ctx, dst);
|
|
2535
|
+
break;
|
|
2331
2536
|
default:
|
|
2332
2537
|
return false;
|
|
2333
2538
|
}
|
|
@@ -2452,6 +2657,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2452
2657
|
case GGML_OP_IM2COL:
|
|
2453
2658
|
ggml_cuda_op_im2col(ctx, dst);
|
|
2454
2659
|
break;
|
|
2660
|
+
case GGML_OP_IM2COL_3D:
|
|
2661
|
+
ggml_cuda_op_im2col_3d(ctx, dst);
|
|
2662
|
+
break;
|
|
2455
2663
|
case GGML_OP_CONV_2D:
|
|
2456
2664
|
ggml_cuda_op_conv2d(ctx, dst);
|
|
2457
2665
|
break;
|
|
@@ -2621,17 +2829,18 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
|
2621
2829
|
}
|
|
2622
2830
|
|
|
2623
2831
|
#ifdef USE_CUDA_GRAPH
|
|
2624
|
-
static bool
|
|
2832
|
+
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
|
2625
2833
|
bool use_cuda_graph) {
|
|
2626
2834
|
|
|
2627
2835
|
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
|
2628
|
-
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
|
2629
2836
|
|
|
2630
2837
|
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
|
2631
2838
|
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
|
|
2632
2839
|
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
|
|
2633
2840
|
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
|
|
2634
2841
|
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
|
|
2842
|
+
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
|
|
2843
|
+
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
|
|
2635
2844
|
|
|
2636
2845
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
2637
2846
|
ggml_tensor * node = cgraph->nodes[i];
|
|
@@ -2660,7 +2869,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
|
2660
2869
|
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
|
|
2661
2870
|
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
|
|
2662
2871
|
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
|
|
2663
|
-
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0
|
|
2872
|
+
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
|
|
2873
|
+
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
|
|
2874
|
+
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
|
|
2664
2875
|
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
|
|
2665
2876
|
// by means of matching node names. See
|
|
2666
2877
|
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
|
|
@@ -2672,33 +2883,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
|
2672
2883
|
#endif
|
|
2673
2884
|
}
|
|
2674
2885
|
|
|
2675
|
-
if (node->op == GGML_OP_CPY) {
|
|
2676
|
-
|
|
2677
|
-
// Store the pointers which are updated for each token, such that these can be sent
|
|
2678
|
-
// to the device and accessed using indirection from CUDA graph
|
|
2679
|
-
cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
|
|
2680
|
-
|
|
2681
|
-
// store a pointer to each copy op CUDA kernel to identify it later
|
|
2682
|
-
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
|
2683
|
-
if (!ptr) {
|
|
2684
|
-
use_cuda_graph = false;
|
|
2685
|
-
#ifndef NDEBUG
|
|
2686
|
-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
|
2687
|
-
#endif
|
|
2688
|
-
}
|
|
2689
|
-
}
|
|
2690
|
-
|
|
2691
2886
|
if (!use_cuda_graph) {
|
|
2692
2887
|
break;
|
|
2693
2888
|
}
|
|
2694
2889
|
}
|
|
2695
2890
|
|
|
2696
|
-
if (use_cuda_graph) {
|
|
2697
|
-
cuda_ctx->cuda_graph->use_cpy_indirection = true;
|
|
2698
|
-
// copy pointers to GPU so they can be accessed via indirection within CUDA graph
|
|
2699
|
-
ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
|
|
2700
|
-
}
|
|
2701
|
-
|
|
2702
2891
|
return use_cuda_graph;
|
|
2703
2892
|
}
|
|
2704
2893
|
|
|
@@ -2717,7 +2906,6 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p
|
|
|
2717
2906
|
|
|
2718
2907
|
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
|
2719
2908
|
if (node->data != graph_node_properties->node_address &&
|
|
2720
|
-
node->op != GGML_OP_CPY &&
|
|
2721
2909
|
node->op != GGML_OP_VIEW) {
|
|
2722
2910
|
return false;
|
|
2723
2911
|
}
|
|
@@ -2738,14 +2926,13 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
|
|
2738
2926
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
2739
2927
|
if (node->src[i] &&
|
|
2740
2928
|
node->src[i]->data != graph_node_properties->src_address[i] &&
|
|
2741
|
-
node->op != GGML_OP_CPY &&
|
|
2742
2929
|
node->op != GGML_OP_VIEW
|
|
2743
2930
|
) {
|
|
2744
2931
|
return false;
|
|
2745
2932
|
}
|
|
2746
2933
|
}
|
|
2747
2934
|
|
|
2748
|
-
if (node->op == GGML_OP_SCALE &&
|
|
2935
|
+
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
|
|
2749
2936
|
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
|
2750
2937
|
return false;
|
|
2751
2938
|
}
|
|
@@ -2811,12 +2998,125 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
|
|
2811
2998
|
}
|
|
2812
2999
|
#endif
|
|
2813
3000
|
|
|
3001
|
+
static bool ggml_cuda_should_fuse_rope_set_rows(const ggml_tensor * rope,
|
|
3002
|
+
const ggml_tensor * view,
|
|
3003
|
+
const ggml_tensor * set_rows) {
|
|
3004
|
+
|
|
3005
|
+
if (rope->op != GGML_OP_ROPE || view->op != GGML_OP_VIEW || set_rows->op != GGML_OP_SET_ROWS) {
|
|
3006
|
+
return false;
|
|
3007
|
+
}
|
|
3008
|
+
// ne3 not tested
|
|
3009
|
+
if (rope->src[0]->ne[3] != 1) {
|
|
3010
|
+
return false;
|
|
3011
|
+
}
|
|
3012
|
+
|
|
3013
|
+
if (set_rows->type != GGML_TYPE_F32 && set_rows->type != GGML_TYPE_F16) {
|
|
3014
|
+
return false;
|
|
3015
|
+
}
|
|
3016
|
+
|
|
3017
|
+
if (set_rows->src[1]->type != GGML_TYPE_I64) {
|
|
3018
|
+
return false;
|
|
3019
|
+
}
|
|
3020
|
+
|
|
3021
|
+
// The view should flatten two dims of rope into one dim
|
|
3022
|
+
if (!ggml_is_contiguous(view) || view->ne[0] != rope->ne[0] * rope->ne[1]) {
|
|
3023
|
+
return false;
|
|
3024
|
+
}
|
|
3025
|
+
|
|
3026
|
+
// Only norm/neox shaders have the fusion code
|
|
3027
|
+
const int mode = ((const int32_t *) rope->op_params)[2];
|
|
3028
|
+
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
|
|
3029
|
+
return false;
|
|
3030
|
+
}
|
|
3031
|
+
|
|
3032
|
+
return true;
|
|
3033
|
+
}
|
|
3034
|
+
|
|
2814
3035
|
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
|
|
2815
3036
|
#ifndef NDEBUG
|
|
2816
3037
|
const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
|
|
2817
3038
|
GGML_ASSERT(unary_ops.size() == num_unary);
|
|
2818
3039
|
#endif
|
|
2819
3040
|
|
|
3041
|
+
//TODO: remove special case once ggml_can_fuse can handle empty nodes
|
|
3042
|
+
std::initializer_list<enum ggml_op> topk_moe_ops =
|
|
3043
|
+
ggml_cuda_topk_moe_ops(/*with_norm*/ false, /*delayed_softmax=*/false);
|
|
3044
|
+
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm =
|
|
3045
|
+
ggml_cuda_topk_moe_ops(/*with_norm=*/true, /*delayed_softmax=*/false);
|
|
3046
|
+
std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
|
|
3047
|
+
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
|
|
3048
|
+
|
|
3049
|
+
if (ops.size() == topk_moe_ops_with_norm.size() &&
|
|
3050
|
+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
|
|
3051
|
+
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
|
3052
|
+
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
|
|
3053
|
+
|
|
3054
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
|
3055
|
+
return true;
|
|
3056
|
+
}
|
|
3057
|
+
}
|
|
3058
|
+
|
|
3059
|
+
if (ops.size() == topk_moe_ops.size() &&
|
|
3060
|
+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
|
|
3061
|
+
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
|
3062
|
+
ggml_tensor * weights = cgraph->nodes[node_idx + 4];
|
|
3063
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
|
3064
|
+
return true;
|
|
3065
|
+
}
|
|
3066
|
+
}
|
|
3067
|
+
|
|
3068
|
+
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
|
|
3069
|
+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
|
|
3070
|
+
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
|
3071
|
+
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
|
3072
|
+
|
|
3073
|
+
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
|
3074
|
+
return true;
|
|
3075
|
+
}
|
|
3076
|
+
}
|
|
3077
|
+
|
|
3078
|
+
std::initializer_list<enum ggml_op> mul_mat_bias_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_GLU };
|
|
3079
|
+
std::initializer_list<enum ggml_op> mul_mat_id_bias_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_GLU };
|
|
3080
|
+
|
|
3081
|
+
std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
|
|
3082
|
+
std::initializer_list<enum ggml_op> mul_mat_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT, GGML_OP_GLU };
|
|
3083
|
+
|
|
3084
|
+
if (ops.size() == 5 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}) ||
|
|
3085
|
+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}))) {
|
|
3086
|
+
|
|
3087
|
+
const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
|
|
3088
|
+
const ggml_tensor * ffn_gate_bias = cgraph->nodes[node_idx + 1];
|
|
3089
|
+
const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 2];
|
|
3090
|
+
const ggml_tensor * ffn_up_bias = cgraph->nodes[node_idx + 3];
|
|
3091
|
+
const ggml_tensor * glu = cgraph->nodes[node_idx + 4];
|
|
3092
|
+
|
|
3093
|
+
if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu, ffn_up_bias, ffn_gate_bias)) {
|
|
3094
|
+
return true;
|
|
3095
|
+
}
|
|
3096
|
+
}
|
|
3097
|
+
|
|
3098
|
+
if (ops.size() == 3 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}) ||
|
|
3099
|
+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}))) {
|
|
3100
|
+
|
|
3101
|
+
const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
|
|
3102
|
+
const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 1];
|
|
3103
|
+
const ggml_tensor * glu = cgraph->nodes[node_idx + 2];
|
|
3104
|
+
|
|
3105
|
+
if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
|
|
3106
|
+
return true;
|
|
3107
|
+
}
|
|
3108
|
+
}
|
|
3109
|
+
|
|
3110
|
+
if (ops.size() == 3 && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
|
|
3111
|
+
const ggml_tensor * rope = cgraph->nodes[node_idx];
|
|
3112
|
+
const ggml_tensor * view = cgraph->nodes[node_idx + 1];
|
|
3113
|
+
const ggml_tensor * set_rows = cgraph->nodes[node_idx + 2];
|
|
3114
|
+
|
|
3115
|
+
if (ggml_cuda_should_fuse_rope_set_rows(rope, view, set_rows)) {
|
|
3116
|
+
return true;
|
|
3117
|
+
}
|
|
3118
|
+
}
|
|
3119
|
+
|
|
2820
3120
|
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
|
2821
3121
|
return false;
|
|
2822
3122
|
}
|
|
@@ -2847,7 +3147,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
|
2847
3147
|
}
|
|
2848
3148
|
|
|
2849
3149
|
//if rms norm is the B operand, then we don't handle broadcast
|
|
2850
|
-
if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm
|
|
3150
|
+
if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
|
|
2851
3151
|
return false;
|
|
2852
3152
|
}
|
|
2853
3153
|
|
|
@@ -2897,8 +3197,17 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
2897
3197
|
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
|
2898
3198
|
if (!use_cuda_graph || cuda_graph_update_required) {
|
|
2899
3199
|
|
|
3200
|
+
[[maybe_unused]] int prev_i = 0;
|
|
3201
|
+
|
|
2900
3202
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
2901
3203
|
ggml_tensor * node = cgraph->nodes[i];
|
|
3204
|
+
#ifdef GGML_CUDA_DEBUG
|
|
3205
|
+
const int nodes_fused = i - prev_i - 1;
|
|
3206
|
+
prev_i = i;
|
|
3207
|
+
if (nodes_fused > 0) {
|
|
3208
|
+
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
|
|
3209
|
+
}
|
|
3210
|
+
#endif
|
|
2902
3211
|
|
|
2903
3212
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
2904
3213
|
continue;
|
|
@@ -2907,6 +3216,45 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
2907
3216
|
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
|
2908
3217
|
if (!disable_fusion) {
|
|
2909
3218
|
|
|
3219
|
+
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
|
|
3220
|
+
ggml_tensor * weights = cgraph->nodes[i + 9];
|
|
3221
|
+
ggml_tensor * selected_experts = cgraph->nodes[i + 3];
|
|
3222
|
+
ggml_tensor * clamp = cgraph->nodes[i + 7];
|
|
3223
|
+
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true,
|
|
3224
|
+
/*delayed softmax*/ false, clamp);
|
|
3225
|
+
i += 9;
|
|
3226
|
+
continue;
|
|
3227
|
+
}
|
|
3228
|
+
|
|
3229
|
+
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
|
|
3230
|
+
ggml_tensor * weights = cgraph->nodes[i + 4];
|
|
3231
|
+
ggml_tensor * selected_experts = cgraph->nodes[i + 3];
|
|
3232
|
+
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false,
|
|
3233
|
+
/*delayed softmax*/ false);
|
|
3234
|
+
i += 4;
|
|
3235
|
+
continue;
|
|
3236
|
+
}
|
|
3237
|
+
|
|
3238
|
+
if (ggml_cuda_can_fuse(cgraph, i,
|
|
3239
|
+
ggml_cuda_topk_moe_ops(/*with norm*/ false, /*delayed softmax*/ true), {})) {
|
|
3240
|
+
ggml_tensor * weights = cgraph->nodes[i + 5];
|
|
3241
|
+
ggml_tensor * ids = cgraph->nodes[i + 1];
|
|
3242
|
+
|
|
3243
|
+
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, ids, /*with norm*/ false,
|
|
3244
|
+
/*delayed_softmax*/ true);
|
|
3245
|
+
i += 5;
|
|
3246
|
+
continue;
|
|
3247
|
+
}
|
|
3248
|
+
|
|
3249
|
+
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
|
|
3250
|
+
ggml_tensor * rope = cgraph->nodes[i];
|
|
3251
|
+
ggml_tensor * set_rows = cgraph->nodes[i + 2];
|
|
3252
|
+
|
|
3253
|
+
ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows);
|
|
3254
|
+
i += 2;
|
|
3255
|
+
continue;
|
|
3256
|
+
}
|
|
3257
|
+
|
|
2910
3258
|
if (node->op == GGML_OP_ADD) {
|
|
2911
3259
|
int n_fuse = 0;
|
|
2912
3260
|
ggml_op ops[8];
|
|
@@ -2938,6 +3286,195 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
2938
3286
|
}
|
|
2939
3287
|
}
|
|
2940
3288
|
|
|
3289
|
+
bool fused_mul_mat_vec = false;
|
|
3290
|
+
int fused_node_count = 0;
|
|
3291
|
+
|
|
3292
|
+
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
|
|
3293
|
+
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
|
|
3294
|
+
|
|
3295
|
+
if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
|
|
3296
|
+
ggml_tensor * glu = cgraph->nodes[i + 4];
|
|
3297
|
+
ggml_tensor * gate_bias_n = glu->src[0];
|
|
3298
|
+
ggml_tensor * up_bias_n = glu->src[1];
|
|
3299
|
+
|
|
3300
|
+
//we don't assume the order for {gate, up}. Instead infer it from the bias tensor
|
|
3301
|
+
ggml_tensor * gate_n = nullptr;
|
|
3302
|
+
ggml_tensor * up_n = nullptr;
|
|
3303
|
+
|
|
3304
|
+
if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
|
|
3305
|
+
gate_n = cgraph->nodes[i];
|
|
3306
|
+
up_n = cgraph->nodes[i + 2];
|
|
3307
|
+
} else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
|
|
3308
|
+
gate_n = cgraph->nodes[i + 2];
|
|
3309
|
+
up_n = cgraph->nodes[i];
|
|
3310
|
+
} else {
|
|
3311
|
+
continue;
|
|
3312
|
+
}
|
|
3313
|
+
|
|
3314
|
+
auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
|
|
3315
|
+
if (op_bias == GGML_OP_ADD) {
|
|
3316
|
+
if (bias_node->src[0] == mul_node) {
|
|
3317
|
+
return bias_node->src[1];
|
|
3318
|
+
}
|
|
3319
|
+
if (bias_node->src[1] == mul_node) {
|
|
3320
|
+
return bias_node->src[0];
|
|
3321
|
+
}
|
|
3322
|
+
return (ggml_tensor *) nullptr;
|
|
3323
|
+
}
|
|
3324
|
+
GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
|
|
3325
|
+
GGML_ASSERT(bias_node->src[0] == mul_node);
|
|
3326
|
+
return bias_node->src[1];
|
|
3327
|
+
};
|
|
3328
|
+
|
|
3329
|
+
ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
|
|
3330
|
+
ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
|
|
3331
|
+
|
|
3332
|
+
if (!up_bias_tensor || !gate_bias_tensor) {
|
|
3333
|
+
continue;
|
|
3334
|
+
}
|
|
3335
|
+
|
|
3336
|
+
// we don't support repeating adds
|
|
3337
|
+
if (bias_op == GGML_OP_ADD &&
|
|
3338
|
+
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
|
|
3339
|
+
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
|
|
3340
|
+
continue;
|
|
3341
|
+
}
|
|
3342
|
+
|
|
3343
|
+
const ggml_tensor * src0 = up_n->src[0];
|
|
3344
|
+
const ggml_tensor * src1 = up_n->src[1];
|
|
3345
|
+
const ggml_tensor * ids = up_n->src[2];
|
|
3346
|
+
|
|
3347
|
+
if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
|
|
3348
|
+
ggml_cuda_mm_fusion_args_host fusion_data{};
|
|
3349
|
+
fusion_data.gate = gate_n->src[0];
|
|
3350
|
+
fusion_data.x_bias = up_bias_tensor;
|
|
3351
|
+
fusion_data.gate_bias = gate_bias_tensor;
|
|
3352
|
+
fusion_data.glu_op = ggml_get_glu_op(glu);
|
|
3353
|
+
|
|
3354
|
+
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
|
3355
|
+
fused_mul_mat_vec = true;
|
|
3356
|
+
fused_node_count = 5;
|
|
3357
|
+
break;
|
|
3358
|
+
}
|
|
3359
|
+
|
|
3360
|
+
if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
|
|
3361
|
+
ggml_cuda_mm_fusion_args_host fusion_data{};
|
|
3362
|
+
fusion_data.gate = gate_n->src[0];
|
|
3363
|
+
fusion_data.x_bias = up_bias_tensor;
|
|
3364
|
+
fusion_data.gate_bias = gate_bias_tensor;
|
|
3365
|
+
fusion_data.glu_op = ggml_get_glu_op(glu);
|
|
3366
|
+
|
|
3367
|
+
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
|
3368
|
+
fused_mul_mat_vec = true;
|
|
3369
|
+
fused_node_count = 5;
|
|
3370
|
+
break;
|
|
3371
|
+
}
|
|
3372
|
+
} else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
|
|
3373
|
+
ggml_tensor * glu = cgraph->nodes[i + 2];
|
|
3374
|
+
ggml_tensor * gate = glu->src[0];
|
|
3375
|
+
ggml_tensor * up = glu->src[1];
|
|
3376
|
+
|
|
3377
|
+
bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1])
|
|
3378
|
+
|| (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
|
|
3379
|
+
|
|
3380
|
+
if (!ok) continue;
|
|
3381
|
+
|
|
3382
|
+
const ggml_tensor * src0 = up->src[0];
|
|
3383
|
+
const ggml_tensor * src1 = up->src[1];
|
|
3384
|
+
const ggml_tensor * ids = up->src[2];
|
|
3385
|
+
|
|
3386
|
+
if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
|
|
3387
|
+
ggml_cuda_mm_fusion_args_host fusion_data{};
|
|
3388
|
+
fusion_data.gate = gate->src[0];
|
|
3389
|
+
fusion_data.glu_op = ggml_get_glu_op(glu);
|
|
3390
|
+
|
|
3391
|
+
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
|
3392
|
+
fused_mul_mat_vec = true;
|
|
3393
|
+
fused_node_count = 3;
|
|
3394
|
+
break;
|
|
3395
|
+
}
|
|
3396
|
+
|
|
3397
|
+
if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
|
|
3398
|
+
ggml_cuda_mm_fusion_args_host fusion_data{};
|
|
3399
|
+
fusion_data.gate = gate->src[0];
|
|
3400
|
+
fusion_data.glu_op = ggml_get_glu_op(glu);
|
|
3401
|
+
|
|
3402
|
+
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
|
3403
|
+
fused_mul_mat_vec = true;
|
|
3404
|
+
fused_node_count = 3;
|
|
3405
|
+
break;
|
|
3406
|
+
}
|
|
3407
|
+
}
|
|
3408
|
+
}
|
|
3409
|
+
|
|
3410
|
+
if (fused_mul_mat_vec) {
|
|
3411
|
+
i += fused_node_count - 1;
|
|
3412
|
+
continue;
|
|
3413
|
+
}
|
|
3414
|
+
|
|
3415
|
+
fused_mul_mat_vec = false;
|
|
3416
|
+
fused_node_count = 0;
|
|
3417
|
+
|
|
3418
|
+
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
|
|
3419
|
+
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
|
|
3420
|
+
|
|
3421
|
+
if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
|
|
3422
|
+
continue;
|
|
3423
|
+
}
|
|
3424
|
+
|
|
3425
|
+
ggml_tensor * mm_node = cgraph->nodes[i];
|
|
3426
|
+
ggml_tensor * bias_node = cgraph->nodes[i + 1];
|
|
3427
|
+
|
|
3428
|
+
ggml_tensor * bias_tensor = nullptr;
|
|
3429
|
+
if (bias_op == GGML_OP_ADD) {
|
|
3430
|
+
if (bias_node->src[0] == mm_node) {
|
|
3431
|
+
bias_tensor = bias_node->src[1];
|
|
3432
|
+
} else if (bias_node->src[1] == mm_node) {
|
|
3433
|
+
bias_tensor = bias_node->src[0];
|
|
3434
|
+
} else {
|
|
3435
|
+
continue;
|
|
3436
|
+
}
|
|
3437
|
+
} else {
|
|
3438
|
+
if (bias_node->src[0] != mm_node) {
|
|
3439
|
+
continue;
|
|
3440
|
+
}
|
|
3441
|
+
bias_tensor = bias_node->src[1];
|
|
3442
|
+
}
|
|
3443
|
+
|
|
3444
|
+
const ggml_tensor * src0 = mm_node->src[0];
|
|
3445
|
+
const ggml_tensor * src1 = mm_node->src[1];
|
|
3446
|
+
const ggml_tensor * ids = mm_node->src[2];
|
|
3447
|
+
|
|
3448
|
+
if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
|
|
3449
|
+
continue;
|
|
3450
|
+
}
|
|
3451
|
+
|
|
3452
|
+
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
|
|
3453
|
+
continue;
|
|
3454
|
+
}
|
|
3455
|
+
|
|
3456
|
+
ggml_cuda_mm_fusion_args_host fusion_data{};
|
|
3457
|
+
fusion_data.x_bias = bias_tensor;
|
|
3458
|
+
|
|
3459
|
+
if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
|
|
3460
|
+
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
|
|
3461
|
+
fused_mul_mat_vec = true;
|
|
3462
|
+
fused_node_count = 2;
|
|
3463
|
+
break;
|
|
3464
|
+
}
|
|
3465
|
+
|
|
3466
|
+
if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
|
|
3467
|
+
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
|
|
3468
|
+
fused_mul_mat_vec = true;
|
|
3469
|
+
fused_node_count = 2;
|
|
3470
|
+
break;
|
|
3471
|
+
}
|
|
3472
|
+
}
|
|
3473
|
+
|
|
3474
|
+
if (fused_mul_mat_vec) {
|
|
3475
|
+
i += fused_node_count - 1;
|
|
3476
|
+
continue;
|
|
3477
|
+
}
|
|
2941
3478
|
|
|
2942
3479
|
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
|
|
2943
3480
|
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
|
|
@@ -3050,7 +3587,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3050
3587
|
if (use_cuda_graph) {
|
|
3051
3588
|
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
|
3052
3589
|
|
|
3053
|
-
use_cuda_graph =
|
|
3590
|
+
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
|
|
3054
3591
|
|
|
3055
3592
|
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
|
3056
3593
|
if (use_cuda_graph && cuda_graph_update_required) {
|
|
@@ -3077,10 +3614,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
3077
3614
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
|
3078
3615
|
}
|
|
3079
3616
|
|
|
3080
|
-
if (!use_cuda_graph) {
|
|
3081
|
-
cuda_ctx->cuda_graph->use_cpy_indirection = false;
|
|
3082
|
-
}
|
|
3083
|
-
|
|
3084
3617
|
#else
|
|
3085
3618
|
bool use_cuda_graph = false;
|
|
3086
3619
|
bool cuda_graph_update_required = false;
|
|
@@ -3132,6 +3665,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
|
|
3132
3665
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
|
3133
3666
|
/* .event_record = */ ggml_backend_cuda_event_record,
|
|
3134
3667
|
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
|
3668
|
+
/* .graph_optimize = */ NULL,
|
|
3135
3669
|
};
|
|
3136
3670
|
|
|
3137
3671
|
static ggml_guid_t ggml_backend_cuda_guid() {
|
|
@@ -3201,6 +3735,7 @@ struct ggml_backend_cuda_device_context {
|
|
|
3201
3735
|
int device;
|
|
3202
3736
|
std::string name;
|
|
3203
3737
|
std::string description;
|
|
3738
|
+
std::string pci_bus_id;
|
|
3204
3739
|
};
|
|
3205
3740
|
|
|
3206
3741
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
|
@@ -3213,10 +3748,110 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
|
|
3213
3748
|
return ctx->description.c_str();
|
|
3214
3749
|
}
|
|
3215
3750
|
|
|
3751
|
+
#if defined(__linux__)
|
|
3752
|
+
// Helper function to get available memory from /proc/meminfo for UMA systems
|
|
3753
|
+
static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_kb, long * free_swap_kb) {
|
|
3754
|
+
FILE * meminfo_file = nullptr;
|
|
3755
|
+
// 2KB buffer for reading /proc/meminfo since it does not report size info, should be enough
|
|
3756
|
+
const size_t BUFFER_SIZE = 2048;
|
|
3757
|
+
auto file_buffer = std::make_unique<char[]>(BUFFER_SIZE);
|
|
3758
|
+
size_t bytes_read = 0;
|
|
3759
|
+
long huge_tlb_total_pages = -1;
|
|
3760
|
+
long huge_tlb_free_pages = -1;
|
|
3761
|
+
long huge_tlb_page_size = -1;
|
|
3762
|
+
|
|
3763
|
+
if (available_memory_kb == nullptr || free_swap_kb == nullptr) {
|
|
3764
|
+
return false;
|
|
3765
|
+
}
|
|
3766
|
+
|
|
3767
|
+
meminfo_file = fopen("/proc/meminfo", "r");
|
|
3768
|
+
if (meminfo_file == nullptr) {
|
|
3769
|
+
GGML_LOG_ERROR("%s: failed to open /proc/meminfo\n", __func__);
|
|
3770
|
+
return false;
|
|
3771
|
+
}
|
|
3772
|
+
|
|
3773
|
+
// Read file into buffer
|
|
3774
|
+
bytes_read = fread(file_buffer.get(), 1, BUFFER_SIZE - 1, meminfo_file);
|
|
3775
|
+
fclose(meminfo_file);
|
|
3776
|
+
|
|
3777
|
+
if (bytes_read == 0) {
|
|
3778
|
+
GGML_LOG_ERROR("%s: failed to read from /proc/meminfo\n", __func__);
|
|
3779
|
+
return false;
|
|
3780
|
+
}
|
|
3781
|
+
file_buffer[bytes_read] = '\0';
|
|
3782
|
+
|
|
3783
|
+
*available_memory_kb = -1;
|
|
3784
|
+
*free_swap_kb = -1;
|
|
3785
|
+
|
|
3786
|
+
// Parse the file buffer line by line
|
|
3787
|
+
char * line = file_buffer.get();
|
|
3788
|
+
char * line_next;
|
|
3789
|
+
while (line < file_buffer.get() + bytes_read) {
|
|
3790
|
+
// Find the end of the current line
|
|
3791
|
+
line_next = strchr(line, '\n');
|
|
3792
|
+
if (line_next != nullptr) {
|
|
3793
|
+
*line_next = '\0';
|
|
3794
|
+
line_next++;
|
|
3795
|
+
} else {
|
|
3796
|
+
line_next = file_buffer.get() + bytes_read;
|
|
3797
|
+
}
|
|
3798
|
+
|
|
3799
|
+
long value;
|
|
3800
|
+
if (sscanf(line, "MemAvailable: %ld kB", &value) == 1) {
|
|
3801
|
+
*available_memory_kb = value;
|
|
3802
|
+
} else if (sscanf(line, "SwapFree: %ld kB", &value) == 1) {
|
|
3803
|
+
*free_swap_kb = value;
|
|
3804
|
+
} else if (sscanf(line, "HugePages_Total: %ld", &value) == 1) {
|
|
3805
|
+
huge_tlb_total_pages = value;
|
|
3806
|
+
} else if (sscanf(line, "HugePages_Free: %ld", &value) == 1) {
|
|
3807
|
+
huge_tlb_free_pages = value;
|
|
3808
|
+
} else if (sscanf(line, "Hugepagesize: %ld kB", &value) == 1) {
|
|
3809
|
+
huge_tlb_page_size = value;
|
|
3810
|
+
}
|
|
3811
|
+
|
|
3812
|
+
line = line_next;
|
|
3813
|
+
}
|
|
3814
|
+
|
|
3815
|
+
if (huge_tlb_total_pages != 0 && huge_tlb_total_pages != -1) {
|
|
3816
|
+
*available_memory_kb = huge_tlb_free_pages * huge_tlb_page_size;
|
|
3817
|
+
|
|
3818
|
+
// Hugetlbfs pages are not swappable.
|
|
3819
|
+
*free_swap_kb = 0;
|
|
3820
|
+
}
|
|
3821
|
+
|
|
3822
|
+
GGML_LOG_DEBUG("%s: final available_memory_kb: %ld\n", __func__, *available_memory_kb);
|
|
3823
|
+
return true;
|
|
3824
|
+
}
|
|
3825
|
+
#endif // defined(__linux__)
|
|
3826
|
+
|
|
3216
3827
|
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
3217
3828
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
|
3218
3829
|
ggml_cuda_set_device(ctx->device);
|
|
3219
3830
|
CUDA_CHECK(cudaMemGetInfo(free, total));
|
|
3831
|
+
|
|
3832
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
|
|
3833
|
+
#if defined(__linux__)
|
|
3834
|
+
// Check if this is a UMA (Unified Memory Architecture) system
|
|
3835
|
+
cudaDeviceProp prop;
|
|
3836
|
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
|
|
3837
|
+
|
|
3838
|
+
// Check if UMA is explicitly enabled via environment variable
|
|
3839
|
+
bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr;
|
|
3840
|
+
bool is_uma = prop.unifiedAddressing > 0 || uma_env;
|
|
3841
|
+
|
|
3842
|
+
if (is_uma) {
|
|
3843
|
+
// For UMA systems (like DGX Spark), use system memory info
|
|
3844
|
+
long available_memory_kb = 0;
|
|
3845
|
+
long free_swap_kb = 0;
|
|
3846
|
+
|
|
3847
|
+
if (ggml_backend_cuda_get_available_uma_memory(&available_memory_kb, &free_swap_kb) && available_memory_kb > 0) {
|
|
3848
|
+
*free = (size_t)available_memory_kb * 1024;
|
|
3849
|
+
} else {
|
|
3850
|
+
GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__);
|
|
3851
|
+
}
|
|
3852
|
+
}
|
|
3853
|
+
#endif // defined(__linux__)
|
|
3854
|
+
|
|
3220
3855
|
}
|
|
3221
3856
|
|
|
3222
3857
|
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
|
@@ -3225,9 +3860,12 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
|
|
3225
3860
|
}
|
|
3226
3861
|
|
|
3227
3862
|
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
3863
|
+
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
|
3864
|
+
|
|
3228
3865
|
props->name = ggml_backend_cuda_device_get_name(dev);
|
|
3229
3866
|
props->description = ggml_backend_cuda_device_get_description(dev);
|
|
3230
3867
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
|
3868
|
+
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
|
3231
3869
|
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
3232
3870
|
|
|
3233
3871
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
|
@@ -3301,7 +3939,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3301
3939
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
3302
3940
|
case GGML_UNARY_OP_TANH:
|
|
3303
3941
|
case GGML_UNARY_OP_EXP:
|
|
3942
|
+
case GGML_UNARY_OP_EXPM1:
|
|
3943
|
+
case GGML_UNARY_OP_SOFTPLUS:
|
|
3304
3944
|
case GGML_UNARY_OP_ELU:
|
|
3945
|
+
case GGML_UNARY_OP_FLOOR:
|
|
3946
|
+
case GGML_UNARY_OP_CEIL:
|
|
3947
|
+
case GGML_UNARY_OP_ROUND:
|
|
3948
|
+
case GGML_UNARY_OP_TRUNC:
|
|
3305
3949
|
return ggml_is_contiguous(op->src[0]);
|
|
3306
3950
|
default:
|
|
3307
3951
|
return false;
|
|
@@ -3414,7 +4058,14 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3414
4058
|
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
|
|
3415
4059
|
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
|
|
3416
4060
|
op->src[0]->type == GGML_TYPE_F32 &&
|
|
3417
|
-
op->src[1]->type == GGML_TYPE_I64;
|
|
4061
|
+
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
|
|
4062
|
+
} break;
|
|
4063
|
+
case GGML_OP_SET:
|
|
4064
|
+
{
|
|
4065
|
+
const ggml_type t = op->type;
|
|
4066
|
+
return (t == GGML_TYPE_F32 || t == GGML_TYPE_I32) &&
|
|
4067
|
+
t == op->src[0]->type &&
|
|
4068
|
+
t == op->src[1]->type;
|
|
3418
4069
|
} break;
|
|
3419
4070
|
case GGML_OP_CPY:
|
|
3420
4071
|
{
|
|
@@ -3458,6 +4109,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3458
4109
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
|
3459
4110
|
return true;
|
|
3460
4111
|
}
|
|
4112
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
|
|
4113
|
+
return true;
|
|
4114
|
+
}
|
|
4115
|
+
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
|
|
4116
|
+
return true;
|
|
4117
|
+
}
|
|
4118
|
+
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_I32) {
|
|
4119
|
+
return true;
|
|
4120
|
+
}
|
|
3461
4121
|
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
|
3462
4122
|
return true;
|
|
3463
4123
|
}
|
|
@@ -3559,20 +4219,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3559
4219
|
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
|
|
3560
4220
|
}
|
|
3561
4221
|
case GGML_OP_IM2COL:
|
|
4222
|
+
case GGML_OP_IM2COL_3D:
|
|
3562
4223
|
case GGML_OP_CONV_2D:
|
|
3563
4224
|
case GGML_OP_CONV_2D_DW:
|
|
3564
4225
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
3565
4226
|
case GGML_OP_POOL_2D:
|
|
4227
|
+
case GGML_OP_ACC:
|
|
4228
|
+
return true;
|
|
3566
4229
|
case GGML_OP_SUM:
|
|
4230
|
+
return ggml_is_contiguous_rows(op->src[0]);
|
|
3567
4231
|
case GGML_OP_ARGSORT:
|
|
3568
|
-
|
|
4232
|
+
#ifndef GGML_CUDA_USE_CUB
|
|
4233
|
+
return op->src[0]->ne[0] <= 1024;
|
|
4234
|
+
#else
|
|
3569
4235
|
return true;
|
|
4236
|
+
#endif
|
|
3570
4237
|
case GGML_OP_SUM_ROWS:
|
|
3571
4238
|
case GGML_OP_MEAN:
|
|
3572
4239
|
case GGML_OP_GROUP_NORM:
|
|
4240
|
+
case GGML_OP_PAD:
|
|
3573
4241
|
return ggml_is_contiguous(op->src[0]);
|
|
3574
4242
|
case GGML_OP_UPSCALE:
|
|
3575
|
-
case GGML_OP_PAD:
|
|
3576
4243
|
case GGML_OP_PAD_REFLECT_1D:
|
|
3577
4244
|
case GGML_OP_ARANGE:
|
|
3578
4245
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
@@ -3783,11 +4450,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|
|
3783
4450
|
dev_ctx->device = i;
|
|
3784
4451
|
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
|
|
3785
4452
|
|
|
3786
|
-
ggml_cuda_set_device(i);
|
|
3787
4453
|
cudaDeviceProp prop;
|
|
3788
4454
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
|
3789
4455
|
dev_ctx->description = prop.name;
|
|
3790
4456
|
|
|
4457
|
+
char pci_bus_id[16] = {};
|
|
4458
|
+
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
|
4459
|
+
dev_ctx->pci_bus_id = pci_bus_id;
|
|
4460
|
+
|
|
3791
4461
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
|
3792
4462
|
/* .iface = */ ggml_backend_cuda_device_interface,
|
|
3793
4463
|
/* .reg = */ ®,
|