@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -0
- package/android/CMakeLists.txt +2 -0
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -31,6 +31,12 @@
|
|
|
31
31
|
#include <filesystem>
|
|
32
32
|
#include <algorithm>
|
|
33
33
|
|
|
34
|
+
static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
|
|
35
|
+
|
|
36
|
+
#define LOG_DBG(...) \
|
|
37
|
+
do { if (RPC_DEBUG) GGML_LOG_DEBUG(__VA_ARGS__); } while (0)
|
|
38
|
+
|
|
39
|
+
|
|
34
40
|
namespace fs = std::filesystem;
|
|
35
41
|
|
|
36
42
|
static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
|
|
@@ -47,7 +53,7 @@ struct socket_t {
|
|
|
47
53
|
sockfd_t fd;
|
|
48
54
|
socket_t(sockfd_t fd) : fd(fd) {}
|
|
49
55
|
~socket_t() {
|
|
50
|
-
|
|
56
|
+
LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
|
|
51
57
|
#ifdef _WIN32
|
|
52
58
|
closesocket(this->fd);
|
|
53
59
|
#else
|
|
@@ -99,9 +105,12 @@ enum rpc_cmd {
|
|
|
99
105
|
RPC_CMD_INIT_TENSOR,
|
|
100
106
|
RPC_CMD_GET_ALLOC_SIZE,
|
|
101
107
|
RPC_CMD_HELLO,
|
|
108
|
+
RPC_CMD_DEVICE_COUNT,
|
|
102
109
|
RPC_CMD_COUNT,
|
|
103
110
|
};
|
|
104
111
|
|
|
112
|
+
static_assert(RPC_CMD_HELLO == 14, "RPC_CMD_HELLO must be always 14");
|
|
113
|
+
|
|
105
114
|
// Try RPC_CMD_SET_TENSOR_HASH first when data size is larger than this threshold
|
|
106
115
|
const size_t HASH_THRESHOLD = 10 * 1024 * 1024;
|
|
107
116
|
|
|
@@ -111,7 +120,12 @@ struct rpc_msg_hello_rsp {
|
|
|
111
120
|
uint8_t patch;
|
|
112
121
|
};
|
|
113
122
|
|
|
123
|
+
struct rpc_msg_device_count_rsp {
|
|
124
|
+
uint32_t device_count;
|
|
125
|
+
};
|
|
126
|
+
|
|
114
127
|
struct rpc_msg_get_alloc_size_req {
|
|
128
|
+
uint32_t device;
|
|
115
129
|
rpc_tensor tensor;
|
|
116
130
|
};
|
|
117
131
|
|
|
@@ -124,6 +138,7 @@ struct rpc_msg_init_tensor_req {
|
|
|
124
138
|
};
|
|
125
139
|
|
|
126
140
|
struct rpc_msg_alloc_buffer_req {
|
|
141
|
+
uint32_t device;
|
|
127
142
|
uint64_t size;
|
|
128
143
|
};
|
|
129
144
|
|
|
@@ -132,10 +147,18 @@ struct rpc_msg_alloc_buffer_rsp {
|
|
|
132
147
|
uint64_t remote_size;
|
|
133
148
|
};
|
|
134
149
|
|
|
150
|
+
struct rpc_msg_get_alignment_req {
|
|
151
|
+
uint32_t device;
|
|
152
|
+
};
|
|
153
|
+
|
|
135
154
|
struct rpc_msg_get_alignment_rsp {
|
|
136
155
|
uint64_t alignment;
|
|
137
156
|
};
|
|
138
157
|
|
|
158
|
+
struct rpc_msg_get_max_size_req {
|
|
159
|
+
uint32_t device;
|
|
160
|
+
};
|
|
161
|
+
|
|
139
162
|
struct rpc_msg_get_max_size_rsp {
|
|
140
163
|
uint64_t max_size;
|
|
141
164
|
};
|
|
@@ -186,6 +209,10 @@ struct rpc_msg_graph_compute_rsp {
|
|
|
186
209
|
uint8_t result;
|
|
187
210
|
};
|
|
188
211
|
|
|
212
|
+
struct rpc_msg_get_device_memory_req {
|
|
213
|
+
uint32_t device;
|
|
214
|
+
};
|
|
215
|
+
|
|
189
216
|
struct rpc_msg_get_device_memory_rsp {
|
|
190
217
|
uint64_t free_mem;
|
|
191
218
|
uint64_t total_mem;
|
|
@@ -201,13 +228,15 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
|
|
201
228
|
|
|
202
229
|
struct ggml_backend_rpc_buffer_type_context {
|
|
203
230
|
std::string endpoint;
|
|
231
|
+
uint32_t device;
|
|
204
232
|
std::string name;
|
|
205
|
-
size_t
|
|
206
|
-
size_t
|
|
233
|
+
size_t alignment;
|
|
234
|
+
size_t max_size;
|
|
207
235
|
};
|
|
208
236
|
|
|
209
237
|
struct ggml_backend_rpc_context {
|
|
210
238
|
std::string endpoint;
|
|
239
|
+
uint32_t device;
|
|
211
240
|
std::string name;
|
|
212
241
|
};
|
|
213
242
|
|
|
@@ -265,14 +294,14 @@ static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
|
|
265
294
|
return nullptr;
|
|
266
295
|
}
|
|
267
296
|
if (!set_no_delay(sockfd)) {
|
|
268
|
-
|
|
297
|
+
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
|
269
298
|
return nullptr;
|
|
270
299
|
}
|
|
271
300
|
addr.sin_family = AF_INET;
|
|
272
301
|
addr.sin_port = htons(port);
|
|
273
302
|
struct hostent * server = gethostbyname(host);
|
|
274
303
|
if (server == NULL) {
|
|
275
|
-
|
|
304
|
+
GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
|
|
276
305
|
return nullptr;
|
|
277
306
|
}
|
|
278
307
|
memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
|
|
@@ -289,7 +318,7 @@ static std::shared_ptr<socket_t> socket_accept(sockfd_t srv_sockfd) {
|
|
|
289
318
|
return nullptr;
|
|
290
319
|
}
|
|
291
320
|
if (!set_no_delay(client_socket_fd)) {
|
|
292
|
-
|
|
321
|
+
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
|
293
322
|
return nullptr;
|
|
294
323
|
}
|
|
295
324
|
return client_socket;
|
|
@@ -302,11 +331,11 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
|
|
302
331
|
return nullptr;
|
|
303
332
|
}
|
|
304
333
|
if (!set_reuse_addr(sockfd)) {
|
|
305
|
-
|
|
334
|
+
GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
|
|
306
335
|
return nullptr;
|
|
307
336
|
}
|
|
308
337
|
if (inet_addr(host) == INADDR_NONE) {
|
|
309
|
-
|
|
338
|
+
GGML_LOG_ERROR("Invalid host address: %s\n", host);
|
|
310
339
|
return nullptr;
|
|
311
340
|
}
|
|
312
341
|
struct sockaddr_in serv_addr;
|
|
@@ -349,7 +378,7 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
|
|
349
378
|
return false;
|
|
350
379
|
}
|
|
351
380
|
if (n == 0) {
|
|
352
|
-
|
|
381
|
+
LOG_DBG("recv returned 0 (peer closed?)\n");
|
|
353
382
|
return false;
|
|
354
383
|
}
|
|
355
384
|
bytes_recv += (size_t)n;
|
|
@@ -383,7 +412,7 @@ static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
|
|
|
383
412
|
try {
|
|
384
413
|
input.resize(size);
|
|
385
414
|
} catch (const std::bad_alloc & e) {
|
|
386
|
-
|
|
415
|
+
GGML_LOG_ERROR("Failed to allocate input buffer of size %" PRIu64 "\n", size);
|
|
387
416
|
return false;
|
|
388
417
|
}
|
|
389
418
|
return recv_data(sockfd, input.data(), size);
|
|
@@ -443,11 +472,11 @@ static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
|
|
443
472
|
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
|
444
473
|
RPC_STATUS_ASSERT(status);
|
|
445
474
|
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
|
446
|
-
|
|
475
|
+
GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
|
447
476
|
return false;
|
|
448
477
|
}
|
|
449
478
|
if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
|
|
450
|
-
|
|
479
|
+
GGML_LOG_INFO("WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
|
451
480
|
}
|
|
452
481
|
return true;
|
|
453
482
|
}
|
|
@@ -488,7 +517,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|
|
488
517
|
if (!check_server_version(sock)) {
|
|
489
518
|
return nullptr;
|
|
490
519
|
}
|
|
491
|
-
|
|
520
|
+
LOG_DBG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
|
492
521
|
sockets[endpoint] = sock;
|
|
493
522
|
return sock;
|
|
494
523
|
}
|
|
@@ -602,23 +631,30 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
|
|
|
602
631
|
RPC_STATUS_ASSERT(status);
|
|
603
632
|
}
|
|
604
633
|
|
|
634
|
+
static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
|
|
635
|
+
return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
|
|
636
|
+
}
|
|
637
|
+
|
|
605
638
|
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
639
|
+
if (ggml_backend_buffer_is_rpc(src->buffer)) {
|
|
640
|
+
// check if src and dst are on the same server
|
|
641
|
+
ggml_backend_buffer_t src_buffer = src->buffer;
|
|
642
|
+
ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
|
|
643
|
+
ggml_backend_buffer_t dst_buffer = dst->buffer;
|
|
644
|
+
ggml_backend_rpc_buffer_context * dst_ctx = (ggml_backend_rpc_buffer_context *)dst_buffer->context;
|
|
645
|
+
if (src_ctx->sock != dst_ctx->sock) {
|
|
646
|
+
return false;
|
|
647
|
+
}
|
|
648
|
+
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
|
649
|
+
rpc_msg_copy_tensor_req request;
|
|
650
|
+
request.src = serialize_tensor(src);
|
|
651
|
+
request.dst = serialize_tensor(dst);
|
|
652
|
+
rpc_msg_copy_tensor_rsp response;
|
|
653
|
+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
|
|
654
|
+
RPC_STATUS_ASSERT(status);
|
|
655
|
+
return response.result;
|
|
613
656
|
}
|
|
614
|
-
|
|
615
|
-
rpc_msg_copy_tensor_req request;
|
|
616
|
-
request.src = serialize_tensor(src);
|
|
617
|
-
request.dst = serialize_tensor(dst);
|
|
618
|
-
rpc_msg_copy_tensor_rsp response;
|
|
619
|
-
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
|
|
620
|
-
RPC_STATUS_ASSERT(status);
|
|
621
|
-
return response.result;
|
|
657
|
+
return false;
|
|
622
658
|
}
|
|
623
659
|
|
|
624
660
|
static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
@@ -647,7 +683,7 @@ static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t
|
|
|
647
683
|
|
|
648
684
|
static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
649
685
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
|
650
|
-
rpc_msg_alloc_buffer_req request = {size};
|
|
686
|
+
rpc_msg_alloc_buffer_req request = {buft_ctx->device, size};
|
|
651
687
|
rpc_msg_alloc_buffer_rsp response;
|
|
652
688
|
auto sock = get_socket(buft_ctx->endpoint);
|
|
653
689
|
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
|
|
@@ -663,9 +699,10 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
|
|
663
699
|
}
|
|
664
700
|
}
|
|
665
701
|
|
|
666
|
-
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
|
702
|
+
static size_t get_alignment(const std::shared_ptr<socket_t> & sock, uint32_t device) {
|
|
703
|
+
rpc_msg_get_alignment_req request = {device};
|
|
667
704
|
rpc_msg_get_alignment_rsp response;
|
|
668
|
-
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT,
|
|
705
|
+
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, &request, sizeof(request), &response, sizeof(response));
|
|
669
706
|
RPC_STATUS_ASSERT(status);
|
|
670
707
|
return response.alignment;
|
|
671
708
|
}
|
|
@@ -675,9 +712,10 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
|
|
|
675
712
|
return buft_ctx->alignment;
|
|
676
713
|
}
|
|
677
714
|
|
|
678
|
-
static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
|
|
715
|
+
static size_t get_max_size(const std::shared_ptr<socket_t> & sock, uint32_t device) {
|
|
716
|
+
rpc_msg_get_max_size_req request = {device};
|
|
679
717
|
rpc_msg_get_max_size_rsp response;
|
|
680
|
-
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE,
|
|
718
|
+
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, &request, sizeof(request), &response, sizeof(response));
|
|
681
719
|
RPC_STATUS_ASSERT(status);
|
|
682
720
|
return response.max_size;
|
|
683
721
|
}
|
|
@@ -694,7 +732,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
|
|
694
732
|
auto sock = get_socket(buft_ctx->endpoint);
|
|
695
733
|
|
|
696
734
|
rpc_msg_get_alloc_size_req request;
|
|
697
|
-
|
|
735
|
+
request.device = buft_ctx->device;
|
|
698
736
|
request.tensor = serialize_tensor(tensor);
|
|
699
737
|
|
|
700
738
|
rpc_msg_get_alloc_size_rsp response;
|
|
@@ -748,7 +786,7 @@ static void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors,
|
|
|
748
786
|
tensors.push_back(serialize_tensor(tensor));
|
|
749
787
|
}
|
|
750
788
|
|
|
751
|
-
static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
|
|
789
|
+
static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
|
|
752
790
|
uint32_t n_nodes = cgraph->n_nodes;
|
|
753
791
|
std::vector<rpc_tensor> tensors;
|
|
754
792
|
std::unordered_set<ggml_tensor*> visited;
|
|
@@ -756,24 +794,29 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
|
|
|
756
794
|
add_tensor(cgraph->nodes[i], tensors, visited);
|
|
757
795
|
}
|
|
758
796
|
// serialization format:
|
|
759
|
-
// | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
|
797
|
+
// | device (4 bytes) | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
|
760
798
|
uint32_t n_tensors = tensors.size();
|
|
761
|
-
int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
|
|
799
|
+
int output_size = 2*sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
|
|
762
800
|
output.resize(output_size, 0);
|
|
763
|
-
|
|
801
|
+
uint8_t * dest = output.data();
|
|
802
|
+
memcpy(dest, &device, sizeof(device));
|
|
803
|
+
dest += sizeof(device);
|
|
804
|
+
memcpy(dest, &n_nodes, sizeof(n_nodes));
|
|
805
|
+
dest += sizeof(n_nodes);
|
|
764
806
|
for (uint32_t i = 0; i < n_nodes; i++) {
|
|
765
|
-
memcpy(
|
|
807
|
+
memcpy(dest + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
|
|
766
808
|
}
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
809
|
+
dest += n_nodes * sizeof(uint64_t);
|
|
810
|
+
memcpy(dest, &n_tensors, sizeof(n_tensors));
|
|
811
|
+
dest += sizeof(n_tensors);
|
|
812
|
+
rpc_tensor * out_tensors = (rpc_tensor *)dest;
|
|
770
813
|
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
|
|
771
814
|
}
|
|
772
815
|
|
|
773
816
|
static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
774
817
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
|
775
818
|
std::vector<uint8_t> input;
|
|
776
|
-
serialize_graph(cgraph, input);
|
|
819
|
+
serialize_graph(rpc_ctx->device, cgraph, input);
|
|
777
820
|
rpc_msg_graph_compute_rsp response;
|
|
778
821
|
auto sock = get_socket(rpc_ctx->endpoint);
|
|
779
822
|
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
|
|
@@ -795,50 +838,55 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|
|
795
838
|
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
|
796
839
|
/* .event_record = */ NULL,
|
|
797
840
|
/* .event_wait = */ NULL,
|
|
841
|
+
/* .graph_optimize = */ NULL,
|
|
798
842
|
};
|
|
799
843
|
|
|
800
|
-
ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
|
844
|
+
ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device) {
|
|
801
845
|
static std::mutex mutex;
|
|
802
846
|
std::lock_guard<std::mutex> lock(mutex);
|
|
847
|
+
std::string buft_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
|
|
803
848
|
// NOTE: buffer types are allocated and never freed; this is by design
|
|
804
849
|
static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
|
|
805
|
-
auto it = buft_map.find(
|
|
850
|
+
auto it = buft_map.find(buft_name);
|
|
806
851
|
if (it != buft_map.end()) {
|
|
807
852
|
return it->second;
|
|
808
853
|
}
|
|
809
854
|
auto sock = get_socket(endpoint);
|
|
810
855
|
if (sock == nullptr) {
|
|
811
|
-
|
|
856
|
+
GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
|
|
812
857
|
return nullptr;
|
|
813
858
|
}
|
|
814
|
-
size_t alignment = get_alignment(sock);
|
|
815
|
-
size_t max_size = get_max_size(sock);
|
|
859
|
+
size_t alignment = get_alignment(sock, device);
|
|
860
|
+
size_t max_size = get_max_size(sock, device);
|
|
816
861
|
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
|
817
862
|
/* .endpoint = */ endpoint,
|
|
818
|
-
/* .
|
|
863
|
+
/* .device = */ device,
|
|
864
|
+
/* .name = */ buft_name,
|
|
819
865
|
/* .alignment = */ alignment,
|
|
820
866
|
/* .max_size = */ max_size
|
|
821
867
|
};
|
|
822
|
-
|
|
868
|
+
auto reg = ggml_backend_rpc_add_server(endpoint);
|
|
823
869
|
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
|
824
870
|
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
|
825
|
-
/* .device = */
|
|
871
|
+
/* .device = */ ggml_backend_reg_dev_get(reg, device),
|
|
826
872
|
/* .context = */ buft_ctx
|
|
827
873
|
};
|
|
828
|
-
buft_map[
|
|
874
|
+
buft_map[buft_name] = buft;
|
|
829
875
|
return buft;
|
|
830
876
|
}
|
|
831
877
|
|
|
832
|
-
ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|
878
|
+
ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
|
|
879
|
+
std::string dev_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
|
|
833
880
|
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
|
834
|
-
/* .endpoint
|
|
835
|
-
/* .
|
|
881
|
+
/* .endpoint = */ endpoint,
|
|
882
|
+
/* .device = */ device,
|
|
883
|
+
/* .name = */ dev_name
|
|
836
884
|
};
|
|
837
|
-
|
|
885
|
+
auto reg = ggml_backend_rpc_add_server(endpoint);
|
|
838
886
|
ggml_backend_t backend = new ggml_backend {
|
|
839
887
|
/* .guid = */ ggml_backend_rpc_guid(),
|
|
840
888
|
/* .iface = */ ggml_backend_rpc_interface,
|
|
841
|
-
/* .device = */
|
|
889
|
+
/* .device = */ ggml_backend_reg_dev_get(reg, device),
|
|
842
890
|
/* .context = */ ctx
|
|
843
891
|
};
|
|
844
892
|
return backend;
|
|
@@ -848,37 +896,39 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
|
|
848
896
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
|
849
897
|
}
|
|
850
898
|
|
|
851
|
-
static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
|
|
899
|
+
static void get_device_memory(const std::shared_ptr<socket_t> & sock, uint32_t device, size_t * free, size_t * total) {
|
|
900
|
+
rpc_msg_get_device_memory_req request;
|
|
901
|
+
request.device = device;
|
|
852
902
|
rpc_msg_get_device_memory_rsp response;
|
|
853
|
-
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY,
|
|
903
|
+
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, &request, sizeof(request), &response, sizeof(response));
|
|
854
904
|
RPC_STATUS_ASSERT(status);
|
|
855
905
|
*free = response.free_mem;
|
|
856
906
|
*total = response.total_mem;
|
|
857
907
|
}
|
|
858
908
|
|
|
859
|
-
void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
|
909
|
+
void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total) {
|
|
860
910
|
auto sock = get_socket(endpoint);
|
|
861
911
|
if (sock == nullptr) {
|
|
862
912
|
*free = 0;
|
|
863
913
|
*total = 0;
|
|
864
914
|
return;
|
|
865
915
|
}
|
|
866
|
-
get_device_memory(sock, free, total);
|
|
916
|
+
get_device_memory(sock, device, free, total);
|
|
867
917
|
}
|
|
868
918
|
|
|
869
919
|
// RPC server-side implementation
|
|
870
920
|
|
|
871
921
|
class rpc_server {
|
|
872
922
|
public:
|
|
873
|
-
rpc_server(ggml_backend_t
|
|
874
|
-
:
|
|
923
|
+
rpc_server(std::vector<ggml_backend_t> backends, const char * cache_dir)
|
|
924
|
+
: backends(std::move(backends)), cache_dir(cache_dir) {
|
|
875
925
|
}
|
|
876
926
|
~rpc_server();
|
|
877
927
|
|
|
878
928
|
void hello(rpc_msg_hello_rsp & response);
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
929
|
+
bool alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
|
|
930
|
+
bool get_alignment(const rpc_msg_get_alignment_req & request, rpc_msg_get_alignment_rsp & response);
|
|
931
|
+
bool get_max_size(const rpc_msg_get_max_size_req & request, rpc_msg_get_max_size_rsp & response);
|
|
882
932
|
bool buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response);
|
|
883
933
|
bool free_buffer(const rpc_msg_free_buffer_req & request);
|
|
884
934
|
bool buffer_clear(const rpc_msg_buffer_clear_req & request);
|
|
@@ -889,6 +939,7 @@ public:
|
|
|
889
939
|
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
|
890
940
|
bool init_tensor(const rpc_msg_init_tensor_req & request);
|
|
891
941
|
bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
|
|
942
|
+
bool get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response);
|
|
892
943
|
|
|
893
944
|
private:
|
|
894
945
|
bool get_cached_file(uint64_t hash, std::vector<uint8_t> & data);
|
|
@@ -899,7 +950,7 @@ private:
|
|
|
899
950
|
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
|
|
900
951
|
|
|
901
952
|
|
|
902
|
-
ggml_backend_t
|
|
953
|
+
std::vector<ggml_backend_t> backends;
|
|
903
954
|
const char * cache_dir;
|
|
904
955
|
std::unordered_set<ggml_backend_buffer_t> buffers;
|
|
905
956
|
};
|
|
@@ -908,10 +959,14 @@ void rpc_server::hello(rpc_msg_hello_rsp & response) {
|
|
|
908
959
|
response.major = RPC_PROTO_MAJOR_VERSION;
|
|
909
960
|
response.minor = RPC_PROTO_MINOR_VERSION;
|
|
910
961
|
response.patch = RPC_PROTO_PATCH_VERSION;
|
|
911
|
-
|
|
962
|
+
LOG_DBG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
|
|
912
963
|
}
|
|
913
964
|
|
|
914
965
|
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
|
|
966
|
+
uint32_t dev_id = request.device;
|
|
967
|
+
if (dev_id >= backends.size()) {
|
|
968
|
+
return false;
|
|
969
|
+
}
|
|
915
970
|
ggml_backend_buffer_type_t buft;
|
|
916
971
|
struct ggml_init_params params {
|
|
917
972
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
|
@@ -928,50 +983,66 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
|
|
|
928
983
|
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
|
929
984
|
return false;
|
|
930
985
|
}
|
|
931
|
-
|
|
986
|
+
LOG_DBG("[%s] device: %d, buffer: %p, data: %p\n", __func__, dev_id, (void*)tensor->buffer, tensor->data);
|
|
932
987
|
if (tensor->buffer == nullptr) {
|
|
933
988
|
//No buffer allocated.
|
|
934
|
-
buft = ggml_backend_get_default_buffer_type(
|
|
989
|
+
buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
|
|
935
990
|
} else {
|
|
936
991
|
buft = tensor->buffer->buft;
|
|
937
992
|
}
|
|
938
993
|
|
|
939
|
-
response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
|
|
994
|
+
response.alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);
|
|
940
995
|
|
|
941
996
|
return true;
|
|
942
997
|
}
|
|
943
998
|
|
|
944
|
-
|
|
945
|
-
|
|
999
|
+
bool rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
|
|
1000
|
+
uint32_t dev_id = request.device;
|
|
1001
|
+
if (dev_id >= backends.size()) {
|
|
1002
|
+
return false;
|
|
1003
|
+
}
|
|
1004
|
+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
|
|
946
1005
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
|
|
947
1006
|
response.remote_ptr = 0;
|
|
948
1007
|
response.remote_size = 0;
|
|
949
1008
|
if (buffer != nullptr) {
|
|
950
1009
|
response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
|
|
951
1010
|
response.remote_size = buffer->size;
|
|
952
|
-
|
|
1011
|
+
LOG_DBG("[%s] device: %d, size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n",
|
|
1012
|
+
__func__, dev_id, request.size, response.remote_ptr, response.remote_size);
|
|
953
1013
|
buffers.insert(buffer);
|
|
954
1014
|
} else {
|
|
955
|
-
|
|
1015
|
+
LOG_DBG("[%s] device: %d, size: %" PRIu64 " -> failed\n", __func__, dev_id, request.size);
|
|
956
1016
|
}
|
|
1017
|
+
return true;
|
|
957
1018
|
}
|
|
958
1019
|
|
|
959
|
-
|
|
960
|
-
|
|
1020
|
+
bool rpc_server::get_alignment(const rpc_msg_get_alignment_req & request, rpc_msg_get_alignment_rsp & response) {
|
|
1021
|
+
uint32_t dev_id = request.device;
|
|
1022
|
+
if (dev_id >= backends.size()) {
|
|
1023
|
+
return false;
|
|
1024
|
+
}
|
|
1025
|
+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
|
|
961
1026
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
|
962
|
-
|
|
1027
|
+
LOG_DBG("[%s] device: %d, alignment: %lu\n", __func__, dev_id, alignment);
|
|
963
1028
|
response.alignment = alignment;
|
|
1029
|
+
return true;
|
|
964
1030
|
}
|
|
965
1031
|
|
|
966
|
-
|
|
967
|
-
|
|
1032
|
+
bool rpc_server::get_max_size(const rpc_msg_get_max_size_req & request, rpc_msg_get_max_size_rsp & response) {
|
|
1033
|
+
uint32_t dev_id = request.device;
|
|
1034
|
+
if (dev_id >= backends.size()) {
|
|
1035
|
+
return false;
|
|
1036
|
+
}
|
|
1037
|
+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backends[dev_id]);
|
|
968
1038
|
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
|
969
|
-
|
|
1039
|
+
LOG_DBG("[%s] device: %d, max_size: %lu\n", __func__, dev_id, max_size);
|
|
970
1040
|
response.max_size = max_size;
|
|
1041
|
+
return true;
|
|
971
1042
|
}
|
|
972
1043
|
|
|
973
1044
|
bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
|
|
974
|
-
|
|
1045
|
+
LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
|
975
1046
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
|
976
1047
|
if (buffers.find(buffer) == buffers.end()) {
|
|
977
1048
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
|
@@ -983,7 +1054,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
|
|
|
983
1054
|
}
|
|
984
1055
|
|
|
985
1056
|
bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
|
986
|
-
|
|
1057
|
+
LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
|
987
1058
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
|
988
1059
|
if (buffers.find(buffer) == buffers.end()) {
|
|
989
1060
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
|
@@ -995,7 +1066,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
|
|
995
1066
|
}
|
|
996
1067
|
|
|
997
1068
|
bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
|
998
|
-
|
|
1069
|
+
LOG_DBG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
|
999
1070
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
|
1000
1071
|
if (buffers.find(buffer) == buffers.end()) {
|
|
1001
1072
|
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
|
@@ -1072,7 +1143,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
|
|
1072
1143
|
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
|
1073
1144
|
return false;
|
|
1074
1145
|
}
|
|
1075
|
-
|
|
1146
|
+
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
|
1076
1147
|
|
|
1077
1148
|
// sanitize tensor->data
|
|
1078
1149
|
{
|
|
@@ -1095,7 +1166,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
|
|
1095
1166
|
fs::path cache_file = fs::path(cache_dir) / hash_str;
|
|
1096
1167
|
std::ofstream ofs(cache_file, std::ios::binary);
|
|
1097
1168
|
ofs.write((const char *)data, size);
|
|
1098
|
-
|
|
1169
|
+
GGML_LOG_INFO("[%s] saved to '%s'\n", __func__, cache_file.c_str());
|
|
1099
1170
|
}
|
|
1100
1171
|
ggml_backend_tensor_set(tensor, data, offset, size);
|
|
1101
1172
|
return true;
|
|
@@ -1141,8 +1212,8 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
|
|
|
1141
1212
|
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
|
1142
1213
|
return false;
|
|
1143
1214
|
}
|
|
1144
|
-
|
|
1145
|
-
|
|
1215
|
+
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
|
|
1216
|
+
__func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
|
|
1146
1217
|
|
|
1147
1218
|
// sanitize tensor->data
|
|
1148
1219
|
{
|
|
@@ -1176,7 +1247,7 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
|
|
|
1176
1247
|
GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
|
|
1177
1248
|
return false;
|
|
1178
1249
|
}
|
|
1179
|
-
|
|
1250
|
+
LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
|
|
1180
1251
|
// Call the backend's buffer_init_tensor function
|
|
1181
1252
|
ggml_backend_buffer_t buffer = tensor->buffer;
|
|
1182
1253
|
if (buffer && buffer->iface.init_tensor) {
|
|
@@ -1209,7 +1280,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
|
|
1209
1280
|
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
|
1210
1281
|
return false;
|
|
1211
1282
|
}
|
|
1212
|
-
|
|
1283
|
+
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
|
|
1213
1284
|
|
|
1214
1285
|
// sanitize tensor->data
|
|
1215
1286
|
{
|
|
@@ -1253,7 +1324,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
|
|
1253
1324
|
uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
|
|
1254
1325
|
|
|
1255
1326
|
if (dst_data + src_size > dst_base + dst_buf_sz) {
|
|
1256
|
-
|
|
1327
|
+
GGML_LOG_ERROR("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
|
|
1257
1328
|
" write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
|
|
1258
1329
|
" buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
|
|
1259
1330
|
__func__,
|
|
@@ -1264,8 +1335,8 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
|
|
1264
1335
|
return false;
|
|
1265
1336
|
}
|
|
1266
1337
|
|
|
1267
|
-
|
|
1268
|
-
|
|
1338
|
+
LOG_DBG("[%s] src->buffer: %p, dst->buffer: %p\n",
|
|
1339
|
+
__func__, (void*) src->buffer, (void*) dst->buffer);
|
|
1269
1340
|
|
|
1270
1341
|
response.result = ggml_backend_buffer_copy_tensor(src, dst);
|
|
1271
1342
|
return true;
|
|
@@ -1325,23 +1396,33 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
|
|
|
1325
1396
|
|
|
1326
1397
|
bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response) {
|
|
1327
1398
|
// serialization format:
|
|
1328
|
-
// | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
|
1329
|
-
if (input.size() < sizeof(uint32_t)) {
|
|
1399
|
+
// | device (4 bytes) | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
|
1400
|
+
if (input.size() < 2*sizeof(uint32_t)) {
|
|
1401
|
+
return false;
|
|
1402
|
+
}
|
|
1403
|
+
const uint8_t * src = input.data();
|
|
1404
|
+
uint32_t device;
|
|
1405
|
+
memcpy(&device, src, sizeof(device));
|
|
1406
|
+
src += sizeof(device);
|
|
1407
|
+
if (device >= backends.size()) {
|
|
1330
1408
|
return false;
|
|
1331
1409
|
}
|
|
1332
1410
|
uint32_t n_nodes;
|
|
1333
|
-
memcpy(&n_nodes,
|
|
1334
|
-
|
|
1411
|
+
memcpy(&n_nodes, src, sizeof(n_nodes));
|
|
1412
|
+
src += sizeof(n_nodes);
|
|
1413
|
+
if (input.size() < 2*sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
|
|
1335
1414
|
return false;
|
|
1336
1415
|
}
|
|
1337
|
-
const uint64_t * nodes = (const uint64_t *)
|
|
1416
|
+
const uint64_t * nodes = (const uint64_t *)src;
|
|
1417
|
+
src += n_nodes*sizeof(uint64_t);
|
|
1338
1418
|
uint32_t n_tensors;
|
|
1339
|
-
memcpy(&n_tensors,
|
|
1340
|
-
|
|
1419
|
+
memcpy(&n_tensors, src, sizeof(n_tensors));
|
|
1420
|
+
src += sizeof(n_tensors);
|
|
1421
|
+
if (input.size() < 2*sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
|
|
1341
1422
|
return false;
|
|
1342
1423
|
}
|
|
1343
|
-
const rpc_tensor * tensors = (const rpc_tensor *)
|
|
1344
|
-
|
|
1424
|
+
const rpc_tensor * tensors = (const rpc_tensor *)src;
|
|
1425
|
+
LOG_DBG("[%s] device: %u, n_nodes: %u, n_tensors: %u\n", __func__, device, n_nodes, n_tensors);
|
|
1345
1426
|
|
|
1346
1427
|
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
|
1347
1428
|
|
|
@@ -1373,27 +1454,41 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
|
|
|
1373
1454
|
return false;
|
|
1374
1455
|
}
|
|
1375
1456
|
}
|
|
1376
|
-
ggml_status status = ggml_backend_graph_compute(
|
|
1457
|
+
ggml_status status = ggml_backend_graph_compute(backends[device], graph);
|
|
1377
1458
|
response.result = status;
|
|
1378
1459
|
return true;
|
|
1379
1460
|
}
|
|
1380
1461
|
|
|
1462
|
+
bool rpc_server::get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response) {
|
|
1463
|
+
uint32_t dev_id = request.device;
|
|
1464
|
+
if (dev_id >= backends.size()) {
|
|
1465
|
+
return false;
|
|
1466
|
+
}
|
|
1467
|
+
size_t free, total;
|
|
1468
|
+
ggml_backend_dev_t dev = ggml_backend_get_device(backends[dev_id]);
|
|
1469
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
1470
|
+
response.free_mem = free;
|
|
1471
|
+
response.total_mem = total;
|
|
1472
|
+
LOG_DBG("[%s] device: %u, free_mem: %" PRIu64 ", total_mem: %" PRIu64 "\n", __func__, dev_id, response.free_mem, response.total_mem);
|
|
1473
|
+
return true;
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1381
1476
|
rpc_server::~rpc_server() {
|
|
1382
1477
|
for (auto buffer : buffers) {
|
|
1383
1478
|
ggml_backend_buffer_free(buffer);
|
|
1384
1479
|
}
|
|
1385
1480
|
}
|
|
1386
1481
|
|
|
1387
|
-
static void rpc_serve_client(ggml_backend_t
|
|
1388
|
-
sockfd_t sockfd
|
|
1389
|
-
rpc_server server(
|
|
1482
|
+
static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const char * cache_dir,
|
|
1483
|
+
sockfd_t sockfd) {
|
|
1484
|
+
rpc_server server(backends, cache_dir);
|
|
1390
1485
|
uint8_t cmd;
|
|
1391
1486
|
if (!recv_data(sockfd, &cmd, 1)) {
|
|
1392
1487
|
return;
|
|
1393
1488
|
}
|
|
1394
1489
|
// the first command sent by the client must be HELLO
|
|
1395
1490
|
if (cmd != RPC_CMD_HELLO) {
|
|
1396
|
-
|
|
1491
|
+
GGML_LOG_ERROR("Expected HELLO command, update client\n");
|
|
1397
1492
|
return;
|
|
1398
1493
|
}
|
|
1399
1494
|
if (!recv_msg(sockfd, nullptr, 0)) {
|
|
@@ -1410,7 +1505,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
|
|
1410
1505
|
}
|
|
1411
1506
|
if (cmd >= RPC_CMD_COUNT) {
|
|
1412
1507
|
// fail fast if the command is invalid
|
|
1413
|
-
|
|
1508
|
+
GGML_LOG_ERROR("Unknown command: %d\n", cmd);
|
|
1414
1509
|
break;
|
|
1415
1510
|
}
|
|
1416
1511
|
switch (cmd) {
|
|
@@ -1418,13 +1513,26 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
|
|
1418
1513
|
// HELLO command is handled above
|
|
1419
1514
|
return;
|
|
1420
1515
|
}
|
|
1516
|
+
case RPC_CMD_DEVICE_COUNT: {
|
|
1517
|
+
if (!recv_msg(sockfd, nullptr, 0)) {
|
|
1518
|
+
return;
|
|
1519
|
+
}
|
|
1520
|
+
rpc_msg_device_count_rsp response;
|
|
1521
|
+
response.device_count = backends.size();
|
|
1522
|
+
if (!send_msg(sockfd, &response, sizeof(response))) {
|
|
1523
|
+
return;
|
|
1524
|
+
}
|
|
1525
|
+
break;
|
|
1526
|
+
}
|
|
1421
1527
|
case RPC_CMD_ALLOC_BUFFER: {
|
|
1422
1528
|
rpc_msg_alloc_buffer_req request;
|
|
1423
1529
|
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
|
1424
1530
|
return;
|
|
1425
1531
|
}
|
|
1426
1532
|
rpc_msg_alloc_buffer_rsp response;
|
|
1427
|
-
server.alloc_buffer(request, response)
|
|
1533
|
+
if (!server.alloc_buffer(request, response)) {
|
|
1534
|
+
return;
|
|
1535
|
+
}
|
|
1428
1536
|
if (!send_msg(sockfd, &response, sizeof(response))) {
|
|
1429
1537
|
return;
|
|
1430
1538
|
}
|
|
@@ -1445,22 +1553,28 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
|
|
1445
1553
|
break;
|
|
1446
1554
|
}
|
|
1447
1555
|
case RPC_CMD_GET_ALIGNMENT: {
|
|
1448
|
-
|
|
1556
|
+
rpc_msg_get_alignment_req request;
|
|
1557
|
+
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
|
1449
1558
|
return;
|
|
1450
1559
|
}
|
|
1451
1560
|
rpc_msg_get_alignment_rsp response;
|
|
1452
|
-
server.get_alignment(response)
|
|
1561
|
+
if (!server.get_alignment(request, response)) {
|
|
1562
|
+
return;
|
|
1563
|
+
}
|
|
1453
1564
|
if (!send_msg(sockfd, &response, sizeof(response))) {
|
|
1454
1565
|
return;
|
|
1455
1566
|
}
|
|
1456
1567
|
break;
|
|
1457
1568
|
}
|
|
1458
1569
|
case RPC_CMD_GET_MAX_SIZE: {
|
|
1459
|
-
|
|
1570
|
+
rpc_msg_get_max_size_req request;
|
|
1571
|
+
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
|
1460
1572
|
return;
|
|
1461
1573
|
}
|
|
1462
1574
|
rpc_msg_get_max_size_rsp response;
|
|
1463
|
-
server.get_max_size(response)
|
|
1575
|
+
if (!server.get_max_size(request, response)) {
|
|
1576
|
+
return;
|
|
1577
|
+
}
|
|
1464
1578
|
if (!send_msg(sockfd, &response, sizeof(response))) {
|
|
1465
1579
|
return;
|
|
1466
1580
|
}
|
|
@@ -1586,35 +1700,61 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
|
|
1586
1700
|
break;
|
|
1587
1701
|
}
|
|
1588
1702
|
case RPC_CMD_GET_DEVICE_MEMORY: {
|
|
1589
|
-
|
|
1703
|
+
rpc_msg_get_device_memory_req request;
|
|
1704
|
+
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
|
1590
1705
|
return;
|
|
1591
1706
|
}
|
|
1592
1707
|
rpc_msg_get_device_memory_rsp response;
|
|
1593
|
-
|
|
1594
|
-
|
|
1708
|
+
if (!server.get_device_memory(request, response)) {
|
|
1709
|
+
return;
|
|
1710
|
+
}
|
|
1595
1711
|
if (!send_msg(sockfd, &response, sizeof(response))) {
|
|
1596
1712
|
return;
|
|
1597
1713
|
}
|
|
1598
1714
|
break;
|
|
1599
1715
|
}
|
|
1600
1716
|
default: {
|
|
1601
|
-
|
|
1717
|
+
GGML_LOG_ERROR("Unknown command: %d\n", cmd);
|
|
1602
1718
|
return;
|
|
1603
1719
|
}
|
|
1604
1720
|
}
|
|
1605
1721
|
}
|
|
1606
1722
|
}
|
|
1607
1723
|
|
|
1608
|
-
void ggml_backend_rpc_start_server(
|
|
1609
|
-
|
|
1610
|
-
|
|
1724
|
+
void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
|
|
1725
|
+
size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices) {
|
|
1726
|
+
if (n_devices == 0 || devices == nullptr) {
|
|
1727
|
+
fprintf(stderr, "Invalid arguments to ggml_backend_rpc_start_server\n");
|
|
1728
|
+
return;
|
|
1729
|
+
}
|
|
1730
|
+
std::vector<ggml_backend_t> backends;
|
|
1611
1731
|
printf("Starting RPC server v%d.%d.%d\n",
|
|
1612
1732
|
RPC_PROTO_MAJOR_VERSION,
|
|
1613
1733
|
RPC_PROTO_MINOR_VERSION,
|
|
1614
1734
|
RPC_PROTO_PATCH_VERSION);
|
|
1615
1735
|
printf(" endpoint : %s\n", endpoint);
|
|
1616
1736
|
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
|
|
1617
|
-
printf("
|
|
1737
|
+
printf("Devices:\n");
|
|
1738
|
+
for (size_t i = 0; i < n_devices; i++) {
|
|
1739
|
+
auto dev = devices[i];
|
|
1740
|
+
size_t free, total;
|
|
1741
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
1742
|
+
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
1743
|
+
total / 1024 / 1024, free / 1024 / 1024);
|
|
1744
|
+
auto backend = ggml_backend_dev_init(dev, nullptr);
|
|
1745
|
+
if (!backend) {
|
|
1746
|
+
fprintf(stderr, "Failed to create backend for device %s\n", dev->iface.get_name(dev));
|
|
1747
|
+
return;
|
|
1748
|
+
}
|
|
1749
|
+
backends.push_back(backend);
|
|
1750
|
+
ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
|
|
1751
|
+
if (reg) {
|
|
1752
|
+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
|
1753
|
+
if (ggml_backend_set_n_threads_fn) {
|
|
1754
|
+
ggml_backend_set_n_threads_fn(backend, n_threads);
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
}
|
|
1618
1758
|
|
|
1619
1759
|
std::string host;
|
|
1620
1760
|
int port;
|
|
@@ -1642,22 +1782,27 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
|
|
|
1642
1782
|
fprintf(stderr, "Failed to accept client connection\n");
|
|
1643
1783
|
return;
|
|
1644
1784
|
}
|
|
1645
|
-
printf("Accepted client connection
|
|
1785
|
+
printf("Accepted client connection\n");
|
|
1646
1786
|
fflush(stdout);
|
|
1647
|
-
rpc_serve_client(
|
|
1787
|
+
rpc_serve_client(backends, cache_dir, client_socket->fd);
|
|
1648
1788
|
printf("Client connection closed\n");
|
|
1649
1789
|
fflush(stdout);
|
|
1650
1790
|
}
|
|
1651
1791
|
#ifdef _WIN32
|
|
1652
1792
|
WSACleanup();
|
|
1653
1793
|
#endif
|
|
1794
|
+
for (auto backend : backends) {
|
|
1795
|
+
ggml_backend_free(backend);
|
|
1796
|
+
}
|
|
1654
1797
|
}
|
|
1655
1798
|
|
|
1656
1799
|
// device interface
|
|
1657
1800
|
|
|
1658
1801
|
struct ggml_backend_rpc_device_context {
|
|
1659
1802
|
std::string endpoint;
|
|
1803
|
+
uint32_t device;
|
|
1660
1804
|
std::string name;
|
|
1805
|
+
std::string description;
|
|
1661
1806
|
};
|
|
1662
1807
|
|
|
1663
1808
|
static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
|
|
@@ -1669,15 +1814,13 @@ static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
|
|
|
1669
1814
|
static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
|
|
1670
1815
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
|
1671
1816
|
|
|
1672
|
-
return ctx->
|
|
1817
|
+
return ctx->description.c_str();
|
|
1673
1818
|
}
|
|
1674
1819
|
|
|
1675
1820
|
static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
1676
1821
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
|
1677
1822
|
|
|
1678
|
-
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
|
1679
|
-
|
|
1680
|
-
GGML_UNUSED(dev);
|
|
1823
|
+
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), ctx->device, free, total);
|
|
1681
1824
|
}
|
|
1682
1825
|
|
|
1683
1826
|
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
|
@@ -1703,7 +1846,7 @@ static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggm
|
|
|
1703
1846
|
static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
1704
1847
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
|
1705
1848
|
|
|
1706
|
-
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
|
1849
|
+
return ggml_backend_rpc_init(ctx->endpoint.c_str(), ctx->device);
|
|
1707
1850
|
|
|
1708
1851
|
GGML_UNUSED(params);
|
|
1709
1852
|
}
|
|
@@ -1711,7 +1854,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
|
|
|
1711
1854
|
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
1712
1855
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
|
1713
1856
|
|
|
1714
|
-
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
|
1857
|
+
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str(), ctx->device);
|
|
1715
1858
|
|
|
1716
1859
|
GGML_UNUSED(dev);
|
|
1717
1860
|
}
|
|
@@ -1729,7 +1872,7 @@ static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_b
|
|
|
1729
1872
|
}
|
|
1730
1873
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
|
1731
1874
|
ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
|
|
1732
|
-
return buft_ctx->endpoint == dev_ctx->endpoint;
|
|
1875
|
+
return buft_ctx->endpoint == dev_ctx->endpoint && buft_ctx->device == dev_ctx->device;
|
|
1733
1876
|
}
|
|
1734
1877
|
|
|
1735
1878
|
static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
|
@@ -1752,28 +1895,34 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
|
|
1752
1895
|
|
|
1753
1896
|
// backend reg interface
|
|
1754
1897
|
|
|
1755
|
-
|
|
1756
|
-
|
|
1898
|
+
struct ggml_backend_rpc_reg_context {
|
|
1899
|
+
std::string name;
|
|
1900
|
+
std::vector<ggml_backend_dev_t> devices;
|
|
1901
|
+
};
|
|
1757
1902
|
|
|
1758
|
-
|
|
1903
|
+
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
|
1904
|
+
ggml_backend_rpc_reg_context * ctx = (ggml_backend_rpc_reg_context *)reg->context;
|
|
1905
|
+
return ctx ? ctx->name.c_str() : "RPC";
|
|
1759
1906
|
}
|
|
1760
1907
|
|
|
1761
1908
|
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
GGML_UNUSED(reg);
|
|
1909
|
+
ggml_backend_rpc_reg_context * ctx = (ggml_backend_rpc_reg_context *)reg->context;
|
|
1910
|
+
return ctx ? ctx->devices.size() : 0;
|
|
1765
1911
|
}
|
|
1766
1912
|
|
|
1767
1913
|
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1914
|
+
ggml_backend_rpc_reg_context * ctx = (ggml_backend_rpc_reg_context *)reg->context;
|
|
1915
|
+
if (ctx == nullptr) {
|
|
1916
|
+
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_rpc_add_server instead");
|
|
1917
|
+
} else {
|
|
1918
|
+
GGML_ASSERT(index < ctx->devices.size());
|
|
1919
|
+
return ctx->devices[index];
|
|
1920
|
+
}
|
|
1772
1921
|
}
|
|
1773
1922
|
|
|
1774
1923
|
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
1775
|
-
if (std::strcmp(name, "
|
|
1776
|
-
return (void *)
|
|
1924
|
+
if (std::strcmp(name, "ggml_backend_rpc_add_server") == 0) {
|
|
1925
|
+
return (void *)ggml_backend_rpc_add_server;
|
|
1777
1926
|
}
|
|
1778
1927
|
if (std::strcmp(name, "ggml_backend_rpc_start_server") == 0) {
|
|
1779
1928
|
return (void *)ggml_backend_rpc_start_server;
|
|
@@ -1800,30 +1949,61 @@ ggml_backend_reg_t ggml_backend_rpc_reg(void) {
|
|
|
1800
1949
|
return &ggml_backend_rpc_reg;
|
|
1801
1950
|
}
|
|
1802
1951
|
|
|
1803
|
-
|
|
1804
|
-
|
|
1952
|
+
static uint32_t ggml_backend_rpc_get_device_count(const char * endpoint) {
|
|
1953
|
+
auto sock = get_socket(endpoint);
|
|
1954
|
+
rpc_msg_device_count_rsp response;
|
|
1955
|
+
bool status = send_rpc_cmd(sock, RPC_CMD_DEVICE_COUNT, nullptr, 0, &response, sizeof(response));
|
|
1956
|
+
RPC_STATUS_ASSERT(status);
|
|
1957
|
+
return response.device_count;
|
|
1958
|
+
}
|
|
1959
|
+
|
|
1960
|
+
static const ggml_backend_reg_i ggml_backend_rpc_reg_interface = {
|
|
1961
|
+
/* .get_name = */ ggml_backend_rpc_reg_get_name,
|
|
1962
|
+
/* .get_device_count = */ ggml_backend_rpc_reg_get_device_count,
|
|
1963
|
+
/* .get_device = */ ggml_backend_rpc_reg_get_device,
|
|
1964
|
+
/* .get_proc_address = */ ggml_backend_rpc_get_proc_address,
|
|
1965
|
+
};
|
|
1805
1966
|
|
|
1967
|
+
ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) {
|
|
1968
|
+
static std::unordered_map<std::string, ggml_backend_reg_t> reg_map;
|
|
1806
1969
|
static std::mutex mutex;
|
|
1970
|
+
static uint32_t dev_id = 0;
|
|
1807
1971
|
std::lock_guard<std::mutex> lock(mutex);
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
return dev_map[endpoint];
|
|
1972
|
+
if (reg_map.find(endpoint) != reg_map.end()) {
|
|
1973
|
+
return reg_map[endpoint];
|
|
1811
1974
|
}
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1975
|
+
uint32_t dev_count = ggml_backend_rpc_get_device_count(endpoint);
|
|
1976
|
+
if (dev_count == 0) {
|
|
1977
|
+
return nullptr;
|
|
1978
|
+
}
|
|
1979
|
+
ggml_backend_rpc_reg_context * ctx = new ggml_backend_rpc_reg_context;
|
|
1980
|
+
ctx->name = "RPC[" + std::string(endpoint) + "]";
|
|
1981
|
+
for (uint32_t ind = 0; ind < dev_count; ind++) {
|
|
1982
|
+
std::string dev_name = "RPC" + std::to_string(dev_id);
|
|
1983
|
+
std::string dev_desc = std::string(endpoint);
|
|
1984
|
+
ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context {
|
|
1985
|
+
/* .endpoint = */ endpoint,
|
|
1986
|
+
/* .device = */ ind,
|
|
1987
|
+
/* .name = */ dev_name,
|
|
1988
|
+
/* .description = */ dev_desc
|
|
1989
|
+
};
|
|
1990
|
+
|
|
1991
|
+
ggml_backend_dev_t dev = new ggml_backend_device {
|
|
1992
|
+
/* .iface = */ ggml_backend_rpc_device_i,
|
|
1993
|
+
/* .reg = */ ggml_backend_rpc_reg(),
|
|
1994
|
+
/* .context = */ dev_ctx,
|
|
1995
|
+
};
|
|
1996
|
+
ctx->devices.push_back(dev);
|
|
1997
|
+
dev_id++;
|
|
1998
|
+
}
|
|
1999
|
+
ggml_backend_reg_t reg = new ggml_backend_reg {
|
|
2000
|
+
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
|
2001
|
+
/* .iface = */ ggml_backend_rpc_reg_interface,
|
|
2002
|
+
/* .context = */ ctx
|
|
1822
2003
|
};
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
return dev;
|
|
2004
|
+
reg_map[endpoint] = reg;
|
|
2005
|
+
return reg;
|
|
1827
2006
|
}
|
|
1828
2007
|
|
|
2008
|
+
|
|
1829
2009
|
GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
|