@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -0
- package/android/CMakeLists.txt +2 -0
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
#include "chat.h"
|
|
4
4
|
#include "common.h"
|
|
5
|
-
#include "gguf.h" // for reading GGUF splits
|
|
6
5
|
#include "json-schema-to-grammar.h"
|
|
7
6
|
#include "log.h"
|
|
8
7
|
#include "sampling.h"
|
|
8
|
+
#include "download.h"
|
|
9
9
|
|
|
10
10
|
// fix problem with std::min and std::max
|
|
11
11
|
#if defined(_WIN32)
|
|
@@ -22,26 +22,30 @@
|
|
|
22
22
|
#include <algorithm>
|
|
23
23
|
#include <climits>
|
|
24
24
|
#include <cstdarg>
|
|
25
|
-
#include <filesystem>
|
|
26
25
|
#include <fstream>
|
|
27
26
|
#include <list>
|
|
28
27
|
#include <regex>
|
|
29
28
|
#include <set>
|
|
30
29
|
#include <string>
|
|
31
|
-
#include <thread>
|
|
30
|
+
#include <thread> // for hardware_concurrency
|
|
32
31
|
#include <vector>
|
|
33
32
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
#
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
#
|
|
33
|
+
#ifdef __linux__
|
|
34
|
+
#include <linux/limits.h>
|
|
35
|
+
#elif defined(_WIN32)
|
|
36
|
+
# if !defined(PATH_MAX)
|
|
37
|
+
# define PATH_MAX MAX_PATH
|
|
38
|
+
# endif
|
|
39
|
+
#elif defined(_AIX)
|
|
40
|
+
#include <sys/limits.h>
|
|
41
|
+
#else
|
|
42
|
+
#include <sys/syslimits.h>
|
|
40
43
|
#endif
|
|
44
|
+
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
41
45
|
|
|
42
46
|
using json = nlohmann::ordered_json;
|
|
43
47
|
|
|
44
|
-
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
48
|
+
static std::initializer_list<enum llama_example> mmproj_examples = {
|
|
45
49
|
LLAMA_EXAMPLE_MTMD,
|
|
46
50
|
LLAMA_EXAMPLE_SERVER,
|
|
47
51
|
};
|
|
@@ -56,22 +60,13 @@ static std::string read_file(const std::string & fname) {
|
|
|
56
60
|
return content;
|
|
57
61
|
}
|
|
58
62
|
|
|
59
|
-
static void write_file(const std::string & fname, const std::string & content) {
|
|
60
|
-
std::ofstream file(fname);
|
|
61
|
-
if (!file) {
|
|
62
|
-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
|
63
|
-
}
|
|
64
|
-
file << content;
|
|
65
|
-
file.close();
|
|
66
|
-
}
|
|
67
|
-
|
|
68
63
|
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
|
69
|
-
this->examples =
|
|
64
|
+
this->examples = examples;
|
|
70
65
|
return *this;
|
|
71
66
|
}
|
|
72
67
|
|
|
73
68
|
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
|
|
74
|
-
this->excludes =
|
|
69
|
+
this->excludes = excludes;
|
|
75
70
|
return *this;
|
|
76
71
|
}
|
|
77
72
|
|
|
@@ -94,7 +89,7 @@ bool common_arg::is_exclude(enum llama_example ex) {
|
|
|
94
89
|
return excludes.find(ex) != excludes.end();
|
|
95
90
|
}
|
|
96
91
|
|
|
97
|
-
bool common_arg::get_value_from_env(std::string & output) {
|
|
92
|
+
bool common_arg::get_value_from_env(std::string & output) const {
|
|
98
93
|
if (env == nullptr) return false;
|
|
99
94
|
char * value = std::getenv(env);
|
|
100
95
|
if (value) {
|
|
@@ -104,7 +99,7 @@ bool common_arg::get_value_from_env(std::string & output) {
|
|
|
104
99
|
return false;
|
|
105
100
|
}
|
|
106
101
|
|
|
107
|
-
bool common_arg::has_value_from_env() {
|
|
102
|
+
bool common_arg::has_value_from_env() const {
|
|
108
103
|
return env != nullptr && std::getenv(env);
|
|
109
104
|
}
|
|
110
105
|
|
|
@@ -172,579 +167,6 @@ std::string common_arg::to_string() {
|
|
|
172
167
|
return ss.str();
|
|
173
168
|
}
|
|
174
169
|
|
|
175
|
-
//
|
|
176
|
-
// downloader
|
|
177
|
-
//
|
|
178
|
-
|
|
179
|
-
struct common_hf_file_res {
|
|
180
|
-
std::string repo; // repo name with ":tag" removed
|
|
181
|
-
std::string ggufFile;
|
|
182
|
-
std::string mmprojFile;
|
|
183
|
-
};
|
|
184
|
-
|
|
185
|
-
#ifdef LLAMA_USE_CURL
|
|
186
|
-
|
|
187
|
-
bool common_has_curl() {
|
|
188
|
-
return true;
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
#ifdef __linux__
|
|
192
|
-
#include <linux/limits.h>
|
|
193
|
-
#elif defined(_WIN32)
|
|
194
|
-
# if !defined(PATH_MAX)
|
|
195
|
-
# define PATH_MAX MAX_PATH
|
|
196
|
-
# endif
|
|
197
|
-
#elif defined(_AIX)
|
|
198
|
-
#include <sys/limits.h>
|
|
199
|
-
#else
|
|
200
|
-
#include <sys/syslimits.h>
|
|
201
|
-
#endif
|
|
202
|
-
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
203
|
-
|
|
204
|
-
//
|
|
205
|
-
// CURL utils
|
|
206
|
-
//
|
|
207
|
-
|
|
208
|
-
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
209
|
-
|
|
210
|
-
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
|
211
|
-
struct curl_slist_ptr {
|
|
212
|
-
struct curl_slist * ptr = nullptr;
|
|
213
|
-
~curl_slist_ptr() {
|
|
214
|
-
if (ptr) {
|
|
215
|
-
curl_slist_free_all(ptr);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
};
|
|
219
|
-
|
|
220
|
-
#define CURL_MAX_RETRY 3
|
|
221
|
-
#define CURL_RETRY_DELAY_SECONDS 2
|
|
222
|
-
|
|
223
|
-
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
|
|
224
|
-
int remaining_attempts = max_attempts;
|
|
225
|
-
|
|
226
|
-
while (remaining_attempts > 0) {
|
|
227
|
-
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
|
228
|
-
|
|
229
|
-
CURLcode res = curl_easy_perform(curl);
|
|
230
|
-
if (res == CURLE_OK) {
|
|
231
|
-
return true;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
|
235
|
-
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
|
236
|
-
|
|
237
|
-
remaining_attempts--;
|
|
238
|
-
if (remaining_attempts == 0) break;
|
|
239
|
-
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
|
243
|
-
|
|
244
|
-
return false;
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
// download one single file from remote URL to local path
|
|
248
|
-
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
|
249
|
-
// Check if the file already exists locally
|
|
250
|
-
auto file_exists = std::filesystem::exists(path);
|
|
251
|
-
|
|
252
|
-
// If the file exists, check its JSON metadata companion file.
|
|
253
|
-
std::string metadata_path = path + ".json";
|
|
254
|
-
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
|
255
|
-
std::string etag;
|
|
256
|
-
std::string last_modified;
|
|
257
|
-
|
|
258
|
-
if (file_exists) {
|
|
259
|
-
if (offline) {
|
|
260
|
-
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
|
261
|
-
return true; // skip verification/downloading
|
|
262
|
-
}
|
|
263
|
-
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
264
|
-
std::ifstream metadata_in(metadata_path);
|
|
265
|
-
if (metadata_in.good()) {
|
|
266
|
-
try {
|
|
267
|
-
metadata_in >> metadata;
|
|
268
|
-
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
269
|
-
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
|
270
|
-
etag = metadata.at("etag");
|
|
271
|
-
}
|
|
272
|
-
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
273
|
-
last_modified = metadata.at("lastModified");
|
|
274
|
-
}
|
|
275
|
-
} catch (const nlohmann::json::exception & e) {
|
|
276
|
-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
|
280
|
-
} else {
|
|
281
|
-
if (offline) {
|
|
282
|
-
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
|
283
|
-
return false;
|
|
284
|
-
}
|
|
285
|
-
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
289
|
-
struct common_load_model_from_url_headers {
|
|
290
|
-
std::string etag;
|
|
291
|
-
std::string last_modified;
|
|
292
|
-
};
|
|
293
|
-
|
|
294
|
-
common_load_model_from_url_headers headers;
|
|
295
|
-
bool head_request_ok = false;
|
|
296
|
-
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
|
297
|
-
|
|
298
|
-
// Initialize libcurl
|
|
299
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
300
|
-
curl_slist_ptr http_headers;
|
|
301
|
-
if (!curl) {
|
|
302
|
-
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
303
|
-
return false;
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
// Set the URL, allow to follow http redirection
|
|
307
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
308
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
309
|
-
|
|
310
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
311
|
-
// Check if hf-token or bearer-token was specified
|
|
312
|
-
if (!bearer_token.empty()) {
|
|
313
|
-
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
|
314
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
315
|
-
}
|
|
316
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
317
|
-
|
|
318
|
-
#if defined(_WIN32)
|
|
319
|
-
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
320
|
-
// operating system. Currently implemented under MS-Windows.
|
|
321
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
322
|
-
#endif
|
|
323
|
-
|
|
324
|
-
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
325
|
-
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
326
|
-
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
327
|
-
|
|
328
|
-
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
329
|
-
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
330
|
-
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
331
|
-
|
|
332
|
-
std::string header(buffer, n_items);
|
|
333
|
-
std::smatch match;
|
|
334
|
-
if (std::regex_match(header, match, header_regex)) {
|
|
335
|
-
const std::string & key = match[1];
|
|
336
|
-
const std::string & value = match[2];
|
|
337
|
-
if (std::regex_match(key, match, etag_regex)) {
|
|
338
|
-
headers->etag = value;
|
|
339
|
-
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
340
|
-
headers->last_modified = value;
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
return n_items;
|
|
344
|
-
};
|
|
345
|
-
|
|
346
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
347
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
348
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
349
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
350
|
-
|
|
351
|
-
// we only allow retrying once for HEAD requests
|
|
352
|
-
// this is for the use case of using running offline (no internet), retrying can be annoying
|
|
353
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
|
354
|
-
if (!was_perform_successful) {
|
|
355
|
-
head_request_ok = false;
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
long http_code = 0;
|
|
359
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
360
|
-
if (http_code == 200) {
|
|
361
|
-
head_request_ok = true;
|
|
362
|
-
} else {
|
|
363
|
-
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
364
|
-
head_request_ok = false;
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
// if head_request_ok is false, we don't have the etag or last-modified headers
|
|
368
|
-
// we leave should_download as-is, which is true if the file does not exist
|
|
369
|
-
if (head_request_ok) {
|
|
370
|
-
// check if ETag or Last-Modified headers are different
|
|
371
|
-
// if it is, we need to download the file again
|
|
372
|
-
if (!etag.empty() && etag != headers.etag) {
|
|
373
|
-
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
|
374
|
-
should_download = true;
|
|
375
|
-
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
|
376
|
-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
|
377
|
-
should_download = true;
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
if (should_download) {
|
|
382
|
-
std::string path_temporary = path + ".downloadInProgress";
|
|
383
|
-
if (file_exists) {
|
|
384
|
-
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
|
385
|
-
if (remove(path.c_str()) != 0) {
|
|
386
|
-
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
|
387
|
-
return false;
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
// Set the output file
|
|
392
|
-
|
|
393
|
-
struct FILE_deleter {
|
|
394
|
-
void operator()(FILE * f) const {
|
|
395
|
-
fclose(f);
|
|
396
|
-
}
|
|
397
|
-
};
|
|
398
|
-
|
|
399
|
-
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
|
400
|
-
if (!outfile) {
|
|
401
|
-
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
|
402
|
-
return false;
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
|
|
406
|
-
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
|
|
407
|
-
return fwrite(data, size, nmemb, (FILE *)fd);
|
|
408
|
-
};
|
|
409
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
|
|
410
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
411
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
|
|
412
|
-
|
|
413
|
-
// display download progress
|
|
414
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
|
415
|
-
|
|
416
|
-
// helper function to hide password in URL
|
|
417
|
-
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
|
418
|
-
std::size_t protocol_pos = url.find("://");
|
|
419
|
-
if (protocol_pos == std::string::npos) {
|
|
420
|
-
return url; // Malformed URL
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
|
424
|
-
if (at_pos == std::string::npos) {
|
|
425
|
-
return url; // No password in URL
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
|
429
|
-
};
|
|
430
|
-
|
|
431
|
-
// start the download
|
|
432
|
-
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
|
433
|
-
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
|
434
|
-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
|
|
435
|
-
if (!was_perform_successful) {
|
|
436
|
-
return false;
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
long http_code = 0;
|
|
440
|
-
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
441
|
-
if (http_code < 200 || http_code >= 400) {
|
|
442
|
-
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
|
443
|
-
return false;
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
// Causes file to be closed explicitly here before we rename it.
|
|
447
|
-
outfile.reset();
|
|
448
|
-
|
|
449
|
-
// Write the updated JSON metadata file.
|
|
450
|
-
metadata.update({
|
|
451
|
-
{"url", url},
|
|
452
|
-
{"etag", headers.etag},
|
|
453
|
-
{"lastModified", headers.last_modified}
|
|
454
|
-
});
|
|
455
|
-
write_file(metadata_path, metadata.dump(4));
|
|
456
|
-
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
|
457
|
-
|
|
458
|
-
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
|
459
|
-
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
|
460
|
-
return false;
|
|
461
|
-
}
|
|
462
|
-
} else {
|
|
463
|
-
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
return true;
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
// download multiple files from remote URLs to local paths
|
|
470
|
-
// the input is a vector of pairs <url, path>
|
|
471
|
-
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
|
472
|
-
// Prepare download in parallel
|
|
473
|
-
std::vector<std::future<bool>> futures_download;
|
|
474
|
-
for (auto const & item : urls) {
|
|
475
|
-
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
|
476
|
-
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
|
477
|
-
}, item));
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
// Wait for all downloads to complete
|
|
481
|
-
for (auto & f : futures_download) {
|
|
482
|
-
if (!f.get()) {
|
|
483
|
-
return false;
|
|
484
|
-
}
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
return true;
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
static bool common_download_model(
|
|
491
|
-
const common_params_model & model,
|
|
492
|
-
const std::string & bearer_token,
|
|
493
|
-
bool offline) {
|
|
494
|
-
// Basic validation of the model.url
|
|
495
|
-
if (model.url.empty()) {
|
|
496
|
-
LOG_ERR("%s: invalid model url\n", __func__);
|
|
497
|
-
return false;
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
|
501
|
-
return false;
|
|
502
|
-
}
|
|
503
|
-
|
|
504
|
-
// check for additional GGUFs split to download
|
|
505
|
-
int n_split = 0;
|
|
506
|
-
{
|
|
507
|
-
struct gguf_init_params gguf_params = {
|
|
508
|
-
/*.no_alloc = */ true,
|
|
509
|
-
/*.ctx = */ NULL,
|
|
510
|
-
};
|
|
511
|
-
auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
|
|
512
|
-
if (!ctx_gguf) {
|
|
513
|
-
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
|
|
514
|
-
return false;
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
|
518
|
-
if (key_n_split >= 0) {
|
|
519
|
-
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
gguf_free(ctx_gguf);
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
if (n_split > 1) {
|
|
526
|
-
char split_prefix[PATH_MAX] = {0};
|
|
527
|
-
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
528
|
-
|
|
529
|
-
// Verify the first split file format
|
|
530
|
-
// and extract split URL and PATH prefixes
|
|
531
|
-
{
|
|
532
|
-
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
|
|
533
|
-
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
|
|
534
|
-
return false;
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
|
|
538
|
-
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
|
|
539
|
-
return false;
|
|
540
|
-
}
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
std::vector<std::pair<std::string, std::string>> urls;
|
|
544
|
-
for (int idx = 1; idx < n_split; idx++) {
|
|
545
|
-
char split_path[PATH_MAX] = {0};
|
|
546
|
-
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
|
547
|
-
|
|
548
|
-
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
549
|
-
llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
|
|
550
|
-
|
|
551
|
-
if (std::string(split_path) == model.path) {
|
|
552
|
-
continue; // skip the already downloaded file
|
|
553
|
-
}
|
|
554
|
-
|
|
555
|
-
urls.push_back({split_url, split_path});
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
// Download in parallel
|
|
559
|
-
common_download_file_multiple(urls, bearer_token, offline);
|
|
560
|
-
}
|
|
561
|
-
|
|
562
|
-
return true;
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
|
566
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
567
|
-
curl_slist_ptr http_headers;
|
|
568
|
-
std::vector<char> res_buffer;
|
|
569
|
-
|
|
570
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
571
|
-
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
572
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
573
|
-
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
574
|
-
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
575
|
-
auto data_vec = static_cast<std::vector<char> *>(data);
|
|
576
|
-
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
|
577
|
-
return size * nmemb;
|
|
578
|
-
};
|
|
579
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
580
|
-
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
|
581
|
-
#if defined(_WIN32)
|
|
582
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
583
|
-
#endif
|
|
584
|
-
if (params.timeout > 0) {
|
|
585
|
-
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
|
586
|
-
}
|
|
587
|
-
if (params.max_size > 0) {
|
|
588
|
-
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
|
589
|
-
}
|
|
590
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
591
|
-
for (const auto & header : params.headers) {
|
|
592
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
|
593
|
-
}
|
|
594
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
595
|
-
|
|
596
|
-
CURLcode res = curl_easy_perform(curl.get());
|
|
597
|
-
|
|
598
|
-
if (res != CURLE_OK) {
|
|
599
|
-
std::string error_msg = curl_easy_strerror(res);
|
|
600
|
-
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
long res_code;
|
|
604
|
-
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
605
|
-
|
|
606
|
-
return { res_code, std::move(res_buffer) };
|
|
607
|
-
}
|
|
608
|
-
|
|
609
|
-
/**
|
|
610
|
-
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
611
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
612
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
|
613
|
-
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
|
614
|
-
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
|
615
|
-
*
|
|
616
|
-
* Return pair of <repo, file> (with "repo" already having tag removed)
|
|
617
|
-
*
|
|
618
|
-
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
619
|
-
*/
|
|
620
|
-
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
|
621
|
-
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
622
|
-
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
623
|
-
std::string hf_repo = parts[0];
|
|
624
|
-
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
|
625
|
-
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
|
629
|
-
|
|
630
|
-
// headers
|
|
631
|
-
std::vector<std::string> headers;
|
|
632
|
-
headers.push_back("Accept: application/json");
|
|
633
|
-
if (!bearer_token.empty()) {
|
|
634
|
-
headers.push_back("Authorization: Bearer " + bearer_token);
|
|
635
|
-
}
|
|
636
|
-
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
|
637
|
-
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
|
638
|
-
|
|
639
|
-
// we use "=" to avoid clashing with other component, while still being allowed on windows
|
|
640
|
-
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
|
|
641
|
-
string_replace_all(cached_response_fname, "/", "_");
|
|
642
|
-
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
|
|
643
|
-
|
|
644
|
-
// make the request
|
|
645
|
-
common_remote_params params;
|
|
646
|
-
params.headers = headers;
|
|
647
|
-
long res_code = 0;
|
|
648
|
-
std::string res_str;
|
|
649
|
-
bool use_cache = false;
|
|
650
|
-
if (!offline) {
|
|
651
|
-
try {
|
|
652
|
-
auto res = common_remote_get_content(url, params);
|
|
653
|
-
res_code = res.first;
|
|
654
|
-
res_str = std::string(res.second.data(), res.second.size());
|
|
655
|
-
} catch (const std::exception & e) {
|
|
656
|
-
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
|
|
657
|
-
}
|
|
658
|
-
}
|
|
659
|
-
if (res_code == 0) {
|
|
660
|
-
if (std::filesystem::exists(cached_response_path)) {
|
|
661
|
-
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
|
|
662
|
-
res_str = read_file(cached_response_path);
|
|
663
|
-
res_code = 200;
|
|
664
|
-
use_cache = true;
|
|
665
|
-
} else {
|
|
666
|
-
throw std::runtime_error(
|
|
667
|
-
offline ? "error: failed to get manifest (offline mode)"
|
|
668
|
-
: "error: failed to get manifest (check your internet connection)");
|
|
669
|
-
}
|
|
670
|
-
}
|
|
671
|
-
std::string ggufFile;
|
|
672
|
-
std::string mmprojFile;
|
|
673
|
-
|
|
674
|
-
if (res_code == 200 || res_code == 304) {
|
|
675
|
-
// extract ggufFile.rfilename in json, using regex
|
|
676
|
-
{
|
|
677
|
-
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
|
678
|
-
std::smatch match;
|
|
679
|
-
if (std::regex_search(res_str, match, pattern)) {
|
|
680
|
-
ggufFile = match[1].str();
|
|
681
|
-
}
|
|
682
|
-
}
|
|
683
|
-
// extract mmprojFile.rfilename in json, using regex
|
|
684
|
-
{
|
|
685
|
-
std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
|
686
|
-
std::smatch match;
|
|
687
|
-
if (std::regex_search(res_str, match, pattern)) {
|
|
688
|
-
mmprojFile = match[1].str();
|
|
689
|
-
}
|
|
690
|
-
}
|
|
691
|
-
if (!use_cache) {
|
|
692
|
-
// if not using cached response, update the cache file
|
|
693
|
-
write_file(cached_response_path, res_str);
|
|
694
|
-
}
|
|
695
|
-
} else if (res_code == 401) {
|
|
696
|
-
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
|
697
|
-
} else {
|
|
698
|
-
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
|
699
|
-
}
|
|
700
|
-
|
|
701
|
-
// check response
|
|
702
|
-
if (ggufFile.empty()) {
|
|
703
|
-
throw std::runtime_error("error: model does not have ggufFile");
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
return { hf_repo, ggufFile, mmprojFile };
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
#else
|
|
710
|
-
|
|
711
|
-
bool common_has_curl() {
|
|
712
|
-
return false;
|
|
713
|
-
}
|
|
714
|
-
|
|
715
|
-
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
|
716
|
-
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
|
717
|
-
return false;
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
|
721
|
-
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
722
|
-
return false;
|
|
723
|
-
}
|
|
724
|
-
|
|
725
|
-
static bool common_download_model(
|
|
726
|
-
const common_params_model &,
|
|
727
|
-
const std::string &,
|
|
728
|
-
bool) {
|
|
729
|
-
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
730
|
-
return false;
|
|
731
|
-
}
|
|
732
|
-
|
|
733
|
-
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
|
734
|
-
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
735
|
-
return {};
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
|
739
|
-
if (!url.empty()) {
|
|
740
|
-
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
|
741
|
-
}
|
|
742
|
-
|
|
743
|
-
return {};
|
|
744
|
-
}
|
|
745
|
-
|
|
746
|
-
#endif // LLAMA_USE_CURL
|
|
747
|
-
|
|
748
170
|
//
|
|
749
171
|
// utils
|
|
750
172
|
//
|
|
@@ -795,7 +217,9 @@ static handle_model_result common_params_handle_model(
|
|
|
795
217
|
handle_model_result result;
|
|
796
218
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
797
219
|
{
|
|
798
|
-
if (!model.
|
|
220
|
+
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
|
|
221
|
+
model.path = common_docker_resolve_model(model.docker_repo);
|
|
222
|
+
} else if (!model.hf_repo.empty()) {
|
|
799
223
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
800
224
|
if (model.hf_file.empty()) {
|
|
801
225
|
if (model.path.empty()) {
|
|
@@ -884,8 +308,6 @@ static std::string get_all_kv_cache_types() {
|
|
|
884
308
|
//
|
|
885
309
|
|
|
886
310
|
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
|
887
|
-
std::string arg;
|
|
888
|
-
const std::string arg_prefix = "--";
|
|
889
311
|
common_params & params = ctx_arg.params;
|
|
890
312
|
|
|
891
313
|
std::unordered_map<std::string, common_arg *> arg_to_options;
|
|
@@ -1184,7 +606,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
|
|
1184
606
|
} else {
|
|
1185
607
|
for (const auto & device : dev_names) {
|
|
1186
608
|
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
|
1187
|
-
if (!dev || ggml_backend_dev_type(dev)
|
|
609
|
+
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
1188
610
|
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
|
1189
611
|
}
|
|
1190
612
|
devices.push_back(dev);
|
|
@@ -1194,7 +616,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
|
|
1194
616
|
return devices;
|
|
1195
617
|
}
|
|
1196
618
|
|
|
1197
|
-
static void add_rpc_devices(std::string servers) {
|
|
619
|
+
static void add_rpc_devices(const std::string & servers) {
|
|
1198
620
|
auto rpc_servers = string_split<std::string>(servers, ',');
|
|
1199
621
|
if (rpc_servers.empty()) {
|
|
1200
622
|
throw std::invalid_argument("no RPC servers specified");
|
|
@@ -1203,18 +625,14 @@ static void add_rpc_devices(std::string servers) {
|
|
|
1203
625
|
if (!rpc_reg) {
|
|
1204
626
|
throw std::invalid_argument("failed to find RPC backend");
|
|
1205
627
|
}
|
|
1206
|
-
typedef
|
|
1207
|
-
|
|
1208
|
-
if (!
|
|
1209
|
-
throw std::invalid_argument("failed to find RPC
|
|
628
|
+
typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
|
|
629
|
+
ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
|
|
630
|
+
if (!ggml_backend_rpc_add_server_fn) {
|
|
631
|
+
throw std::invalid_argument("failed to find RPC add server function");
|
|
1210
632
|
}
|
|
1211
633
|
for (const auto & server : rpc_servers) {
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
ggml_backend_device_register(dev);
|
|
1215
|
-
} else {
|
|
1216
|
-
throw std::invalid_argument("failed to register RPC device");
|
|
1217
|
-
}
|
|
634
|
+
auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
|
|
635
|
+
ggml_backend_register(reg);
|
|
1218
636
|
}
|
|
1219
637
|
}
|
|
1220
638
|
|
|
@@ -1263,6 +681,18 @@ static std::string list_builtin_chat_templates() {
|
|
|
1263
681
|
return msg.str();
|
|
1264
682
|
}
|
|
1265
683
|
|
|
684
|
+
static bool is_truthy(const std::string & value) {
|
|
685
|
+
return value == "on" || value == "enabled" || value == "1";
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
static bool is_falsey(const std::string & value) {
|
|
689
|
+
return value == "off" || value == "disabled" || value == "0";
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
static bool is_autoy(const std::string & value) {
|
|
693
|
+
return value == "auto" || value == "-1";
|
|
694
|
+
}
|
|
695
|
+
|
|
1266
696
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
1267
697
|
// load dynamic backends
|
|
1268
698
|
ggml_backend_load_all();
|
|
@@ -1310,6 +740,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1310
740
|
exit(0);
|
|
1311
741
|
}
|
|
1312
742
|
));
|
|
743
|
+
add_opt(common_arg(
|
|
744
|
+
{"-cl", "--cache-list"},
|
|
745
|
+
"show list of models in cache",
|
|
746
|
+
[](common_params &) {
|
|
747
|
+
printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
|
|
748
|
+
auto models = common_list_cached_models();
|
|
749
|
+
printf("number of models in cache: %zu\n", models.size());
|
|
750
|
+
for (size_t i = 0; i < models.size(); i++) {
|
|
751
|
+
auto & model = models[i];
|
|
752
|
+
printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
|
|
753
|
+
}
|
|
754
|
+
exit(0);
|
|
755
|
+
}
|
|
756
|
+
));
|
|
1313
757
|
add_opt(common_arg(
|
|
1314
758
|
{"--completion-bash"},
|
|
1315
759
|
"print source-able bash completion script for llama.cpp",
|
|
@@ -1340,7 +784,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1340
784
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
1341
785
|
add_opt(common_arg(
|
|
1342
786
|
{"-t", "--threads"}, "N",
|
|
1343
|
-
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
787
|
+
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
1344
788
|
[](common_params & params, int value) {
|
|
1345
789
|
params.cpuparams.n_threads = value;
|
|
1346
790
|
if (params.cpuparams.n_threads <= 0) {
|
|
@@ -1508,13 +952,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1508
952
|
}
|
|
1509
953
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
1510
954
|
add_opt(common_arg(
|
|
1511
|
-
{"--swa-checkpoints"}, "N",
|
|
1512
|
-
string_format("max number of
|
|
1513
|
-
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.
|
|
955
|
+
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
|
956
|
+
string_format("max number of context checkpoints to create per slot (default: %d)\n"
|
|
957
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
|
958
|
+
[](common_params & params, int value) {
|
|
959
|
+
params.n_ctx_checkpoints = value;
|
|
960
|
+
}
|
|
961
|
+
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
962
|
+
add_opt(common_arg(
|
|
963
|
+
{"--cache-ram", "-cram"}, "N",
|
|
964
|
+
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
|
|
965
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
|
1514
966
|
[](common_params & params, int value) {
|
|
1515
|
-
params.
|
|
967
|
+
params.cache_ram_mib = value;
|
|
1516
968
|
}
|
|
1517
|
-
).set_env("
|
|
969
|
+
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1518
970
|
add_opt(common_arg(
|
|
1519
971
|
{"--kv-unified", "-kvu"},
|
|
1520
972
|
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
|
@@ -1544,13 +996,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1544
996
|
params.n_chunks = value;
|
|
1545
997
|
}
|
|
1546
998
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1547
|
-
add_opt(common_arg(
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
999
|
+
add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
|
|
1000
|
+
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
|
|
1001
|
+
llama_flash_attn_type_name(params.flash_attn_type)),
|
|
1002
|
+
[](common_params & params, const std::string & value) {
|
|
1003
|
+
if (is_truthy(value)) {
|
|
1004
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
|
1005
|
+
} else if (is_falsey(value)) {
|
|
1006
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
1007
|
+
} else if (is_autoy(value)) {
|
|
1008
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
1009
|
+
} else {
|
|
1010
|
+
throw std::runtime_error(
|
|
1011
|
+
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
|
1012
|
+
}
|
|
1013
|
+
}).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
1554
1014
|
add_opt(common_arg(
|
|
1555
1015
|
{"-p", "--prompt"}, "PROMPT",
|
|
1556
1016
|
"prompt to start generation with; for system message, use -sys",
|
|
@@ -1564,7 +1024,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1564
1024
|
[](common_params & params, const std::string & value) {
|
|
1565
1025
|
params.system_prompt = value;
|
|
1566
1026
|
}
|
|
1567
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1027
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1568
1028
|
add_opt(common_arg(
|
|
1569
1029
|
{"--no-perf"},
|
|
1570
1030
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
@@ -1594,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1594
1054
|
params.system_prompt.pop_back();
|
|
1595
1055
|
}
|
|
1596
1056
|
}
|
|
1597
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1057
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1598
1058
|
add_opt(common_arg(
|
|
1599
1059
|
{"--in-file"}, "FNAME",
|
|
1600
1060
|
"an input file (repeat to specify multiple files)",
|
|
@@ -2156,6 +1616,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2156
1616
|
params.no_extra_bufts = true;
|
|
2157
1617
|
}
|
|
2158
1618
|
).set_env("LLAMA_ARG_NO_REPACK"));
|
|
1619
|
+
add_opt(common_arg(
|
|
1620
|
+
{"--no-host"},
|
|
1621
|
+
"bypass host buffer allowing extra buffers to be used",
|
|
1622
|
+
[](common_params & params) {
|
|
1623
|
+
params.no_host = true;
|
|
1624
|
+
}
|
|
1625
|
+
).set_env("LLAMA_ARG_NO_HOST"));
|
|
2159
1626
|
add_opt(common_arg(
|
|
2160
1627
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
|
2161
1628
|
string_format(
|
|
@@ -2325,6 +1792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2325
1792
|
params.image.emplace_back(value);
|
|
2326
1793
|
}
|
|
2327
1794
|
).set_examples({LLAMA_EXAMPLE_MTMD}));
|
|
1795
|
+
add_opt(common_arg(
|
|
1796
|
+
{"--image-min-tokens"}, "N",
|
|
1797
|
+
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
|
|
1798
|
+
[](common_params & params, int value) {
|
|
1799
|
+
params.image_min_tokens = value;
|
|
1800
|
+
}
|
|
1801
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
|
|
1802
|
+
add_opt(common_arg(
|
|
1803
|
+
{"--image-max-tokens"}, "N",
|
|
1804
|
+
"maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
|
|
1805
|
+
[](common_params & params, int value) {
|
|
1806
|
+
params.image_max_tokens = value;
|
|
1807
|
+
}
|
|
1808
|
+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
|
|
2328
1809
|
if (llama_supports_rpc()) {
|
|
2329
1810
|
add_opt(common_arg(
|
|
2330
1811
|
{"--rpc"}, "SERVERS",
|
|
@@ -2376,24 +1857,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2376
1857
|
{"--list-devices"},
|
|
2377
1858
|
"print list of available devices and exit",
|
|
2378
1859
|
[](common_params &) {
|
|
2379
|
-
std::vector<ggml_backend_dev_t>
|
|
2380
|
-
std::vector<ggml_backend_dev_t> all_devices;
|
|
1860
|
+
std::vector<ggml_backend_dev_t> devices;
|
|
2381
1861
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
2382
1862
|
auto * dev = ggml_backend_dev_get(i);
|
|
2383
|
-
if (ggml_backend_dev_type(dev)
|
|
2384
|
-
|
|
2385
|
-
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
2386
|
-
rpc_devices.push_back(dev);
|
|
2387
|
-
} else {
|
|
2388
|
-
all_devices.push_back(dev);
|
|
2389
|
-
}
|
|
1863
|
+
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
1864
|
+
devices.push_back(dev);
|
|
2390
1865
|
}
|
|
2391
1866
|
}
|
|
2392
|
-
// insert RPC devices in front
|
|
2393
|
-
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
|
|
2394
1867
|
printf("Available devices:\n");
|
|
2395
|
-
for (
|
|
2396
|
-
auto * dev = all_devices[i];
|
|
1868
|
+
for (auto * dev : devices) {
|
|
2397
1869
|
size_t free, total;
|
|
2398
1870
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
2399
1871
|
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
|
@@ -2417,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2417
1889
|
{"--cpu-moe", "-cmoe"},
|
|
2418
1890
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
2419
1891
|
[](common_params & params) {
|
|
2420
|
-
params.tensor_buft_overrides.push_back(
|
|
1892
|
+
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2421
1893
|
}
|
|
2422
1894
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2423
1895
|
add_opt(common_arg(
|
|
@@ -2430,7 +1902,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2430
1902
|
for (int i = 0; i < value; ++i) {
|
|
2431
1903
|
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2432
1904
|
static std::list<std::string> buft_overrides;
|
|
2433
|
-
buft_overrides.push_back(
|
|
1905
|
+
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
|
|
2434
1906
|
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2435
1907
|
}
|
|
2436
1908
|
}
|
|
@@ -2439,7 +1911,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2439
1911
|
{"--cpu-moe-draft", "-cmoed"},
|
|
2440
1912
|
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
|
2441
1913
|
[](common_params & params) {
|
|
2442
|
-
params.speculative.tensor_buft_overrides.push_back(
|
|
1914
|
+
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2443
1915
|
}
|
|
2444
1916
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2445
1917
|
add_opt(common_arg(
|
|
@@ -2451,14 +1923,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2451
1923
|
}
|
|
2452
1924
|
for (int i = 0; i < value; ++i) {
|
|
2453
1925
|
static std::list<std::string> buft_overrides_draft;
|
|
2454
|
-
buft_overrides_draft.push_back(
|
|
1926
|
+
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
|
|
2455
1927
|
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2456
1928
|
}
|
|
2457
1929
|
}
|
|
2458
1930
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
2459
1931
|
add_opt(common_arg(
|
|
2460
1932
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
2461
|
-
"number of layers to store in VRAM",
|
|
1933
|
+
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
|
|
2462
1934
|
[](common_params & params, int value) {
|
|
2463
1935
|
params.n_gpu_layers = value;
|
|
2464
1936
|
if (!llama_supports_gpu_offload()) {
|
|
@@ -2616,6 +2088,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2616
2088
|
params.model.url = value;
|
|
2617
2089
|
}
|
|
2618
2090
|
).set_env("LLAMA_ARG_MODEL_URL"));
|
|
2091
|
+
add_opt(common_arg(
|
|
2092
|
+
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
|
|
2093
|
+
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
|
|
2094
|
+
"example: gemma3\n"
|
|
2095
|
+
"(default: unused)",
|
|
2096
|
+
[](common_params & params, const std::string & value) {
|
|
2097
|
+
params.model.docker_repo = value;
|
|
2098
|
+
}
|
|
2099
|
+
).set_env("LLAMA_ARG_DOCKER_REPO"));
|
|
2619
2100
|
add_opt(common_arg(
|
|
2620
2101
|
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
|
2621
2102
|
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
|
@@ -2760,7 +2241,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2760
2241
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2761
2242
|
add_opt(common_arg(
|
|
2762
2243
|
{"--parse-special"},
|
|
2763
|
-
string_format("
|
|
2244
|
+
string_format("parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
|
2764
2245
|
[](common_params & params) {
|
|
2765
2246
|
params.parse_special = true;
|
|
2766
2247
|
}
|
|
@@ -2772,6 +2253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2772
2253
|
params.is_pp_shared = true;
|
|
2773
2254
|
}
|
|
2774
2255
|
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
|
2256
|
+
add_opt(common_arg(
|
|
2257
|
+
{"-tgs"},
|
|
2258
|
+
string_format("is the text generation separated across the different sequences (default: %s)", params.is_tg_separate ? "true" : "false"),
|
|
2259
|
+
[](common_params & params) {
|
|
2260
|
+
params.is_tg_separate = true;
|
|
2261
|
+
}
|
|
2262
|
+
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
|
2775
2263
|
add_opt(common_arg(
|
|
2776
2264
|
{"-npp"}, "n0,n1,...",
|
|
2777
2265
|
"number of prompt tokens",
|
|
@@ -2805,7 +2293,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2805
2293
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2806
2294
|
add_opt(common_arg(
|
|
2807
2295
|
{"--embd-output-format"}, "FORMAT",
|
|
2808
|
-
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
|
|
2296
|
+
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
|
2809
2297
|
[](common_params & params, const std::string & value) {
|
|
2810
2298
|
params.embd_out = value;
|
|
2811
2299
|
}
|
|
@@ -2915,7 +2403,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2915
2403
|
add_opt(common_arg(
|
|
2916
2404
|
{"--chat-template-kwargs"}, "STRING",
|
|
2917
2405
|
string_format("sets additional params for the json template parser"),
|
|
2918
|
-
[](common_params & params, const std::string &
|
|
2406
|
+
[](common_params & params, const std::string & value) {
|
|
2919
2407
|
auto parsed = json::parse(value);
|
|
2920
2408
|
for (const auto & item : parsed.items()) {
|
|
2921
2409
|
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
@@ -2954,13 +2442,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2954
2442
|
params.endpoint_metrics = true;
|
|
2955
2443
|
}
|
|
2956
2444
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
|
2957
|
-
add_opt(common_arg(
|
|
2958
|
-
{"--slots"},
|
|
2959
|
-
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
2960
|
-
[](common_params & params) {
|
|
2961
|
-
params.endpoint_slots = true;
|
|
2962
|
-
}
|
|
2963
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
2964
2445
|
add_opt(common_arg(
|
|
2965
2446
|
{"--props"},
|
|
2966
2447
|
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
|
|
@@ -2968,6 +2449,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2968
2449
|
params.endpoint_props = true;
|
|
2969
2450
|
}
|
|
2970
2451
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
|
2452
|
+
add_opt(common_arg(
|
|
2453
|
+
{"--slots"},
|
|
2454
|
+
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
|
2455
|
+
[](common_params & params) {
|
|
2456
|
+
params.endpoint_slots = true;
|
|
2457
|
+
}
|
|
2458
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
|
2971
2459
|
add_opt(common_arg(
|
|
2972
2460
|
{"--no-slots"},
|
|
2973
2461
|
"disables slots monitoring endpoint",
|
|
@@ -2992,12 +2480,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2992
2480
|
[](common_params & params) {
|
|
2993
2481
|
params.use_jinja = true;
|
|
2994
2482
|
}
|
|
2995
|
-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
2483
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
2996
2484
|
add_opt(common_arg(
|
|
2997
2485
|
{"--reasoning-format"}, "FORMAT",
|
|
2998
2486
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
2999
2487
|
"- none: leaves thoughts unparsed in `message.content`\n"
|
|
3000
|
-
"- deepseek: puts thoughts in `message.reasoning_content
|
|
2488
|
+
"- deepseek: puts thoughts in `message.reasoning_content`\n"
|
|
2489
|
+
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
|
|
3001
2490
|
"(default: auto)",
|
|
3002
2491
|
[](common_params & params, const std::string & value) {
|
|
3003
2492
|
params.reasoning_format = common_reasoning_format_from_name(value);
|
|
@@ -3127,10 +2616,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3127
2616
|
}
|
|
3128
2617
|
));
|
|
3129
2618
|
add_opt(common_arg(
|
|
3130
|
-
{"--log-colors"},
|
|
3131
|
-
"
|
|
3132
|
-
|
|
3133
|
-
|
|
2619
|
+
{"--log-colors"}, "[on|off|auto]",
|
|
2620
|
+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
|
2621
|
+
"'auto' enables colors when output is to a terminal",
|
|
2622
|
+
[](common_params &, const std::string & value) {
|
|
2623
|
+
if (is_truthy(value)) {
|
|
2624
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
|
|
2625
|
+
} else if (is_falsey(value)) {
|
|
2626
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
|
|
2627
|
+
} else if (is_autoy(value)) {
|
|
2628
|
+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
|
2629
|
+
} else {
|
|
2630
|
+
throw std::invalid_argument(
|
|
2631
|
+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
|
|
2632
|
+
}
|
|
3134
2633
|
}
|
|
3135
2634
|
).set_env("LLAMA_LOG_COLORS"));
|
|
3136
2635
|
add_opt(common_arg(
|
|
@@ -3398,7 +2897,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3398
2897
|
}
|
|
3399
2898
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
3400
2899
|
|
|
3401
|
-
|
|
2900
|
+
add_opt(common_arg(
|
|
2901
|
+
{"--diffusion-steps"}, "N",
|
|
2902
|
+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
2903
|
+
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
2904
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2905
|
+
add_opt(common_arg(
|
|
2906
|
+
{"--diffusion-visual"},
|
|
2907
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
|
|
2908
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
2909
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2910
|
+
add_opt(common_arg(
|
|
2911
|
+
{"--diffusion-eps"}, "F",
|
|
2912
|
+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
2913
|
+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
2914
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2915
|
+
add_opt(common_arg(
|
|
2916
|
+
{"--diffusion-algorithm"}, "N",
|
|
2917
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
|
|
2918
|
+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
2919
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2920
|
+
add_opt(common_arg(
|
|
2921
|
+
{"--diffusion-alg-temp"}, "F",
|
|
2922
|
+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
2923
|
+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
2924
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2925
|
+
add_opt(common_arg(
|
|
2926
|
+
{"--diffusion-block-length"}, "N",
|
|
2927
|
+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
|
2928
|
+
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
|
2929
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2930
|
+
add_opt(common_arg(
|
|
2931
|
+
{"--diffusion-cfg-scale"}, "F",
|
|
2932
|
+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
|
2933
|
+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
|
2934
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2935
|
+
add_opt(common_arg(
|
|
2936
|
+
{"--diffusion-add-gumbel-noise"}, "F",
|
|
2937
|
+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
2938
|
+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
2939
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
2940
|
+
add_opt(common_arg(
|
|
2941
|
+
{ "-lr", "--learning-rate" }, "ALPHA",
|
|
2942
|
+
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
|
|
2943
|
+
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
|
|
2944
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
2945
|
+
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
|
2946
|
+
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
|
2947
|
+
(double) params.lr.lr_min),
|
|
2948
|
+
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
|
|
2949
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
2950
|
+
add_opt(common_arg(
|
|
2951
|
+
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
|
|
2952
|
+
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
|
|
2953
|
+
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
|
|
2954
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
2955
|
+
add_opt(common_arg(
|
|
2956
|
+
{"-wd", "--weight-decay"}, "WD",
|
|
2957
|
+
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
|
|
2958
|
+
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
|
|
2959
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
2960
|
+
add_opt(common_arg(
|
|
2961
|
+
{"-val-split", "--val-split"}, "FRACTION",
|
|
2962
|
+
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
|
|
2963
|
+
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
|
|
2964
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
2965
|
+
add_opt(common_arg(
|
|
2966
|
+
{"-epochs", "--epochs"}, "N",
|
|
2967
|
+
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
|
2968
|
+
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
|
|
2969
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
2970
|
+
add_opt(common_arg(
|
|
2971
|
+
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
|
|
2972
|
+
[](common_params & params, const std::string & name) {
|
|
2973
|
+
params.optimizer = common_opt_get_optimizer(name.c_str());
|
|
2974
|
+
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
|
2975
|
+
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
|
2976
|
+
}
|
|
2977
|
+
}
|
|
2978
|
+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
2979
|
+
|
|
2980
|
+
// presets
|
|
3402
2981
|
add_opt(common_arg(
|
|
3403
2982
|
{"--tts-oute-default"},
|
|
3404
2983
|
string_format("use default OuteTTS models (note: can download weights from the internet)"),
|
|
@@ -3411,42 +2990,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3411
2990
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
3412
2991
|
|
|
3413
2992
|
add_opt(common_arg(
|
|
3414
|
-
{"--embd-
|
|
3415
|
-
string_format("use default
|
|
3416
|
-
[](common_params & params) {
|
|
3417
|
-
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
3418
|
-
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
3419
|
-
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
3420
|
-
params.embd_normalize = 2;
|
|
3421
|
-
params.n_ctx = 512;
|
|
3422
|
-
params.verbose_prompt = true;
|
|
3423
|
-
params.embedding = true;
|
|
3424
|
-
}
|
|
3425
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
3426
|
-
|
|
3427
|
-
add_opt(common_arg(
|
|
3428
|
-
{"--embd-e5-small-en-default"},
|
|
3429
|
-
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
3430
|
-
[](common_params & params) {
|
|
3431
|
-
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
3432
|
-
params.model.hf_file = "e5-small-v2-q8_0.gguf";
|
|
3433
|
-
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
3434
|
-
params.embd_normalize = 2;
|
|
3435
|
-
params.n_ctx = 512;
|
|
3436
|
-
params.verbose_prompt = true;
|
|
3437
|
-
params.embedding = true;
|
|
3438
|
-
}
|
|
3439
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
3440
|
-
|
|
3441
|
-
add_opt(common_arg(
|
|
3442
|
-
{"--embd-gte-small-default"},
|
|
3443
|
-
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
2993
|
+
{"--embd-gemma-default"},
|
|
2994
|
+
string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
|
|
3444
2995
|
[](common_params & params) {
|
|
3445
|
-
params.model.hf_repo = "ggml-org/
|
|
3446
|
-
params.model.hf_file = "
|
|
3447
|
-
params.
|
|
3448
|
-
params.
|
|
3449
|
-
params.
|
|
2996
|
+
params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
|
|
2997
|
+
params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
|
|
2998
|
+
params.port = 8011;
|
|
2999
|
+
params.n_ubatch = 2048;
|
|
3000
|
+
params.n_batch = 2048;
|
|
3001
|
+
params.n_parallel = 32;
|
|
3002
|
+
params.n_ctx = 2048*params.n_parallel;
|
|
3450
3003
|
params.verbose_prompt = true;
|
|
3451
3004
|
params.embedding = true;
|
|
3452
3005
|
}
|
|
@@ -3459,8 +3012,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3459
3012
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
3460
3013
|
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
3461
3014
|
params.port = 8012;
|
|
3462
|
-
params.n_gpu_layers = 99;
|
|
3463
|
-
params.flash_attn = true;
|
|
3464
3015
|
params.n_ubatch = 1024;
|
|
3465
3016
|
params.n_batch = 1024;
|
|
3466
3017
|
params.n_ctx = 0;
|
|
@@ -3475,8 +3026,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3475
3026
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
3476
3027
|
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
3477
3028
|
params.port = 8012;
|
|
3478
|
-
params.n_gpu_layers = 99;
|
|
3479
|
-
params.flash_attn = true;
|
|
3480
3029
|
params.n_ubatch = 1024;
|
|
3481
3030
|
params.n_batch = 1024;
|
|
3482
3031
|
params.n_ctx = 0;
|
|
@@ -3491,8 +3040,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3491
3040
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
3492
3041
|
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3493
3042
|
params.port = 8012;
|
|
3494
|
-
params.n_gpu_layers = 99;
|
|
3495
|
-
params.flash_attn = true;
|
|
3496
3043
|
params.n_ubatch = 1024;
|
|
3497
3044
|
params.n_batch = 1024;
|
|
3498
3045
|
params.n_ctx = 0;
|
|
@@ -3508,10 +3055,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3508
3055
|
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3509
3056
|
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3510
3057
|
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3511
|
-
params.speculative.n_gpu_layers = 99;
|
|
3512
3058
|
params.port = 8012;
|
|
3513
|
-
params.n_gpu_layers = 99;
|
|
3514
|
-
params.flash_attn = true;
|
|
3515
3059
|
params.n_ubatch = 1024;
|
|
3516
3060
|
params.n_batch = 1024;
|
|
3517
3061
|
params.n_ctx = 0;
|
|
@@ -3527,10 +3071,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3527
3071
|
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
3528
3072
|
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3529
3073
|
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3530
|
-
params.speculative.n_gpu_layers = 99;
|
|
3531
3074
|
params.port = 8012;
|
|
3532
|
-
params.n_gpu_layers = 99;
|
|
3533
|
-
params.flash_attn = true;
|
|
3534
3075
|
params.n_ubatch = 1024;
|
|
3535
3076
|
params.n_batch = 1024;
|
|
3536
3077
|
params.n_ctx = 0;
|
|
@@ -3545,8 +3086,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3545
3086
|
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
|
|
3546
3087
|
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
|
|
3547
3088
|
params.port = 8012;
|
|
3548
|
-
params.n_gpu_layers = 99;
|
|
3549
|
-
params.flash_attn = true;
|
|
3550
3089
|
params.n_ubatch = 1024;
|
|
3551
3090
|
params.n_batch = 1024;
|
|
3552
3091
|
params.n_ctx = 0;
|
|
@@ -3555,96 +3094,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3555
3094
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3556
3095
|
|
|
3557
3096
|
add_opt(common_arg(
|
|
3558
|
-
{
|
|
3559
|
-
string_format("
|
|
3560
|
-
[](common_params & params
|
|
3561
|
-
|
|
3562
|
-
|
|
3563
|
-
|
|
3564
|
-
|
|
3565
|
-
|
|
3566
|
-
|
|
3567
|
-
|
|
3097
|
+
{"--gpt-oss-20b-default"},
|
|
3098
|
+
string_format("use gpt-oss-20b (note: can download weights from the internet)"),
|
|
3099
|
+
[](common_params & params) {
|
|
3100
|
+
params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
|
|
3101
|
+
params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
|
|
3102
|
+
params.port = 8013;
|
|
3103
|
+
params.n_ubatch = 2048;
|
|
3104
|
+
params.n_batch = 32768;
|
|
3105
|
+
params.n_parallel = 2;
|
|
3106
|
+
params.n_ctx = 131072*params.n_parallel;
|
|
3107
|
+
params.sampling.temp = 1.0f;
|
|
3108
|
+
params.sampling.top_p = 1.0f;
|
|
3109
|
+
params.sampling.top_k = 0;
|
|
3110
|
+
params.sampling.min_p = 0.01f;
|
|
3111
|
+
params.use_jinja = true;
|
|
3112
|
+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
3113
|
+
}
|
|
3114
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3568
3115
|
|
|
3569
3116
|
add_opt(common_arg(
|
|
3570
|
-
{
|
|
3571
|
-
string_format("
|
|
3572
|
-
[](common_params & params
|
|
3573
|
-
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3117
|
+
{"--gpt-oss-120b-default"},
|
|
3118
|
+
string_format("use gpt-oss-120b (note: can download weights from the internet)"),
|
|
3119
|
+
[](common_params & params) {
|
|
3120
|
+
params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
|
|
3121
|
+
params.port = 8013;
|
|
3122
|
+
params.n_ubatch = 2048;
|
|
3123
|
+
params.n_batch = 32768;
|
|
3124
|
+
params.n_parallel = 2;
|
|
3125
|
+
params.n_ctx = 131072*params.n_parallel;
|
|
3126
|
+
params.sampling.temp = 1.0f;
|
|
3127
|
+
params.sampling.top_p = 1.0f;
|
|
3128
|
+
params.sampling.top_k = 0;
|
|
3129
|
+
params.sampling.min_p = 0.01f;
|
|
3130
|
+
params.use_jinja = true;
|
|
3131
|
+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
3132
|
+
}
|
|
3133
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3585
3134
|
|
|
3586
3135
|
add_opt(common_arg(
|
|
3587
|
-
{
|
|
3588
|
-
string_format("
|
|
3589
|
-
[](common_params & params
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
).set_examples({
|
|
3596
|
-
add_opt(common_arg(
|
|
3597
|
-
{ "--diffusion-add-gumbel-noise" }, "F",
|
|
3598
|
-
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
3599
|
-
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
3600
|
-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3601
|
-
|
|
3136
|
+
{"--vision-gemma-4b-default"},
|
|
3137
|
+
string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
|
|
3138
|
+
[](common_params & params) {
|
|
3139
|
+
params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
|
|
3140
|
+
params.port = 8014;
|
|
3141
|
+
params.n_ctx = 0;
|
|
3142
|
+
params.use_jinja = true;
|
|
3143
|
+
}
|
|
3144
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3602
3145
|
|
|
3603
|
-
add_opt(
|
|
3604
|
-
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
.
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
|
3614
|
-
(double) params.lr.lr_min),
|
|
3615
|
-
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
|
|
3616
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3617
|
-
add_opt(
|
|
3618
|
-
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
|
|
3619
|
-
string_format(
|
|
3620
|
-
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
|
|
3621
|
-
(double) params.lr.decay_epochs),
|
|
3622
|
-
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
|
|
3623
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3624
|
-
add_opt(common_arg(
|
|
3625
|
-
{ "-wd", "--weight-decay" }, "WD",
|
|
3626
|
-
string_format(
|
|
3627
|
-
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
|
|
3628
|
-
(double) params.lr.wd),
|
|
3629
|
-
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
|
|
3630
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3631
|
-
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
|
|
3632
|
-
string_format("fraction of data to use as validation set for training (default: %.2g).",
|
|
3633
|
-
(double) params.val_split),
|
|
3634
|
-
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
|
|
3635
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3636
|
-
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
|
|
3637
|
-
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
|
3638
|
-
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
|
|
3639
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3640
|
-
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
|
|
3641
|
-
[](common_params & params, const std::string & name) {
|
|
3642
|
-
params.optimizer = common_opt_get_optimizer(name.c_str());
|
|
3643
|
-
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
|
3644
|
-
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
|
3645
|
-
}
|
|
3646
|
-
})
|
|
3647
|
-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3146
|
+
add_opt(common_arg(
|
|
3147
|
+
{"--vision-gemma-12b-default"},
|
|
3148
|
+
string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
|
|
3149
|
+
[](common_params & params) {
|
|
3150
|
+
params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
|
|
3151
|
+
params.port = 8014;
|
|
3152
|
+
params.n_ctx = 0;
|
|
3153
|
+
params.use_jinja = true;
|
|
3154
|
+
}
|
|
3155
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3648
3156
|
|
|
3649
3157
|
return ctx_arg;
|
|
3650
3158
|
}
|