@novastera-oss/llamarn 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +4 -2
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
using json = nlohmann::ordered_json;
|
|
40
40
|
|
|
41
41
|
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
42
|
-
|
|
42
|
+
LLAMA_EXAMPLE_MTMD,
|
|
43
43
|
LLAMA_EXAMPLE_SERVER,
|
|
44
44
|
};
|
|
45
45
|
|
|
@@ -242,33 +242,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|
|
242
242
|
}
|
|
243
243
|
|
|
244
244
|
// download one single file from remote URL to local path
|
|
245
|
-
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
|
246
|
-
// Initialize libcurl
|
|
247
|
-
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
248
|
-
curl_slist_ptr http_headers;
|
|
249
|
-
if (!curl) {
|
|
250
|
-
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
251
|
-
return false;
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
// Set the URL, allow to follow http redirection
|
|
255
|
-
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
256
|
-
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
257
|
-
|
|
258
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
259
|
-
// Check if hf-token or bearer-token was specified
|
|
260
|
-
if (!bearer_token.empty()) {
|
|
261
|
-
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
|
262
|
-
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
263
|
-
}
|
|
264
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
265
|
-
|
|
266
|
-
#if defined(_WIN32)
|
|
267
|
-
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
268
|
-
// operating system. Currently implemented under MS-Windows.
|
|
269
|
-
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
270
|
-
#endif
|
|
271
|
-
|
|
245
|
+
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
|
272
246
|
// Check if the file already exists locally
|
|
273
247
|
auto file_exists = std::filesystem::exists(path);
|
|
274
248
|
|
|
@@ -279,6 +253,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
279
253
|
std::string last_modified;
|
|
280
254
|
|
|
281
255
|
if (file_exists) {
|
|
256
|
+
if (offline) {
|
|
257
|
+
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
|
258
|
+
return true; // skip verification/downloading
|
|
259
|
+
}
|
|
282
260
|
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
283
261
|
std::ifstream metadata_in(metadata_path);
|
|
284
262
|
if (metadata_in.good()) {
|
|
@@ -297,6 +275,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
297
275
|
}
|
|
298
276
|
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
|
299
277
|
} else {
|
|
278
|
+
if (offline) {
|
|
279
|
+
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
|
280
|
+
return false;
|
|
281
|
+
}
|
|
300
282
|
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
301
283
|
}
|
|
302
284
|
|
|
@@ -310,50 +292,73 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
310
292
|
bool head_request_ok = false;
|
|
311
293
|
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
|
312
294
|
|
|
313
|
-
//
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
295
|
+
// Initialize libcurl
|
|
296
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
297
|
+
curl_slist_ptr http_headers;
|
|
298
|
+
if (!curl) {
|
|
299
|
+
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
300
|
+
return false;
|
|
301
|
+
}
|
|
318
302
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
303
|
+
// Set the URL, allow to follow http redirection
|
|
304
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
305
|
+
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
|
322
306
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
307
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
308
|
+
// Check if hf-token or bearer-token was specified
|
|
309
|
+
if (!bearer_token.empty()) {
|
|
310
|
+
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
|
311
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
312
|
+
}
|
|
313
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
314
|
+
|
|
315
|
+
#if defined(_WIN32)
|
|
316
|
+
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
|
317
|
+
// operating system. Currently implemented under MS-Windows.
|
|
318
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
319
|
+
#endif
|
|
336
320
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
321
|
+
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
322
|
+
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
323
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
341
324
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
if (!was_perform_successful) {
|
|
346
|
-
head_request_ok = false;
|
|
347
|
-
}
|
|
325
|
+
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
326
|
+
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
327
|
+
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
|
348
328
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
if (
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
329
|
+
std::string header(buffer, n_items);
|
|
330
|
+
std::smatch match;
|
|
331
|
+
if (std::regex_match(header, match, header_regex)) {
|
|
332
|
+
const std::string & key = match[1];
|
|
333
|
+
const std::string & value = match[2];
|
|
334
|
+
if (std::regex_match(key, match, etag_regex)) {
|
|
335
|
+
headers->etag = value;
|
|
336
|
+
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
337
|
+
headers->last_modified = value;
|
|
338
|
+
}
|
|
356
339
|
}
|
|
340
|
+
return n_items;
|
|
341
|
+
};
|
|
342
|
+
|
|
343
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
344
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
345
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
346
|
+
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
347
|
+
|
|
348
|
+
// we only allow retrying once for HEAD requests
|
|
349
|
+
// this is for the use case of using running offline (no internet), retrying can be annoying
|
|
350
|
+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
|
351
|
+
if (!was_perform_successful) {
|
|
352
|
+
head_request_ok = false;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
long http_code = 0;
|
|
356
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
357
|
+
if (http_code == 200) {
|
|
358
|
+
head_request_ok = true;
|
|
359
|
+
} else {
|
|
360
|
+
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
361
|
+
head_request_ok = false;
|
|
357
362
|
}
|
|
358
363
|
|
|
359
364
|
// if head_request_ok is false, we don't have the etag or last-modified headers
|
|
@@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|
|
460
465
|
|
|
461
466
|
// download multiple files from remote URLs to local paths
|
|
462
467
|
// the input is a vector of pairs <url, path>
|
|
463
|
-
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
|
468
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
|
464
469
|
// Prepare download in parallel
|
|
465
470
|
std::vector<std::future<bool>> futures_download;
|
|
466
471
|
for (auto const & item : urls) {
|
|
467
|
-
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
|
468
|
-
return common_download_file_single(it.first, it.second, bearer_token);
|
|
472
|
+
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
|
473
|
+
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
|
469
474
|
}, item));
|
|
470
475
|
}
|
|
471
476
|
|
|
@@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
|
|
481
486
|
|
|
482
487
|
static bool common_download_model(
|
|
483
488
|
const common_params_model & model,
|
|
484
|
-
const std::string & bearer_token
|
|
489
|
+
const std::string & bearer_token,
|
|
490
|
+
bool offline) {
|
|
485
491
|
// Basic validation of the model.url
|
|
486
492
|
if (model.url.empty()) {
|
|
487
493
|
LOG_ERR("%s: invalid model url\n", __func__);
|
|
488
494
|
return false;
|
|
489
495
|
}
|
|
490
496
|
|
|
491
|
-
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
|
497
|
+
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
|
492
498
|
return false;
|
|
493
499
|
}
|
|
494
500
|
|
|
@@ -547,7 +553,7 @@ static bool common_download_model(
|
|
|
547
553
|
}
|
|
548
554
|
|
|
549
555
|
// Download in parallel
|
|
550
|
-
common_download_file_multiple(urls, bearer_token);
|
|
556
|
+
common_download_file_multiple(urls, bearer_token, offline);
|
|
551
557
|
}
|
|
552
558
|
|
|
553
559
|
return true;
|
|
@@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|
|
608
614
|
*
|
|
609
615
|
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
610
616
|
*/
|
|
611
|
-
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
|
617
|
+
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
|
612
618
|
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
613
619
|
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
614
620
|
std::string hf_repo = parts[0];
|
|
@@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
|
|
638
644
|
long res_code = 0;
|
|
639
645
|
std::string res_str;
|
|
640
646
|
bool use_cache = false;
|
|
641
|
-
|
|
642
|
-
auto res = common_remote_get_content(url, params);
|
|
643
|
-
res_code = res.first;
|
|
644
|
-
res_str = std::string(res.second.data(), res.second.size());
|
|
645
|
-
} catch (const std::exception & e) {
|
|
646
|
-
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
|
647
|
-
LOG_WRN("try reading from cache\n");
|
|
648
|
-
// try to read from cache
|
|
647
|
+
if (!offline) {
|
|
649
648
|
try {
|
|
649
|
+
auto res = common_remote_get_content(url, params);
|
|
650
|
+
res_code = res.first;
|
|
651
|
+
res_str = std::string(res.second.data(), res.second.size());
|
|
652
|
+
} catch (const std::exception & e) {
|
|
653
|
+
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
if (res_code == 0) {
|
|
657
|
+
if (std::filesystem::exists(cached_response_path)) {
|
|
658
|
+
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
|
|
650
659
|
res_str = read_file(cached_response_path);
|
|
651
660
|
res_code = 200;
|
|
652
661
|
use_cache = true;
|
|
653
|
-
}
|
|
654
|
-
throw std::runtime_error(
|
|
662
|
+
} else {
|
|
663
|
+
throw std::runtime_error(
|
|
664
|
+
offline ? "error: failed to get manifest (offline mode)"
|
|
665
|
+
: "error: failed to get manifest (check your internet connection)");
|
|
655
666
|
}
|
|
656
667
|
}
|
|
657
668
|
std::string ggufFile;
|
|
@@ -698,24 +709,25 @@ bool common_has_curl() {
|
|
|
698
709
|
return false;
|
|
699
710
|
}
|
|
700
711
|
|
|
701
|
-
static bool common_download_file_single(const std::string &, const std::string &, const std::string
|
|
712
|
+
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
|
702
713
|
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
|
703
714
|
return false;
|
|
704
715
|
}
|
|
705
716
|
|
|
706
|
-
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string
|
|
717
|
+
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
|
707
718
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
708
719
|
return false;
|
|
709
720
|
}
|
|
710
721
|
|
|
711
722
|
static bool common_download_model(
|
|
712
723
|
const common_params_model &,
|
|
713
|
-
const std::string
|
|
724
|
+
const std::string &,
|
|
725
|
+
bool) {
|
|
714
726
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
715
727
|
return false;
|
|
716
728
|
}
|
|
717
729
|
|
|
718
|
-
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string
|
|
730
|
+
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
|
719
731
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
|
720
732
|
return {};
|
|
721
733
|
}
|
|
@@ -742,7 +754,8 @@ struct handle_model_result {
|
|
|
742
754
|
static handle_model_result common_params_handle_model(
|
|
743
755
|
struct common_params_model & model,
|
|
744
756
|
const std::string & bearer_token,
|
|
745
|
-
const std::string & model_path_default
|
|
757
|
+
const std::string & model_path_default,
|
|
758
|
+
bool offline) {
|
|
746
759
|
handle_model_result result;
|
|
747
760
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
748
761
|
{
|
|
@@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
|
|
|
750
763
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
751
764
|
if (model.hf_file.empty()) {
|
|
752
765
|
if (model.path.empty()) {
|
|
753
|
-
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
|
766
|
+
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
|
754
767
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
755
768
|
exit(1); // built without CURL, error message already printed
|
|
756
769
|
}
|
|
@@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
|
|
|
791
804
|
|
|
792
805
|
// then, download it if needed
|
|
793
806
|
if (!model.url.empty()) {
|
|
794
|
-
bool ok = common_download_model(model, bearer_token);
|
|
807
|
+
bool ok = common_download_model(model, bearer_token, offline);
|
|
795
808
|
if (!ok) {
|
|
796
809
|
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
|
797
810
|
exit(1);
|
|
@@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
934
947
|
|
|
935
948
|
// handle model and download
|
|
936
949
|
{
|
|
937
|
-
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
|
950
|
+
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
|
|
938
951
|
if (params.no_mmproj) {
|
|
939
952
|
params.mmproj = {};
|
|
940
953
|
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
|
@@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
944
957
|
// only download mmproj if the current example is using it
|
|
945
958
|
for (auto & ex : mmproj_examples) {
|
|
946
959
|
if (ctx_arg.ex == ex) {
|
|
947
|
-
common_params_handle_model(params.mmproj, params.hf_token, "");
|
|
960
|
+
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
|
|
948
961
|
break;
|
|
949
962
|
}
|
|
950
963
|
}
|
|
951
|
-
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
|
952
|
-
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
|
964
|
+
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
|
|
965
|
+
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
|
|
953
966
|
}
|
|
954
967
|
|
|
955
968
|
if (params.escape) {
|
|
@@ -1445,6 +1458,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1445
1458
|
params.n_keep = value;
|
|
1446
1459
|
}
|
|
1447
1460
|
));
|
|
1461
|
+
add_opt(common_arg(
|
|
1462
|
+
{"--swa-full"},
|
|
1463
|
+
string_format("use full-size SWA cache (default: %s)\n"
|
|
1464
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
|
|
1465
|
+
[](common_params & params) {
|
|
1466
|
+
params.swa_full = true;
|
|
1467
|
+
}
|
|
1468
|
+
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
1448
1469
|
add_opt(common_arg(
|
|
1449
1470
|
{"--no-context-shift"},
|
|
1450
1471
|
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
@@ -1670,7 +1691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1670
1691
|
[](common_params & params) {
|
|
1671
1692
|
params.warmup = false;
|
|
1672
1693
|
}
|
|
1673
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
|
|
1694
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1674
1695
|
add_opt(common_arg(
|
|
1675
1696
|
{"--spm-infill"},
|
|
1676
1697
|
string_format(
|
|
@@ -2057,13 +2078,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2057
2078
|
params.grp_attn_w = value;
|
|
2058
2079
|
}
|
|
2059
2080
|
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
2060
|
-
add_opt(common_arg(
|
|
2061
|
-
{"-dkvc", "--dump-kv-cache"},
|
|
2062
|
-
"verbose print of the KV cache",
|
|
2063
|
-
[](common_params & params) {
|
|
2064
|
-
params.dump_kv_cache = true;
|
|
2065
|
-
}
|
|
2066
|
-
));
|
|
2067
2081
|
add_opt(common_arg(
|
|
2068
2082
|
{"-nkvo", "--no-kv-offload"},
|
|
2069
2083
|
"disable KV offload",
|
|
@@ -2232,12 +2246,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2232
2246
|
}
|
|
2233
2247
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
|
2234
2248
|
add_opt(common_arg(
|
|
2235
|
-
{"--image"}, "FILE",
|
|
2236
|
-
"path to an image file. use with multimodal models
|
|
2249
|
+
{"--image", "--audio"}, "FILE",
|
|
2250
|
+
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
|
2237
2251
|
[](common_params & params, const std::string & value) {
|
|
2238
2252
|
params.image.emplace_back(value);
|
|
2239
2253
|
}
|
|
2240
|
-
).set_examples({
|
|
2254
|
+
).set_examples({LLAMA_EXAMPLE_MTMD}));
|
|
2241
2255
|
if (llama_supports_rpc()) {
|
|
2242
2256
|
add_opt(common_arg(
|
|
2243
2257
|
{"--rpc"}, "SERVERS",
|
|
@@ -2585,7 +2599,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2585
2599
|
[](common_params & params, int value) {
|
|
2586
2600
|
params.n_junk = value;
|
|
2587
2601
|
}
|
|
2588
|
-
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
2602
|
+
).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
|
|
2589
2603
|
add_opt(common_arg(
|
|
2590
2604
|
{"--pos"}, "N",
|
|
2591
2605
|
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
|
@@ -2648,7 +2662,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2648
2662
|
[](common_params & params) {
|
|
2649
2663
|
params.is_pp_shared = true;
|
|
2650
2664
|
}
|
|
2651
|
-
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
2665
|
+
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
|
2652
2666
|
add_opt(common_arg(
|
|
2653
2667
|
{"-npp"}, "n0,n1,...",
|
|
2654
2668
|
"number of prompt tokens",
|
|
@@ -2847,15 +2861,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2847
2861
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
2848
2862
|
add_opt(common_arg(
|
|
2849
2863
|
{"--reasoning-format"}, "FORMAT",
|
|
2850
|
-
"
|
|
2851
|
-
"
|
|
2852
|
-
"
|
|
2864
|
+
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
2865
|
+
"- none: leaves thoughts unparsed in `message.content`\n"
|
|
2866
|
+
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
|
2867
|
+
"(default: deepseek)",
|
|
2853
2868
|
[](common_params & params, const std::string & value) {
|
|
2854
2869
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2855
2870
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2856
|
-
else { std::invalid_argument("invalid value"); }
|
|
2871
|
+
else { throw std::invalid_argument("invalid value"); }
|
|
2857
2872
|
}
|
|
2858
2873
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
2874
|
+
add_opt(common_arg(
|
|
2875
|
+
{"--reasoning-budget"}, "N",
|
|
2876
|
+
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
|
2877
|
+
[](common_params & params, int value) {
|
|
2878
|
+
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
|
2879
|
+
params.reasoning_budget = value;
|
|
2880
|
+
}
|
|
2881
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
|
2859
2882
|
add_opt(common_arg(
|
|
2860
2883
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
2861
2884
|
string_format(
|
|
@@ -2867,7 +2890,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2867
2890
|
[](common_params & params, const std::string & value) {
|
|
2868
2891
|
params.chat_template = value;
|
|
2869
2892
|
}
|
|
2870
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER,
|
|
2893
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
2871
2894
|
add_opt(common_arg(
|
|
2872
2895
|
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
|
2873
2896
|
string_format(
|
|
@@ -2880,6 +2903,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2880
2903
|
params.chat_template = read_file(value);
|
|
2881
2904
|
}
|
|
2882
2905
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
2906
|
+
add_opt(common_arg(
|
|
2907
|
+
{"--no-prefill-assistant"},
|
|
2908
|
+
string_format(
|
|
2909
|
+
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
|
2910
|
+
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
|
2911
|
+
),
|
|
2912
|
+
[](common_params & params) {
|
|
2913
|
+
params.prefill_assistant = false;
|
|
2914
|
+
}
|
|
2915
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
|
|
2883
2916
|
add_opt(common_arg(
|
|
2884
2917
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
|
2885
2918
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
|
@@ -2944,7 +2977,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2944
2977
|
[](common_params & params, const std::string & value) {
|
|
2945
2978
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
|
2946
2979
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
|
2947
|
-
else { std::invalid_argument("invalid value"); }
|
|
2980
|
+
else { throw std::invalid_argument("invalid value"); }
|
|
2948
2981
|
}
|
|
2949
2982
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
|
2950
2983
|
add_opt(common_arg(
|
|
@@ -2976,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2976
3009
|
common_log_set_verbosity_thold(INT_MAX);
|
|
2977
3010
|
}
|
|
2978
3011
|
));
|
|
3012
|
+
add_opt(common_arg(
|
|
3013
|
+
{"--offline"},
|
|
3014
|
+
"Offline mode: forces use of cache, prevents network access",
|
|
3015
|
+
[](common_params & params) {
|
|
3016
|
+
params.offline = true;
|
|
3017
|
+
}
|
|
3018
|
+
).set_env("LLAMA_OFFLINE"));
|
|
2979
3019
|
add_opt(common_arg(
|
|
2980
3020
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
|
2981
3021
|
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|