@novastera-oss/llamarn 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -2
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +24 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +5 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
- package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -43
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
- package/cpp/llama.cpp/src/llama-arch.h +36 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
- package/cpp/llama.cpp/src/llama-batch.h +105 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
- package/cpp/llama.cpp/src/llama-graph.h +78 -79
- package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
- package/cpp/llama.cpp/src/llama-hparams.h +11 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
- package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +21 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
- package/cpp/llama.cpp/src/llama-model.h +40 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
- package/cpp/llama.cpp/src/llama-vocab.h +42 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +5 -0
- package/ios/include/llama.h +8 -43
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -390,6 +390,7 @@ extern "C" {
|
|
|
390
390
|
void * imatrix; // pointer to importance matrix data
|
|
391
391
|
void * kv_overrides; // pointer to vector containing overrides
|
|
392
392
|
void * tensor_types; // pointer to vector containing tensor types
|
|
393
|
+
void * prune_layers; // pointer to vector containing layer indices to prune
|
|
393
394
|
} llama_model_quantize_params;
|
|
394
395
|
|
|
395
396
|
typedef struct llama_logit_bias {
|
|
@@ -943,12 +944,14 @@ extern "C" {
|
|
|
943
944
|
// Requires the context to have a memory.
|
|
944
945
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
945
946
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
946
|
-
// Upon
|
|
947
|
+
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
|
|
948
|
+
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
|
|
949
|
+
// Upon other return values, the memory state is restored to the state before this call
|
|
947
950
|
// 0 - success
|
|
948
951
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
949
|
-
// 2 - aborted
|
|
952
|
+
// 2 - aborted (processed ubatches will remain in the context's memory)
|
|
950
953
|
// -1 - invalid input batch
|
|
951
|
-
// < -1 - error
|
|
954
|
+
// < -1 - fatal error (processed ubatches will remain in the context's memory)
|
|
952
955
|
LLAMA_API int32_t llama_decode(
|
|
953
956
|
struct llama_context * ctx,
|
|
954
957
|
struct llama_batch batch);
|
|
@@ -1044,6 +1047,7 @@ extern "C" {
|
|
|
1044
1047
|
|
|
1045
1048
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1046
1049
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
1050
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
|
1047
1051
|
|
|
1048
1052
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
|
1049
1053
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
|
@@ -1087,6 +1091,7 @@ extern "C" {
|
|
|
1087
1091
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
1088
1092
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
|
1089
1093
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
1094
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
|
1090
1095
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
|
1091
1096
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
|
1092
1097
|
/// as plaintext. Does not insert a leading space.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/LlamaCppModel.cpp
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include <cstdlib>
|
|
6
6
|
#include <ctime>
|
|
7
7
|
#include <chrono>
|
|
8
|
+
#include <thread>
|
|
8
9
|
#include <fstream>
|
|
9
10
|
#include <iostream>
|
|
10
11
|
#include <random>
|
|
@@ -50,33 +51,60 @@ LlamaCppModel::~LlamaCppModel() {
|
|
|
50
51
|
}
|
|
51
52
|
|
|
52
53
|
void LlamaCppModel::release() {
|
|
53
|
-
//
|
|
54
|
+
// Signal completion to stop and wait for it to finish gracefully
|
|
54
55
|
if (is_predicting_) {
|
|
55
56
|
should_stop_completion_ = true;
|
|
56
57
|
|
|
57
|
-
//
|
|
58
|
+
// Wait more patiently for completion to stop, with proper backoff
|
|
58
59
|
int retry = 0;
|
|
59
|
-
while (is_predicting_ && retry <
|
|
60
|
-
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
|
60
|
+
while (is_predicting_ && retry < 100) { // Increased from 10 to 100
|
|
61
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(retry < 50 ? 10 : 50));
|
|
61
62
|
retry++;
|
|
62
63
|
}
|
|
64
|
+
|
|
65
|
+
// Force stop if still predicting
|
|
66
|
+
if (is_predicting_) {
|
|
67
|
+
is_predicting_ = false;
|
|
68
|
+
}
|
|
63
69
|
}
|
|
64
70
|
|
|
65
|
-
// Clean up our resources
|
|
71
|
+
// Clean up our resources with proper mutex protection
|
|
66
72
|
if (rn_ctx_) {
|
|
73
|
+
std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
|
|
74
|
+
|
|
75
|
+
// Clear KV cache before freeing context (following server.cpp pattern)
|
|
67
76
|
if (rn_ctx_->ctx) {
|
|
77
|
+
try {
|
|
78
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
79
|
+
} catch (...) {
|
|
80
|
+
// Ignore errors during cache clearing
|
|
81
|
+
}
|
|
82
|
+
|
|
68
83
|
llama_free(rn_ctx_->ctx);
|
|
69
84
|
rn_ctx_->ctx = nullptr;
|
|
70
85
|
}
|
|
71
86
|
|
|
87
|
+
// Free model after context (following server.cpp cleanup order)
|
|
72
88
|
if (rn_ctx_->model) {
|
|
73
89
|
llama_model_free(rn_ctx_->model);
|
|
74
90
|
rn_ctx_->model = nullptr;
|
|
75
91
|
}
|
|
76
92
|
|
|
93
|
+
// Clean up additional resources
|
|
94
|
+
rn_ctx_->vocab = nullptr; // This is owned by the model, so just null it
|
|
95
|
+
rn_ctx_->chat_templates.reset(); // Clean up chat templates
|
|
96
|
+
rn_ctx_->lora_adapters.clear(); // Clear LoRA adapters
|
|
97
|
+
|
|
98
|
+
// Reset state flags
|
|
99
|
+
rn_ctx_->model_loaded = false;
|
|
100
|
+
|
|
77
101
|
// Note: rn_ctx_ itself is owned by the module, so we don't delete it here
|
|
78
102
|
rn_ctx_ = nullptr;
|
|
79
103
|
}
|
|
104
|
+
|
|
105
|
+
// Reset our internal state
|
|
106
|
+
should_stop_completion_ = false;
|
|
107
|
+
is_predicting_ = false;
|
|
80
108
|
}
|
|
81
109
|
|
|
82
110
|
int32_t LlamaCppModel::getVocabSize() const {
|
|
@@ -133,6 +161,10 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
|
|
|
133
161
|
options.min_p = obj.getProperty(rt, "min_p").asNumber();
|
|
134
162
|
}
|
|
135
163
|
|
|
164
|
+
if (obj.hasProperty(rt, "presence_penalty") && !obj.getProperty(rt, "presence_penalty").isUndefined()) {
|
|
165
|
+
options.presence_penalty = obj.getProperty(rt, "presence_penalty").asNumber();
|
|
166
|
+
}
|
|
167
|
+
|
|
136
168
|
if (obj.hasProperty(rt, "n_predict") && !obj.getProperty(rt, "n_predict").isUndefined()) {
|
|
137
169
|
options.n_predict = obj.getProperty(rt, "n_predict").asNumber();
|
|
138
170
|
} else if (obj.hasProperty(rt, "max_tokens") && !obj.getProperty(rt, "max_tokens").isUndefined()) {
|
|
@@ -365,13 +397,14 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
365
397
|
std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
|
|
366
398
|
|
|
367
399
|
// Clear the context KV cache
|
|
368
|
-
|
|
400
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
369
401
|
|
|
370
402
|
// Store original sampling parameters to restore later
|
|
371
403
|
float orig_temp = rn_ctx_->params.sampling.temp;
|
|
372
404
|
float orig_top_p = rn_ctx_->params.sampling.top_p;
|
|
373
405
|
float orig_top_k = rn_ctx_->params.sampling.top_k;
|
|
374
406
|
float orig_min_p = rn_ctx_->params.sampling.min_p;
|
|
407
|
+
float orig_presence_penalty = rn_ctx_->params.sampling.penalty_present;
|
|
375
408
|
int orig_n_predict = rn_ctx_->params.n_predict;
|
|
376
409
|
|
|
377
410
|
// Set sampling parameters from options
|
|
@@ -379,6 +412,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
379
412
|
rn_ctx_->params.sampling.top_p = options.top_p;
|
|
380
413
|
rn_ctx_->params.sampling.top_k = options.top_k;
|
|
381
414
|
rn_ctx_->params.sampling.min_p = options.min_p;
|
|
415
|
+
rn_ctx_->params.sampling.penalty_present = options.presence_penalty;
|
|
382
416
|
rn_ctx_->params.n_predict = options.n_predict;
|
|
383
417
|
|
|
384
418
|
// Check for a partial callback
|
|
@@ -426,6 +460,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
426
460
|
rn_ctx_->params.sampling.top_p = orig_top_p;
|
|
427
461
|
rn_ctx_->params.sampling.top_k = orig_top_k;
|
|
428
462
|
rn_ctx_->params.sampling.min_p = orig_min_p;
|
|
463
|
+
rn_ctx_->params.sampling.penalty_present = orig_presence_penalty;
|
|
429
464
|
rn_ctx_->params.n_predict = orig_n_predict;
|
|
430
465
|
|
|
431
466
|
return result;
|
|
@@ -885,29 +920,28 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
|
|
|
885
920
|
}
|
|
886
921
|
|
|
887
922
|
// Clear the context KV cache to ensure clean embedding
|
|
888
|
-
|
|
923
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
889
924
|
|
|
890
925
|
// Enable embedding mode
|
|
891
926
|
llama_set_embeddings(rn_ctx_->ctx, true);
|
|
892
927
|
|
|
893
|
-
//
|
|
928
|
+
// Create and populate batch using common_batch functions (following server.cpp pattern)
|
|
929
|
+
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
|
|
930
|
+
|
|
931
|
+
common_batch_clear(batch);
|
|
894
932
|
for (int i = 0; i < (int)tokens.size(); i++) {
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
/* token */ &token,
|
|
899
|
-
/* embd */ nullptr,
|
|
900
|
-
/* pos */ &i,
|
|
901
|
-
/* n_seq_id */ nullptr,
|
|
902
|
-
/* seq_id */ nullptr,
|
|
903
|
-
/* logits */ nullptr
|
|
904
|
-
};
|
|
905
|
-
|
|
906
|
-
if (llama_decode(rn_ctx_->ctx, batch) != 0) {
|
|
907
|
-
throw std::runtime_error("Failed to decode token for embedding");
|
|
908
|
-
}
|
|
933
|
+
// For embeddings, we typically need logits for the last token (for pooling)
|
|
934
|
+
bool needs_logits = (i == (int)tokens.size() - 1);
|
|
935
|
+
common_batch_add(batch, tokens[i], i, {0}, needs_logits);
|
|
909
936
|
}
|
|
910
937
|
|
|
938
|
+
if (llama_decode(rn_ctx_->ctx, batch) != 0) {
|
|
939
|
+
llama_batch_free(batch);
|
|
940
|
+
throw std::runtime_error("Failed to decode tokens for embedding");
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
llama_batch_free(batch);
|
|
944
|
+
|
|
911
945
|
// Get embedding size from the model
|
|
912
946
|
const int n_embd = llama_model_n_embd(rn_ctx_->model);
|
|
913
947
|
if (n_embd <= 0) {
|
package/cpp/build-info.cpp
CHANGED
|
@@ -95,7 +95,7 @@ endif()
|
|
|
95
95
|
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
|
96
96
|
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
97
97
|
endif()
|
|
98
|
-
set(LLAMA_INSTALL_VERSION 0.0.${
|
|
98
|
+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
|
99
99
|
|
|
100
100
|
# override ggml options
|
|
101
101
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|
@@ -120,7 +120,6 @@ endfunction()
|
|
|
120
120
|
|
|
121
121
|
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
|
122
122
|
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
|
123
|
-
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
|
124
123
|
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
|
125
124
|
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
|
126
125
|
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
package/cpp/llama.cpp/README.md
CHANGED
|
@@ -6,9 +6,9 @@
|
|
|
6
6
|
[](https://github.com/ggml-org/llama.cpp/releases)
|
|
7
7
|
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
|
8
8
|
|
|
9
|
-
[
|
|
9
|
+
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
LLM inference in C/C++
|
|
12
12
|
|
|
13
13
|
## Recent API changes
|
|
14
14
|
|
|
@@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|
|
17
17
|
|
|
18
18
|
## Hot topics
|
|
19
19
|
|
|
20
|
-
-
|
|
21
|
-
-
|
|
20
|
+
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
|
|
21
|
+
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
|
22
22
|
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
|
23
|
-
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
|
24
23
|
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
|
25
24
|
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
|
26
25
|
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
|
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
|
|
|
86
86
|
endif()
|
|
87
87
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
88
88
|
include_directories(${CURL_INCLUDE_DIRS})
|
|
89
|
-
|
|
90
|
-
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
|
89
|
+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
|
91
90
|
endif ()
|
|
92
91
|
|
|
93
92
|
if (LLAMA_LLGUIDANCE)
|
|
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
|
|
|
112
111
|
|
|
113
112
|
ExternalProject_Add(llguidance_ext
|
|
114
113
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
115
|
-
#
|
|
116
|
-
GIT_TAG
|
|
114
|
+
# v1.0.1:
|
|
115
|
+
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
|
|
117
116
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
118
117
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
119
118
|
BUILD_IN_SOURCE TRUE
|
|
120
119
|
CONFIGURE_COMMAND ""
|
|
121
|
-
BUILD_COMMAND cargo build --release
|
|
120
|
+
BUILD_COMMAND cargo build --release --package llguidance
|
|
122
121
|
INSTALL_COMMAND ""
|
|
123
122
|
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
|
124
123
|
UPDATE_COMMAND ""
|
|
@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2706
2706
|
params.embd_sep = value;
|
|
2707
2707
|
}
|
|
2708
2708
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2709
|
+
add_opt(common_arg(
|
|
2710
|
+
{"--cls-separator"}, "STRING",
|
|
2711
|
+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
|
|
2712
|
+
[](common_params & params, const std::string & value) {
|
|
2713
|
+
params.cls_sep = value;
|
|
2714
|
+
}
|
|
2715
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2709
2716
|
add_opt(common_arg(
|
|
2710
2717
|
{"--host"}, "HOST",
|
|
2711
2718
|
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
|
@@ -2727,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2727
2734
|
params.public_path = value;
|
|
2728
2735
|
}
|
|
2729
2736
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
2737
|
+
add_opt(common_arg(
|
|
2738
|
+
{"--api-prefix"}, "PREFIX",
|
|
2739
|
+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
|
|
2740
|
+
[](common_params & params, const std::string & value) {
|
|
2741
|
+
params.api_prefix = value;
|
|
2742
|
+
}
|
|
2743
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2730
2744
|
add_opt(common_arg(
|
|
2731
2745
|
{"--no-webui"},
|
|
2732
2746
|
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
@@ -2787,6 +2801,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2787
2801
|
params.ssl_file_cert = value;
|
|
2788
2802
|
}
|
|
2789
2803
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
2804
|
+
add_opt(common_arg(
|
|
2805
|
+
{"--chat-template-kwargs"}, "STRING",
|
|
2806
|
+
string_format("sets additional params for the json template parser"),
|
|
2807
|
+
[](common_params & params, const std::string & value) {
|
|
2808
|
+
auto parsed = json::parse(value);
|
|
2809
|
+
for (const auto & item : parsed.items()) {
|
|
2810
|
+
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
2811
|
+
}
|
|
2812
|
+
}
|
|
2813
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2790
2814
|
add_opt(common_arg(
|
|
2791
2815
|
{"-to", "--timeout"}, "N",
|
|
2792
2816
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
@@ -17,6 +17,8 @@
|
|
|
17
17
|
#include <string>
|
|
18
18
|
#include <vector>
|
|
19
19
|
|
|
20
|
+
using json = nlohmann::ordered_json;
|
|
21
|
+
|
|
20
22
|
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
|
|
21
23
|
auto time = std::chrono::system_clock::to_time_t(now);
|
|
22
24
|
auto local_time = *std::localtime(&time);
|
|
@@ -140,6 +142,7 @@ struct templates_params {
|
|
|
140
142
|
bool add_generation_prompt = true;
|
|
141
143
|
bool enable_thinking = true;
|
|
142
144
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
145
|
+
json extra_context;
|
|
143
146
|
};
|
|
144
147
|
|
|
145
148
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
|
|
|
720
723
|
|
|
721
724
|
static std::string apply(
|
|
722
725
|
const common_chat_template & tmpl,
|
|
723
|
-
const
|
|
724
|
-
const
|
|
725
|
-
|
|
726
|
-
const
|
|
726
|
+
const struct templates_params & inputs,
|
|
727
|
+
const std::optional<json> & messages_override = std::nullopt,
|
|
728
|
+
const std::optional<json> & tools_override = std::nullopt,
|
|
729
|
+
const std::optional<json> & additional_context = std::nullopt)
|
|
727
730
|
{
|
|
728
731
|
minja::chat_template_inputs tmpl_inputs;
|
|
729
|
-
tmpl_inputs.messages = messages;
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
732
|
+
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
|
|
733
|
+
if (tools_override) {
|
|
734
|
+
tmpl_inputs.tools = *tools_override;
|
|
735
|
+
} else {
|
|
736
|
+
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
|
|
737
|
+
}
|
|
738
|
+
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
|
|
739
|
+
tmpl_inputs.extra_context = inputs.extra_context;
|
|
740
|
+
if (additional_context) {
|
|
741
|
+
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
742
|
+
}
|
|
733
743
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
734
744
|
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
735
745
|
|
|
@@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
|
|
828
838
|
inputs.messages,
|
|
829
839
|
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
|
830
840
|
|
|
831
|
-
data.prompt = apply(tmpl,
|
|
841
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
|
832
842
|
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
|
833
843
|
return data;
|
|
834
844
|
}
|
|
@@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
904
914
|
data.preserved_tokens = {
|
|
905
915
|
"[TOOL_CALLS]",
|
|
906
916
|
};
|
|
907
|
-
data.prompt = apply(tmpl, inputs
|
|
917
|
+
data.prompt = apply(tmpl, inputs);
|
|
908
918
|
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
909
919
|
return data;
|
|
910
920
|
}
|
|
@@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
|
|
934
944
|
adjusted_messages.push_back(msg);
|
|
935
945
|
}
|
|
936
946
|
}
|
|
937
|
-
data.prompt = apply(tmpl,
|
|
947
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
|
938
948
|
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
|
939
949
|
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
|
|
940
950
|
if (!inputs.enable_thinking) {
|
|
@@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1122
1132
|
} else {
|
|
1123
1133
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1124
1134
|
}
|
|
1125
|
-
data.prompt = apply(tmpl, inputs
|
|
1135
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
|
|
1126
1136
|
{"date_string", format_time(inputs.now, "%d %b %Y")},
|
|
1127
1137
|
{"tools_in_user_message", false},
|
|
1128
1138
|
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
|
@@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
|
|
|
1187
1197
|
|
|
1188
1198
|
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1189
1199
|
common_chat_params data;
|
|
1190
|
-
auto prompt = apply(tmpl, inputs
|
|
1200
|
+
auto prompt = apply(tmpl, inputs);
|
|
1191
1201
|
|
|
1192
1202
|
// Hacks to fix the official (broken) prompt.
|
|
1193
1203
|
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
|
|
@@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1282
1292
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1283
1293
|
LOG_DBG("%s\n", __func__);
|
|
1284
1294
|
common_chat_params data;
|
|
1285
|
-
data.prompt = apply(tmpl, inputs
|
|
1295
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
|
|
1286
1296
|
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
|
1287
1297
|
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
1288
1298
|
});
|
|
@@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
|
|
1338
1348
|
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
|
1339
1349
|
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
|
|
1340
1350
|
common_chat_params data;
|
|
1341
|
-
data.prompt = apply(tmpl, inputs
|
|
1351
|
+
data.prompt = apply(tmpl, inputs);
|
|
1342
1352
|
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
|
1343
1353
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1344
1354
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
@@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
|
|
1465
1475
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1466
1476
|
}
|
|
1467
1477
|
|
|
1468
|
-
data.prompt = apply(tmpl, inputs
|
|
1478
|
+
data.prompt = apply(tmpl, inputs);
|
|
1469
1479
|
// TODO: if (has_raw_python)
|
|
1470
1480
|
return data;
|
|
1471
1481
|
}
|
|
@@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
|
|
|
1498
1508
|
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1499
1509
|
common_chat_params data;
|
|
1500
1510
|
|
|
1501
|
-
json
|
|
1511
|
+
json extra_context = json {
|
|
1502
1512
|
{"enable_thinking", inputs.enable_thinking},
|
|
1503
1513
|
};
|
|
1514
|
+
extra_context.update(inputs.extra_context);
|
|
1504
1515
|
|
|
1505
|
-
data.prompt = apply(tmpl, inputs
|
|
1516
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
|
|
1506
1517
|
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
|
1507
1518
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
1508
|
-
if (!
|
|
1519
|
+
if (!extra_context["enable_thinking"]) {
|
|
1509
1520
|
data.prompt += "</think>";
|
|
1510
1521
|
} else {
|
|
1511
1522
|
data.thinking_forced_open = true;
|
|
@@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1691
1702
|
|
|
1692
1703
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1693
1704
|
common_chat_params data;
|
|
1694
|
-
data.prompt = apply(tmpl, inputs
|
|
1705
|
+
data.prompt = apply(tmpl, inputs);
|
|
1695
1706
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1696
1707
|
data.grammar_lazy = false;
|
|
1697
1708
|
if (!inputs.json_schema.is_null()) {
|
|
@@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1722
1733
|
params.enable_thinking = inputs.enable_thinking;
|
|
1723
1734
|
params.grammar = inputs.grammar;
|
|
1724
1735
|
params.now = inputs.now;
|
|
1736
|
+
|
|
1737
|
+
params.extra_context = json::object();
|
|
1738
|
+
for (auto el : inputs.chat_template_kwargs) {
|
|
1739
|
+
params.extra_context[el.first] = json::parse(el.second);
|
|
1740
|
+
}
|
|
1741
|
+
|
|
1725
1742
|
if (!inputs.json_schema.empty()) {
|
|
1726
1743
|
params.json_schema = json::parse(inputs.json_schema);
|
|
1727
1744
|
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <chrono>
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <vector>
|
|
10
|
+
#include <map>
|
|
10
11
|
|
|
11
12
|
struct common_chat_templates;
|
|
12
13
|
|
|
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
|
|
|
125
126
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
126
127
|
bool enable_thinking = true;
|
|
127
128
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
129
|
+
std::map<std::string, std::string> chat_template_kwargs;
|
|
128
130
|
};
|
|
129
131
|
|
|
130
132
|
struct common_chat_params {
|
|
@@ -1290,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
|
|
|
1290
1290
|
int n_tokens = text.length() + 2 * add_special;
|
|
1291
1291
|
std::vector<llama_token> result(n_tokens);
|
|
1292
1292
|
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
1293
|
+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
|
1294
|
+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
|
1295
|
+
}
|
|
1293
1296
|
if (n_tokens < 0) {
|
|
1294
1297
|
result.resize(-n_tokens);
|
|
1295
1298
|
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <string_view>
|
|
10
10
|
#include <vector>
|
|
11
|
+
#include <map>
|
|
11
12
|
#include <sstream>
|
|
12
13
|
|
|
13
14
|
#ifdef _WIN32
|
|
@@ -358,6 +359,7 @@ struct common_params {
|
|
|
358
359
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
359
360
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
360
361
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
362
|
+
std::string cls_sep = "\t"; // separator of classification sequences
|
|
361
363
|
|
|
362
364
|
// server params
|
|
363
365
|
int32_t port = 8080; // server listens on this network port
|
|
@@ -368,6 +370,7 @@ struct common_params {
|
|
|
368
370
|
|
|
369
371
|
std::string hostname = "127.0.0.1";
|
|
370
372
|
std::string public_path = ""; // NOLINT
|
|
373
|
+
std::string api_prefix = ""; // NOLINT
|
|
371
374
|
std::string chat_template = ""; // NOLINT
|
|
372
375
|
bool use_jinja = false; // NOLINT
|
|
373
376
|
bool enable_chat_template = true;
|
|
@@ -380,6 +383,8 @@ struct common_params {
|
|
|
380
383
|
std::string ssl_file_key = ""; // NOLINT
|
|
381
384
|
std::string ssl_file_cert = ""; // NOLINT
|
|
382
385
|
|
|
386
|
+
std::map<std::string, std::string> default_template_kwargs;
|
|
387
|
+
|
|
383
388
|
// "advanced" endpoints are disabled by default for better security
|
|
384
389
|
bool webui = true;
|
|
385
390
|
bool endpoint_slots = false;
|