@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
package/cpp/llama.cpp/Makefile
CHANGED
|
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
|
|
|
367
367
|
endif
|
|
368
368
|
|
|
369
369
|
ifndef GGML_NO_CPU_AARCH64
|
|
370
|
-
MK_CPPFLAGS += -
|
|
370
|
+
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
|
|
371
371
|
endif
|
|
372
372
|
|
|
373
373
|
# warnings
|
|
@@ -970,7 +970,7 @@ OBJ_GGML = \
|
|
|
970
970
|
$(DIR_GGML)/src/ggml-threading.o \
|
|
971
971
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
|
972
972
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
|
|
973
|
-
$(DIR_GGML)/src/ggml-cpu/
|
|
973
|
+
$(DIR_GGML)/src/ggml-cpu/repack.o \
|
|
974
974
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
|
|
975
975
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
|
976
976
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
|
package/cpp/llama.cpp/README.md
CHANGED
|
@@ -3,9 +3,10 @@
|
|
|
3
3
|

|
|
4
4
|
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://github.com/ggml-org/llama.cpp/releases)
|
|
6
7
|
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
|
7
8
|
|
|
8
|
-
[Roadmap](https://github.com/users/ggerganov/projects/7) / [
|
|
9
|
+
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
|
9
10
|
|
|
10
11
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
|
11
12
|
|
|
@@ -17,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|
|
17
18
|
## Hot topics
|
|
18
19
|
|
|
19
20
|
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
|
20
|
-
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
|
21
21
|
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
|
22
22
|
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
|
23
23
|
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
|
@@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|
|
28
28
|
|
|
29
29
|
----
|
|
30
30
|
|
|
31
|
+
## Quick start
|
|
32
|
+
|
|
33
|
+
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
|
34
|
+
|
|
35
|
+
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
|
36
|
+
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
|
37
|
+
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
|
38
|
+
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
|
39
|
+
|
|
40
|
+
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
|
|
41
|
+
|
|
42
|
+
Example command:
|
|
43
|
+
|
|
44
|
+
```sh
|
|
45
|
+
# Use a local model file
|
|
46
|
+
llama-cli -m my_model.gguf
|
|
47
|
+
|
|
48
|
+
# Or download and run a model directly from Hugging Face
|
|
49
|
+
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
|
50
|
+
|
|
51
|
+
# Launch OpenAI-compatible API server
|
|
52
|
+
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
|
|
53
|
+
```
|
|
54
|
+
|
|
31
55
|
## Description
|
|
32
56
|
|
|
33
57
|
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
|
@@ -230,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
230
254
|
|
|
231
255
|
</details>
|
|
232
256
|
|
|
257
|
+
|
|
233
258
|
## Supported backends
|
|
234
259
|
|
|
235
260
|
| Backend | Target devices |
|
|
@@ -246,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
246
271
|
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
|
247
272
|
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
|
248
273
|
|
|
249
|
-
## Building the project
|
|
250
|
-
|
|
251
|
-
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
|
252
|
-
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
|
253
|
-
|
|
254
|
-
- Clone this repository and build locally, see [how to build](docs/build.md)
|
|
255
|
-
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
|
256
|
-
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
|
257
|
-
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
|
|
258
|
-
|
|
259
274
|
## Obtaining and quantizing models
|
|
260
275
|
|
|
261
276
|
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
|
|
@@ -263,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
|
|
|
263
278
|
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
|
264
279
|
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
|
265
280
|
|
|
266
|
-
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
|
|
281
|
+
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
|
|
282
|
+
|
|
283
|
+
```sh
|
|
284
|
+
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
|
285
|
+
```
|
|
267
286
|
|
|
268
287
|
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
|
269
288
|
|
|
@@ -7,8 +7,8 @@ llama_add_compile_flags()
|
|
|
7
7
|
# Build info header
|
|
8
8
|
#
|
|
9
9
|
|
|
10
|
-
if(EXISTS "${
|
|
11
|
-
set(GIT_DIR "${
|
|
10
|
+
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
|
11
|
+
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
|
12
12
|
|
|
13
13
|
# Is git submodule
|
|
14
14
|
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
|
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
|
|
18
18
|
if (SLASH_POS EQUAL 0)
|
|
19
19
|
set(GIT_DIR "${REAL_GIT_DIR}")
|
|
20
20
|
else()
|
|
21
|
-
set(GIT_DIR "${
|
|
21
|
+
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
|
22
22
|
endif()
|
|
23
23
|
endif()
|
|
24
24
|
|
|
25
25
|
if(EXISTS "${GIT_DIR}/index")
|
|
26
|
-
|
|
26
|
+
# For build-info.cpp below
|
|
27
|
+
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
|
|
27
28
|
else()
|
|
28
29
|
message(WARNING "Git index not found in git repository.")
|
|
29
|
-
set(GIT_INDEX "")
|
|
30
30
|
endif()
|
|
31
31
|
else()
|
|
32
32
|
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
|
33
|
-
set(GIT_INDEX "")
|
|
34
33
|
endif()
|
|
35
34
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
|
41
|
-
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
|
42
|
-
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
|
43
|
-
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
|
44
|
-
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
|
45
|
-
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
|
46
|
-
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
|
47
|
-
VERBATIM
|
|
48
|
-
)
|
|
35
|
+
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
|
36
|
+
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
|
37
|
+
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
|
38
|
+
|
|
49
39
|
set(TARGET build_info)
|
|
50
|
-
add_library(${TARGET} OBJECT
|
|
40
|
+
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
|
51
41
|
if (BUILD_SHARED_LIBS)
|
|
52
42
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
53
43
|
endif()
|
|
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
988
988
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
989
989
|
}
|
|
990
990
|
|
|
991
|
-
if (params.reranking && params.embedding) {
|
|
992
|
-
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
|
993
|
-
}
|
|
994
|
-
|
|
995
991
|
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
|
996
992
|
throw std::runtime_error(string_format(
|
|
997
993
|
"error: the supplied chat template is not supported: %s%s\n",
|
|
@@ -2710,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2710
2706
|
params.embd_sep = value;
|
|
2711
2707
|
}
|
|
2712
2708
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2709
|
+
add_opt(common_arg(
|
|
2710
|
+
{"--cls-separator"}, "STRING",
|
|
2711
|
+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
|
|
2712
|
+
[](common_params & params, const std::string & value) {
|
|
2713
|
+
params.cls_sep = value;
|
|
2714
|
+
}
|
|
2715
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2713
2716
|
add_opt(common_arg(
|
|
2714
2717
|
{"--host"}, "HOST",
|
|
2715
2718
|
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
|
@@ -2747,9 +2750,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2747
2750
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2748
2751
|
add_opt(common_arg(
|
|
2749
2752
|
{"--reranking", "--rerank"},
|
|
2750
|
-
string_format("enable reranking endpoint on server (default: %s)",
|
|
2753
|
+
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
2751
2754
|
[](common_params & params) {
|
|
2752
|
-
params.
|
|
2755
|
+
params.embedding = true;
|
|
2756
|
+
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
2753
2757
|
}
|
|
2754
2758
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
2755
2759
|
add_opt(common_arg(
|
|
@@ -2869,6 +2873,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2869
2873
|
"(default: deepseek)",
|
|
2870
2874
|
[](common_params & params, const std::string & value) {
|
|
2871
2875
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2876
|
+
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
|
2872
2877
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2873
2878
|
else { throw std::invalid_argument("invalid value"); }
|
|
2874
2879
|
}
|
|
@@ -3212,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3212
3217
|
params.speculative.model.path = value;
|
|
3213
3218
|
}
|
|
3214
3219
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3220
|
+
add_opt(common_arg(
|
|
3221
|
+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
3222
|
+
string_format(
|
|
3223
|
+
"KV cache data type for K for the draft model\n"
|
|
3224
|
+
"allowed values: %s\n"
|
|
3225
|
+
"(default: %s)",
|
|
3226
|
+
get_all_kv_cache_types().c_str(),
|
|
3227
|
+
ggml_type_name(params.speculative.cache_type_k)
|
|
3228
|
+
),
|
|
3229
|
+
[](common_params & params, const std::string & value) {
|
|
3230
|
+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
|
|
3231
|
+
}
|
|
3232
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
|
|
3233
|
+
add_opt(common_arg(
|
|
3234
|
+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
|
|
3235
|
+
string_format(
|
|
3236
|
+
"KV cache data type for V for the draft model\n"
|
|
3237
|
+
"allowed values: %s\n"
|
|
3238
|
+
"(default: %s)",
|
|
3239
|
+
get_all_kv_cache_types().c_str(),
|
|
3240
|
+
ggml_type_name(params.speculative.cache_type_v)
|
|
3241
|
+
),
|
|
3242
|
+
[](common_params & params, const std::string & value) {
|
|
3243
|
+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
|
|
3244
|
+
}
|
|
3245
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
|
|
3215
3246
|
|
|
3216
3247
|
add_opt(common_arg(
|
|
3217
3248
|
{"-mv", "--model-vocoder"}, "FNAME",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
int LLAMA_BUILD_NUMBER = @
|
|
2
|
-
char const *LLAMA_COMMIT = "@
|
|
1
|
+
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
|
2
|
+
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
|
3
3
|
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
|
4
4
|
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
|
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
|
|
49
49
|
|
|
50
50
|
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
|
51
51
|
result_.tool_calls.emplace_back(tool_call);
|
|
52
|
+
|
|
52
53
|
return true;
|
|
53
54
|
}
|
|
54
55
|
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
|
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
378
379
|
/* .is_partial = */ found_healing_marker,
|
|
379
380
|
};
|
|
380
381
|
}
|
|
382
|
+
|
|
383
|
+
void common_chat_msg_parser::clear_tools() {
|
|
384
|
+
result_.tool_calls.clear();
|
|
385
|
+
}
|
|
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
|
|
|
82
82
|
|
|
83
83
|
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
|
|
84
84
|
std::vector<common_chat_msg_diff> diffs;
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
85
|
+
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
|
|
86
|
+
auto & diff = diffs.emplace_back();
|
|
87
|
+
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
|
|
88
|
+
}
|
|
89
89
|
if (previous_msg.content != new_msg.content) {
|
|
90
90
|
auto & diff = diffs.emplace_back();
|
|
91
91
|
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
|
|
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
|
|
|
385
385
|
|
|
386
386
|
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
|
387
387
|
json delta = json::object();
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
388
|
+
if (!diff.reasoning_content_delta.empty()) {
|
|
389
|
+
delta["reasoning_content"] = diff.reasoning_content_delta;
|
|
390
|
+
}
|
|
391
391
|
if (!diff.content_delta.empty()) {
|
|
392
392
|
delta["content"] = diff.content_delta;
|
|
393
393
|
}
|
|
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
|
598
598
|
switch (format) {
|
|
599
599
|
case COMMON_REASONING_FORMAT_NONE: return "none";
|
|
600
600
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
601
|
+
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
601
602
|
default:
|
|
602
603
|
throw std::runtime_error("Unknown reasoning format");
|
|
603
604
|
}
|
|
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
|
|
1837
1838
|
if (res < 0) {
|
|
1838
1839
|
// if the custom "tmpl" is not supported, we throw an error
|
|
1839
1840
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
|
1840
|
-
throw std::runtime_error("this custom template is not supported");
|
|
1841
|
+
throw std::runtime_error("this custom template is not supported, try using --jinja");
|
|
1841
1842
|
}
|
|
1842
1843
|
|
|
1843
1844
|
// if it turns out that our buffer is too small, we resize it
|
|
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|
|
1920
1921
|
} catch (const common_chat_msg_partial_exception & ex) {
|
|
1921
1922
|
LOG_DBG("Partial parse: %s\n", ex.what());
|
|
1922
1923
|
if (!is_partial) {
|
|
1923
|
-
|
|
1924
|
+
builder.clear_tools();
|
|
1925
|
+
builder.move_to(0);
|
|
1926
|
+
common_chat_parse_content_only(builder);
|
|
1924
1927
|
}
|
|
1925
1928
|
}
|
|
1926
1929
|
auto msg = builder.result();
|
|
@@ -70,7 +70,7 @@ struct common_chat_msg {
|
|
|
70
70
|
};
|
|
71
71
|
|
|
72
72
|
struct common_chat_msg_diff {
|
|
73
|
-
|
|
73
|
+
std::string reasoning_content_delta;
|
|
74
74
|
std::string content_delta;
|
|
75
75
|
size_t tool_call_index = std::string::npos;
|
|
76
76
|
common_chat_tool_call tool_call_delta;
|
|
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
|
|
466
466
|
|
|
467
467
|
std::string regex_escape(const std::string & s) {
|
|
468
468
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
469
|
-
return std::regex_replace(s, special_chars, "
|
|
469
|
+
return std::regex_replace(s, special_chars, "\\$&");
|
|
470
470
|
}
|
|
471
471
|
|
|
472
472
|
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
|
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
706
706
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
707
707
|
# pragma clang diagnostic push
|
|
708
708
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
709
|
+
#elif defined(__GNUC__)
|
|
710
|
+
# pragma GCC diagnostic push
|
|
711
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
709
712
|
#endif
|
|
713
|
+
|
|
710
714
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
711
715
|
|
|
712
716
|
#if defined(__clang__)
|
|
713
717
|
# pragma clang diagnostic pop
|
|
718
|
+
#elif defined(__GNUC__)
|
|
719
|
+
# pragma GCC diagnostic pop
|
|
714
720
|
#endif
|
|
715
721
|
|
|
716
722
|
filename_utf32 = converter.from_bytes(filename);
|
|
@@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
767
773
|
return true;
|
|
768
774
|
}
|
|
769
775
|
|
|
776
|
+
#include <iostream>
|
|
777
|
+
|
|
778
|
+
|
|
770
779
|
// returns true if successful, false otherwise
|
|
771
780
|
bool fs_create_directory_with_parents(const std::string & path) {
|
|
772
781
|
#ifdef _WIN32
|
|
@@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
|
784
793
|
// process path from front to back, procedurally creating directories
|
|
785
794
|
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
786
795
|
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
787
|
-
const wchar_t * test = subpath.c_str();
|
|
788
796
|
|
|
789
|
-
|
|
797
|
+
pos_slash += 1;
|
|
798
|
+
|
|
799
|
+
// skip the drive letter, in some systems it can return an access denied error
|
|
800
|
+
if (subpath.length() == 2 && subpath[1] == ':') {
|
|
801
|
+
continue;
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
|
805
|
+
|
|
790
806
|
if (!success) {
|
|
791
807
|
const DWORD error = GetLastError();
|
|
792
808
|
|
|
@@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
|
800
816
|
return false;
|
|
801
817
|
}
|
|
802
818
|
}
|
|
803
|
-
|
|
804
|
-
pos_slash += 1;
|
|
805
819
|
}
|
|
806
820
|
|
|
807
821
|
return true;
|
|
@@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
897
911
|
|
|
898
912
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
899
913
|
|
|
900
|
-
if (params.reranking) {
|
|
901
|
-
bool ok = true;
|
|
902
|
-
|
|
903
|
-
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
904
|
-
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
905
|
-
ok = false;
|
|
906
|
-
}
|
|
907
|
-
|
|
908
|
-
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
909
|
-
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
910
|
-
|
|
911
|
-
if (!has_eos && !has_sep) {
|
|
912
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
|
913
|
-
ok = false;
|
|
914
|
-
} else if (!has_eos) {
|
|
915
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
916
|
-
} else if (!has_sep) {
|
|
917
|
-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
918
|
-
ok = false;
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
if (!ok) {
|
|
922
|
-
llama_model_free(model);
|
|
923
|
-
|
|
924
|
-
return iparams;
|
|
925
|
-
}
|
|
926
|
-
}
|
|
927
|
-
|
|
928
914
|
auto cparams = common_context_params_to_llama(params);
|
|
929
915
|
|
|
930
916
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
@@ -934,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
934
920
|
return iparams;
|
|
935
921
|
}
|
|
936
922
|
|
|
937
|
-
if (params.ctx_shift && !
|
|
923
|
+
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
|
938
924
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
|
939
925
|
params.ctx_shift = false;
|
|
940
926
|
}
|
|
@@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
966
952
|
}
|
|
967
953
|
}
|
|
968
954
|
|
|
955
|
+
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
|
956
|
+
bool ok = true;
|
|
957
|
+
|
|
958
|
+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
959
|
+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
960
|
+
ok = false;
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
964
|
+
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
965
|
+
|
|
966
|
+
if (!has_eos && !has_sep) {
|
|
967
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
|
968
|
+
ok = false;
|
|
969
|
+
} else if (!has_eos) {
|
|
970
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
971
|
+
} else if (!has_sep) {
|
|
972
|
+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
973
|
+
ok = false;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
if (!ok) {
|
|
977
|
+
llama_free(lctx);
|
|
978
|
+
llama_model_free(model);
|
|
979
|
+
|
|
980
|
+
return iparams;
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
969
984
|
// load and optionally apply lora adapters
|
|
970
985
|
for (auto & la : params.lora_adapters) {
|
|
971
986
|
llama_adapter_lora_ptr lora;
|
|
@@ -1041,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1041
1056
|
if (llama_model_has_decoder(model)) {
|
|
1042
1057
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
1043
1058
|
}
|
|
1044
|
-
|
|
1059
|
+
llama_memory_clear(llama_get_memory(lctx), true);
|
|
1045
1060
|
llama_synchronize(lctx);
|
|
1046
1061
|
llama_perf_context_reset(lctx);
|
|
1047
1062
|
llama_set_warmup(lctx, false);
|
|
@@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1143
1158
|
cparams.op_offload = !params.no_op_offload;
|
|
1144
1159
|
cparams.swa_full = params.swa_full;
|
|
1145
1160
|
|
|
1146
|
-
if (params.reranking) {
|
|
1147
|
-
cparams.embeddings = true;
|
|
1148
|
-
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
1149
|
-
}
|
|
1150
|
-
|
|
1151
1161
|
cparams.type_k = params.cache_type_k;
|
|
1152
1162
|
cparams.type_v = params.cache_type_v;
|
|
1153
1163
|
|
|
@@ -1280,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
|
|
|
1280
1290
|
int n_tokens = text.length() + 2 * add_special;
|
|
1281
1291
|
std::vector<llama_token> result(n_tokens);
|
|
1282
1292
|
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
1293
|
+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
|
1294
|
+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
|
1295
|
+
}
|
|
1283
1296
|
if (n_tokens < 0) {
|
|
1284
1297
|
result.resize(-n_tokens);
|
|
1285
1298
|
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
@@ -199,6 +199,9 @@ struct common_params_speculative {
|
|
|
199
199
|
float p_split = 0.1f; // speculative decoding split probability
|
|
200
200
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
201
201
|
|
|
202
|
+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
203
|
+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
204
|
+
|
|
202
205
|
struct cpu_params cpuparams;
|
|
203
206
|
struct cpu_params cpuparams_batch;
|
|
204
207
|
|
|
@@ -215,7 +218,8 @@ struct common_params_vocoder {
|
|
|
215
218
|
|
|
216
219
|
enum common_reasoning_format {
|
|
217
220
|
COMMON_REASONING_FORMAT_NONE,
|
|
218
|
-
|
|
221
|
+
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
222
|
+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
219
223
|
};
|
|
220
224
|
|
|
221
225
|
struct common_params {
|
|
@@ -354,7 +358,7 @@ struct common_params {
|
|
|
354
358
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
355
359
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
356
360
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
357
|
-
|
|
361
|
+
std::string cls_sep = "\t"; // separator of classification sequences
|
|
358
362
|
|
|
359
363
|
// server params
|
|
360
364
|
int32_t port = 8080; // server listens on this network port
|
|
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|
|
41
41
|
return result;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
|
45
|
-
class string_view {
|
|
46
|
-
const std::string & _str;
|
|
47
|
-
const size_t _start;
|
|
48
|
-
const size_t _end;
|
|
49
|
-
public:
|
|
50
|
-
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
|
51
|
-
|
|
52
|
-
size_t size() const {
|
|
53
|
-
return _end - _start;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
size_t length() const {
|
|
57
|
-
return size();
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
operator std::string() const {
|
|
61
|
-
return str();
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
std::string str() const {
|
|
65
|
-
return _str.substr(_start, _end - _start);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
|
69
|
-
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
char operator[](size_t pos) const {
|
|
73
|
-
auto index = _start + pos;
|
|
74
|
-
if (index >= _end) {
|
|
75
|
-
throw std::out_of_range("string_view index out of range");
|
|
76
|
-
}
|
|
77
|
-
return _str[_start + pos];
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
bool operator==(const string_view & other) const {
|
|
81
|
-
std::string this_str = *this;
|
|
82
|
-
std::string other_str = other;
|
|
83
|
-
return this_str == other_str;
|
|
84
|
-
}
|
|
85
|
-
};
|
|
86
|
-
|
|
87
44
|
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
|
88
45
|
auto has_min = min_value != std::numeric_limits<int>::min();
|
|
89
46
|
auto has_max = max_value != std::numeric_limits<int>::max();
|
|
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
112
69
|
}
|
|
113
70
|
out << "}";
|
|
114
71
|
};
|
|
115
|
-
std::function<void(const string_view &, const string_view &)> uniform_range =
|
|
116
|
-
[&](const string_view & from, const string_view & to) {
|
|
72
|
+
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
|
|
73
|
+
[&](const std::string_view & from, const std::string_view & to) {
|
|
117
74
|
size_t i = 0;
|
|
118
75
|
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
|
119
76
|
i++;
|
|
120
77
|
}
|
|
121
78
|
if (i > 0) {
|
|
122
|
-
out << "\"" << from.substr(0, i)
|
|
79
|
+
out << "\"" << from.substr(0, i) << "\"";
|
|
123
80
|
}
|
|
124
81
|
if (i < from.length() && i < to.length()) {
|
|
125
82
|
if (i > 0) {
|
|
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
|
|
|
144
144
|
auto & smpl = spec->smpl;
|
|
145
145
|
auto & prompt = spec->prompt;
|
|
146
146
|
|
|
147
|
+
auto * mem = llama_get_memory(ctx);
|
|
148
|
+
|
|
147
149
|
int reuse_i = 0;
|
|
148
150
|
int reuse_n = 0;
|
|
149
151
|
|
|
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
173
175
|
result.reserve(params.n_draft);
|
|
174
176
|
|
|
175
177
|
if (reuse_n == 0) {
|
|
176
|
-
|
|
178
|
+
llama_memory_clear(mem, false);
|
|
177
179
|
|
|
178
180
|
prompt.clear();
|
|
179
181
|
} else {
|
|
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
|
|
|
192
194
|
}
|
|
193
195
|
|
|
194
196
|
if (reuse_i > 0) {
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
|
198
|
+
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
|
197
199
|
|
|
198
200
|
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
|
199
201
|
}
|
|
200
202
|
|
|
201
203
|
if (reuse_n < (int) prompt.size()) {
|
|
202
|
-
|
|
204
|
+
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
|
203
205
|
|
|
204
206
|
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
|
205
207
|
}
|