@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -61,7 +61,10 @@ extern "C" {
|
|
|
61
61
|
struct llama_model;
|
|
62
62
|
struct llama_context;
|
|
63
63
|
struct llama_sampler;
|
|
64
|
-
|
|
64
|
+
|
|
65
|
+
typedef struct llama_memory_i * llama_memory_t;
|
|
66
|
+
|
|
67
|
+
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
|
65
68
|
|
|
66
69
|
typedef int32_t llama_pos;
|
|
67
70
|
typedef int32_t llama_token;
|
|
@@ -240,18 +243,21 @@ extern "C" {
|
|
|
240
243
|
|
|
241
244
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
|
242
245
|
|
|
243
|
-
// Input data for llama_decode
|
|
246
|
+
// Input data for llama_encode/llama_decode
|
|
244
247
|
// A llama_batch object can contain input about one or many sequences
|
|
245
248
|
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
|
246
249
|
//
|
|
247
250
|
// - token : the token ids of the input (used when embd is NULL)
|
|
248
251
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
249
252
|
// - pos : the positions of the respective token in the sequence
|
|
250
|
-
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
|
253
|
+
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
|
251
254
|
// - seq_id : the sequence to which the respective token belongs
|
|
252
255
|
// (if set to NULL, the sequence ID will be assumed to be 0)
|
|
253
256
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
|
254
|
-
// (if set to NULL
|
|
257
|
+
// (if set to NULL:
|
|
258
|
+
// - if embeddings: all tokens are output
|
|
259
|
+
// - if not: only the last token is output
|
|
260
|
+
// )
|
|
255
261
|
//
|
|
256
262
|
typedef struct llama_batch {
|
|
257
263
|
int32_t n_tokens;
|
|
@@ -259,8 +265,8 @@ extern "C" {
|
|
|
259
265
|
llama_token * token;
|
|
260
266
|
float * embd;
|
|
261
267
|
llama_pos * pos;
|
|
262
|
-
int32_t * n_seq_id;
|
|
263
|
-
llama_seq_id ** seq_id;
|
|
268
|
+
int32_t * n_seq_id;
|
|
269
|
+
llama_seq_id ** seq_id;
|
|
264
270
|
int8_t * logits; // TODO: rename this to "output"
|
|
265
271
|
} llama_batch;
|
|
266
272
|
|
|
@@ -384,6 +390,7 @@ extern "C" {
|
|
|
384
390
|
void * imatrix; // pointer to importance matrix data
|
|
385
391
|
void * kv_overrides; // pointer to vector containing overrides
|
|
386
392
|
void * tensor_types; // pointer to vector containing tensor types
|
|
393
|
+
void * prune_layers; // pointer to vector containing layer indices to prune
|
|
387
394
|
} llama_model_quantize_params;
|
|
388
395
|
|
|
389
396
|
typedef struct llama_logit_bias {
|
|
@@ -493,9 +500,11 @@ extern "C" {
|
|
|
493
500
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
|
494
501
|
|
|
495
502
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
|
496
|
-
LLAMA_API
|
|
503
|
+
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
|
497
504
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
498
505
|
|
|
506
|
+
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
|
507
|
+
|
|
499
508
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
500
509
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
501
510
|
|
|
@@ -509,6 +518,13 @@ extern "C" {
|
|
|
509
518
|
// Get the model's RoPE frequency scaling factor
|
|
510
519
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
511
520
|
|
|
521
|
+
// Returns the number of classifier outputs (only valid for classifier models)
|
|
522
|
+
// Undefined behavior for non-classifier models
|
|
523
|
+
LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
|
|
524
|
+
|
|
525
|
+
// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
|
|
526
|
+
LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
|
|
527
|
+
|
|
512
528
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
|
513
529
|
|
|
514
530
|
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
|
@@ -609,7 +625,81 @@ extern "C" {
|
|
|
609
625
|
int32_t il_end);
|
|
610
626
|
|
|
611
627
|
//
|
|
612
|
-
//
|
|
628
|
+
// Memory
|
|
629
|
+
//
|
|
630
|
+
|
|
631
|
+
// Clear the memory contents
|
|
632
|
+
// If data == true, the data buffers will also be cleared together with the metadata
|
|
633
|
+
LLAMA_API void llama_memory_clear(
|
|
634
|
+
llama_memory_t mem,
|
|
635
|
+
bool data);
|
|
636
|
+
|
|
637
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
638
|
+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
639
|
+
// seq_id < 0 : match any sequence
|
|
640
|
+
// p0 < 0 : [0, p1]
|
|
641
|
+
// p1 < 0 : [p0, inf)
|
|
642
|
+
LLAMA_API bool llama_memory_seq_rm(
|
|
643
|
+
llama_memory_t mem,
|
|
644
|
+
llama_seq_id seq_id,
|
|
645
|
+
llama_pos p0,
|
|
646
|
+
llama_pos p1);
|
|
647
|
+
|
|
648
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
|
649
|
+
// p0 < 0 : [0, p1]
|
|
650
|
+
// p1 < 0 : [p0, inf)
|
|
651
|
+
LLAMA_API void llama_memory_seq_cp(
|
|
652
|
+
llama_memory_t mem,
|
|
653
|
+
llama_seq_id seq_id_src,
|
|
654
|
+
llama_seq_id seq_id_dst,
|
|
655
|
+
llama_pos p0,
|
|
656
|
+
llama_pos p1);
|
|
657
|
+
|
|
658
|
+
// Removes all tokens that do not belong to the specified sequence
|
|
659
|
+
LLAMA_API void llama_memory_seq_keep(
|
|
660
|
+
llama_memory_t mem,
|
|
661
|
+
llama_seq_id seq_id);
|
|
662
|
+
|
|
663
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
664
|
+
// p0 < 0 : [0, p1]
|
|
665
|
+
// p1 < 0 : [p0, inf)
|
|
666
|
+
LLAMA_API void llama_memory_seq_add(
|
|
667
|
+
llama_memory_t mem,
|
|
668
|
+
llama_seq_id seq_id,
|
|
669
|
+
llama_pos p0,
|
|
670
|
+
llama_pos p1,
|
|
671
|
+
llama_pos delta);
|
|
672
|
+
|
|
673
|
+
// Integer division of the positions by factor of `d > 1`
|
|
674
|
+
// p0 < 0 : [0, p1]
|
|
675
|
+
// p1 < 0 : [p0, inf)
|
|
676
|
+
LLAMA_API void llama_memory_seq_div(
|
|
677
|
+
llama_memory_t mem,
|
|
678
|
+
llama_seq_id seq_id,
|
|
679
|
+
llama_pos p0,
|
|
680
|
+
llama_pos p1,
|
|
681
|
+
int d);
|
|
682
|
+
|
|
683
|
+
// Returns the smallest position present in the memory for the specified sequence
|
|
684
|
+
// This is typically non-zero only for SWA caches
|
|
685
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
686
|
+
// Return -1 if the sequence is empty
|
|
687
|
+
LLAMA_API llama_pos llama_memory_seq_pos_min(
|
|
688
|
+
llama_memory_t mem,
|
|
689
|
+
llama_seq_id seq_id);
|
|
690
|
+
|
|
691
|
+
// Returns the largest position present in the memory for the specified sequence
|
|
692
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
693
|
+
// Return -1 if the sequence is empty
|
|
694
|
+
LLAMA_API llama_pos llama_memory_seq_pos_max(
|
|
695
|
+
llama_memory_t mem,
|
|
696
|
+
llama_seq_id seq_id);
|
|
697
|
+
|
|
698
|
+
// Check if the memory supports shifting
|
|
699
|
+
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
|
700
|
+
|
|
701
|
+
//
|
|
702
|
+
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
|
613
703
|
//
|
|
614
704
|
|
|
615
705
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
@@ -622,86 +712,95 @@ extern "C" {
|
|
|
622
712
|
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
623
713
|
|
|
624
714
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
625
|
-
LLAMA_API void llama_kv_self_clear(
|
|
626
|
-
|
|
715
|
+
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
|
716
|
+
struct llama_context * ctx),
|
|
717
|
+
"Use llama_memory_clear() instead");
|
|
627
718
|
|
|
628
719
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
629
720
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
630
721
|
// seq_id < 0 : match any sequence
|
|
631
722
|
// p0 < 0 : [0, p1]
|
|
632
723
|
// p1 < 0 : [p0, inf)
|
|
633
|
-
LLAMA_API bool llama_kv_self_seq_rm(
|
|
724
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
|
634
725
|
struct llama_context * ctx,
|
|
635
726
|
llama_seq_id seq_id,
|
|
636
727
|
llama_pos p0,
|
|
637
|
-
llama_pos p1)
|
|
728
|
+
llama_pos p1),
|
|
729
|
+
"Use llama_memory_seq_rm() instead");
|
|
638
730
|
|
|
639
731
|
// Copy all tokens that belong to the specified sequence to another sequence
|
|
640
732
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
641
733
|
// p0 < 0 : [0, p1]
|
|
642
734
|
// p1 < 0 : [p0, inf)
|
|
643
|
-
LLAMA_API void llama_kv_self_seq_cp(
|
|
735
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
|
644
736
|
struct llama_context * ctx,
|
|
645
737
|
llama_seq_id seq_id_src,
|
|
646
738
|
llama_seq_id seq_id_dst,
|
|
647
739
|
llama_pos p0,
|
|
648
|
-
llama_pos p1)
|
|
740
|
+
llama_pos p1),
|
|
741
|
+
"Use llama_memory_seq_cp() instead");
|
|
649
742
|
|
|
650
743
|
// Removes all tokens that do not belong to the specified sequence
|
|
651
|
-
LLAMA_API void llama_kv_self_seq_keep(
|
|
744
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
|
652
745
|
struct llama_context * ctx,
|
|
653
|
-
llama_seq_id seq_id)
|
|
746
|
+
llama_seq_id seq_id),
|
|
747
|
+
"Use llama_memory_seq_keep() instead");
|
|
654
748
|
|
|
655
749
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
656
750
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
657
751
|
// - lazily on next llama_decode()
|
|
658
752
|
// p0 < 0 : [0, p1]
|
|
659
753
|
// p1 < 0 : [p0, inf)
|
|
660
|
-
LLAMA_API void llama_kv_self_seq_add(
|
|
754
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
|
661
755
|
struct llama_context * ctx,
|
|
662
756
|
llama_seq_id seq_id,
|
|
663
757
|
llama_pos p0,
|
|
664
758
|
llama_pos p1,
|
|
665
|
-
llama_pos delta)
|
|
759
|
+
llama_pos delta),
|
|
760
|
+
"Use llama_memory_seq_add() instead");
|
|
666
761
|
|
|
667
762
|
// Integer division of the positions by factor of `d > 1`
|
|
668
763
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
669
764
|
// - lazily on next llama_decode()
|
|
670
765
|
// p0 < 0 : [0, p1]
|
|
671
766
|
// p1 < 0 : [p0, inf)
|
|
672
|
-
|
|
767
|
+
DEPRECATED(void llama_kv_self_seq_div(
|
|
673
768
|
struct llama_context * ctx,
|
|
674
769
|
llama_seq_id seq_id,
|
|
675
770
|
llama_pos p0,
|
|
676
771
|
llama_pos p1,
|
|
677
|
-
int d)
|
|
772
|
+
int d),
|
|
773
|
+
"Use llama_memory_seq_div() instead");
|
|
678
774
|
|
|
679
775
|
// Returns the smallest position present in the KV cache for the specified sequence
|
|
680
776
|
// This is typically non-zero only for SWA caches
|
|
681
777
|
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
682
778
|
// Return -1 if the sequence is empty
|
|
683
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
779
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
684
780
|
struct llama_context * ctx,
|
|
685
|
-
llama_seq_id seq_id)
|
|
781
|
+
llama_seq_id seq_id),
|
|
782
|
+
"Use llama_memory_seq_pos_min() instead");
|
|
686
783
|
|
|
687
784
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
688
785
|
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
689
786
|
// Return -1 if the sequence is empty
|
|
690
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
787
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
691
788
|
struct llama_context * ctx,
|
|
692
|
-
llama_seq_id seq_id)
|
|
789
|
+
llama_seq_id seq_id),
|
|
790
|
+
"Use llama_memory_seq_pos_max() instead");
|
|
693
791
|
|
|
694
792
|
// Defragment the KV cache
|
|
695
793
|
// This will be applied:
|
|
696
794
|
// - lazily on next llama_decode()
|
|
697
|
-
LLAMA_API
|
|
795
|
+
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
|
698
796
|
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
699
797
|
|
|
700
798
|
// Check if the context supports KV cache shifting
|
|
701
|
-
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx)
|
|
799
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
|
800
|
+
"use llama_memory_can_shift() instead");
|
|
702
801
|
|
|
703
802
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
704
|
-
LLAMA_API
|
|
803
|
+
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
|
705
804
|
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
706
805
|
|
|
707
806
|
//
|
|
@@ -709,7 +808,7 @@ extern "C" {
|
|
|
709
808
|
//
|
|
710
809
|
|
|
711
810
|
// Returns the *actual* size in bytes of the state
|
|
712
|
-
// (logits, embedding and
|
|
811
|
+
// (logits, embedding and memory)
|
|
713
812
|
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
|
714
813
|
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
|
715
814
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
|
@@ -765,12 +864,12 @@ extern "C" {
|
|
|
765
864
|
size_t n_token_count),
|
|
766
865
|
"use llama_state_save_file instead");
|
|
767
866
|
|
|
768
|
-
// Get the exact size needed to copy the
|
|
867
|
+
// Get the exact size needed to copy the state of a single sequence
|
|
769
868
|
LLAMA_API size_t llama_state_seq_get_size(
|
|
770
869
|
struct llama_context * ctx,
|
|
771
870
|
llama_seq_id seq_id);
|
|
772
871
|
|
|
773
|
-
// Copy the
|
|
872
|
+
// Copy the state of a single sequence into the specified buffer
|
|
774
873
|
LLAMA_API size_t llama_state_seq_get_data(
|
|
775
874
|
struct llama_context * ctx,
|
|
776
875
|
uint8_t * dst,
|
|
@@ -836,21 +935,23 @@ extern "C" {
|
|
|
836
935
|
// For encode-decoder contexts, processes the batch using the encoder.
|
|
837
936
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
|
838
937
|
// 0 - success
|
|
839
|
-
// < 0 - error. the
|
|
938
|
+
// < 0 - error. the memory state is restored to the state before this call
|
|
840
939
|
LLAMA_API int32_t llama_encode(
|
|
841
940
|
struct llama_context * ctx,
|
|
842
941
|
struct llama_batch batch);
|
|
843
942
|
|
|
844
943
|
// Process a batch of tokens.
|
|
845
|
-
// Requires
|
|
944
|
+
// Requires the context to have a memory.
|
|
846
945
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
847
946
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
848
|
-
// Upon
|
|
947
|
+
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
|
|
948
|
+
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
|
|
949
|
+
// Upon other return values, the memory state is restored to the state before this call
|
|
849
950
|
// 0 - success
|
|
850
951
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
851
|
-
// 2 - aborted
|
|
952
|
+
// 2 - aborted (processed ubatches will remain in the context's memory)
|
|
852
953
|
// -1 - invalid input batch
|
|
853
|
-
// < -1 - error
|
|
954
|
+
// < -1 - fatal error (processed ubatches will remain in the context's memory)
|
|
854
955
|
LLAMA_API int32_t llama_decode(
|
|
855
956
|
struct llama_context * ctx,
|
|
856
957
|
struct llama_batch batch);
|
|
@@ -866,8 +967,8 @@ extern "C" {
|
|
|
866
967
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
|
867
968
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
|
868
969
|
|
|
869
|
-
// Set whether the
|
|
870
|
-
//
|
|
970
|
+
// Set whether the context outputs embeddings or not
|
|
971
|
+
// TODO: rename to avoid confusion with llama_get_embeddings()
|
|
871
972
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
|
872
973
|
|
|
873
974
|
// Set whether to use causal attention or not
|
|
@@ -916,7 +1017,7 @@ extern "C" {
|
|
|
916
1017
|
|
|
917
1018
|
// Get the embeddings for a sequence id
|
|
918
1019
|
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
|
919
|
-
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[
|
|
1020
|
+
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
|
|
920
1021
|
// otherwise: float[n_embd] (1-dimensional)
|
|
921
1022
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
922
1023
|
|
|
@@ -946,6 +1047,7 @@ extern "C" {
|
|
|
946
1047
|
|
|
947
1048
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
948
1049
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
1050
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
|
949
1051
|
|
|
950
1052
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
|
951
1053
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
|
@@ -989,6 +1091,7 @@ extern "C" {
|
|
|
989
1091
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
990
1092
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
|
991
1093
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
1094
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
|
992
1095
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
|
993
1096
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
|
994
1097
|
/// as plaintext. Does not insert a leading space.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/LlamaCppModel.cpp
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include <cstdlib>
|
|
6
6
|
#include <ctime>
|
|
7
7
|
#include <chrono>
|
|
8
|
+
#include <thread>
|
|
8
9
|
#include <fstream>
|
|
9
10
|
#include <iostream>
|
|
10
11
|
#include <random>
|
|
@@ -17,8 +18,8 @@
|
|
|
17
18
|
#include <memory>
|
|
18
19
|
|
|
19
20
|
// Include rn-completion integration
|
|
20
|
-
#include "rn-utils.
|
|
21
|
-
#include "rn-llama.
|
|
21
|
+
#include "rn-utils.h"
|
|
22
|
+
#include "rn-llama.h"
|
|
22
23
|
|
|
23
24
|
// Include llama.cpp headers
|
|
24
25
|
#include "llama.h"
|
|
@@ -50,33 +51,60 @@ LlamaCppModel::~LlamaCppModel() {
|
|
|
50
51
|
}
|
|
51
52
|
|
|
52
53
|
void LlamaCppModel::release() {
|
|
53
|
-
//
|
|
54
|
+
// Signal completion to stop and wait for it to finish gracefully
|
|
54
55
|
if (is_predicting_) {
|
|
55
56
|
should_stop_completion_ = true;
|
|
56
57
|
|
|
57
|
-
//
|
|
58
|
+
// Wait more patiently for completion to stop, with proper backoff
|
|
58
59
|
int retry = 0;
|
|
59
|
-
while (is_predicting_ && retry <
|
|
60
|
-
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
|
60
|
+
while (is_predicting_ && retry < 100) { // Increased from 10 to 100
|
|
61
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(retry < 50 ? 10 : 50));
|
|
61
62
|
retry++;
|
|
62
63
|
}
|
|
64
|
+
|
|
65
|
+
// Force stop if still predicting
|
|
66
|
+
if (is_predicting_) {
|
|
67
|
+
is_predicting_ = false;
|
|
68
|
+
}
|
|
63
69
|
}
|
|
64
70
|
|
|
65
|
-
// Clean up our resources
|
|
71
|
+
// Clean up our resources with proper mutex protection
|
|
66
72
|
if (rn_ctx_) {
|
|
73
|
+
std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
|
|
74
|
+
|
|
75
|
+
// Clear KV cache before freeing context (following server.cpp pattern)
|
|
67
76
|
if (rn_ctx_->ctx) {
|
|
77
|
+
try {
|
|
78
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
79
|
+
} catch (...) {
|
|
80
|
+
// Ignore errors during cache clearing
|
|
81
|
+
}
|
|
82
|
+
|
|
68
83
|
llama_free(rn_ctx_->ctx);
|
|
69
84
|
rn_ctx_->ctx = nullptr;
|
|
70
85
|
}
|
|
71
86
|
|
|
87
|
+
// Free model after context (following server.cpp cleanup order)
|
|
72
88
|
if (rn_ctx_->model) {
|
|
73
89
|
llama_model_free(rn_ctx_->model);
|
|
74
90
|
rn_ctx_->model = nullptr;
|
|
75
91
|
}
|
|
76
92
|
|
|
93
|
+
// Clean up additional resources
|
|
94
|
+
rn_ctx_->vocab = nullptr; // This is owned by the model, so just null it
|
|
95
|
+
rn_ctx_->chat_templates.reset(); // Clean up chat templates
|
|
96
|
+
rn_ctx_->lora_adapters.clear(); // Clear LoRA adapters
|
|
97
|
+
|
|
98
|
+
// Reset state flags
|
|
99
|
+
rn_ctx_->model_loaded = false;
|
|
100
|
+
|
|
77
101
|
// Note: rn_ctx_ itself is owned by the module, so we don't delete it here
|
|
78
102
|
rn_ctx_ = nullptr;
|
|
79
103
|
}
|
|
104
|
+
|
|
105
|
+
// Reset our internal state
|
|
106
|
+
should_stop_completion_ = false;
|
|
107
|
+
is_predicting_ = false;
|
|
80
108
|
}
|
|
81
109
|
|
|
82
110
|
int32_t LlamaCppModel::getVocabSize() const {
|
|
@@ -133,6 +161,10 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
|
|
|
133
161
|
options.min_p = obj.getProperty(rt, "min_p").asNumber();
|
|
134
162
|
}
|
|
135
163
|
|
|
164
|
+
if (obj.hasProperty(rt, "presence_penalty") && !obj.getProperty(rt, "presence_penalty").isUndefined()) {
|
|
165
|
+
options.presence_penalty = obj.getProperty(rt, "presence_penalty").asNumber();
|
|
166
|
+
}
|
|
167
|
+
|
|
136
168
|
if (obj.hasProperty(rt, "n_predict") && !obj.getProperty(rt, "n_predict").isUndefined()) {
|
|
137
169
|
options.n_predict = obj.getProperty(rt, "n_predict").asNumber();
|
|
138
170
|
} else if (obj.hasProperty(rt, "max_tokens") && !obj.getProperty(rt, "max_tokens").isUndefined()) {
|
|
@@ -365,13 +397,14 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
365
397
|
std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
|
|
366
398
|
|
|
367
399
|
// Clear the context KV cache
|
|
368
|
-
|
|
400
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
369
401
|
|
|
370
402
|
// Store original sampling parameters to restore later
|
|
371
403
|
float orig_temp = rn_ctx_->params.sampling.temp;
|
|
372
404
|
float orig_top_p = rn_ctx_->params.sampling.top_p;
|
|
373
405
|
float orig_top_k = rn_ctx_->params.sampling.top_k;
|
|
374
406
|
float orig_min_p = rn_ctx_->params.sampling.min_p;
|
|
407
|
+
float orig_presence_penalty = rn_ctx_->params.sampling.penalty_present;
|
|
375
408
|
int orig_n_predict = rn_ctx_->params.n_predict;
|
|
376
409
|
|
|
377
410
|
// Set sampling parameters from options
|
|
@@ -379,6 +412,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
379
412
|
rn_ctx_->params.sampling.top_p = options.top_p;
|
|
380
413
|
rn_ctx_->params.sampling.top_k = options.top_k;
|
|
381
414
|
rn_ctx_->params.sampling.min_p = options.min_p;
|
|
415
|
+
rn_ctx_->params.sampling.penalty_present = options.presence_penalty;
|
|
382
416
|
rn_ctx_->params.n_predict = options.n_predict;
|
|
383
417
|
|
|
384
418
|
// Check for a partial callback
|
|
@@ -426,6 +460,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
|
|
|
426
460
|
rn_ctx_->params.sampling.top_p = orig_top_p;
|
|
427
461
|
rn_ctx_->params.sampling.top_k = orig_top_k;
|
|
428
462
|
rn_ctx_->params.sampling.min_p = orig_min_p;
|
|
463
|
+
rn_ctx_->params.sampling.penalty_present = orig_presence_penalty;
|
|
429
464
|
rn_ctx_->params.n_predict = orig_n_predict;
|
|
430
465
|
|
|
431
466
|
return result;
|
|
@@ -885,29 +920,28 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
|
|
|
885
920
|
}
|
|
886
921
|
|
|
887
922
|
// Clear the context KV cache to ensure clean embedding
|
|
888
|
-
|
|
923
|
+
llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
|
|
889
924
|
|
|
890
925
|
// Enable embedding mode
|
|
891
926
|
llama_set_embeddings(rn_ctx_->ctx, true);
|
|
892
927
|
|
|
893
|
-
//
|
|
928
|
+
// Create and populate batch using common_batch functions (following server.cpp pattern)
|
|
929
|
+
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
|
|
930
|
+
|
|
931
|
+
common_batch_clear(batch);
|
|
894
932
|
for (int i = 0; i < (int)tokens.size(); i++) {
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
/* token */ &token,
|
|
899
|
-
/* embd */ nullptr,
|
|
900
|
-
/* pos */ &i,
|
|
901
|
-
/* n_seq_id */ nullptr,
|
|
902
|
-
/* seq_id */ nullptr,
|
|
903
|
-
/* logits */ nullptr
|
|
904
|
-
};
|
|
905
|
-
|
|
906
|
-
if (llama_decode(rn_ctx_->ctx, batch) != 0) {
|
|
907
|
-
throw std::runtime_error("Failed to decode token for embedding");
|
|
908
|
-
}
|
|
933
|
+
// For embeddings, we typically need logits for the last token (for pooling)
|
|
934
|
+
bool needs_logits = (i == (int)tokens.size() - 1);
|
|
935
|
+
common_batch_add(batch, tokens[i], i, {0}, needs_logits);
|
|
909
936
|
}
|
|
910
937
|
|
|
938
|
+
if (llama_decode(rn_ctx_->ctx, batch) != 0) {
|
|
939
|
+
llama_batch_free(batch);
|
|
940
|
+
throw std::runtime_error("Failed to decode tokens for embedding");
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
llama_batch_free(batch);
|
|
944
|
+
|
|
911
945
|
// Get embedding size from the model
|
|
912
946
|
const int n_embd = llama_model_n_embd(rn_ctx_->model);
|
|
913
947
|
if (n_embd <= 0) {
|
package/cpp/LlamaCppModel.h
CHANGED
|
@@ -17,9 +17,9 @@
|
|
|
17
17
|
#include "chat.h" // For chat format handling and templates
|
|
18
18
|
#include "json-schema-to-grammar.h"
|
|
19
19
|
|
|
20
|
-
// Include rn-utils.
|
|
21
|
-
#include "rn-utils.
|
|
22
|
-
#include "rn-llama.
|
|
20
|
+
// Include rn-utils.h which has the CompletionResult definition
|
|
21
|
+
#include "rn-utils.h"
|
|
22
|
+
#include "rn-llama.h"
|
|
23
23
|
|
|
24
24
|
// Include json.hpp for json handling
|
|
25
25
|
#include "nlohmann/json.hpp"
|
package/cpp/PureCppImpl.cpp
CHANGED
package/cpp/PureCppImpl.h
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
#include <mutex>
|
|
10
10
|
|
|
11
11
|
// Include the header with the full definition of rn_llama_context
|
|
12
|
-
#include "rn-llama.
|
|
12
|
+
#include "rn-llama.h"
|
|
13
13
|
|
|
14
14
|
// Forward declarations for C++ only
|
|
15
15
|
struct llama_model;
|
|
@@ -49,7 +49,7 @@ private:
|
|
|
49
49
|
jsi::Object createModelObject(jsi::Runtime& runtime, struct rn_llama_context* rn_ctx);
|
|
50
50
|
|
|
51
51
|
// Context for the currently loaded model, if any.
|
|
52
|
-
// The actual definition of rn_llama_context should be in "rn-llama.
|
|
52
|
+
// The actual definition of rn_llama_context should be in "rn-llama.h"
|
|
53
53
|
std::unique_ptr<struct rn_llama_context> rn_ctx_;
|
|
54
54
|
|
|
55
55
|
// Mutex for thread safety when accessing rn_ctx_ or other shared resources
|
package/cpp/build-info.cpp
CHANGED
|
@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
|
|
|
89
89
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
|
90
90
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
|
91
91
|
|
|
92
|
+
if (NOT DEFINED LLAMA_BUILD_NUMBER)
|
|
93
|
+
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
|
94
|
+
endif()
|
|
95
|
+
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
|
96
|
+
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
97
|
+
endif()
|
|
98
|
+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
|
99
|
+
|
|
92
100
|
# override ggml options
|
|
93
101
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|
94
102
|
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
|
@@ -155,10 +163,17 @@ if (LLAMA_USE_SYSTEM_GGML)
|
|
|
155
163
|
endif()
|
|
156
164
|
|
|
157
165
|
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
|
166
|
+
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
|
|
167
|
+
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
|
|
158
168
|
add_subdirectory(ggml)
|
|
159
169
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
|
160
170
|
endif()
|
|
161
171
|
|
|
172
|
+
if (MINGW)
|
|
173
|
+
# Target Windows 8 for PrefetchVirtualMemory
|
|
174
|
+
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
175
|
+
endif()
|
|
176
|
+
|
|
162
177
|
#
|
|
163
178
|
# build the library
|
|
164
179
|
#
|
|
@@ -199,10 +214,6 @@ endif()
|
|
|
199
214
|
include(GNUInstallDirs)
|
|
200
215
|
include(CMakePackageConfigHelpers)
|
|
201
216
|
|
|
202
|
-
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
|
203
|
-
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
204
|
-
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
|
205
|
-
|
|
206
217
|
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
|
207
218
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
|
208
219
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|