@novastera-oss/llamarn 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/PureCppImpl.cpp +9 -27
- package/cpp/SystemUtils.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +60 -5
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
package/README.md
CHANGED
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
> ⚠️ **WORK IN PROGRESS**: This package is currently under active development. Community help and feedback are greatly appreciated, especially in the areas mentioned in What Needs Help.
|
|
4
4
|
|
|
5
|
-
A React Native wrapper for llama.cpp focused on providing a simple, reliable way to run LLMs on mobile devices. This project was inspired by and builds upon the excellent work of [llama.rn](https://github.com/mybigday/llama.rn).
|
|
6
|
-
|
|
7
5
|
## Goals
|
|
8
6
|
|
|
9
7
|
* Provide a thin, reliable wrapper around llama.cpp for React Native
|
|
@@ -25,11 +23,12 @@ A React Native wrapper for llama.cpp focused on providing a simple, reliable way
|
|
|
25
23
|
|
|
26
24
|
We welcome contributions, especially in these areas:
|
|
27
25
|
|
|
28
|
-
1. **Android GPU Testing**:
|
|
29
|
-
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
26
|
+
1. **Android GPU Testing and Detection**:
|
|
27
|
+
* Development of reliable GPU detection mechanism in React Native
|
|
28
|
+
* Implementation of proper backend initialization verification
|
|
29
|
+
* Creation of robust testing framework for GPU availability
|
|
30
|
+
* Integration of OpenCL and Vulkan acceleration once detection is stable
|
|
31
|
+
* Performance benchmarking and optimization for mobile GPUs
|
|
33
32
|
|
|
34
33
|
2. **CI Improvements**:
|
|
35
34
|
* Adding automated Android GPU tests to CI pipeline
|
|
@@ -62,6 +61,73 @@ If you're interested in helping with any of these areas, please check our Contri
|
|
|
62
61
|
npm install @novastera-oss/llamarn
|
|
63
62
|
```
|
|
64
63
|
|
|
64
|
+
## Developer Setup
|
|
65
|
+
|
|
66
|
+
If you're contributing to the library or running the example project, follow these setup steps:
|
|
67
|
+
|
|
68
|
+
### Prerequisites
|
|
69
|
+
|
|
70
|
+
1. Clone the repository and navigate to the project directory
|
|
71
|
+
2. Ensure you have React Native development environment set up for your target platform(s)
|
|
72
|
+
|
|
73
|
+
### Initial Setup
|
|
74
|
+
|
|
75
|
+
```sh
|
|
76
|
+
# Install dependencies
|
|
77
|
+
npm install
|
|
78
|
+
|
|
79
|
+
# Optional if you already had previous version of llamacpp
|
|
80
|
+
npm run clean-llama
|
|
81
|
+
|
|
82
|
+
# Initialize llama.cpp submodule and dependencies
|
|
83
|
+
npm run setup-llama-cpp
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Android Development
|
|
87
|
+
|
|
88
|
+
1. Build the native Android libraries:
|
|
89
|
+
```sh
|
|
90
|
+
# Build the external native libraries for Android
|
|
91
|
+
./scripts/build_android_external.sh
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
2. Run the example project:
|
|
95
|
+
```sh
|
|
96
|
+
cd example
|
|
97
|
+
npm run android
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### iOS Development
|
|
101
|
+
|
|
102
|
+
1. Navigate to the example project and install iOS dependencies:
|
|
103
|
+
```sh
|
|
104
|
+
cd example
|
|
105
|
+
cd ios
|
|
106
|
+
|
|
107
|
+
# Install CocoaPods dependencies
|
|
108
|
+
bundle exec pod install
|
|
109
|
+
|
|
110
|
+
# Or if not using Bundler:
|
|
111
|
+
# pod install
|
|
112
|
+
|
|
113
|
+
cd ..
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
2. Run the example project:
|
|
117
|
+
```sh
|
|
118
|
+
npm run ios
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Development Notes
|
|
122
|
+
|
|
123
|
+
- **Android**: The `build_android_external.sh` script compiles llama.cpp for Android architectures and sets up the necessary native libraries. This step is required before running the Android example.
|
|
124
|
+
|
|
125
|
+
- **iOS**: The iOS setup uses CocoaPods to manage native dependencies. The prebuilt llama.cpp framework is included in the repository.
|
|
126
|
+
|
|
127
|
+
- **Troubleshooting**: If you encounter build issues, try cleaning your build cache:
|
|
128
|
+
- Android: `cd android && ./gradlew clean`
|
|
129
|
+
- iOS: `cd example/ios && rm -rf build && rm Podfile.lock && pod install`
|
|
130
|
+
|
|
65
131
|
## Basic Usage
|
|
66
132
|
|
|
67
133
|
### Simple Completion
|
|
@@ -216,20 +282,20 @@ The module accepts different path formats depending on the platform:
|
|
|
216
282
|
|
|
217
283
|
## About
|
|
218
284
|
|
|
219
|
-
|
|
285
|
+
This library is currently being used in [Novastera's](https://novastera.com) mobile application, demonstrating its capabilities in production environments. We're committed to enabling on-device LLM inference with no data leaving the user's device, helping developers build AI-powered applications that respect user privacy.
|
|
220
286
|
|
|
221
287
|
## License
|
|
222
288
|
|
|
223
|
-
Apache 2.0
|
|
289
|
+
Apache 2.0
|
|
224
290
|
|
|
225
291
|
## Acknowledgments
|
|
226
292
|
|
|
227
|
-
We
|
|
293
|
+
We acknowledge the following projects and communities that have contributed to the development of this library:
|
|
228
294
|
|
|
229
|
-
* **[mybigday/llama.rn](https://github.com/mybigday/llama.rn)** - A
|
|
295
|
+
* **[mybigday/llama.rn](https://github.com/mybigday/llama.rn)** - A foundational React Native binding for llama.cpp that demonstrated the viability of on-device LLM inference in mobile applications.
|
|
230
296
|
|
|
231
|
-
* **[
|
|
297
|
+
* **[ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp)** - The core C++ library that enables efficient LLM inference, serving as the foundation for this project.
|
|
232
298
|
|
|
233
|
-
*
|
|
299
|
+
* The test implementation of the Android Turbo Module ([react-native-pure-cpp-turbo-module-library](https://github.com/Zach-Dean-Attractions-io/react-native-pure-cpp-turbo-module-library)) provided valuable insights for our C++ integration.
|
|
234
300
|
|
|
235
|
-
|
|
301
|
+
These projects have significantly contributed to the open-source ecosystem, and we are committed to building upon their work while maintaining the same spirit of collaboration and innovation.
|
package/RNLlamaCpp.podspec
CHANGED
|
@@ -27,11 +27,17 @@ Pod::Spec.new do |s|
|
|
|
27
27
|
# If these are compiled directly, their paths need to be relative to the podspec (e.g., "cpp/llama.cpp/common/common.{h,cpp}")
|
|
28
28
|
"cpp/llama.cpp/common/common.{h,cpp}",
|
|
29
29
|
"cpp/llama.cpp/common/log.{h,cpp}",
|
|
30
|
+
"cpp/llama.cpp/common/arg.{h,cpp}",
|
|
30
31
|
"cpp/llama.cpp/common/sampling.{h,cpp}",
|
|
31
32
|
"cpp/llama.cpp/common/chat.{h,cpp}",
|
|
33
|
+
"cpp/llama.cpp/common/chat-parser.{h,cpp}",
|
|
34
|
+
"cpp/llama.cpp/common/regex-partial.{h,cpp}",
|
|
35
|
+
"cpp/llama.cpp/common/console.{h,cpp}",
|
|
36
|
+
"cpp/llama.cpp/common/json-partial.{h,cpp}",
|
|
32
37
|
"cpp/llama.cpp/common/ngram-cache.{h,cpp}",
|
|
33
38
|
"cpp/llama.cpp/common/json-schema-to-grammar.{h,cpp}",
|
|
34
39
|
"cpp/llama.cpp/common/speculative.{h,cpp}",
|
|
40
|
+
"cpp/llama.cpp/common/llguidance.{h,cpp}",
|
|
35
41
|
"cpp/llama.cpp/common/*.hpp",
|
|
36
42
|
"cpp/llama.cpp/common/minja/*.hpp"
|
|
37
43
|
|
|
@@ -52,16 +58,17 @@ Pod::Spec.new do |s|
|
|
|
52
58
|
"SWIFT_OPTIMIZATION_LEVEL" => "-O",
|
|
53
59
|
"ENABLE_BITCODE" => "NO",
|
|
54
60
|
"DEFINES_MODULE" => "YES",
|
|
55
|
-
"OTHER_LDFLAGS" => "$(inherited)",
|
|
61
|
+
"OTHER_LDFLAGS" => "$(inherited) -framework Accelerate -framework Foundation -framework Metal -framework MetalKit",
|
|
56
62
|
# These preprocessor macros ensure TurboModule registration works correctly
|
|
57
63
|
"GCC_PREPROCESSOR_DEFINITIONS" => ["$(inherited)", "RCT_NEW_ARCH_ENABLED=1",
|
|
58
64
|
"__STDC_FORMAT_MACROS=1", # For format macros in C++
|
|
59
65
|
"LLAMA_SHARED=1"] # For llama shared symbols
|
|
60
66
|
}
|
|
61
67
|
|
|
62
|
-
# Add user_target_xcconfig to propagate linker flags
|
|
68
|
+
# Add user_target_xcconfig to propagate linker flags and fix framework issues
|
|
63
69
|
s.user_target_xcconfig = {
|
|
64
|
-
"OTHER_LDFLAGS" => "$(inherited)"
|
|
70
|
+
"OTHER_LDFLAGS" => "$(inherited) -framework Accelerate -framework Foundation -framework Metal -framework MetalKit",
|
|
71
|
+
"FRAMEWORK_SEARCH_PATHS" => "$(inherited) $(PLATFORM_DIR)/Developer/Library/Frameworks"
|
|
65
72
|
}
|
|
66
73
|
|
|
67
74
|
# Install dependencies for Turbo Modules
|
package/android/CMakeLists.txt
CHANGED
|
@@ -57,7 +57,15 @@ add_library(
|
|
|
57
57
|
${CPP_DIR}/llama.cpp/common/common.cpp
|
|
58
58
|
${CPP_DIR}/llama.cpp/common/sampling.cpp
|
|
59
59
|
${CPP_DIR}/llama.cpp/common/chat.cpp
|
|
60
|
+
${CPP_DIR}/llama.cpp/common/chat-parser.cpp
|
|
61
|
+
${CPP_DIR}/llama.cpp/common/regex-partial.cpp
|
|
62
|
+
${CPP_DIR}/llama.cpp/common/arg.cpp
|
|
63
|
+
${CPP_DIR}/llama.cpp/common/console.cpp
|
|
64
|
+
${CPP_DIR}/llama.cpp/common/json-partial.cpp
|
|
65
|
+
${CPP_DIR}/llama.cpp/common/ngram-cache.cpp
|
|
60
66
|
${CPP_DIR}/llama.cpp/common/json-schema-to-grammar.cpp
|
|
67
|
+
${CPP_DIR}/llama.cpp/common/speculative.cpp
|
|
68
|
+
${CPP_DIR}/llama.cpp/common/llguidance.cpp
|
|
61
69
|
)
|
|
62
70
|
|
|
63
71
|
add_library(
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "ggml.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
6
|
#include "ggml-backend.h"
|
|
7
|
+
#include "ggml-opt.h"
|
|
7
8
|
|
|
8
9
|
#include <stddef.h>
|
|
9
10
|
#include <stdint.h>
|
|
@@ -344,7 +345,7 @@ extern "C" {
|
|
|
344
345
|
float yarn_beta_fast; // YaRN low correction dim
|
|
345
346
|
float yarn_beta_slow; // YaRN high correction dim
|
|
346
347
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
|
347
|
-
float defrag_thold; // defragment the KV cache if holes/size > thold,
|
|
348
|
+
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
|
348
349
|
|
|
349
350
|
ggml_backend_sched_eval_callback cb_eval;
|
|
350
351
|
void * cb_eval_user_data;
|
|
@@ -360,10 +361,11 @@ extern "C" {
|
|
|
360
361
|
|
|
361
362
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
362
363
|
bool embeddings; // if true, extract embeddings (together with logits)
|
|
363
|
-
bool offload_kqv; //
|
|
364
|
-
bool flash_attn; //
|
|
365
|
-
bool no_perf; //
|
|
366
|
-
bool op_offload; //
|
|
364
|
+
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
|
365
|
+
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
|
366
|
+
bool no_perf; // measure performance timings
|
|
367
|
+
bool op_offload; // offload host tensor operations to device
|
|
368
|
+
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
367
369
|
};
|
|
368
370
|
|
|
369
371
|
// model quantization parameters
|
|
@@ -445,6 +447,10 @@ extern "C" {
|
|
|
445
447
|
size_t n_paths,
|
|
446
448
|
struct llama_model_params params);
|
|
447
449
|
|
|
450
|
+
LLAMA_API void llama_model_save_to_file(
|
|
451
|
+
const struct llama_model * model,
|
|
452
|
+
const char * path_model);
|
|
453
|
+
|
|
448
454
|
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
|
449
455
|
"use llama_model_free instead");
|
|
450
456
|
|
|
@@ -465,6 +471,7 @@ extern "C" {
|
|
|
465
471
|
LLAMA_API int64_t llama_time_us(void);
|
|
466
472
|
|
|
467
473
|
LLAMA_API size_t llama_max_devices(void);
|
|
474
|
+
LLAMA_API size_t llama_max_parallel_sequences(void);
|
|
468
475
|
|
|
469
476
|
LLAMA_API bool llama_supports_mmap (void);
|
|
470
477
|
LLAMA_API bool llama_supports_mlock (void);
|
|
@@ -602,71 +609,14 @@ extern "C" {
|
|
|
602
609
|
// KV cache
|
|
603
610
|
//
|
|
604
611
|
|
|
605
|
-
// TODO: start using struct llama_kv_cache
|
|
606
|
-
|
|
607
|
-
// Information associated with an individual cell in the KV cache view.
|
|
608
|
-
struct llama_kv_cache_view_cell {
|
|
609
|
-
// The position for this cell. Takes KV cache shifts into account.
|
|
610
|
-
// May be negative if the cell is not populated.
|
|
611
|
-
llama_pos pos;
|
|
612
|
-
};
|
|
613
|
-
|
|
614
|
-
// An updateable view of the KV cache.
|
|
615
|
-
struct llama_kv_cache_view {
|
|
616
|
-
// Number of KV cache cells. This will be the same as the context size.
|
|
617
|
-
int32_t n_cells;
|
|
618
|
-
|
|
619
|
-
// Maximum number of sequences that can exist in a cell. It's not an error
|
|
620
|
-
// if there are more sequences in a cell than this value, however they will
|
|
621
|
-
// not be visible in the view cells_sequences.
|
|
622
|
-
int32_t n_seq_max;
|
|
623
|
-
|
|
624
|
-
// Number of tokens in the cache. For example, if there are two populated
|
|
625
|
-
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
|
626
|
-
// ids then you'll have 3 tokens.
|
|
627
|
-
int32_t token_count;
|
|
628
|
-
|
|
629
|
-
// Number of populated cache cells.
|
|
630
|
-
int32_t used_cells;
|
|
631
|
-
|
|
632
|
-
// Maximum contiguous empty slots in the cache.
|
|
633
|
-
int32_t max_contiguous;
|
|
634
|
-
|
|
635
|
-
// Index to the start of the max_contiguous slot range. Can be negative
|
|
636
|
-
// when cache is full.
|
|
637
|
-
int32_t max_contiguous_idx;
|
|
638
|
-
|
|
639
|
-
// Information for an individual cell.
|
|
640
|
-
struct llama_kv_cache_view_cell * cells;
|
|
641
|
-
|
|
642
|
-
// The sequences for each cell. There will be n_seq_max items per cell.
|
|
643
|
-
llama_seq_id * cells_sequences;
|
|
644
|
-
};
|
|
645
|
-
|
|
646
|
-
// Create an empty KV cache view. (use only for debugging purposes)
|
|
647
|
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
|
648
|
-
|
|
649
|
-
// Free a KV cache view. (use only for debugging purposes)
|
|
650
|
-
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
|
651
|
-
|
|
652
|
-
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
|
653
|
-
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
|
|
654
|
-
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
|
655
|
-
|
|
656
|
-
///
|
|
657
|
-
|
|
658
612
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
659
613
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
660
|
-
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx)
|
|
661
|
-
|
|
662
|
-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
|
663
|
-
"use llama_kv_self_n_tokens instead");
|
|
614
|
+
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
|
615
|
+
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
664
616
|
|
|
665
617
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
666
|
-
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx)
|
|
667
|
-
|
|
668
|
-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
|
|
669
|
-
"use llama_kv_self_used_cells instead");
|
|
618
|
+
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
|
619
|
+
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
670
620
|
|
|
671
621
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
672
622
|
LLAMA_API void llama_kv_self_clear(
|
|
@@ -725,10 +675,18 @@ extern "C" {
|
|
|
725
675
|
llama_pos p1,
|
|
726
676
|
int d);
|
|
727
677
|
|
|
678
|
+
// Returns the smallest position present in the KV cache for the specified sequence
|
|
679
|
+
// This is typically non-zero only for SWA caches
|
|
680
|
+
// Return -1 if the sequence is empty
|
|
681
|
+
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
682
|
+
struct llama_context * ctx,
|
|
683
|
+
llama_seq_id seq_id);
|
|
684
|
+
|
|
728
685
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
686
|
+
// Return -1 if the sequence is empty
|
|
729
687
|
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
730
688
|
struct llama_context * ctx,
|
|
731
|
-
|
|
689
|
+
llama_seq_id seq_id);
|
|
732
690
|
|
|
733
691
|
// Defragment the KV cache
|
|
734
692
|
// This will be applied:
|
|
@@ -742,61 +700,6 @@ extern "C" {
|
|
|
742
700
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
743
701
|
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
|
|
744
702
|
|
|
745
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_clear(
|
|
746
|
-
struct llama_context * ctx),
|
|
747
|
-
"use llama_kv_self_clear instead");
|
|
748
|
-
|
|
749
|
-
DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
|
|
750
|
-
struct llama_context * ctx,
|
|
751
|
-
llama_seq_id seq_id,
|
|
752
|
-
llama_pos p0,
|
|
753
|
-
llama_pos p1),
|
|
754
|
-
"use llama_kv_self_seq_rm instead");
|
|
755
|
-
|
|
756
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
|
|
757
|
-
struct llama_context * ctx,
|
|
758
|
-
llama_seq_id seq_id_src,
|
|
759
|
-
llama_seq_id seq_id_dst,
|
|
760
|
-
llama_pos p0,
|
|
761
|
-
llama_pos p1),
|
|
762
|
-
"use llama_kv_self_seq_cp instead");
|
|
763
|
-
|
|
764
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
|
|
765
|
-
struct llama_context * ctx,
|
|
766
|
-
llama_seq_id seq_id),
|
|
767
|
-
"use llama_kv_self_seq_keep instead");
|
|
768
|
-
|
|
769
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
|
|
770
|
-
struct llama_context * ctx,
|
|
771
|
-
llama_seq_id seq_id,
|
|
772
|
-
llama_pos p0,
|
|
773
|
-
llama_pos p1,
|
|
774
|
-
llama_pos delta),
|
|
775
|
-
"use llama_kv_self_seq_add instead");
|
|
776
|
-
|
|
777
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
|
|
778
|
-
struct llama_context * ctx,
|
|
779
|
-
llama_seq_id seq_id,
|
|
780
|
-
llama_pos p0,
|
|
781
|
-
llama_pos p1,
|
|
782
|
-
int d),
|
|
783
|
-
"use llama_kv_self_seq_div instead");
|
|
784
|
-
|
|
785
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
|
786
|
-
struct llama_context * ctx,
|
|
787
|
-
llama_seq_id seq_id),
|
|
788
|
-
"use llama_kv_self_seq_pos_max instead");
|
|
789
|
-
|
|
790
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
|
|
791
|
-
"use llama_kv_self_defrag instead");
|
|
792
|
-
|
|
793
|
-
DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
|
|
794
|
-
"use llama_kv_self_can_shift instead");
|
|
795
|
-
|
|
796
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
|
|
797
|
-
"use llama_kv_self_update instead");
|
|
798
|
-
|
|
799
|
-
|
|
800
703
|
//
|
|
801
704
|
// State / sessions
|
|
802
705
|
//
|
|
@@ -938,9 +841,12 @@ extern "C" {
|
|
|
938
841
|
// Requires KV cache.
|
|
939
842
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
940
843
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
941
|
-
//
|
|
942
|
-
//
|
|
943
|
-
//
|
|
844
|
+
// Upon non-zero return values, the KV cache state is restored to the state before this call
|
|
845
|
+
// 0 - success
|
|
846
|
+
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
847
|
+
// 2 - aborted
|
|
848
|
+
// -1 - invalid input batch
|
|
849
|
+
// < -1 - error
|
|
944
850
|
LLAMA_API int32_t llama_decode(
|
|
945
851
|
struct llama_context * ctx,
|
|
946
852
|
struct llama_batch batch);
|
|
@@ -1433,6 +1339,37 @@ extern "C" {
|
|
|
1433
1339
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
|
1434
1340
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
|
1435
1341
|
|
|
1342
|
+
//
|
|
1343
|
+
// training
|
|
1344
|
+
//
|
|
1345
|
+
|
|
1346
|
+
// function that returns whether or not a given tensor contains trainable parameters
|
|
1347
|
+
typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
|
|
1348
|
+
|
|
1349
|
+
// always returns true
|
|
1350
|
+
LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
|
|
1351
|
+
|
|
1352
|
+
struct llama_opt_params {
|
|
1353
|
+
uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
|
|
1354
|
+
|
|
1355
|
+
llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
|
|
1356
|
+
void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
|
|
1357
|
+
|
|
1358
|
+
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
|
1359
|
+
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
1360
|
+
};
|
|
1361
|
+
|
|
1362
|
+
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
|
1363
|
+
|
|
1364
|
+
LLAMA_API void llama_opt_epoch(
|
|
1365
|
+
struct llama_context * lctx,
|
|
1366
|
+
ggml_opt_dataset_t dataset,
|
|
1367
|
+
ggml_opt_result_t result_train,
|
|
1368
|
+
ggml_opt_result_t result_eval,
|
|
1369
|
+
int64_t idata_split,
|
|
1370
|
+
ggml_opt_epoch_callback callback_train,
|
|
1371
|
+
ggml_opt_epoch_callback callback_eval);
|
|
1372
|
+
|
|
1436
1373
|
#ifdef __cplusplus
|
|
1437
1374
|
}
|
|
1438
1375
|
#endif
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/PureCppImpl.cpp
CHANGED
|
@@ -157,7 +157,7 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
|
|
|
157
157
|
SystemUtils::setIfExists(runtime, options, "use_mmap", params.use_mmap);
|
|
158
158
|
SystemUtils::setIfExists(runtime, options, "use_mlock", params.use_mlock);
|
|
159
159
|
SystemUtils::setIfExists(runtime, options, "use_jinja", params.use_jinja);
|
|
160
|
-
|
|
160
|
+
|
|
161
161
|
// Extract threading parameters (preserve custom thread logic)
|
|
162
162
|
int n_threads = 0; // 0 = auto
|
|
163
163
|
if (options.hasProperty(runtime, "n_threads")) {
|
|
@@ -283,42 +283,24 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
|
|
|
283
283
|
// Set additional fields
|
|
284
284
|
rn_params.use_jinja = params.use_jinja;
|
|
285
285
|
rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
286
|
-
//
|
|
287
|
-
rn_params.chat_format = COMMON_CHAT_FORMAT_GENERIC;
|
|
286
|
+
// Don't force a specific chat format - let the template system auto-detect based on model and tools
|
|
287
|
+
// rn_params.chat_format = COMMON_CHAT_FORMAT_GENERIC;
|
|
288
288
|
// Now assign to the context
|
|
289
289
|
rn_ctx_->params = rn_params;
|
|
290
290
|
|
|
291
|
-
|
|
291
|
+
rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, params.chat_template);
|
|
292
292
|
try {
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
SystemUtils::setIfExists(runtime, options, "bos_token", bos_token_override);
|
|
298
|
-
SystemUtils::setIfExists(runtime, options, "eos_token", eos_token_override);
|
|
299
|
-
|
|
300
|
-
rn_ctx_->chat_templates = common_chat_templates_init(
|
|
301
|
-
rn_ctx_->model,
|
|
302
|
-
params.chat_template,
|
|
303
|
-
bos_token_override,
|
|
304
|
-
eos_token_override
|
|
305
|
-
);
|
|
306
|
-
|
|
307
|
-
if (!rn_ctx_->chat_templates) {
|
|
308
|
-
throw std::runtime_error("Failed to initialize chat templates");
|
|
309
|
-
}
|
|
310
|
-
} catch (const std::exception& e) {
|
|
311
|
-
// Log warning and fallback to chatml
|
|
312
|
-
fprintf(stderr, "Warning: Failed to initialize chat template: %s. Falling back to chatml.\n", e.what());
|
|
293
|
+
common_chat_format_example(rn_ctx_->chat_templates.get(), params.use_jinja);
|
|
294
|
+
} catch (const std::exception & e) {
|
|
295
|
+
// Fallback to chatml if the original template parsing fails
|
|
313
296
|
rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, "chatml");
|
|
314
|
-
if (!rn_ctx_->chat_templates) {
|
|
315
|
-
throw std::runtime_error("Failed to initialize fallback chatml template");
|
|
316
|
-
}
|
|
317
297
|
}
|
|
298
|
+
|
|
318
299
|
|
|
319
300
|
// Create the model object and return it
|
|
320
301
|
return createModelObject(runtime, rn_ctx_.get());
|
|
321
302
|
} catch (const std::exception& e) {
|
|
303
|
+
// We can keep this top-level error log as it's for initialization failure
|
|
322
304
|
fprintf(stderr, "initLlama error: %s\n", e.what());
|
|
323
305
|
throw jsi::JSError(runtime, e.what());
|
|
324
306
|
}
|
package/cpp/SystemUtils.h
CHANGED
|
@@ -44,8 +44,8 @@ public:
|
|
|
44
44
|
* Helper functions to easily set values from a JSI object if the property exists.
|
|
45
45
|
* Returns true if the property was found and the value was set.
|
|
46
46
|
*/
|
|
47
|
-
// Template for
|
|
48
|
-
template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type>
|
|
47
|
+
// Template for numeric types (excluding bool so bool specialization is used)
|
|
48
|
+
template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value && !std::is_same<T, bool>::value>::type>
|
|
49
49
|
static bool setIfExists(jsi::Runtime& rt, const jsi::Object& options, const std::string& key, T& outValue) {
|
|
50
50
|
if (options.hasProperty(rt, key.c_str())) {
|
|
51
51
|
jsi::Value val = options.getProperty(rt, key.c_str());
|
package/cpp/build-info.cpp
CHANGED
package/cpp/llama.cpp/README.md
CHANGED
|
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
|
|
|
37
37
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
|
38
38
|
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
|
39
39
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
|
40
|
-
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads
|
|
40
|
+
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
|
41
41
|
- Vulkan and SYCL backend support
|
|
42
42
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
|
43
43
|
|
|
@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
237
237
|
| [BLAS](docs/build.md#blas-build) | All |
|
|
238
238
|
| [BLIS](docs/backend/BLIS.md) | All |
|
|
239
239
|
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
|
240
|
-
| [MUSA](docs/build.md#musa) | Moore Threads
|
|
240
|
+
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
|
241
241
|
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
|
242
242
|
| [HIP](docs/build.md#hip) | AMD GPU |
|
|
243
243
|
| [Vulkan](docs/build.md#vulkan) | GPU |
|
|
@@ -572,4 +572,12 @@ automatically. For example:
|
|
|
572
572
|
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
|
573
573
|
```
|
|
574
574
|
|
|
575
|
-
##
|
|
575
|
+
## Dependencies
|
|
576
|
+
|
|
577
|
+
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
|
|
578
|
+
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
|
579
|
+
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
|
580
|
+
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
|
581
|
+
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
|
582
|
+
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
|
583
|
+
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
|
@@ -117,6 +117,7 @@ setup_framework_structure() {
|
|
|
117
117
|
# Copy all required headers (common for all platforms)
|
|
118
118
|
cp include/llama.h ${header_path}
|
|
119
119
|
cp ggml/include/ggml.h ${header_path}
|
|
120
|
+
cp ggml/include/ggml-opt.h ${header_path}
|
|
120
121
|
cp ggml/include/ggml-alloc.h ${header_path}
|
|
121
122
|
cp ggml/include/ggml-backend.h ${header_path}
|
|
122
123
|
cp ggml/include/ggml-metal.h ${header_path}
|
|
@@ -60,12 +60,16 @@ add_library(${TARGET} STATIC
|
|
|
60
60
|
base64.hpp
|
|
61
61
|
chat.cpp
|
|
62
62
|
chat.h
|
|
63
|
+
chat-parser.cpp
|
|
64
|
+
chat-parser.h
|
|
63
65
|
common.cpp
|
|
64
66
|
common.h
|
|
65
67
|
console.cpp
|
|
66
68
|
console.h
|
|
67
69
|
json-schema-to-grammar.cpp
|
|
68
70
|
json.hpp
|
|
71
|
+
json-partial.h
|
|
72
|
+
json-partial.cpp
|
|
69
73
|
llguidance.cpp
|
|
70
74
|
log.cpp
|
|
71
75
|
log.h
|
|
@@ -73,6 +77,8 @@ add_library(${TARGET} STATIC
|
|
|
73
77
|
minja/minja.hpp
|
|
74
78
|
ngram-cache.cpp
|
|
75
79
|
ngram-cache.h
|
|
80
|
+
regex-partial.cpp
|
|
81
|
+
regex-partial.h
|
|
76
82
|
sampling.cpp
|
|
77
83
|
sampling.h
|
|
78
84
|
speculative.cpp
|
|
@@ -119,8 +125,8 @@ if (LLAMA_LLGUIDANCE)
|
|
|
119
125
|
|
|
120
126
|
ExternalProject_Add(llguidance_ext
|
|
121
127
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
122
|
-
# v0.7.
|
|
123
|
-
GIT_TAG
|
|
128
|
+
# v0.7.20 (+ fix to build on GCC 15):
|
|
129
|
+
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
|
|
124
130
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
125
131
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
126
132
|
BUILD_IN_SOURCE TRUE
|