@novastera-oss/llamarn 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/PureCppImpl.cpp +9 -27
- package/cpp/SystemUtils.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +60 -5
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -96,6 +96,8 @@ enum llm_type {
|
|
|
96
96
|
LLM_TYPE_235B_A22B,
|
|
97
97
|
};
|
|
98
98
|
|
|
99
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
100
|
+
|
|
99
101
|
struct llama_layer_posnet {
|
|
100
102
|
// resnet
|
|
101
103
|
struct ggml_tensor * norm1 = nullptr;
|
|
@@ -396,7 +398,10 @@ struct llama_model {
|
|
|
396
398
|
|
|
397
399
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
|
398
400
|
|
|
399
|
-
|
|
401
|
+
float get_rope_freq_base (const llama_cparams & cparams, int il) const;
|
|
402
|
+
float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
|
|
403
|
+
|
|
404
|
+
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
|
|
400
405
|
|
|
401
406
|
// note: can mutate `cparams`
|
|
402
407
|
// TODO: move this to new llm_arch_model_i interface
|
|
@@ -14,6 +14,12 @@
|
|
|
14
14
|
#include <thread>
|
|
15
15
|
#include <unordered_map>
|
|
16
16
|
|
|
17
|
+
// Quantization types. Changes to this struct must be replicated in quantize.cpp
|
|
18
|
+
struct tensor_quantization {
|
|
19
|
+
std::string name;
|
|
20
|
+
ggml_type quant = GGML_TYPE_COUNT;
|
|
21
|
+
};
|
|
22
|
+
|
|
17
23
|
static void zeros(std::ofstream & file, size_t n) {
|
|
18
24
|
char zero = 0;
|
|
19
25
|
for (size_t i = 0; i < n; ++i) {
|
|
@@ -48,12 +54,6 @@ struct quantize_state_impl {
|
|
|
48
54
|
{}
|
|
49
55
|
};
|
|
50
56
|
|
|
51
|
-
// changes to this struct must be replicated in quantize.cpp
|
|
52
|
-
struct tensor_quantization {
|
|
53
|
-
std::string name;
|
|
54
|
-
ggml_type quant = GGML_TYPE_COUNT;
|
|
55
|
-
};
|
|
56
|
-
|
|
57
57
|
static void llama_tensor_dequantize_impl(
|
|
58
58
|
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
|
59
59
|
const size_t nelements, const int nthread
|
|
@@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
519
519
|
nthread = std::thread::hardware_concurrency();
|
|
520
520
|
}
|
|
521
521
|
|
|
522
|
-
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
|
522
|
+
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
|
523
523
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
|
524
524
|
#if defined(__linux__) || defined(_WIN32)
|
|
525
525
|
constexpr bool use_mmap = true;
|
|
@@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
529
529
|
|
|
530
530
|
llama_model_kv_override * kv_overrides = nullptr;
|
|
531
531
|
if (params->kv_overrides) {
|
|
532
|
-
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
532
|
+
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
533
533
|
kv_overrides = v->data();
|
|
534
534
|
}
|
|
535
535
|
|
|
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
796
796
|
// unless the user specifies a type
|
|
797
797
|
if (params->tensor_types) {
|
|
798
798
|
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
799
|
+
const std::string tensor_name(tensor->name);
|
|
799
800
|
for (const auto & [tname, qtype] : tensor_types) {
|
|
800
|
-
if (std::regex pattern(tname); std::regex_search(
|
|
801
|
-
if
|
|
802
|
-
LLAMA_LOG_DEBUG("(overriding %s
|
|
801
|
+
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
|
802
|
+
if (qtype != new_type) {
|
|
803
|
+
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
|
804
|
+
new_type = qtype;
|
|
805
|
+
break; // if two or more types are specified for the tensor, first match wins
|
|
803
806
|
}
|
|
804
|
-
new_type = qtype;
|
|
805
|
-
break;
|
|
806
807
|
}
|
|
807
808
|
}
|
|
808
809
|
}
|
|
809
810
|
}
|
|
811
|
+
|
|
810
812
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
|
811
813
|
new_type = params->token_embedding_type;
|
|
812
814
|
}
|
|
@@ -798,7 +798,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
|
|
798
798
|
}
|
|
799
799
|
|
|
800
800
|
// if we have enough values the operation was a success
|
|
801
|
-
if (filtered_tokens.size() >= ctx->min_keep) {
|
|
801
|
+
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
|
|
802
802
|
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
|
803
803
|
cur_p->size = filtered_tokens.size();
|
|
804
804
|
min_p_applied = true;
|
|
@@ -909,7 +909,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
|
|
|
909
909
|
cum_sum += cur_p->data[idx].p;
|
|
910
910
|
|
|
911
911
|
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
|
912
|
-
if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
|
|
912
|
+
if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) {
|
|
913
913
|
last_idx = i + 1;
|
|
914
914
|
break;
|
|
915
915
|
}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
#include "llama-vocab.h"
|
|
2
2
|
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "gguf.h"
|
|
3
5
|
#include "llama-impl.h"
|
|
4
6
|
#include "llama-model-loader.h"
|
|
5
7
|
|
|
@@ -833,7 +835,7 @@ struct llm_tokenizer_ugm_session {
|
|
|
833
835
|
}
|
|
834
836
|
|
|
835
837
|
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
|
836
|
-
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -
|
|
838
|
+
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
|
|
837
839
|
// at the beginning tokenization score is zero
|
|
838
840
|
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
|
|
839
841
|
|
|
@@ -865,7 +867,7 @@ struct llm_tokenizer_ugm_session {
|
|
|
865
867
|
const double challenger_score = current_best.score_sum + token_score;
|
|
866
868
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
|
867
869
|
if (challenger_score > current_champ.score_sum) {
|
|
868
|
-
struct best_tokenization challenger = { token_id, input_offset,
|
|
870
|
+
struct best_tokenization challenger = { token_id, input_offset, challenger_score };
|
|
869
871
|
current_champ = challenger;
|
|
870
872
|
}
|
|
871
873
|
}
|
|
@@ -879,7 +881,7 @@ struct llm_tokenizer_ugm_session {
|
|
|
879
881
|
prefix_offset = input_offset + n_utf8_code_units;
|
|
880
882
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
|
881
883
|
if (challenger_score > current_champ.score_sum) {
|
|
882
|
-
struct best_tokenization challenger = { vocab.token_unk(), input_offset,
|
|
884
|
+
struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
|
|
883
885
|
current_champ = challenger;
|
|
884
886
|
}
|
|
885
887
|
}
|
|
@@ -1005,7 +1007,7 @@ private:
|
|
|
1005
1007
|
struct best_tokenization {
|
|
1006
1008
|
llama_token token_id;
|
|
1007
1009
|
size_t input_offset;
|
|
1008
|
-
|
|
1010
|
+
double score_sum;
|
|
1009
1011
|
};
|
|
1010
1012
|
|
|
1011
1013
|
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
|
|
@@ -1234,6 +1236,9 @@ struct fragment_buffer_variant {
|
|
|
1234
1236
|
struct llama_vocab::impl {
|
|
1235
1237
|
uint32_t n_token_types = 0; // for BERT-style token types
|
|
1236
1238
|
|
|
1239
|
+
std::string tokenizer_model;
|
|
1240
|
+
std::string tokenizer_pre;
|
|
1241
|
+
|
|
1237
1242
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
1238
1243
|
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1239
1244
|
|
|
@@ -1369,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1369
1374
|
|
|
1370
1375
|
// determine vocab type
|
|
1371
1376
|
{
|
|
1372
|
-
std::string tokenizer_model;
|
|
1373
|
-
std::string tokenizer_pre;
|
|
1374
|
-
|
|
1375
1377
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
|
1376
1378
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
1377
1379
|
|
|
@@ -1466,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1466
1468
|
|
|
1467
1469
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
|
1468
1470
|
if (precompiled_charsmap_keyidx != -1) {
|
|
1469
|
-
|
|
1471
|
+
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
|
1472
|
+
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
|
1473
|
+
|
|
1474
|
+
const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
|
1470
1475
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
|
1471
1476
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
|
1472
1477
|
#ifdef IS_BIG_ENDIAN
|
|
@@ -2789,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2789
2794
|
pimpl->load(ml, kv);
|
|
2790
2795
|
}
|
|
2791
2796
|
|
|
2797
|
+
std::string llama_vocab::get_tokenizer_model() const {
|
|
2798
|
+
return pimpl->tokenizer_model;
|
|
2799
|
+
}
|
|
2800
|
+
|
|
2801
|
+
std::string llama_vocab::get_tokenizer_pre() const {
|
|
2802
|
+
return pimpl->tokenizer_pre;
|
|
2803
|
+
}
|
|
2804
|
+
|
|
2792
2805
|
enum llama_vocab_type llama_vocab::get_type() const {
|
|
2793
2806
|
return pimpl->type;
|
|
2794
2807
|
}
|
|
@@ -3011,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
|
|
|
3011
3024
|
return it->second;
|
|
3012
3025
|
}
|
|
3013
3026
|
|
|
3027
|
+
std::vector<std::string> llama_vocab::get_bpe_merges() const {
|
|
3028
|
+
std::vector<std::string> result(pimpl->bpe_ranks.size());
|
|
3029
|
+
|
|
3030
|
+
for (const auto & pair : pimpl->bpe_ranks) {
|
|
3031
|
+
result[pair.second] = pair.first.first + " " + pair.first.second;
|
|
3032
|
+
}
|
|
3033
|
+
|
|
3034
|
+
return result;
|
|
3035
|
+
}
|
|
3036
|
+
|
|
3037
|
+
std::vector<char> llama_vocab::get_precompiled_charsmap() const {
|
|
3038
|
+
return pimpl->precompiled_charsmap;
|
|
3039
|
+
}
|
|
3040
|
+
|
|
3014
3041
|
int32_t llama_vocab::tokenize(
|
|
3015
3042
|
const char * text,
|
|
3016
3043
|
int32_t text_len,
|
|
@@ -21,6 +21,9 @@ struct llama_vocab {
|
|
|
21
21
|
|
|
22
22
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
|
23
23
|
|
|
24
|
+
std::string get_tokenizer_model() const;
|
|
25
|
+
std::string get_tokenizer_pre() const;
|
|
26
|
+
|
|
24
27
|
enum llama_vocab_type get_type() const;
|
|
25
28
|
enum llama_vocab_pre_type get_pre_type() const;
|
|
26
29
|
|
|
@@ -80,6 +83,9 @@ struct llama_vocab {
|
|
|
80
83
|
int max_token_len() const;
|
|
81
84
|
|
|
82
85
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
|
86
|
+
std::vector<std::string> get_bpe_merges() const;
|
|
87
|
+
|
|
88
|
+
std::vector<char> get_precompiled_charsmap() const;
|
|
83
89
|
|
|
84
90
|
int32_t tokenize(
|
|
85
91
|
const char * text,
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "llama-mmap.h"
|
|
5
5
|
#include "llama-vocab.h"
|
|
6
6
|
#include "llama-model-loader.h"
|
|
7
|
+
#include "llama-model-saver.h"
|
|
7
8
|
#include "llama-model.h"
|
|
8
9
|
|
|
9
10
|
#include "ggml.h"
|
|
@@ -139,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
139
140
|
struct llama_model_params params) {
|
|
140
141
|
ggml_time_init();
|
|
141
142
|
|
|
143
|
+
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
|
144
|
+
LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
|
|
145
|
+
return nullptr;
|
|
146
|
+
}
|
|
147
|
+
|
|
142
148
|
unsigned cur_percentage = 0;
|
|
143
149
|
if (params.progress_callback == NULL) {
|
|
144
150
|
params.progress_callback_user_data = &cur_percentage;
|
|
@@ -253,6 +259,13 @@ struct llama_model * llama_model_load_from_splits(
|
|
|
253
259
|
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
|
254
260
|
}
|
|
255
261
|
|
|
262
|
+
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
|
263
|
+
llama_model_saver ms(*model);
|
|
264
|
+
ms.add_kv_from_model();
|
|
265
|
+
ms.add_tensors_from_model();
|
|
266
|
+
ms.save(path_model);
|
|
267
|
+
}
|
|
268
|
+
|
|
256
269
|
//
|
|
257
270
|
// chat templates
|
|
258
271
|
//
|
|
@@ -338,3 +351,4 @@ const char * llama_print_system_info(void) {
|
|
|
338
351
|
|
|
339
352
|
return s.c_str();
|
|
340
353
|
}
|
|
354
|
+
|
package/cpp/rn-completion.cpp
CHANGED
|
@@ -298,7 +298,8 @@ CompletionResult run_completion(
|
|
|
298
298
|
}
|
|
299
299
|
|
|
300
300
|
const int64_t t_end_generation = ggml_time_us();
|
|
301
|
-
|
|
301
|
+
// Note: keeping generation_time_ms for future timing measurements
|
|
302
|
+
// const double generation_time_ms = (t_end_generation - t_start_generation) / 1000.0;
|
|
302
303
|
|
|
303
304
|
// Set the result
|
|
304
305
|
result.content = state.generated_text;
|
|
@@ -349,8 +350,9 @@ CompletionResult run_chat_completion(
|
|
|
349
350
|
common_chat_templates_inputs template_inputs;
|
|
350
351
|
template_inputs.messages = chat_msgs;
|
|
351
352
|
template_inputs.add_generation_prompt = true;
|
|
352
|
-
template_inputs.use_jinja =
|
|
353
|
-
|
|
353
|
+
template_inputs.use_jinja = rn_ctx->params.use_jinja;
|
|
354
|
+
// Note: extract_reasoning field doesn't exist in current llama.cpp version
|
|
355
|
+
// template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
|
|
354
356
|
|
|
355
357
|
// Add grammar if present in options
|
|
356
358
|
if (!options.grammar.empty()) {
|
|
@@ -389,6 +391,31 @@ CompletionResult run_chat_completion(
|
|
|
389
391
|
result = run_completion(rn_ctx, cmpl_options, callback);
|
|
390
392
|
|
|
391
393
|
if (result.success) {
|
|
394
|
+
// Parse the generated content for tool calls and structured responses
|
|
395
|
+
common_chat_msg parsed_msg;
|
|
396
|
+
bool has_parsed_content = false;
|
|
397
|
+
|
|
398
|
+
// Only parse if we have tools available and the response isn't empty
|
|
399
|
+
if (!template_inputs.tools.empty() && !result.content.empty()) {
|
|
400
|
+
try {
|
|
401
|
+
// Construct the chat syntax for parsing using the format from template application
|
|
402
|
+
common_chat_syntax syntax;
|
|
403
|
+
syntax.format = chat_params.format; // Use format from template, not from params
|
|
404
|
+
syntax.reasoning_format = rn_ctx->params.reasoning_format;
|
|
405
|
+
syntax.reasoning_in_content = true;
|
|
406
|
+
syntax.thinking_forced_open = false;
|
|
407
|
+
syntax.parse_tool_calls = true;
|
|
408
|
+
|
|
409
|
+
// Parse the generated content for tool calls
|
|
410
|
+
parsed_msg = common_chat_parse(result.content, false, syntax);
|
|
411
|
+
has_parsed_content = true;
|
|
412
|
+
|
|
413
|
+
} catch (const std::exception& e) {
|
|
414
|
+
// If parsing fails, treat as regular content
|
|
415
|
+
has_parsed_content = false;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
392
419
|
// Create OpenAI-compatible response
|
|
393
420
|
json response = {
|
|
394
421
|
{"id", gen_chatcmplid()},
|
|
@@ -401,11 +428,39 @@ CompletionResult run_chat_completion(
|
|
|
401
428
|
json choice = {
|
|
402
429
|
{"index", 0},
|
|
403
430
|
{"message", {
|
|
404
|
-
{"role", "assistant"}
|
|
405
|
-
{"content", result.content}
|
|
431
|
+
{"role", "assistant"}
|
|
406
432
|
}},
|
|
407
433
|
{"finish_reason", "stop"}
|
|
408
434
|
};
|
|
435
|
+
|
|
436
|
+
// Add parsed content and tool calls if available
|
|
437
|
+
if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
|
|
438
|
+
// Set content to the parsed content (may be null for tool-only responses)
|
|
439
|
+
if (!parsed_msg.content.empty()) {
|
|
440
|
+
choice["message"]["content"] = parsed_msg.content;
|
|
441
|
+
} else {
|
|
442
|
+
choice["message"]["content"] = nullptr;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Add tool calls to the message
|
|
446
|
+
json tool_calls = json::array();
|
|
447
|
+
for (const auto& tool_call : parsed_msg.tool_calls) {
|
|
448
|
+
json tc = {
|
|
449
|
+
{"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
|
|
450
|
+
{"type", "function"},
|
|
451
|
+
{"function", {
|
|
452
|
+
{"name", tool_call.name},
|
|
453
|
+
{"arguments", tool_call.arguments}
|
|
454
|
+
}}
|
|
455
|
+
};
|
|
456
|
+
tool_calls.push_back(tc);
|
|
457
|
+
}
|
|
458
|
+
choice["message"]["tool_calls"] = tool_calls;
|
|
459
|
+
choice["finish_reason"] = "tool_calls";
|
|
460
|
+
} else {
|
|
461
|
+
// Regular text response
|
|
462
|
+
choice["message"]["content"] = has_parsed_content ? parsed_msg.content : result.content;
|
|
463
|
+
}
|
|
409
464
|
|
|
410
465
|
choices.push_back(choice);
|
|
411
466
|
response["choices"] = choices;
|
package/ios/include/chat.h
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
#pragma once
|
|
4
4
|
|
|
5
5
|
#include "common.h"
|
|
6
|
+
#include <functional>
|
|
7
|
+
#include <chrono>
|
|
6
8
|
#include <string>
|
|
7
9
|
#include <vector>
|
|
8
10
|
|
|
@@ -12,11 +14,19 @@ struct common_chat_tool_call {
|
|
|
12
14
|
std::string name;
|
|
13
15
|
std::string arguments;
|
|
14
16
|
std::string id;
|
|
17
|
+
|
|
18
|
+
bool operator==(const common_chat_tool_call & other) const {
|
|
19
|
+
return name == other.name && arguments == other.arguments && id == other.id;
|
|
20
|
+
}
|
|
15
21
|
};
|
|
16
22
|
|
|
17
23
|
struct common_chat_msg_content_part {
|
|
18
24
|
std::string type;
|
|
19
25
|
std::string text;
|
|
26
|
+
|
|
27
|
+
bool operator==(const common_chat_msg_content_part & other) const {
|
|
28
|
+
return type == other.type && text == other.text;
|
|
29
|
+
}
|
|
20
30
|
};
|
|
21
31
|
|
|
22
32
|
struct common_chat_msg {
|
|
@@ -27,6 +37,51 @@ struct common_chat_msg {
|
|
|
27
37
|
std::string reasoning_content;
|
|
28
38
|
std::string tool_name;
|
|
29
39
|
std::string tool_call_id;
|
|
40
|
+
|
|
41
|
+
template <class T> T to_json_oaicompat() const;
|
|
42
|
+
|
|
43
|
+
bool empty() const {
|
|
44
|
+
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
|
45
|
+
}
|
|
46
|
+
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
|
|
47
|
+
for (auto i = 0u; i < tool_calls.size(); i++) {
|
|
48
|
+
if (ids_cache.size() <= i) {
|
|
49
|
+
auto id = tool_calls[i].id;
|
|
50
|
+
if (id.empty()) {
|
|
51
|
+
id = gen_tool_call_id();
|
|
52
|
+
}
|
|
53
|
+
ids_cache.push_back(id);
|
|
54
|
+
}
|
|
55
|
+
tool_calls[i].id = ids_cache[i];
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
bool operator==(const common_chat_msg & other) const {
|
|
59
|
+
return role == other.role
|
|
60
|
+
&& content == other.content
|
|
61
|
+
&& content_parts == other.content_parts
|
|
62
|
+
&& tool_calls == other.tool_calls
|
|
63
|
+
&& reasoning_content == other.reasoning_content
|
|
64
|
+
&& tool_name == other.tool_name
|
|
65
|
+
&& tool_call_id == other.tool_call_id;
|
|
66
|
+
}
|
|
67
|
+
bool operator!=(const common_chat_msg & other) const {
|
|
68
|
+
return !(*this == other);
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
struct common_chat_msg_diff {
|
|
73
|
+
// std::string reasoning_content_delta;
|
|
74
|
+
std::string content_delta;
|
|
75
|
+
size_t tool_call_index = std::string::npos;
|
|
76
|
+
common_chat_tool_call tool_call_delta;
|
|
77
|
+
|
|
78
|
+
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
|
|
79
|
+
|
|
80
|
+
bool operator==(const common_chat_msg_diff & other) const {
|
|
81
|
+
return content_delta == other.content_delta
|
|
82
|
+
&& tool_call_index == other.tool_call_index
|
|
83
|
+
&& tool_call_delta == other.tool_call_delta;
|
|
84
|
+
}
|
|
30
85
|
};
|
|
31
86
|
|
|
32
87
|
struct common_chat_tool {
|
|
@@ -48,14 +103,11 @@ enum common_chat_format {
|
|
|
48
103
|
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
|
49
104
|
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
|
50
105
|
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
|
51
|
-
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
|
|
52
106
|
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
|
53
107
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
|
54
108
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
55
109
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
56
|
-
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
|
|
57
110
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
58
|
-
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
|
59
111
|
|
|
60
112
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
61
113
|
};
|
|
@@ -70,7 +122,9 @@ struct common_chat_templates_inputs {
|
|
|
70
122
|
std::vector<common_chat_tool> tools;
|
|
71
123
|
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
|
72
124
|
bool parallel_tool_calls = false;
|
|
73
|
-
|
|
125
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
126
|
+
bool enable_thinking = true;
|
|
127
|
+
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
74
128
|
};
|
|
75
129
|
|
|
76
130
|
struct common_chat_params {
|
|
@@ -78,11 +132,21 @@ struct common_chat_params {
|
|
|
78
132
|
std::string prompt;
|
|
79
133
|
std::string grammar;
|
|
80
134
|
bool grammar_lazy = false;
|
|
135
|
+
bool thinking_forced_open = false;
|
|
81
136
|
std::vector<common_grammar_trigger> grammar_triggers;
|
|
82
137
|
std::vector<std::string> preserved_tokens;
|
|
83
138
|
std::vector<std::string> additional_stops;
|
|
84
139
|
};
|
|
85
140
|
|
|
141
|
+
struct common_chat_syntax {
|
|
142
|
+
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
143
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
144
|
+
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
|
|
145
|
+
bool reasoning_in_content = false;
|
|
146
|
+
bool thinking_forced_open = false;
|
|
147
|
+
bool parse_tool_calls = true;
|
|
148
|
+
};
|
|
149
|
+
|
|
86
150
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
87
151
|
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
|
88
152
|
|
|
@@ -119,8 +183,9 @@ std::string common_chat_format_example(
|
|
|
119
183
|
const struct common_chat_templates * tmpls,
|
|
120
184
|
bool use_jinja);
|
|
121
185
|
|
|
122
|
-
|
|
123
|
-
|
|
186
|
+
const char* common_chat_format_name(common_chat_format format);
|
|
187
|
+
const char* common_reasoning_format_name(common_reasoning_format format);
|
|
188
|
+
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
|
124
189
|
|
|
125
190
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
|
126
191
|
|
|
@@ -133,3 +198,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
|
|
|
133
198
|
// T can be std::string containing JSON or nlohmann::ordered_json
|
|
134
199
|
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
|
|
135
200
|
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
|
201
|
+
|
|
202
|
+
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
|
@@ -13,10 +13,12 @@
|
|
|
13
13
|
#include <chrono>
|
|
14
14
|
#include <cstddef>
|
|
15
15
|
#include <cstdio>
|
|
16
|
+
#include <ctime>
|
|
16
17
|
#include <exception>
|
|
17
18
|
#include <iomanip>
|
|
18
19
|
#include <memory>
|
|
19
20
|
#include <sstream>
|
|
21
|
+
#include <stdexcept>
|
|
20
22
|
#include <string>
|
|
21
23
|
#include <vector>
|
|
22
24
|
|
|
@@ -393,8 +395,8 @@ class chat_template {
|
|
|
393
395
|
|
|
394
396
|
for (const auto & message_ : adjusted_messages) {
|
|
395
397
|
auto message = message_;
|
|
396
|
-
if (!message.contains("role") || !message.contains("content")) {
|
|
397
|
-
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
|
398
|
+
if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
|
|
399
|
+
throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
|
|
398
400
|
}
|
|
399
401
|
std::string role = message.at("role");
|
|
400
402
|
|
|
@@ -415,7 +417,6 @@ class chat_template {
|
|
|
415
417
|
}
|
|
416
418
|
}
|
|
417
419
|
if (polyfill_tool_calls) {
|
|
418
|
-
auto content = message.at("content");
|
|
419
420
|
auto tool_calls = json::array();
|
|
420
421
|
for (const auto & tool_call : message.at("tool_calls")) {
|
|
421
422
|
if (tool_call.at("type") != "function") {
|
|
@@ -434,8 +435,11 @@ class chat_template {
|
|
|
434
435
|
auto obj = json {
|
|
435
436
|
{"tool_calls", tool_calls},
|
|
436
437
|
};
|
|
437
|
-
if (
|
|
438
|
-
|
|
438
|
+
if (message.contains("content")) {
|
|
439
|
+
auto content = message.at("content");
|
|
440
|
+
if (!content.is_null() && !content.empty()) {
|
|
441
|
+
obj["content"] = content;
|
|
442
|
+
}
|
|
439
443
|
}
|
|
440
444
|
message["content"] = obj.dump(2);
|
|
441
445
|
message.erase("tool_calls");
|