@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -73,6 +73,7 @@ enum llm_type {
|
|
|
73
73
|
LLM_TYPE_40B,
|
|
74
74
|
LLM_TYPE_65B,
|
|
75
75
|
LLM_TYPE_70B,
|
|
76
|
+
LLM_TYPE_142B,
|
|
76
77
|
LLM_TYPE_236B,
|
|
77
78
|
LLM_TYPE_290B,
|
|
78
79
|
LLM_TYPE_314B,
|
|
@@ -94,6 +95,8 @@ enum llm_type {
|
|
|
94
95
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
95
96
|
LLM_TYPE_30B_A3B,
|
|
96
97
|
LLM_TYPE_235B_A22B,
|
|
98
|
+
LLM_TYPE_E2B,
|
|
99
|
+
LLM_TYPE_E4B,
|
|
97
100
|
};
|
|
98
101
|
|
|
99
102
|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
@@ -315,6 +318,19 @@ struct llama_layer {
|
|
|
315
318
|
struct ggml_tensor * ffn_up_scale = nullptr;
|
|
316
319
|
struct ggml_tensor * ffn_down_scale = nullptr;
|
|
317
320
|
|
|
321
|
+
// altup & laurel
|
|
322
|
+
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
|
323
|
+
struct ggml_tensor * per_layer_proj = nullptr;
|
|
324
|
+
struct ggml_tensor * per_layer_post_norm = nullptr;
|
|
325
|
+
struct ggml_tensor * altup_correct_coef = nullptr;
|
|
326
|
+
struct ggml_tensor * altup_correct_scale = nullptr;
|
|
327
|
+
struct ggml_tensor * altup_predict_coef = nullptr;
|
|
328
|
+
struct ggml_tensor * altup_router = nullptr;
|
|
329
|
+
struct ggml_tensor * altup_router_norm = nullptr;
|
|
330
|
+
struct ggml_tensor * laurel_l = nullptr;
|
|
331
|
+
struct ggml_tensor * laurel_r = nullptr;
|
|
332
|
+
struct ggml_tensor * laurel_post_norm = nullptr;
|
|
333
|
+
|
|
318
334
|
struct llama_layer_posnet posnet;
|
|
319
335
|
|
|
320
336
|
struct llama_layer_convnext convnext;
|
|
@@ -329,6 +345,9 @@ struct llama_model {
|
|
|
329
345
|
llama_hparams hparams = {};
|
|
330
346
|
llama_vocab vocab;
|
|
331
347
|
|
|
348
|
+
// for classifier models
|
|
349
|
+
std::vector<std::string> classifier_labels;
|
|
350
|
+
|
|
332
351
|
struct ggml_tensor * tok_embd = nullptr;
|
|
333
352
|
struct ggml_tensor * type_embd = nullptr;
|
|
334
353
|
struct ggml_tensor * pos_embd = nullptr;
|
|
@@ -350,6 +369,13 @@ struct llama_model {
|
|
|
350
369
|
struct ggml_tensor * conv1d = nullptr;
|
|
351
370
|
struct ggml_tensor * conv1d_b = nullptr;
|
|
352
371
|
|
|
372
|
+
// gemma3n altup
|
|
373
|
+
struct ggml_tensor * tok_embd_per_layer = nullptr;
|
|
374
|
+
struct ggml_tensor * altup_proj = nullptr;
|
|
375
|
+
struct ggml_tensor * altup_unembd_proj = nullptr;
|
|
376
|
+
struct ggml_tensor * per_layer_model_proj = nullptr;
|
|
377
|
+
struct ggml_tensor * per_layer_proj_norm = nullptr;
|
|
378
|
+
|
|
353
379
|
std::vector<llama_layer> layers;
|
|
354
380
|
|
|
355
381
|
llama_model_params params;
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
#include "llama-quant.h"
|
|
2
|
-
|
|
3
2
|
#include "llama-impl.h"
|
|
4
3
|
#include "llama-model.h"
|
|
5
4
|
#include "llama-model-loader.h"
|
|
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
|
|
|
27
26
|
}
|
|
28
27
|
}
|
|
29
28
|
|
|
29
|
+
static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
|
|
30
|
+
if (prune.empty()) {
|
|
31
|
+
return orig_name;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static const std::regex pattern(R"(blk\.(\d+)\.)");
|
|
35
|
+
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
|
|
36
|
+
const int blk = std::stoi(match[1]);
|
|
37
|
+
std::string new_name = orig_name;
|
|
38
|
+
|
|
39
|
+
if (mapped.count(blk)) {
|
|
40
|
+
// Already mapped, do nothing
|
|
41
|
+
} else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
|
|
42
|
+
mapped[blk] = "";
|
|
43
|
+
} else if (blk < prune.front()) {
|
|
44
|
+
mapped[blk] = std::to_string(blk);
|
|
45
|
+
next_id = blk + 1;
|
|
46
|
+
} else {
|
|
47
|
+
mapped[blk] = std::to_string(next_id);
|
|
48
|
+
++next_id;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return orig_name;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
|
|
58
|
+
if (mapped.empty()) {
|
|
59
|
+
return orig_name;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static const std::regex pattern(R"(blk\.(\d+)\.)");
|
|
63
|
+
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
|
|
64
|
+
const std::string blk(match[1]);
|
|
65
|
+
std::string new_name = orig_name;
|
|
66
|
+
|
|
67
|
+
for (const auto & p : mapped) {
|
|
68
|
+
if (p.second == blk) {
|
|
69
|
+
LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
|
|
70
|
+
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return orig_name;
|
|
77
|
+
}
|
|
78
|
+
|
|
30
79
|
struct quantize_state_impl {
|
|
31
80
|
const llama_model & model;
|
|
32
81
|
const llama_model_quantize_params * params;
|
|
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
174
223
|
new_type = GGML_TYPE_Q6_K;
|
|
175
224
|
}
|
|
176
225
|
}
|
|
177
|
-
} else if (name == "token_embd.weight") {
|
|
226
|
+
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
|
178
227
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
|
179
228
|
new_type = qs.params->token_embedding_type;
|
|
180
229
|
} else {
|
|
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
568
617
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
|
569
618
|
gguf_context_ptr ctx_out { gguf_init_empty() };
|
|
570
619
|
|
|
620
|
+
std::vector<int> prune_list = {};
|
|
621
|
+
if (params->prune_layers) {
|
|
622
|
+
prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
|
|
623
|
+
}
|
|
624
|
+
|
|
571
625
|
// copy the KV pairs from the input file
|
|
572
626
|
gguf_set_kv (ctx_out.get(), ml.meta.get());
|
|
573
627
|
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
|
@@ -585,7 +639,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
585
639
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
|
586
640
|
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
|
587
641
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
|
588
|
-
|
|
642
|
+
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
|
643
|
+
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
|
|
589
644
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
|
590
645
|
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
|
591
646
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
|
@@ -596,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
651
|
}
|
|
597
652
|
}
|
|
598
653
|
|
|
654
|
+
std::map<int, std::string> mapped;
|
|
655
|
+
int blk_id = 0;
|
|
656
|
+
int pruned_attention_w = 0;
|
|
657
|
+
|
|
599
658
|
// make a list of weights
|
|
600
659
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
|
601
660
|
tensors.reserve(ml.weights_map.size());
|
|
602
661
|
for (const auto & it : ml.weights_map) {
|
|
662
|
+
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
|
663
|
+
if (remapped_name.empty()) {
|
|
664
|
+
if (it.first.find("attn_v.weight") != std::string::npos ||
|
|
665
|
+
it.first.find("attn_qkv.weight") != std::string::npos ||
|
|
666
|
+
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
|
667
|
+
pruned_attention_w++;
|
|
668
|
+
}
|
|
669
|
+
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
670
|
+
continue;
|
|
671
|
+
} else if (remapped_name != it.first) {
|
|
672
|
+
ggml_set_name(it.second.tensor, remapped_name.c_str());
|
|
673
|
+
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
|
|
674
|
+
}
|
|
603
675
|
tensors.push_back(&it.second);
|
|
604
676
|
}
|
|
677
|
+
if (!prune_list.empty()) {
|
|
678
|
+
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
|
|
679
|
+
}
|
|
605
680
|
|
|
606
681
|
// keep_split requires that the weights are sorted by split index
|
|
607
682
|
if (params->keep_split) {
|
|
@@ -639,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
639
714
|
if (llama_model_has_encoder(&model)) {
|
|
640
715
|
n_attn_layer *= 3;
|
|
641
716
|
}
|
|
642
|
-
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
|
717
|
+
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
|
|
643
718
|
}
|
|
644
719
|
|
|
645
720
|
size_t total_size_org = 0;
|
|
@@ -680,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
680
755
|
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
|
681
756
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
|
682
757
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
|
683
|
-
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(),
|
|
758
|
+
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
|
|
684
759
|
}
|
|
685
760
|
}
|
|
686
761
|
|
|
@@ -755,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
755
830
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
756
831
|
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
757
832
|
|
|
833
|
+
// these are very small (e.g. 4x4)
|
|
834
|
+
quantize &= name.find("altup") == std::string::npos;
|
|
835
|
+
quantize &= name.find("laurel") == std::string::npos;
|
|
836
|
+
|
|
837
|
+
// these are not too big so keep them as it is
|
|
838
|
+
quantize &= name.find("per_layer_model_proj") == std::string::npos;
|
|
839
|
+
|
|
758
840
|
// do not quantize positional embeddings and token types (BERT)
|
|
759
841
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
760
842
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
|
@@ -831,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
831
913
|
|
|
832
914
|
const float * imatrix = nullptr;
|
|
833
915
|
if (imatrix_data) {
|
|
834
|
-
auto it = imatrix_data->find(tensor->name);
|
|
916
|
+
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
|
835
917
|
if (it == imatrix_data->end()) {
|
|
836
918
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
|
837
919
|
} else {
|
|
@@ -946,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
946
1028
|
/*.imatrix =*/ nullptr,
|
|
947
1029
|
/*.kv_overrides =*/ nullptr,
|
|
948
1030
|
/*.tensor_type =*/ nullptr,
|
|
1031
|
+
/*.prune_layers =*/ nullptr
|
|
949
1032
|
};
|
|
950
1033
|
|
|
951
1034
|
return result;
|
|
@@ -9,16 +9,16 @@
|
|
|
9
9
|
|
|
10
10
|
#include <algorithm>
|
|
11
11
|
#include <cassert>
|
|
12
|
+
#include <cctype>
|
|
12
13
|
#include <cfloat>
|
|
13
|
-
#include <climits>
|
|
14
14
|
#include <cstdarg>
|
|
15
15
|
#include <cstring>
|
|
16
16
|
#include <forward_list>
|
|
17
|
+
#include <limits>
|
|
17
18
|
#include <map>
|
|
18
19
|
#include <queue>
|
|
19
20
|
#include <set>
|
|
20
21
|
#include <unordered_map>
|
|
21
|
-
#include <cctype>
|
|
22
22
|
|
|
23
23
|
//
|
|
24
24
|
// helpers
|
|
@@ -1269,6 +1269,7 @@ struct llama_vocab::impl {
|
|
|
1269
1269
|
bool add_space_prefix = false;
|
|
1270
1270
|
bool add_bos = false;
|
|
1271
1271
|
bool add_eos = false;
|
|
1272
|
+
bool add_sep = false;
|
|
1272
1273
|
bool ignore_merges = false;
|
|
1273
1274
|
bool clean_spaces = false; // clean_up_tokenization_spaces
|
|
1274
1275
|
bool remove_extra_whitespaces = false;
|
|
@@ -1421,6 +1422,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1421
1422
|
special_sep_id = 102;
|
|
1422
1423
|
special_pad_id = 0;
|
|
1423
1424
|
special_mask_id = 103;
|
|
1425
|
+
|
|
1426
|
+
add_sep = true;
|
|
1424
1427
|
} else if (tokenizer_model == "gpt2") {
|
|
1425
1428
|
type = LLAMA_VOCAB_TYPE_BPE;
|
|
1426
1429
|
|
|
@@ -1550,12 +1553,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1550
1553
|
tokenizer_pre == "jina-es" ||
|
|
1551
1554
|
tokenizer_pre == "jina-de" ||
|
|
1552
1555
|
tokenizer_pre == "gigachat" ||
|
|
1553
|
-
tokenizer_pre == "jina-v1-en" ||
|
|
1554
1556
|
tokenizer_pre == "jina-v2-es" ||
|
|
1555
|
-
tokenizer_pre == "jina-v2-de"
|
|
1557
|
+
tokenizer_pre == "jina-v2-de") {
|
|
1558
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1559
|
+
} else if (
|
|
1560
|
+
tokenizer_pre == "jina-v1-en" ||
|
|
1556
1561
|
tokenizer_pre == "jina-v2-code" ||
|
|
1557
1562
|
tokenizer_pre == "roberta-bpe") {
|
|
1558
1563
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1564
|
+
add_sep = true;
|
|
1559
1565
|
} else if (
|
|
1560
1566
|
tokenizer_pre == "refact") {
|
|
1561
1567
|
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
|
@@ -1665,6 +1671,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1665
1671
|
clean_spaces = true;
|
|
1666
1672
|
add_bos = true;
|
|
1667
1673
|
add_eos = false;
|
|
1674
|
+
add_sep = true;
|
|
1668
1675
|
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
|
|
1669
1676
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1670
1677
|
add_bos = false;
|
|
@@ -1801,7 +1808,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1801
1808
|
}
|
|
1802
1809
|
}
|
|
1803
1810
|
|
|
1804
|
-
// Handle add_bos and
|
|
1811
|
+
// Handle add_bos, add_eos and add_sep
|
|
1805
1812
|
{
|
|
1806
1813
|
bool temp = true;
|
|
1807
1814
|
|
|
@@ -1811,6 +1818,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1811
1818
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
|
1812
1819
|
add_eos = temp;
|
|
1813
1820
|
}
|
|
1821
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
|
|
1822
|
+
add_sep = temp;
|
|
1823
|
+
}
|
|
1814
1824
|
}
|
|
1815
1825
|
|
|
1816
1826
|
// auto-detect special tokens by text
|
|
@@ -1987,6 +1997,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1987
1997
|
|| t.first == "<|eom_id|>"
|
|
1988
1998
|
|| t.first == "<EOT>"
|
|
1989
1999
|
|| t.first == "_<EOT>"
|
|
2000
|
+
|| t.first == "<|end_of_text|>"
|
|
1990
2001
|
) {
|
|
1991
2002
|
special_eog_ids.insert(t.second);
|
|
1992
2003
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2059,9 +2070,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2059
2070
|
//NOTE: Per token attributes are missing from the GGUF file.
|
|
2060
2071
|
//TODO: Extract attributes from GGUF file.
|
|
2061
2072
|
{
|
|
2062
|
-
auto _contains_any = [] (const std::string & str, const std::vector<std::
|
|
2073
|
+
auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
|
|
2063
2074
|
for (const auto & substr : substrs) {
|
|
2064
|
-
if (str.find(substr)
|
|
2075
|
+
if (str.find(substr) != std::string::npos) {
|
|
2065
2076
|
return true;
|
|
2066
2077
|
}
|
|
2067
2078
|
}
|
|
@@ -2098,7 +2109,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2098
2109
|
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
|
2099
2110
|
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
|
2100
2111
|
) {
|
|
2101
|
-
|
|
2112
|
+
if (token_to_id.count("<mask>") == 0) {
|
|
2113
|
+
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
|
2114
|
+
} else {
|
|
2115
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
2116
|
+
}
|
|
2102
2117
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
2103
2118
|
for (auto id : cache_special_tokens) {
|
|
2104
2119
|
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
@@ -2568,6 +2583,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
|
|
2568
2583
|
// copy piece chars to output text buffer
|
|
2569
2584
|
// skip up to 'lstrip' leading spaces before copying
|
|
2570
2585
|
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
|
2586
|
+
if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
|
|
2587
|
+
GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
|
|
2588
|
+
}
|
|
2589
|
+
|
|
2571
2590
|
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
|
2572
2591
|
token++;
|
|
2573
2592
|
size--;
|
|
@@ -2764,26 +2783,26 @@ void llama_vocab::impl::print_info() const {
|
|
|
2764
2783
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
|
2765
2784
|
|
|
2766
2785
|
// special tokens
|
|
2767
|
-
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token
|
|
2768
|
-
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token
|
|
2769
|
-
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token
|
|
2770
|
-
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token
|
|
2771
|
-
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token
|
|
2772
|
-
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token
|
|
2773
|
-
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token
|
|
2774
|
-
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token
|
|
2775
|
-
|
|
2776
|
-
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token
|
|
2777
|
-
|
|
2778
|
-
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token
|
|
2779
|
-
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token
|
|
2780
|
-
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token
|
|
2781
|
-
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token
|
|
2782
|
-
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token
|
|
2783
|
-
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token
|
|
2786
|
+
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
|
2787
|
+
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
|
2788
|
+
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
|
2789
|
+
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
|
2790
|
+
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
|
2791
|
+
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
|
2792
|
+
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
|
2793
|
+
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
|
2794
|
+
|
|
2795
|
+
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
|
2796
|
+
|
|
2797
|
+
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
|
2798
|
+
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
|
2799
|
+
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
|
2800
|
+
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
|
2801
|
+
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
|
2802
|
+
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
|
2784
2803
|
|
|
2785
2804
|
for (const auto & id : special_eog_ids) {
|
|
2786
|
-
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token
|
|
2805
|
+
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
|
2787
2806
|
}
|
|
2788
2807
|
|
|
2789
2808
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
|
@@ -2991,6 +3010,10 @@ bool llama_vocab::get_add_eos() const {
|
|
|
2991
3010
|
return pimpl->add_eos;
|
|
2992
3011
|
}
|
|
2993
3012
|
|
|
3013
|
+
bool llama_vocab::get_add_sep() const {
|
|
3014
|
+
return pimpl->add_sep;
|
|
3015
|
+
}
|
|
3016
|
+
|
|
2994
3017
|
bool llama_vocab::get_ignore_merges() const {
|
|
2995
3018
|
return pimpl->ignore_merges;
|
|
2996
3019
|
}
|
|
@@ -3051,6 +3074,11 @@ int32_t llama_vocab::tokenize(
|
|
|
3051
3074
|
bool add_special,
|
|
3052
3075
|
bool parse_special) const {
|
|
3053
3076
|
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
|
|
3077
|
+
if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
|
|
3078
|
+
LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
|
|
3079
|
+
return std::numeric_limits<int32_t>::min();
|
|
3080
|
+
}
|
|
3081
|
+
|
|
3054
3082
|
if (n_tokens_max < (int) res.size()) {
|
|
3055
3083
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
|
3056
3084
|
return -((int) res.size());
|
|
@@ -3182,6 +3210,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
|
|
|
3182
3210
|
return vocab->get_add_eos();
|
|
3183
3211
|
}
|
|
3184
3212
|
|
|
3213
|
+
bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
|
|
3214
|
+
return vocab->get_add_sep();
|
|
3215
|
+
}
|
|
3216
|
+
|
|
3185
3217
|
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
|
|
3186
3218
|
return vocab->token_fim_pre();
|
|
3187
3219
|
}
|
|
@@ -74,6 +74,7 @@ struct llama_vocab {
|
|
|
74
74
|
bool get_add_space_prefix () const;
|
|
75
75
|
bool get_add_bos () const;
|
|
76
76
|
bool get_add_eos () const;
|
|
77
|
+
bool get_add_sep () const;
|
|
77
78
|
bool get_ignore_merges () const;
|
|
78
79
|
bool get_clean_spaces () const;
|
|
79
80
|
bool get_remove_extra_whitespaces () const;
|
|
@@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
198
198
|
|
|
199
199
|
// if using single GPU mode, remove all except the main GPU
|
|
200
200
|
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
201
|
-
if (params.main_gpu < 0
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
201
|
+
if (params.main_gpu < 0) {
|
|
202
|
+
model->devices.clear();
|
|
203
|
+
} else {
|
|
204
|
+
if (params.main_gpu >= (int)model->devices.size()) {
|
|
205
|
+
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
|
|
206
|
+
llama_model_free(model);
|
|
207
|
+
return nullptr;
|
|
208
|
+
}
|
|
209
|
+
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
|
210
|
+
model->devices.clear();
|
|
211
|
+
model->devices.push_back(main_gpu);
|
|
205
212
|
}
|
|
206
|
-
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
|
207
|
-
model->devices.clear();
|
|
208
|
-
model->devices.push_back(main_gpu);
|
|
209
213
|
}
|
|
210
214
|
|
|
211
215
|
for (auto * dev : model->devices) {
|
|
@@ -204,12 +204,17 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
|
|
204
204
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
205
205
|
# pragma clang diagnostic push
|
|
206
206
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
207
|
+
#elif defined(__GNUC__)
|
|
208
|
+
# pragma GCC diagnostic push
|
|
209
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
207
210
|
#endif
|
|
208
211
|
|
|
209
212
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
|
210
213
|
|
|
211
214
|
#if defined(__clang__)
|
|
212
215
|
# pragma clang diagnostic pop
|
|
216
|
+
#elif defined(__GNUC__)
|
|
217
|
+
# pragma GCC diagnostic pop
|
|
213
218
|
#endif
|
|
214
219
|
|
|
215
220
|
return conv.from_bytes(s);
|
package/cpp/rn-completion.cpp
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#include "rn-llama.
|
|
1
|
+
#include "rn-llama.h"
|
|
2
2
|
// Suppress unused function warnings from llama.cpp headers
|
|
3
3
|
#pragma GCC diagnostic push
|
|
4
4
|
#pragma GCC diagnostic ignored "-Wunused-function"
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#include "llama.h"
|
|
8
8
|
#include "sampling.h"
|
|
9
9
|
#pragma GCC diagnostic pop
|
|
10
|
-
#include "rn-utils.
|
|
10
|
+
#include "rn-utils.h"
|
|
11
11
|
|
|
12
12
|
#include <string>
|
|
13
13
|
#include <vector>
|
|
@@ -54,6 +54,7 @@ struct CompletionOptions {
|
|
|
54
54
|
float top_p = 0.9f;
|
|
55
55
|
float top_k = 40.0f;
|
|
56
56
|
float min_p = 0.05f;
|
|
57
|
+
float presence_penalty = 0.0f; // for reducing repetitions (0-2 range)
|
|
57
58
|
int n_keep = 0;
|
|
58
59
|
int n_probs = 0; // for log probabilities
|
|
59
60
|
bool post_sampling_probs = false;
|
|
@@ -77,6 +78,7 @@ struct CompletionOptions {
|
|
|
77
78
|
{"top_p", top_p},
|
|
78
79
|
{"top_k", top_k},
|
|
79
80
|
{"min_p", min_p},
|
|
81
|
+
{"presence_penalty", presence_penalty},
|
|
80
82
|
{"n_predict", n_predict},
|
|
81
83
|
{"n_keep", n_keep},
|
|
82
84
|
{"n_probs", n_probs},
|
|
@@ -147,6 +149,7 @@ struct CompletionOptions {
|
|
|
147
149
|
data["top_p"] = top_p;
|
|
148
150
|
data["max_tokens"] = n_predict;
|
|
149
151
|
data["stream"] = stream;
|
|
152
|
+
data["presence_penalty"] = presence_penalty;
|
|
150
153
|
|
|
151
154
|
if (seed >= 0) {
|
|
152
155
|
data["seed"] = seed;
|
package/ios/include/chat.h
CHANGED
|
@@ -70,7 +70,7 @@ struct common_chat_msg {
|
|
|
70
70
|
};
|
|
71
71
|
|
|
72
72
|
struct common_chat_msg_diff {
|
|
73
|
-
|
|
73
|
+
std::string reasoning_content_delta;
|
|
74
74
|
std::string content_delta;
|
|
75
75
|
size_t tool_call_index = std::string::npos;
|
|
76
76
|
common_chat_tool_call tool_call_delta;
|
package/ios/include/common.h
CHANGED
|
@@ -199,6 +199,9 @@ struct common_params_speculative {
|
|
|
199
199
|
float p_split = 0.1f; // speculative decoding split probability
|
|
200
200
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
201
201
|
|
|
202
|
+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
203
|
+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
204
|
+
|
|
202
205
|
struct cpu_params cpuparams;
|
|
203
206
|
struct cpu_params cpuparams_batch;
|
|
204
207
|
|
|
@@ -215,7 +218,8 @@ struct common_params_vocoder {
|
|
|
215
218
|
|
|
216
219
|
enum common_reasoning_format {
|
|
217
220
|
COMMON_REASONING_FORMAT_NONE,
|
|
218
|
-
|
|
221
|
+
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
222
|
+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
219
223
|
};
|
|
220
224
|
|
|
221
225
|
struct common_params {
|
|
@@ -354,7 +358,7 @@ struct common_params {
|
|
|
354
358
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
355
359
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
356
360
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
357
|
-
|
|
361
|
+
std::string cls_sep = "\t"; // separator of classification sequences
|
|
358
362
|
|
|
359
363
|
// server params
|
|
360
364
|
int32_t port = 8080; // server listens on this network port
|