@novastera-oss/llamarn 0.2.9 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/proguard-rules.pro +12 -0
- package/android/src/main/cpp/include/llama.h +15 -47
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/CMakePresets.json +11 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -0
- package/cpp/llama.cpp/README.md +8 -8
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +62 -1
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +22 -6
- package/cpp/llama.cpp/common/common.h +22 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1250 -43
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +21 -13
- package/cpp/llama.cpp/ggml/CMakeLists.txt +13 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +44 -38
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +126 -8
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +138 -18
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +11 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1206 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +36 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +86 -17
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +47 -60
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +29 -42
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +46 -59
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +38 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +23 -36
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +255 -99
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -695
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +104 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +27 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +48 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +572 -106
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +599 -105
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +800 -42
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +191 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +991 -307
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +59 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +18 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +84 -9
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +386 -67
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +307 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -47
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +15 -47
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
- package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +316 -3
- package/cpp/llama.cpp/src/llama-arch.h +23 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +103 -71
- package/cpp/llama.cpp/src/llama-batch.h +31 -18
- package/cpp/llama.cpp/src/llama-chat.cpp +58 -1
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +180 -106
- package/cpp/llama.cpp/src/llama-context.h +26 -16
- package/cpp/llama.cpp/src/llama-cparams.h +3 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +310 -211
- package/cpp/llama.cpp/src/llama-graph.h +184 -122
- package/cpp/llama.cpp/src/llama-hparams.cpp +47 -1
- package/cpp/llama.cpp/src/llama-hparams.h +13 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +143 -47
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +36 -11
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +3545 -719
- package/cpp/llama.cpp/src/llama-model.h +21 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +376 -10
- package/cpp/llama.cpp/src/llama-vocab.h +43 -0
- package/cpp/llama.cpp/src/unicode.cpp +207 -0
- package/cpp/llama.cpp/src/unicode.h +2 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +22 -4
- package/ios/include/llama.h +15 -47
- package/ios/libs/llama.xcframework/Info.plist +13 -13
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -4926
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +4 -4
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
package/android/build.gradle
CHANGED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# ProGuard rules for @novastera-oss/llamarn library
|
|
2
|
+
# These rules will be automatically included when apps use this library
|
|
3
|
+
|
|
4
|
+
# Keep all classes in our package (includes NativeRNLlamaCppSpec, RNLlamaCppPackage, etc.)
|
|
5
|
+
-keep class com.novastera.llamarn.** {
|
|
6
|
+
*;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
# Keep native methods (JNI)
|
|
10
|
+
-keepclassmembers class com.novastera.llamarn.** {
|
|
11
|
+
native <methods>;
|
|
12
|
+
}
|
|
@@ -71,52 +71,13 @@ extern "C" {
|
|
|
71
71
|
typedef int32_t llama_seq_id;
|
|
72
72
|
|
|
73
73
|
enum llama_vocab_type {
|
|
74
|
-
LLAMA_VOCAB_TYPE_NONE
|
|
75
|
-
LLAMA_VOCAB_TYPE_SPM
|
|
76
|
-
LLAMA_VOCAB_TYPE_BPE
|
|
77
|
-
LLAMA_VOCAB_TYPE_WPM
|
|
78
|
-
LLAMA_VOCAB_TYPE_UGM
|
|
79
|
-
LLAMA_VOCAB_TYPE_RWKV
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
// pre-tokenization types
|
|
83
|
-
enum llama_vocab_pre_type {
|
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
|
90
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
|
91
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
|
92
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
|
93
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
|
94
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
|
95
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
96
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
97
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
98
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
99
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
100
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
|
101
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
|
102
|
-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
|
103
|
-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
|
104
|
-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
|
105
|
-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
|
106
|
-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
107
|
-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
|
108
|
-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
109
|
-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
110
|
-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
111
|
-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
112
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
113
|
-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
114
|
-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
115
|
-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
116
|
-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
117
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
118
|
-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
119
|
-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
74
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
|
75
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
|
76
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
|
77
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
|
78
|
+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
|
79
|
+
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
|
80
|
+
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
|
120
81
|
};
|
|
121
82
|
|
|
122
83
|
enum llama_rope_type {
|
|
@@ -374,6 +335,9 @@ extern "C" {
|
|
|
374
335
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
375
336
|
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
376
337
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
338
|
+
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
|
339
|
+
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
|
340
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
377
341
|
};
|
|
378
342
|
|
|
379
343
|
// model quantization parameters
|
|
@@ -764,7 +728,7 @@ extern "C" {
|
|
|
764
728
|
// - lazily on next llama_decode()
|
|
765
729
|
// p0 < 0 : [0, p1]
|
|
766
730
|
// p1 < 0 : [p0, inf)
|
|
767
|
-
DEPRECATED(void llama_kv_self_seq_div(
|
|
731
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
|
768
732
|
struct llama_context * ctx,
|
|
769
733
|
llama_seq_id seq_id,
|
|
770
734
|
llama_pos p0,
|
|
@@ -992,6 +956,7 @@ extern "C" {
|
|
|
992
956
|
// in the order they have appeared in the batch.
|
|
993
957
|
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
|
994
958
|
// Cols: n_vocab
|
|
959
|
+
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
|
995
960
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
|
996
961
|
|
|
997
962
|
// Logits for the ith token. For positive indices, Equivalent to:
|
|
@@ -1006,6 +971,7 @@ extern "C" {
|
|
|
1006
971
|
// in the order they have appeared in the batch.
|
|
1007
972
|
// shape: [n_outputs*n_embd]
|
|
1008
973
|
// Otherwise, returns NULL.
|
|
974
|
+
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
|
1009
975
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
|
1010
976
|
|
|
1011
977
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
|
@@ -1044,6 +1010,7 @@ extern "C" {
|
|
|
1044
1010
|
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
|
1045
1011
|
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
|
1046
1012
|
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
|
1013
|
+
LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
|
|
1047
1014
|
|
|
1048
1015
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1049
1016
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
@@ -1429,6 +1396,7 @@ extern "C" {
|
|
|
1429
1396
|
|
|
1430
1397
|
int32_t n_p_eval;
|
|
1431
1398
|
int32_t n_eval;
|
|
1399
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
|
1432
1400
|
};
|
|
1433
1401
|
|
|
1434
1402
|
struct llama_perf_sampler_data {
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/build-info.cpp
CHANGED
|
@@ -120,7 +120,6 @@ endfunction()
|
|
|
120
120
|
|
|
121
121
|
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
|
122
122
|
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
|
123
|
-
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
|
124
123
|
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
|
125
124
|
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
|
126
125
|
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
|
@@ -55,6 +55,17 @@
|
|
|
55
55
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
|
56
56
|
}
|
|
57
57
|
},
|
|
58
|
+
{
|
|
59
|
+
"name": "x64-linux-gcc", "hidden": true,
|
|
60
|
+
"cacheVariables": {
|
|
61
|
+
"CMAKE_C_COMPILER": "gcc",
|
|
62
|
+
"CMAKE_CXX_COMPILER": "g++"
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
|
|
66
|
+
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
|
|
67
|
+
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
|
|
68
|
+
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
|
|
58
69
|
|
|
59
70
|
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
|
60
71
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
package/cpp/llama.cpp/CODEOWNERS
CHANGED
package/cpp/llama.cpp/README.md
CHANGED
|
@@ -6,9 +6,9 @@
|
|
|
6
6
|
[](https://github.com/ggml-org/llama.cpp/releases)
|
|
7
7
|
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
|
8
8
|
|
|
9
|
-
[
|
|
9
|
+
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
LLM inference in C/C++
|
|
12
12
|
|
|
13
13
|
## Recent API changes
|
|
14
14
|
|
|
@@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|
|
17
17
|
|
|
18
18
|
## Hot topics
|
|
19
19
|
|
|
20
|
-
-
|
|
21
|
-
-
|
|
20
|
+
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
|
|
21
|
+
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
|
22
22
|
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
|
23
|
-
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
|
24
23
|
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
|
25
24
|
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
|
26
25
|
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
|
@@ -134,6 +133,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
134
133
|
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
|
135
134
|
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
|
|
136
135
|
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
|
|
136
|
+
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
|
|
137
137
|
|
|
138
138
|
#### Multimodal
|
|
139
139
|
|
|
@@ -269,6 +269,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
269
269
|
| [Vulkan](docs/build.md#vulkan) | GPU |
|
|
270
270
|
| [CANN](docs/build.md#cann) | Ascend NPU |
|
|
271
271
|
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
|
272
|
+
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
|
272
273
|
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
|
273
274
|
|
|
274
275
|
## Obtaining and quantizing models
|
|
@@ -434,7 +435,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
|
|
434
435
|
|
|
435
436
|
## [`llama-perplexity`](tools/perplexity)
|
|
436
437
|
|
|
437
|
-
#### A tool for measuring the perplexity [^1]
|
|
438
|
+
#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
|
|
438
439
|
|
|
439
440
|
- <details open>
|
|
440
441
|
<summary>Measure the perplexity over a text file</summary>
|
|
@@ -457,8 +458,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
|
|
457
458
|
|
|
458
459
|
</details>
|
|
459
460
|
|
|
460
|
-
[^1]: [
|
|
461
|
-
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
|
461
|
+
[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
|
462
462
|
|
|
463
463
|
## [`llama-bench`](tools/llama-bench)
|
|
464
464
|
|
|
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
|
|
|
86
86
|
endif()
|
|
87
87
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
88
88
|
include_directories(${CURL_INCLUDE_DIRS})
|
|
89
|
-
|
|
90
|
-
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
|
89
|
+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
|
91
90
|
endif ()
|
|
92
91
|
|
|
93
92
|
if (LLAMA_LLGUIDANCE)
|
|
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
|
|
|
112
111
|
|
|
113
112
|
ExternalProject_Add(llguidance_ext
|
|
114
113
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
115
|
-
#
|
|
116
|
-
GIT_TAG
|
|
114
|
+
# v1.0.1:
|
|
115
|
+
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
|
|
117
116
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
118
117
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
119
118
|
BUILD_IN_SOURCE TRUE
|
|
120
119
|
CONFIGURE_COMMAND ""
|
|
121
|
-
BUILD_COMMAND cargo build --release
|
|
120
|
+
BUILD_COMMAND cargo build --release --package llguidance
|
|
122
121
|
INSTALL_COMMAND ""
|
|
123
122
|
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
|
124
123
|
UPDATE_COMMAND ""
|
|
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1464
1464
|
params.swa_full = true;
|
|
1465
1465
|
}
|
|
1466
1466
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
1467
|
+
add_opt(common_arg(
|
|
1468
|
+
{"--kv-unified", "-kvu"},
|
|
1469
|
+
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
|
1470
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
|
1471
|
+
[](common_params & params) {
|
|
1472
|
+
params.kv_unified = true;
|
|
1473
|
+
}
|
|
1474
|
+
).set_env("LLAMA_ARG_KV_SPLIT"));
|
|
1467
1475
|
add_opt(common_arg(
|
|
1468
1476
|
{"--no-context-shift"},
|
|
1469
1477
|
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
@@ -1604,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1604
1612
|
[](common_params & params, const std::string & value) {
|
|
1605
1613
|
params.antiprompt.emplace_back(value);
|
|
1606
1614
|
}
|
|
1607
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1615
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
1608
1616
|
add_opt(common_arg(
|
|
1609
1617
|
{"-sp", "--special"},
|
|
1610
1618
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
@@ -2647,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2647
2655
|
params.i_chunk = value;
|
|
2648
2656
|
}
|
|
2649
2657
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2658
|
+
add_opt(common_arg(
|
|
2659
|
+
{"--show-statistics"},
|
|
2660
|
+
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
|
|
2661
|
+
[](common_params & params) {
|
|
2662
|
+
params.show_statistics = true;
|
|
2663
|
+
}
|
|
2664
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2650
2665
|
add_opt(common_arg(
|
|
2651
2666
|
{"--parse-special"},
|
|
2652
2667
|
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
|
@@ -2734,6 +2749,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2734
2749
|
params.public_path = value;
|
|
2735
2750
|
}
|
|
2736
2751
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
2752
|
+
add_opt(common_arg(
|
|
2753
|
+
{"--api-prefix"}, "PREFIX",
|
|
2754
|
+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
|
|
2755
|
+
[](common_params & params, const std::string & value) {
|
|
2756
|
+
params.api_prefix = value;
|
|
2757
|
+
}
|
|
2758
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2737
2759
|
add_opt(common_arg(
|
|
2738
2760
|
{"--no-webui"},
|
|
2739
2761
|
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
@@ -2794,6 +2816,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2794
2816
|
params.ssl_file_cert = value;
|
|
2795
2817
|
}
|
|
2796
2818
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
2819
|
+
add_opt(common_arg(
|
|
2820
|
+
{"--chat-template-kwargs"}, "STRING",
|
|
2821
|
+
string_format("sets additional params for the json template parser"),
|
|
2822
|
+
[](common_params & params, const std::string & value) {
|
|
2823
|
+
auto parsed = json::parse(value);
|
|
2824
|
+
for (const auto & item : parsed.items()) {
|
|
2825
|
+
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
2826
|
+
}
|
|
2827
|
+
}
|
|
2828
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2797
2829
|
add_opt(common_arg(
|
|
2798
2830
|
{"-to", "--timeout"}, "N",
|
|
2799
2831
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
@@ -3406,5 +3438,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3406
3438
|
}
|
|
3407
3439
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3408
3440
|
|
|
3441
|
+
// diffusion parameters
|
|
3442
|
+
add_opt(common_arg(
|
|
3443
|
+
{ "--diffusion-steps" }, "N",
|
|
3444
|
+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3445
|
+
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3446
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3447
|
+
add_opt(common_arg(
|
|
3448
|
+
{ "--diffusion-eps" }, "F",
|
|
3449
|
+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
3450
|
+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
3451
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3452
|
+
add_opt(common_arg(
|
|
3453
|
+
{ "--diffusion-algorithm" }, "N",
|
|
3454
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
|
|
3455
|
+
params.diffusion.algorithm),
|
|
3456
|
+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3457
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3458
|
+
add_opt(common_arg(
|
|
3459
|
+
{ "--diffusion-alg-temp" }, "F",
|
|
3460
|
+
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3461
|
+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3462
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3463
|
+
add_opt(common_arg(
|
|
3464
|
+
{ "--diffusion-visual" },
|
|
3465
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
|
3466
|
+
params.diffusion.visual_mode ? "true" : "false"),
|
|
3467
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3468
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3469
|
+
|
|
3409
3470
|
return ctx_arg;
|
|
3410
3471
|
}
|
|
@@ -17,6 +17,8 @@
|
|
|
17
17
|
#include <string>
|
|
18
18
|
#include <vector>
|
|
19
19
|
|
|
20
|
+
using json = nlohmann::ordered_json;
|
|
21
|
+
|
|
20
22
|
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
|
|
21
23
|
auto time = std::chrono::system_clock::to_time_t(now);
|
|
22
24
|
auto local_time = *std::localtime(&time);
|
|
@@ -140,6 +142,7 @@ struct templates_params {
|
|
|
140
142
|
bool add_generation_prompt = true;
|
|
141
143
|
bool enable_thinking = true;
|
|
142
144
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
145
|
+
json extra_context;
|
|
143
146
|
};
|
|
144
147
|
|
|
145
148
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
|
|
|
720
723
|
|
|
721
724
|
static std::string apply(
|
|
722
725
|
const common_chat_template & tmpl,
|
|
723
|
-
const
|
|
724
|
-
const
|
|
725
|
-
|
|
726
|
-
const
|
|
726
|
+
const struct templates_params & inputs,
|
|
727
|
+
const std::optional<json> & messages_override = std::nullopt,
|
|
728
|
+
const std::optional<json> & tools_override = std::nullopt,
|
|
729
|
+
const std::optional<json> & additional_context = std::nullopt)
|
|
727
730
|
{
|
|
728
731
|
minja::chat_template_inputs tmpl_inputs;
|
|
729
|
-
tmpl_inputs.messages = messages;
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
732
|
+
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
|
|
733
|
+
if (tools_override) {
|
|
734
|
+
tmpl_inputs.tools = *tools_override;
|
|
735
|
+
} else {
|
|
736
|
+
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
|
|
737
|
+
}
|
|
738
|
+
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
|
|
739
|
+
tmpl_inputs.extra_context = inputs.extra_context;
|
|
740
|
+
if (additional_context) {
|
|
741
|
+
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
742
|
+
}
|
|
733
743
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
734
744
|
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
735
745
|
|
|
@@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
|
|
828
838
|
inputs.messages,
|
|
829
839
|
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
|
830
840
|
|
|
831
|
-
data.prompt = apply(tmpl,
|
|
841
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
|
832
842
|
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
|
833
843
|
return data;
|
|
834
844
|
}
|
|
@@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
904
914
|
data.preserved_tokens = {
|
|
905
915
|
"[TOOL_CALLS]",
|
|
906
916
|
};
|
|
907
|
-
data.prompt = apply(tmpl, inputs
|
|
917
|
+
data.prompt = apply(tmpl, inputs);
|
|
908
918
|
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
909
919
|
return data;
|
|
910
920
|
}
|
|
@@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
|
|
934
944
|
adjusted_messages.push_back(msg);
|
|
935
945
|
}
|
|
936
946
|
}
|
|
937
|
-
data.prompt = apply(tmpl,
|
|
947
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
|
938
948
|
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
|
939
949
|
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
|
|
940
950
|
if (!inputs.enable_thinking) {
|
|
@@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1122
1132
|
} else {
|
|
1123
1133
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1124
1134
|
}
|
|
1125
|
-
data.prompt = apply(tmpl, inputs
|
|
1135
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
|
|
1126
1136
|
{"date_string", format_time(inputs.now, "%d %b %Y")},
|
|
1127
1137
|
{"tools_in_user_message", false},
|
|
1128
1138
|
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
|
@@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
|
|
|
1187
1197
|
|
|
1188
1198
|
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1189
1199
|
common_chat_params data;
|
|
1190
|
-
auto prompt = apply(tmpl, inputs
|
|
1200
|
+
auto prompt = apply(tmpl, inputs);
|
|
1191
1201
|
|
|
1192
1202
|
// Hacks to fix the official (broken) prompt.
|
|
1193
1203
|
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
|
|
@@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1282
1292
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1283
1293
|
LOG_DBG("%s\n", __func__);
|
|
1284
1294
|
common_chat_params data;
|
|
1285
|
-
data.prompt = apply(tmpl, inputs
|
|
1295
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
|
|
1286
1296
|
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
|
1287
1297
|
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
1288
1298
|
});
|
|
@@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
|
|
1338
1348
|
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
|
1339
1349
|
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
|
|
1340
1350
|
common_chat_params data;
|
|
1341
|
-
data.prompt = apply(tmpl, inputs
|
|
1351
|
+
data.prompt = apply(tmpl, inputs);
|
|
1342
1352
|
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
|
1343
1353
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1344
1354
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
@@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
|
|
1465
1475
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1466
1476
|
}
|
|
1467
1477
|
|
|
1468
|
-
data.prompt = apply(tmpl, inputs
|
|
1478
|
+
data.prompt = apply(tmpl, inputs);
|
|
1469
1479
|
// TODO: if (has_raw_python)
|
|
1470
1480
|
return data;
|
|
1471
1481
|
}
|
|
@@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
|
|
|
1498
1508
|
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1499
1509
|
common_chat_params data;
|
|
1500
1510
|
|
|
1501
|
-
json
|
|
1511
|
+
json extra_context = json {
|
|
1502
1512
|
{"enable_thinking", inputs.enable_thinking},
|
|
1503
1513
|
};
|
|
1514
|
+
extra_context.update(inputs.extra_context);
|
|
1504
1515
|
|
|
1505
|
-
data.prompt = apply(tmpl, inputs
|
|
1516
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
|
|
1506
1517
|
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
|
1507
1518
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
1508
|
-
if (!
|
|
1519
|
+
if (!extra_context["enable_thinking"]) {
|
|
1509
1520
|
data.prompt += "</think>";
|
|
1510
1521
|
} else {
|
|
1511
1522
|
data.thinking_forced_open = true;
|
|
@@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1691
1702
|
|
|
1692
1703
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1693
1704
|
common_chat_params data;
|
|
1694
|
-
data.prompt = apply(tmpl, inputs
|
|
1705
|
+
data.prompt = apply(tmpl, inputs);
|
|
1695
1706
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1696
1707
|
data.grammar_lazy = false;
|
|
1697
1708
|
if (!inputs.json_schema.is_null()) {
|
|
@@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1722
1733
|
params.enable_thinking = inputs.enable_thinking;
|
|
1723
1734
|
params.grammar = inputs.grammar;
|
|
1724
1735
|
params.now = inputs.now;
|
|
1736
|
+
|
|
1737
|
+
params.extra_context = json::object();
|
|
1738
|
+
for (auto el : inputs.chat_template_kwargs) {
|
|
1739
|
+
params.extra_context[el.first] = json::parse(el.second);
|
|
1740
|
+
}
|
|
1741
|
+
|
|
1725
1742
|
if (!inputs.json_schema.empty()) {
|
|
1726
1743
|
params.json_schema = json::parse(inputs.json_schema);
|
|
1727
1744
|
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <chrono>
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <vector>
|
|
10
|
+
#include <map>
|
|
10
11
|
|
|
11
12
|
struct common_chat_templates;
|
|
12
13
|
|
|
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
|
|
|
125
126
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
126
127
|
bool enable_thinking = true;
|
|
127
128
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
129
|
+
std::map<std::string, std::string> chat_template_kwargs;
|
|
128
130
|
};
|
|
129
131
|
|
|
130
132
|
struct common_chat_params {
|
|
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
448
448
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
|
449
449
|
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
|
450
450
|
}
|
|
451
|
+
|
|
452
|
+
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
|
|
453
|
+
bool has_suffix = string_ends_with(str, suffix);
|
|
454
|
+
if (has_suffix) {
|
|
455
|
+
str = str.substr(0, str.size() - suffix.size());
|
|
456
|
+
}
|
|
457
|
+
return has_suffix;
|
|
458
|
+
}
|
|
459
|
+
|
|
451
460
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
|
452
461
|
if (!str.empty() && !stop.empty()) {
|
|
453
462
|
const char text_last_char = str.back();
|
|
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1005
1014
|
params.sampling.ignore_eos = false;
|
|
1006
1015
|
}
|
|
1007
1016
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
}
|
|
1017
|
+
// initialize once
|
|
1018
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
1019
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
|
1020
|
+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
1021
|
+
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
1014
1022
|
}
|
|
1015
1023
|
}
|
|
1016
1024
|
|
|
1025
|
+
if (params.sampling.ignore_eos) {
|
|
1026
|
+
// add EOG biases to the active set of logit biases
|
|
1027
|
+
params.sampling.logit_bias.insert(
|
|
1028
|
+
params.sampling.logit_bias.end(),
|
|
1029
|
+
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1017
1032
|
if (params.sampling.penalty_last_n == -1) {
|
|
1018
1033
|
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1019
1034
|
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
@@ -1157,6 +1172,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1157
1172
|
cparams.no_perf = params.no_perf;
|
|
1158
1173
|
cparams.op_offload = !params.no_op_offload;
|
|
1159
1174
|
cparams.swa_full = params.swa_full;
|
|
1175
|
+
cparams.kv_unified = params.kv_unified;
|
|
1160
1176
|
|
|
1161
1177
|
cparams.type_k = params.cache_type_k;
|
|
1162
1178
|
cparams.type_v = params.cache_type_v;
|