@novastera-oss/llamarn 0.2.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +17 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.h +4 -0
- package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +0 -40
- package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
- package/cpp/llama.cpp/src/llama-arch.h +18 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
- package/cpp/llama.cpp/src/llama-batch.h +8 -1
- package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
- package/cpp/llama.cpp/src/llama-graph.h +47 -60
- package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
- package/cpp/llama.cpp/src/llama-hparams.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
- package/cpp/llama.cpp/src/llama-model.h +18 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
- package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
- package/cpp/llama.cpp/src/llama-vocab.h +41 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +4 -0
- package/ios/include/llama.h +0 -40
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/build-info.cpp
CHANGED
|
@@ -120,7 +120,6 @@ endfunction()
|
|
|
120
120
|
|
|
121
121
|
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
|
122
122
|
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
|
123
|
-
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
|
124
123
|
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
|
125
124
|
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
|
126
125
|
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
package/cpp/llama.cpp/README.md
CHANGED
|
@@ -6,9 +6,9 @@
|
|
|
6
6
|
[](https://github.com/ggml-org/llama.cpp/releases)
|
|
7
7
|
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
|
8
8
|
|
|
9
|
-
[
|
|
9
|
+
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
LLM inference in C/C++
|
|
12
12
|
|
|
13
13
|
## Recent API changes
|
|
14
14
|
|
|
@@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|
|
17
17
|
|
|
18
18
|
## Hot topics
|
|
19
19
|
|
|
20
|
-
-
|
|
21
|
-
-
|
|
20
|
+
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
|
|
21
|
+
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
|
22
22
|
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
|
23
|
-
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
|
24
23
|
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
|
25
24
|
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
|
26
25
|
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
|
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
|
|
|
86
86
|
endif()
|
|
87
87
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
88
88
|
include_directories(${CURL_INCLUDE_DIRS})
|
|
89
|
-
|
|
90
|
-
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
|
89
|
+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
|
91
90
|
endif ()
|
|
92
91
|
|
|
93
92
|
if (LLAMA_LLGUIDANCE)
|
|
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
|
|
|
112
111
|
|
|
113
112
|
ExternalProject_Add(llguidance_ext
|
|
114
113
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
115
|
-
#
|
|
116
|
-
GIT_TAG
|
|
114
|
+
# v1.0.1:
|
|
115
|
+
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
|
|
117
116
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
118
117
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
119
118
|
BUILD_IN_SOURCE TRUE
|
|
120
119
|
CONFIGURE_COMMAND ""
|
|
121
|
-
BUILD_COMMAND cargo build --release
|
|
120
|
+
BUILD_COMMAND cargo build --release --package llguidance
|
|
122
121
|
INSTALL_COMMAND ""
|
|
123
122
|
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
|
124
123
|
UPDATE_COMMAND ""
|
|
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2734
2734
|
params.public_path = value;
|
|
2735
2735
|
}
|
|
2736
2736
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
2737
|
+
add_opt(common_arg(
|
|
2738
|
+
{"--api-prefix"}, "PREFIX",
|
|
2739
|
+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
|
|
2740
|
+
[](common_params & params, const std::string & value) {
|
|
2741
|
+
params.api_prefix = value;
|
|
2742
|
+
}
|
|
2743
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2737
2744
|
add_opt(common_arg(
|
|
2738
2745
|
{"--no-webui"},
|
|
2739
2746
|
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
@@ -2794,6 +2801,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2794
2801
|
params.ssl_file_cert = value;
|
|
2795
2802
|
}
|
|
2796
2803
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
2804
|
+
add_opt(common_arg(
|
|
2805
|
+
{"--chat-template-kwargs"}, "STRING",
|
|
2806
|
+
string_format("sets additional params for the json template parser"),
|
|
2807
|
+
[](common_params & params, const std::string & value) {
|
|
2808
|
+
auto parsed = json::parse(value);
|
|
2809
|
+
for (const auto & item : parsed.items()) {
|
|
2810
|
+
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
2811
|
+
}
|
|
2812
|
+
}
|
|
2813
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2797
2814
|
add_opt(common_arg(
|
|
2798
2815
|
{"-to", "--timeout"}, "N",
|
|
2799
2816
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
@@ -17,6 +17,8 @@
|
|
|
17
17
|
#include <string>
|
|
18
18
|
#include <vector>
|
|
19
19
|
|
|
20
|
+
using json = nlohmann::ordered_json;
|
|
21
|
+
|
|
20
22
|
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
|
|
21
23
|
auto time = std::chrono::system_clock::to_time_t(now);
|
|
22
24
|
auto local_time = *std::localtime(&time);
|
|
@@ -140,6 +142,7 @@ struct templates_params {
|
|
|
140
142
|
bool add_generation_prompt = true;
|
|
141
143
|
bool enable_thinking = true;
|
|
142
144
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
145
|
+
json extra_context;
|
|
143
146
|
};
|
|
144
147
|
|
|
145
148
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
|
|
|
720
723
|
|
|
721
724
|
static std::string apply(
|
|
722
725
|
const common_chat_template & tmpl,
|
|
723
|
-
const
|
|
724
|
-
const
|
|
725
|
-
|
|
726
|
-
const
|
|
726
|
+
const struct templates_params & inputs,
|
|
727
|
+
const std::optional<json> & messages_override = std::nullopt,
|
|
728
|
+
const std::optional<json> & tools_override = std::nullopt,
|
|
729
|
+
const std::optional<json> & additional_context = std::nullopt)
|
|
727
730
|
{
|
|
728
731
|
minja::chat_template_inputs tmpl_inputs;
|
|
729
|
-
tmpl_inputs.messages = messages;
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
732
|
+
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
|
|
733
|
+
if (tools_override) {
|
|
734
|
+
tmpl_inputs.tools = *tools_override;
|
|
735
|
+
} else {
|
|
736
|
+
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
|
|
737
|
+
}
|
|
738
|
+
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
|
|
739
|
+
tmpl_inputs.extra_context = inputs.extra_context;
|
|
740
|
+
if (additional_context) {
|
|
741
|
+
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
742
|
+
}
|
|
733
743
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
734
744
|
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
735
745
|
|
|
@@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
|
|
828
838
|
inputs.messages,
|
|
829
839
|
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
|
830
840
|
|
|
831
|
-
data.prompt = apply(tmpl,
|
|
841
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
|
832
842
|
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
|
833
843
|
return data;
|
|
834
844
|
}
|
|
@@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
904
914
|
data.preserved_tokens = {
|
|
905
915
|
"[TOOL_CALLS]",
|
|
906
916
|
};
|
|
907
|
-
data.prompt = apply(tmpl, inputs
|
|
917
|
+
data.prompt = apply(tmpl, inputs);
|
|
908
918
|
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
909
919
|
return data;
|
|
910
920
|
}
|
|
@@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
|
|
934
944
|
adjusted_messages.push_back(msg);
|
|
935
945
|
}
|
|
936
946
|
}
|
|
937
|
-
data.prompt = apply(tmpl,
|
|
947
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
|
938
948
|
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
|
939
949
|
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
|
|
940
950
|
if (!inputs.enable_thinking) {
|
|
@@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1122
1132
|
} else {
|
|
1123
1133
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1124
1134
|
}
|
|
1125
|
-
data.prompt = apply(tmpl, inputs
|
|
1135
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
|
|
1126
1136
|
{"date_string", format_time(inputs.now, "%d %b %Y")},
|
|
1127
1137
|
{"tools_in_user_message", false},
|
|
1128
1138
|
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
|
@@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
|
|
|
1187
1197
|
|
|
1188
1198
|
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1189
1199
|
common_chat_params data;
|
|
1190
|
-
auto prompt = apply(tmpl, inputs
|
|
1200
|
+
auto prompt = apply(tmpl, inputs);
|
|
1191
1201
|
|
|
1192
1202
|
// Hacks to fix the official (broken) prompt.
|
|
1193
1203
|
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
|
|
@@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1282
1292
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1283
1293
|
LOG_DBG("%s\n", __func__);
|
|
1284
1294
|
common_chat_params data;
|
|
1285
|
-
data.prompt = apply(tmpl, inputs
|
|
1295
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
|
|
1286
1296
|
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
|
1287
1297
|
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
1288
1298
|
});
|
|
@@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
|
|
1338
1348
|
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
|
1339
1349
|
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
|
|
1340
1350
|
common_chat_params data;
|
|
1341
|
-
data.prompt = apply(tmpl, inputs
|
|
1351
|
+
data.prompt = apply(tmpl, inputs);
|
|
1342
1352
|
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
|
1343
1353
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1344
1354
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
@@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
|
|
1465
1475
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1466
1476
|
}
|
|
1467
1477
|
|
|
1468
|
-
data.prompt = apply(tmpl, inputs
|
|
1478
|
+
data.prompt = apply(tmpl, inputs);
|
|
1469
1479
|
// TODO: if (has_raw_python)
|
|
1470
1480
|
return data;
|
|
1471
1481
|
}
|
|
@@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
|
|
|
1498
1508
|
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1499
1509
|
common_chat_params data;
|
|
1500
1510
|
|
|
1501
|
-
json
|
|
1511
|
+
json extra_context = json {
|
|
1502
1512
|
{"enable_thinking", inputs.enable_thinking},
|
|
1503
1513
|
};
|
|
1514
|
+
extra_context.update(inputs.extra_context);
|
|
1504
1515
|
|
|
1505
|
-
data.prompt = apply(tmpl, inputs
|
|
1516
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
|
|
1506
1517
|
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
|
1507
1518
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
1508
|
-
if (!
|
|
1519
|
+
if (!extra_context["enable_thinking"]) {
|
|
1509
1520
|
data.prompt += "</think>";
|
|
1510
1521
|
} else {
|
|
1511
1522
|
data.thinking_forced_open = true;
|
|
@@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1691
1702
|
|
|
1692
1703
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1693
1704
|
common_chat_params data;
|
|
1694
|
-
data.prompt = apply(tmpl, inputs
|
|
1705
|
+
data.prompt = apply(tmpl, inputs);
|
|
1695
1706
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1696
1707
|
data.grammar_lazy = false;
|
|
1697
1708
|
if (!inputs.json_schema.is_null()) {
|
|
@@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1722
1733
|
params.enable_thinking = inputs.enable_thinking;
|
|
1723
1734
|
params.grammar = inputs.grammar;
|
|
1724
1735
|
params.now = inputs.now;
|
|
1736
|
+
|
|
1737
|
+
params.extra_context = json::object();
|
|
1738
|
+
for (auto el : inputs.chat_template_kwargs) {
|
|
1739
|
+
params.extra_context[el.first] = json::parse(el.second);
|
|
1740
|
+
}
|
|
1741
|
+
|
|
1725
1742
|
if (!inputs.json_schema.empty()) {
|
|
1726
1743
|
params.json_schema = json::parse(inputs.json_schema);
|
|
1727
1744
|
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <chrono>
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <vector>
|
|
10
|
+
#include <map>
|
|
10
11
|
|
|
11
12
|
struct common_chat_templates;
|
|
12
13
|
|
|
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
|
|
|
125
126
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
126
127
|
bool enable_thinking = true;
|
|
127
128
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
129
|
+
std::map<std::string, std::string> chat_template_kwargs;
|
|
128
130
|
};
|
|
129
131
|
|
|
130
132
|
struct common_chat_params {
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <string_view>
|
|
10
10
|
#include <vector>
|
|
11
|
+
#include <map>
|
|
11
12
|
#include <sstream>
|
|
12
13
|
|
|
13
14
|
#ifdef _WIN32
|
|
@@ -369,6 +370,7 @@ struct common_params {
|
|
|
369
370
|
|
|
370
371
|
std::string hostname = "127.0.0.1";
|
|
371
372
|
std::string public_path = ""; // NOLINT
|
|
373
|
+
std::string api_prefix = ""; // NOLINT
|
|
372
374
|
std::string chat_template = ""; // NOLINT
|
|
373
375
|
bool use_jinja = false; // NOLINT
|
|
374
376
|
bool enable_chat_template = true;
|
|
@@ -381,6 +383,8 @@ struct common_params {
|
|
|
381
383
|
std::string ssl_file_key = ""; // NOLINT
|
|
382
384
|
std::string ssl_file_cert = ""; // NOLINT
|
|
383
385
|
|
|
386
|
+
std::map<std::string, std::string> default_template_kwargs;
|
|
387
|
+
|
|
384
388
|
// "advanced" endpoints are disabled by default for better security
|
|
385
389
|
bool webui = true;
|
|
386
390
|
bool endpoint_slots = false;
|