@novastera-oss/llamarn 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +140 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +48 -67
- package/cpp/LlamaCppModel.h +8 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +33 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
- package/cpp/llama.cpp/common/arg.cpp +38 -12
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
- package/cpp/llama.cpp/common/chat-parser.h +4 -1
- package/cpp/llama.cpp/common/chat.cpp +16 -13
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +52 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +140 -38
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
- package/cpp/llama.cpp/src/llama-batch.h +47 -17
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +488 -313
- package/cpp/llama.cpp/src/llama-context.h +38 -17
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
- package/cpp/llama.cpp/src/llama-graph.h +109 -52
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
- package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +89 -4
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +735 -143
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +65 -10
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +140 -38
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
|
|
|
89
89
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
|
90
90
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
|
91
91
|
|
|
92
|
+
if (NOT DEFINED LLAMA_BUILD_NUMBER)
|
|
93
|
+
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
|
94
|
+
endif()
|
|
95
|
+
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
|
96
|
+
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
97
|
+
endif()
|
|
98
|
+
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
|
99
|
+
|
|
92
100
|
# override ggml options
|
|
93
101
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|
94
102
|
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
|
@@ -155,10 +163,17 @@ if (LLAMA_USE_SYSTEM_GGML)
|
|
|
155
163
|
endif()
|
|
156
164
|
|
|
157
165
|
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
|
166
|
+
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
|
|
167
|
+
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
|
|
158
168
|
add_subdirectory(ggml)
|
|
159
169
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
|
160
170
|
endif()
|
|
161
171
|
|
|
172
|
+
if (MINGW)
|
|
173
|
+
# Target Windows 8 for PrefetchVirtualMemory
|
|
174
|
+
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
175
|
+
endif()
|
|
176
|
+
|
|
162
177
|
#
|
|
163
178
|
# build the library
|
|
164
179
|
#
|
|
@@ -199,10 +214,6 @@ endif()
|
|
|
199
214
|
include(GNUInstallDirs)
|
|
200
215
|
include(CMakePackageConfigHelpers)
|
|
201
216
|
|
|
202
|
-
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
|
203
|
-
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
204
|
-
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
|
205
|
-
|
|
206
217
|
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
|
207
218
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
|
208
219
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
package/cpp/llama.cpp/Makefile
CHANGED
|
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
|
|
|
367
367
|
endif
|
|
368
368
|
|
|
369
369
|
ifndef GGML_NO_CPU_AARCH64
|
|
370
|
-
MK_CPPFLAGS += -
|
|
370
|
+
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
|
|
371
371
|
endif
|
|
372
372
|
|
|
373
373
|
# warnings
|
|
@@ -970,7 +970,7 @@ OBJ_GGML = \
|
|
|
970
970
|
$(DIR_GGML)/src/ggml-threading.o \
|
|
971
971
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
|
972
972
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
|
|
973
|
-
$(DIR_GGML)/src/ggml-cpu/
|
|
973
|
+
$(DIR_GGML)/src/ggml-cpu/repack.o \
|
|
974
974
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
|
|
975
975
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
|
976
976
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
|
package/cpp/llama.cpp/README.md
CHANGED
|
@@ -3,9 +3,10 @@
|
|
|
3
3
|

|
|
4
4
|
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://github.com/ggml-org/llama.cpp/releases)
|
|
6
7
|
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
|
7
8
|
|
|
8
|
-
[Roadmap](https://github.com/users/ggerganov/projects/7) / [
|
|
9
|
+
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
|
9
10
|
|
|
10
11
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
|
11
12
|
|
|
@@ -17,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|
|
17
18
|
## Hot topics
|
|
18
19
|
|
|
19
20
|
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
|
20
|
-
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
|
21
21
|
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
|
22
22
|
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
|
23
23
|
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
|
@@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|
|
28
28
|
|
|
29
29
|
----
|
|
30
30
|
|
|
31
|
+
## Quick start
|
|
32
|
+
|
|
33
|
+
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
|
34
|
+
|
|
35
|
+
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
|
36
|
+
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
|
37
|
+
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
|
38
|
+
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
|
39
|
+
|
|
40
|
+
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
|
|
41
|
+
|
|
42
|
+
Example command:
|
|
43
|
+
|
|
44
|
+
```sh
|
|
45
|
+
# Use a local model file
|
|
46
|
+
llama-cli -m my_model.gguf
|
|
47
|
+
|
|
48
|
+
# Or download and run a model directly from Hugging Face
|
|
49
|
+
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
|
50
|
+
|
|
51
|
+
# Launch OpenAI-compatible API server
|
|
52
|
+
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
|
|
53
|
+
```
|
|
54
|
+
|
|
31
55
|
## Description
|
|
32
56
|
|
|
33
57
|
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
|
@@ -130,6 +154,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
130
154
|
<details>
|
|
131
155
|
<summary>Bindings</summary>
|
|
132
156
|
|
|
157
|
+
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
|
|
133
158
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
|
134
159
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
|
135
160
|
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
|
@@ -229,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
229
254
|
|
|
230
255
|
</details>
|
|
231
256
|
|
|
257
|
+
|
|
232
258
|
## Supported backends
|
|
233
259
|
|
|
234
260
|
| Backend | Target devices |
|
|
@@ -245,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
245
271
|
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
|
246
272
|
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
|
247
273
|
|
|
248
|
-
## Building the project
|
|
249
|
-
|
|
250
|
-
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
|
251
|
-
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
|
252
|
-
|
|
253
|
-
- Clone this repository and build locally, see [how to build](docs/build.md)
|
|
254
|
-
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
|
255
|
-
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
|
256
|
-
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
|
|
257
|
-
|
|
258
274
|
## Obtaining and quantizing models
|
|
259
275
|
|
|
260
276
|
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
|
|
@@ -262,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
|
|
|
262
278
|
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
|
263
279
|
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
|
264
280
|
|
|
265
|
-
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
|
|
281
|
+
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
|
|
282
|
+
|
|
283
|
+
```sh
|
|
284
|
+
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
|
285
|
+
```
|
|
266
286
|
|
|
267
287
|
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
|
268
288
|
|
|
@@ -7,8 +7,8 @@ llama_add_compile_flags()
|
|
|
7
7
|
# Build info header
|
|
8
8
|
#
|
|
9
9
|
|
|
10
|
-
if(EXISTS "${
|
|
11
|
-
set(GIT_DIR "${
|
|
10
|
+
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
|
11
|
+
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
|
12
12
|
|
|
13
13
|
# Is git submodule
|
|
14
14
|
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
|
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
|
|
18
18
|
if (SLASH_POS EQUAL 0)
|
|
19
19
|
set(GIT_DIR "${REAL_GIT_DIR}")
|
|
20
20
|
else()
|
|
21
|
-
set(GIT_DIR "${
|
|
21
|
+
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
|
22
22
|
endif()
|
|
23
23
|
endif()
|
|
24
24
|
|
|
25
25
|
if(EXISTS "${GIT_DIR}/index")
|
|
26
|
-
|
|
26
|
+
# For build-info.cpp below
|
|
27
|
+
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
|
|
27
28
|
else()
|
|
28
29
|
message(WARNING "Git index not found in git repository.")
|
|
29
|
-
set(GIT_INDEX "")
|
|
30
30
|
endif()
|
|
31
31
|
else()
|
|
32
32
|
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
|
33
|
-
set(GIT_INDEX "")
|
|
34
33
|
endif()
|
|
35
34
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
|
41
|
-
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
|
42
|
-
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
|
43
|
-
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
|
44
|
-
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
|
45
|
-
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
|
46
|
-
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
|
47
|
-
VERBATIM
|
|
48
|
-
)
|
|
35
|
+
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
|
36
|
+
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
|
37
|
+
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
|
38
|
+
|
|
49
39
|
set(TARGET build_info)
|
|
50
|
-
add_library(${TARGET} OBJECT
|
|
40
|
+
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
|
51
41
|
if (BUILD_SHARED_LIBS)
|
|
52
42
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
53
43
|
endif()
|
|
@@ -58,23 +48,20 @@ add_library(${TARGET} STATIC
|
|
|
58
48
|
arg.cpp
|
|
59
49
|
arg.h
|
|
60
50
|
base64.hpp
|
|
61
|
-
chat.cpp
|
|
62
|
-
chat.h
|
|
63
51
|
chat-parser.cpp
|
|
64
52
|
chat-parser.h
|
|
53
|
+
chat.cpp
|
|
54
|
+
chat.h
|
|
65
55
|
common.cpp
|
|
66
56
|
common.h
|
|
67
57
|
console.cpp
|
|
68
58
|
console.h
|
|
69
|
-
json-schema-to-grammar.cpp
|
|
70
|
-
json.hpp
|
|
71
|
-
json-partial.h
|
|
72
59
|
json-partial.cpp
|
|
60
|
+
json-partial.h
|
|
61
|
+
json-schema-to-grammar.cpp
|
|
73
62
|
llguidance.cpp
|
|
74
63
|
log.cpp
|
|
75
64
|
log.h
|
|
76
|
-
minja/chat-template.hpp
|
|
77
|
-
minja/minja.hpp
|
|
78
65
|
ngram-cache.cpp
|
|
79
66
|
ngram-cache.h
|
|
80
67
|
regex-partial.cpp
|
|
@@ -147,7 +134,7 @@ if (LLAMA_LLGUIDANCE)
|
|
|
147
134
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
148
135
|
endif ()
|
|
149
136
|
|
|
150
|
-
target_include_directories(${TARGET} PUBLIC .)
|
|
137
|
+
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
|
151
138
|
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
|
152
139
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
|
153
140
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
#include "gguf.h" // for reading GGUF splits
|
|
2
1
|
#include "arg.h"
|
|
3
2
|
|
|
3
|
+
#include "chat.h"
|
|
4
4
|
#include "common.h"
|
|
5
|
+
#include "gguf.h" // for reading GGUF splits
|
|
6
|
+
#include "json-schema-to-grammar.h"
|
|
5
7
|
#include "log.h"
|
|
6
8
|
#include "sampling.h"
|
|
7
|
-
#include "chat.h"
|
|
8
9
|
|
|
9
10
|
// fix problem with std::min and std::max
|
|
10
11
|
#if defined(_WIN32)
|
|
@@ -15,6 +16,9 @@
|
|
|
15
16
|
#include <windows.h>
|
|
16
17
|
#endif
|
|
17
18
|
|
|
19
|
+
#define JSON_ASSERT GGML_ASSERT
|
|
20
|
+
#include <nlohmann/json.hpp>
|
|
21
|
+
|
|
18
22
|
#include <algorithm>
|
|
19
23
|
#include <climits>
|
|
20
24
|
#include <cstdarg>
|
|
@@ -34,8 +38,6 @@
|
|
|
34
38
|
#include <future>
|
|
35
39
|
#endif
|
|
36
40
|
|
|
37
|
-
#include "json-schema-to-grammar.h"
|
|
38
|
-
|
|
39
41
|
using json = nlohmann::ordered_json;
|
|
40
42
|
|
|
41
43
|
std::initializer_list<enum llama_example> mmproj_examples = {
|
|
@@ -986,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
986
988
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
987
989
|
}
|
|
988
990
|
|
|
989
|
-
if (params.reranking && params.embedding) {
|
|
990
|
-
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
|
991
|
-
}
|
|
992
|
-
|
|
993
991
|
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
|
994
992
|
throw std::runtime_error(string_format(
|
|
995
993
|
"error: the supplied chat template is not supported: %s%s\n",
|
|
@@ -1346,9 +1344,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1346
1344
|
));
|
|
1347
1345
|
add_opt(common_arg(
|
|
1348
1346
|
{"--prio"}, "N",
|
|
1349
|
-
string_format("set process/thread priority :
|
|
1347
|
+
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
|
|
1350
1348
|
[](common_params & params, int prio) {
|
|
1351
|
-
if (prio <
|
|
1349
|
+
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
|
|
1352
1350
|
throw std::invalid_argument("invalid value");
|
|
1353
1351
|
}
|
|
1354
1352
|
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
@@ -2745,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2745
2743
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2746
2744
|
add_opt(common_arg(
|
|
2747
2745
|
{"--reranking", "--rerank"},
|
|
2748
|
-
string_format("enable reranking endpoint on server (default: %s)",
|
|
2746
|
+
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
2749
2747
|
[](common_params & params) {
|
|
2750
|
-
params.
|
|
2748
|
+
params.embedding = true;
|
|
2749
|
+
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
2751
2750
|
}
|
|
2752
2751
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
2753
2752
|
add_opt(common_arg(
|
|
@@ -2867,6 +2866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2867
2866
|
"(default: deepseek)",
|
|
2868
2867
|
[](common_params & params, const std::string & value) {
|
|
2869
2868
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2869
|
+
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
|
2870
2870
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2871
2871
|
else { throw std::invalid_argument("invalid value"); }
|
|
2872
2872
|
}
|
|
@@ -3210,6 +3210,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3210
3210
|
params.speculative.model.path = value;
|
|
3211
3211
|
}
|
|
3212
3212
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3213
|
+
add_opt(common_arg(
|
|
3214
|
+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
3215
|
+
string_format(
|
|
3216
|
+
"KV cache data type for K for the draft model\n"
|
|
3217
|
+
"allowed values: %s\n"
|
|
3218
|
+
"(default: %s)",
|
|
3219
|
+
get_all_kv_cache_types().c_str(),
|
|
3220
|
+
ggml_type_name(params.speculative.cache_type_k)
|
|
3221
|
+
),
|
|
3222
|
+
[](common_params & params, const std::string & value) {
|
|
3223
|
+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
|
|
3224
|
+
}
|
|
3225
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
|
|
3226
|
+
add_opt(common_arg(
|
|
3227
|
+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
|
|
3228
|
+
string_format(
|
|
3229
|
+
"KV cache data type for V for the draft model\n"
|
|
3230
|
+
"allowed values: %s\n"
|
|
3231
|
+
"(default: %s)",
|
|
3232
|
+
get_all_kv_cache_types().c_str(),
|
|
3233
|
+
ggml_type_name(params.speculative.cache_type_v)
|
|
3234
|
+
),
|
|
3235
|
+
[](common_params & params, const std::string & value) {
|
|
3236
|
+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
|
|
3237
|
+
}
|
|
3238
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
|
|
3213
3239
|
|
|
3214
3240
|
add_opt(common_arg(
|
|
3215
3241
|
{"-mv", "--model-vocoder"}, "FNAME",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
int LLAMA_BUILD_NUMBER = @
|
|
2
|
-
char const *LLAMA_COMMIT = "@
|
|
1
|
+
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
|
2
|
+
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
|
3
3
|
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
|
4
4
|
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
|
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
|
|
49
49
|
|
|
50
50
|
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
|
51
51
|
result_.tool_calls.emplace_back(tool_call);
|
|
52
|
+
|
|
52
53
|
return true;
|
|
53
54
|
}
|
|
54
55
|
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
|
@@ -154,9 +155,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
|
|
|
154
155
|
if (!rest.empty()) {
|
|
155
156
|
handle_reasoning(rest, /* closed */ !is_partial());
|
|
156
157
|
}
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
158
|
+
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
|
|
159
|
+
// if (!syntax_.thinking_forced_open) {
|
|
160
|
+
// throw common_chat_msg_partial_exception(end_think);
|
|
161
|
+
// }
|
|
160
162
|
return true;
|
|
161
163
|
}
|
|
162
164
|
}
|
|
@@ -377,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
377
379
|
/* .is_partial = */ found_healing_marker,
|
|
378
380
|
};
|
|
379
381
|
}
|
|
382
|
+
|
|
383
|
+
void common_chat_msg_parser::clear_tools() {
|
|
384
|
+
result_.tool_calls.clear();
|
|
385
|
+
}
|
|
@@ -2,9 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
#include "chat.h"
|
|
4
4
|
#include "json-partial.h"
|
|
5
|
-
#include "json.hpp"
|
|
6
5
|
#include "regex-partial.h"
|
|
7
6
|
|
|
7
|
+
#include <nlohmann/json.hpp>
|
|
8
|
+
|
|
8
9
|
#include <optional>
|
|
9
10
|
#include <string>
|
|
10
11
|
#include <vector>
|
|
@@ -114,4 +115,6 @@ class common_chat_msg_parser {
|
|
|
114
115
|
const std::vector<std::vector<std::string>> & args_paths = {},
|
|
115
116
|
const std::vector<std::vector<std::string>> & content_paths = {}
|
|
116
117
|
);
|
|
118
|
+
|
|
119
|
+
void clear_tools();
|
|
117
120
|
};
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
#include "chat.h"
|
|
2
2
|
#include "chat-parser.h"
|
|
3
3
|
#include "common.h"
|
|
4
|
+
#include "json-partial.h"
|
|
4
5
|
#include "json-schema-to-grammar.h"
|
|
5
6
|
#include "log.h"
|
|
6
|
-
#include "json-partial.h"
|
|
7
|
-
#include "minja/chat-template.hpp"
|
|
8
|
-
#include "minja/minja.hpp"
|
|
9
7
|
#include "regex-partial.h"
|
|
10
8
|
|
|
9
|
+
#include <minja/chat-template.hpp>
|
|
10
|
+
#include <minja/minja.hpp>
|
|
11
|
+
|
|
11
12
|
#include <cstdio>
|
|
12
13
|
#include <exception>
|
|
13
14
|
#include <iostream>
|
|
@@ -16,7 +17,6 @@
|
|
|
16
17
|
#include <string>
|
|
17
18
|
#include <vector>
|
|
18
19
|
|
|
19
|
-
|
|
20
20
|
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
|
|
21
21
|
auto time = std::chrono::system_clock::to_time_t(now);
|
|
22
22
|
auto local_time = *std::localtime(&time);
|
|
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
|
|
|
82
82
|
|
|
83
83
|
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
|
|
84
84
|
std::vector<common_chat_msg_diff> diffs;
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
85
|
+
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
|
|
86
|
+
auto & diff = diffs.emplace_back();
|
|
87
|
+
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
|
|
88
|
+
}
|
|
89
89
|
if (previous_msg.content != new_msg.content) {
|
|
90
90
|
auto & diff = diffs.emplace_back();
|
|
91
91
|
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
|
|
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
|
|
|
385
385
|
|
|
386
386
|
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
|
387
387
|
json delta = json::object();
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
388
|
+
if (!diff.reasoning_content_delta.empty()) {
|
|
389
|
+
delta["reasoning_content"] = diff.reasoning_content_delta;
|
|
390
|
+
}
|
|
391
391
|
if (!diff.content_delta.empty()) {
|
|
392
392
|
delta["content"] = diff.content_delta;
|
|
393
393
|
}
|
|
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
|
598
598
|
switch (format) {
|
|
599
599
|
case COMMON_REASONING_FORMAT_NONE: return "none";
|
|
600
600
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
601
|
+
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
601
602
|
default:
|
|
602
603
|
throw std::runtime_error("Unknown reasoning format");
|
|
603
604
|
}
|
|
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
|
|
1837
1838
|
if (res < 0) {
|
|
1838
1839
|
// if the custom "tmpl" is not supported, we throw an error
|
|
1839
1840
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
|
1840
|
-
throw std::runtime_error("this custom template is not supported");
|
|
1841
|
+
throw std::runtime_error("this custom template is not supported, try using --jinja");
|
|
1841
1842
|
}
|
|
1842
1843
|
|
|
1843
1844
|
// if it turns out that our buffer is too small, we resize it
|
|
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|
|
1920
1921
|
} catch (const common_chat_msg_partial_exception & ex) {
|
|
1921
1922
|
LOG_DBG("Partial parse: %s\n", ex.what());
|
|
1922
1923
|
if (!is_partial) {
|
|
1923
|
-
|
|
1924
|
+
builder.clear_tools();
|
|
1925
|
+
builder.move_to(0);
|
|
1926
|
+
common_chat_parse_content_only(builder);
|
|
1924
1927
|
}
|
|
1925
1928
|
}
|
|
1926
1929
|
auto msg = builder.result();
|
|
@@ -70,7 +70,7 @@ struct common_chat_msg {
|
|
|
70
70
|
};
|
|
71
71
|
|
|
72
72
|
struct common_chat_msg_diff {
|
|
73
|
-
|
|
73
|
+
std::string reasoning_content_delta;
|
|
74
74
|
std::string content_delta;
|
|
75
75
|
size_t tool_call_index = std::string::npos;
|
|
76
76
|
common_chat_tool_call tool_call_delta;
|