@novastera-oss/llamarn 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +134 -36
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -2
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +30 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +50 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +134 -36
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
- package/cpp/llama.cpp/src/llama-batch.h +36 -11
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +313 -213
- package/cpp/llama.cpp/src/llama-context.h +16 -12
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
- package/cpp/llama.cpp/src/llama-graph.h +90 -34
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
- package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +64 -23
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +726 -141
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/llama.h +134 -36
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
|
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
988
988
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
989
989
|
}
|
|
990
990
|
|
|
991
|
-
if (params.reranking && params.embedding) {
|
|
992
|
-
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
|
993
|
-
}
|
|
994
|
-
|
|
995
991
|
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
|
996
992
|
throw std::runtime_error(string_format(
|
|
997
993
|
"error: the supplied chat template is not supported: %s%s\n",
|
|
@@ -2747,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2747
2743
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2748
2744
|
add_opt(common_arg(
|
|
2749
2745
|
{"--reranking", "--rerank"},
|
|
2750
|
-
string_format("enable reranking endpoint on server (default: %s)",
|
|
2746
|
+
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
2751
2747
|
[](common_params & params) {
|
|
2752
|
-
params.
|
|
2748
|
+
params.embedding = true;
|
|
2749
|
+
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
2753
2750
|
}
|
|
2754
2751
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
2755
2752
|
add_opt(common_arg(
|
|
@@ -2869,6 +2866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2869
2866
|
"(default: deepseek)",
|
|
2870
2867
|
[](common_params & params, const std::string & value) {
|
|
2871
2868
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2869
|
+
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
|
2872
2870
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2873
2871
|
else { throw std::invalid_argument("invalid value"); }
|
|
2874
2872
|
}
|
|
@@ -3212,6 +3210,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3212
3210
|
params.speculative.model.path = value;
|
|
3213
3211
|
}
|
|
3214
3212
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3213
|
+
add_opt(common_arg(
|
|
3214
|
+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
3215
|
+
string_format(
|
|
3216
|
+
"KV cache data type for K for the draft model\n"
|
|
3217
|
+
"allowed values: %s\n"
|
|
3218
|
+
"(default: %s)",
|
|
3219
|
+
get_all_kv_cache_types().c_str(),
|
|
3220
|
+
ggml_type_name(params.speculative.cache_type_k)
|
|
3221
|
+
),
|
|
3222
|
+
[](common_params & params, const std::string & value) {
|
|
3223
|
+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
|
|
3224
|
+
}
|
|
3225
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
|
|
3226
|
+
add_opt(common_arg(
|
|
3227
|
+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
|
|
3228
|
+
string_format(
|
|
3229
|
+
"KV cache data type for V for the draft model\n"
|
|
3230
|
+
"allowed values: %s\n"
|
|
3231
|
+
"(default: %s)",
|
|
3232
|
+
get_all_kv_cache_types().c_str(),
|
|
3233
|
+
ggml_type_name(params.speculative.cache_type_v)
|
|
3234
|
+
),
|
|
3235
|
+
[](common_params & params, const std::string & value) {
|
|
3236
|
+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
|
|
3237
|
+
}
|
|
3238
|
+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
|
|
3215
3239
|
|
|
3216
3240
|
add_opt(common_arg(
|
|
3217
3241
|
{"-mv", "--model-vocoder"}, "FNAME",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
int LLAMA_BUILD_NUMBER = @
|
|
2
|
-
char const *LLAMA_COMMIT = "@
|
|
1
|
+
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
|
2
|
+
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
|
3
3
|
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
|
4
4
|
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
|
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
|
|
49
49
|
|
|
50
50
|
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
|
51
51
|
result_.tool_calls.emplace_back(tool_call);
|
|
52
|
+
|
|
52
53
|
return true;
|
|
53
54
|
}
|
|
54
55
|
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
|
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
|
|
378
379
|
/* .is_partial = */ found_healing_marker,
|
|
379
380
|
};
|
|
380
381
|
}
|
|
382
|
+
|
|
383
|
+
void common_chat_msg_parser::clear_tools() {
|
|
384
|
+
result_.tool_calls.clear();
|
|
385
|
+
}
|
|
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
|
|
|
82
82
|
|
|
83
83
|
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
|
|
84
84
|
std::vector<common_chat_msg_diff> diffs;
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
85
|
+
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
|
|
86
|
+
auto & diff = diffs.emplace_back();
|
|
87
|
+
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
|
|
88
|
+
}
|
|
89
89
|
if (previous_msg.content != new_msg.content) {
|
|
90
90
|
auto & diff = diffs.emplace_back();
|
|
91
91
|
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
|
|
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
|
|
|
385
385
|
|
|
386
386
|
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
|
387
387
|
json delta = json::object();
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
388
|
+
if (!diff.reasoning_content_delta.empty()) {
|
|
389
|
+
delta["reasoning_content"] = diff.reasoning_content_delta;
|
|
390
|
+
}
|
|
391
391
|
if (!diff.content_delta.empty()) {
|
|
392
392
|
delta["content"] = diff.content_delta;
|
|
393
393
|
}
|
|
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
|
598
598
|
switch (format) {
|
|
599
599
|
case COMMON_REASONING_FORMAT_NONE: return "none";
|
|
600
600
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
601
|
+
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
601
602
|
default:
|
|
602
603
|
throw std::runtime_error("Unknown reasoning format");
|
|
603
604
|
}
|
|
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
|
|
1837
1838
|
if (res < 0) {
|
|
1838
1839
|
// if the custom "tmpl" is not supported, we throw an error
|
|
1839
1840
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
|
1840
|
-
throw std::runtime_error("this custom template is not supported");
|
|
1841
|
+
throw std::runtime_error("this custom template is not supported, try using --jinja");
|
|
1841
1842
|
}
|
|
1842
1843
|
|
|
1843
1844
|
// if it turns out that our buffer is too small, we resize it
|
|
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|
|
1920
1921
|
} catch (const common_chat_msg_partial_exception & ex) {
|
|
1921
1922
|
LOG_DBG("Partial parse: %s\n", ex.what());
|
|
1922
1923
|
if (!is_partial) {
|
|
1923
|
-
|
|
1924
|
+
builder.clear_tools();
|
|
1925
|
+
builder.move_to(0);
|
|
1926
|
+
common_chat_parse_content_only(builder);
|
|
1924
1927
|
}
|
|
1925
1928
|
}
|
|
1926
1929
|
auto msg = builder.result();
|
|
@@ -70,7 +70,7 @@ struct common_chat_msg {
|
|
|
70
70
|
};
|
|
71
71
|
|
|
72
72
|
struct common_chat_msg_diff {
|
|
73
|
-
|
|
73
|
+
std::string reasoning_content_delta;
|
|
74
74
|
std::string content_delta;
|
|
75
75
|
size_t tool_call_index = std::string::npos;
|
|
76
76
|
common_chat_tool_call tool_call_delta;
|
|
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
|
|
466
466
|
|
|
467
467
|
std::string regex_escape(const std::string & s) {
|
|
468
468
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
469
|
-
return std::regex_replace(s, special_chars, "
|
|
469
|
+
return std::regex_replace(s, special_chars, "\\$&");
|
|
470
470
|
}
|
|
471
471
|
|
|
472
472
|
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
|
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
706
706
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
707
707
|
# pragma clang diagnostic push
|
|
708
708
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
709
|
+
#elif defined(__GNUC__)
|
|
710
|
+
# pragma GCC diagnostic push
|
|
711
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
709
712
|
#endif
|
|
713
|
+
|
|
710
714
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
711
715
|
|
|
712
716
|
#if defined(__clang__)
|
|
713
717
|
# pragma clang diagnostic pop
|
|
718
|
+
#elif defined(__GNUC__)
|
|
719
|
+
# pragma GCC diagnostic pop
|
|
714
720
|
#endif
|
|
715
721
|
|
|
716
722
|
filename_utf32 = converter.from_bytes(filename);
|
|
@@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
767
773
|
return true;
|
|
768
774
|
}
|
|
769
775
|
|
|
776
|
+
#include <iostream>
|
|
777
|
+
|
|
778
|
+
|
|
770
779
|
// returns true if successful, false otherwise
|
|
771
780
|
bool fs_create_directory_with_parents(const std::string & path) {
|
|
772
781
|
#ifdef _WIN32
|
|
@@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
|
784
793
|
// process path from front to back, procedurally creating directories
|
|
785
794
|
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
786
795
|
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
787
|
-
const wchar_t * test = subpath.c_str();
|
|
788
796
|
|
|
789
|
-
|
|
797
|
+
pos_slash += 1;
|
|
798
|
+
|
|
799
|
+
// skip the drive letter, in some systems it can return an access denied error
|
|
800
|
+
if (subpath.length() == 2 && subpath[1] == ':') {
|
|
801
|
+
continue;
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
|
805
|
+
|
|
790
806
|
if (!success) {
|
|
791
807
|
const DWORD error = GetLastError();
|
|
792
808
|
|
|
@@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
|
800
816
|
return false;
|
|
801
817
|
}
|
|
802
818
|
}
|
|
803
|
-
|
|
804
|
-
pos_slash += 1;
|
|
805
819
|
}
|
|
806
820
|
|
|
807
821
|
return true;
|
|
@@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
897
911
|
|
|
898
912
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
899
913
|
|
|
900
|
-
if (params.reranking) {
|
|
901
|
-
bool ok = true;
|
|
902
|
-
|
|
903
|
-
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
904
|
-
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
905
|
-
ok = false;
|
|
906
|
-
}
|
|
907
|
-
|
|
908
|
-
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
909
|
-
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
910
|
-
|
|
911
|
-
if (!has_eos && !has_sep) {
|
|
912
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
|
913
|
-
ok = false;
|
|
914
|
-
} else if (!has_eos) {
|
|
915
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
916
|
-
} else if (!has_sep) {
|
|
917
|
-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
918
|
-
ok = false;
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
if (!ok) {
|
|
922
|
-
llama_model_free(model);
|
|
923
|
-
|
|
924
|
-
return iparams;
|
|
925
|
-
}
|
|
926
|
-
}
|
|
927
|
-
|
|
928
914
|
auto cparams = common_context_params_to_llama(params);
|
|
929
915
|
|
|
930
916
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
@@ -934,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
934
920
|
return iparams;
|
|
935
921
|
}
|
|
936
922
|
|
|
937
|
-
if (params.ctx_shift && !
|
|
923
|
+
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
|
938
924
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
|
939
925
|
params.ctx_shift = false;
|
|
940
926
|
}
|
|
@@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
966
952
|
}
|
|
967
953
|
}
|
|
968
954
|
|
|
955
|
+
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
|
956
|
+
bool ok = true;
|
|
957
|
+
|
|
958
|
+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
959
|
+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
960
|
+
ok = false;
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
964
|
+
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
965
|
+
|
|
966
|
+
if (!has_eos && !has_sep) {
|
|
967
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
|
968
|
+
ok = false;
|
|
969
|
+
} else if (!has_eos) {
|
|
970
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
971
|
+
} else if (!has_sep) {
|
|
972
|
+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
973
|
+
ok = false;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
if (!ok) {
|
|
977
|
+
llama_free(lctx);
|
|
978
|
+
llama_model_free(model);
|
|
979
|
+
|
|
980
|
+
return iparams;
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
969
984
|
// load and optionally apply lora adapters
|
|
970
985
|
for (auto & la : params.lora_adapters) {
|
|
971
986
|
llama_adapter_lora_ptr lora;
|
|
@@ -1041,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1041
1056
|
if (llama_model_has_decoder(model)) {
|
|
1042
1057
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
1043
1058
|
}
|
|
1044
|
-
|
|
1059
|
+
llama_memory_clear(llama_get_memory(lctx), true);
|
|
1045
1060
|
llama_synchronize(lctx);
|
|
1046
1061
|
llama_perf_context_reset(lctx);
|
|
1047
1062
|
llama_set_warmup(lctx, false);
|
|
@@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1143
1158
|
cparams.op_offload = !params.no_op_offload;
|
|
1144
1159
|
cparams.swa_full = params.swa_full;
|
|
1145
1160
|
|
|
1146
|
-
if (params.reranking) {
|
|
1147
|
-
cparams.embeddings = true;
|
|
1148
|
-
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
1149
|
-
}
|
|
1150
|
-
|
|
1151
1161
|
cparams.type_k = params.cache_type_k;
|
|
1152
1162
|
cparams.type_v = params.cache_type_v;
|
|
1153
1163
|
|
|
@@ -199,6 +199,9 @@ struct common_params_speculative {
|
|
|
199
199
|
float p_split = 0.1f; // speculative decoding split probability
|
|
200
200
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
201
201
|
|
|
202
|
+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
203
|
+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
204
|
+
|
|
202
205
|
struct cpu_params cpuparams;
|
|
203
206
|
struct cpu_params cpuparams_batch;
|
|
204
207
|
|
|
@@ -215,7 +218,8 @@ struct common_params_vocoder {
|
|
|
215
218
|
|
|
216
219
|
enum common_reasoning_format {
|
|
217
220
|
COMMON_REASONING_FORMAT_NONE,
|
|
218
|
-
|
|
221
|
+
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
222
|
+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
219
223
|
};
|
|
220
224
|
|
|
221
225
|
struct common_params {
|
|
@@ -354,7 +358,6 @@ struct common_params {
|
|
|
354
358
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
355
359
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
356
360
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
357
|
-
bool reranking = false; // enable reranking support on server
|
|
358
361
|
|
|
359
362
|
// server params
|
|
360
363
|
int32_t port = 8080; // server listens on this network port
|
|
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
|
|
|
144
144
|
auto & smpl = spec->smpl;
|
|
145
145
|
auto & prompt = spec->prompt;
|
|
146
146
|
|
|
147
|
+
auto * mem = llama_get_memory(ctx);
|
|
148
|
+
|
|
147
149
|
int reuse_i = 0;
|
|
148
150
|
int reuse_n = 0;
|
|
149
151
|
|
|
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
173
175
|
result.reserve(params.n_draft);
|
|
174
176
|
|
|
175
177
|
if (reuse_n == 0) {
|
|
176
|
-
|
|
178
|
+
llama_memory_clear(mem, false);
|
|
177
179
|
|
|
178
180
|
prompt.clear();
|
|
179
181
|
} else {
|
|
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
|
|
|
192
194
|
}
|
|
193
195
|
|
|
194
196
|
if (reuse_i > 0) {
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
|
198
|
+
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
|
197
199
|
|
|
198
200
|
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
|
199
201
|
}
|
|
200
202
|
|
|
201
203
|
if (reuse_n < (int) prompt.size()) {
|
|
202
|
-
|
|
204
|
+
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
|
203
205
|
|
|
204
206
|
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
|
205
207
|
}
|