@novastera-oss/llamarn 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +140 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +48 -67
- package/cpp/LlamaCppModel.h +8 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +33 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
- package/cpp/llama.cpp/common/arg.cpp +38 -12
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
- package/cpp/llama.cpp/common/chat-parser.h +4 -1
- package/cpp/llama.cpp/common/chat.cpp +16 -13
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +52 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +140 -38
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
- package/cpp/llama.cpp/src/llama-batch.h +47 -17
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +488 -313
- package/cpp/llama.cpp/src/llama-context.h +38 -17
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
- package/cpp/llama.cpp/src/llama-graph.h +109 -52
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
- package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +89 -4
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +735 -143
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +65 -10
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +140 -38
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|
|
203
203
|
|
|
204
204
|
DWORD p = NORMAL_PRIORITY_CLASS;
|
|
205
205
|
switch (prio) {
|
|
206
|
+
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
|
|
206
207
|
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
|
207
208
|
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
|
208
209
|
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
|
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|
|
228
229
|
|
|
229
230
|
int p = 0;
|
|
230
231
|
switch (prio) {
|
|
232
|
+
case GGML_SCHED_PRIO_LOW: p = 5; break;
|
|
231
233
|
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
|
232
234
|
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
|
233
235
|
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
|
@@ -464,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
|
|
464
466
|
|
|
465
467
|
std::string regex_escape(const std::string & s) {
|
|
466
468
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
467
|
-
return std::regex_replace(s, special_chars, "
|
|
469
|
+
return std::regex_replace(s, special_chars, "\\$&");
|
|
468
470
|
}
|
|
469
471
|
|
|
470
472
|
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
|
@@ -704,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
704
706
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
705
707
|
# pragma clang diagnostic push
|
|
706
708
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
709
|
+
#elif defined(__GNUC__)
|
|
710
|
+
# pragma GCC diagnostic push
|
|
711
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
707
712
|
#endif
|
|
713
|
+
|
|
708
714
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
709
715
|
|
|
710
716
|
#if defined(__clang__)
|
|
711
717
|
# pragma clang diagnostic pop
|
|
718
|
+
#elif defined(__GNUC__)
|
|
719
|
+
# pragma GCC diagnostic pop
|
|
712
720
|
#endif
|
|
713
721
|
|
|
714
722
|
filename_utf32 = converter.from_bytes(filename);
|
|
@@ -765,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
765
773
|
return true;
|
|
766
774
|
}
|
|
767
775
|
|
|
776
|
+
#include <iostream>
|
|
777
|
+
|
|
778
|
+
|
|
768
779
|
// returns true if successful, false otherwise
|
|
769
780
|
bool fs_create_directory_with_parents(const std::string & path) {
|
|
770
781
|
#ifdef _WIN32
|
|
@@ -782,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
|
782
793
|
// process path from front to back, procedurally creating directories
|
|
783
794
|
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
784
795
|
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
785
|
-
const wchar_t * test = subpath.c_str();
|
|
786
796
|
|
|
787
|
-
|
|
797
|
+
pos_slash += 1;
|
|
798
|
+
|
|
799
|
+
// skip the drive letter, in some systems it can return an access denied error
|
|
800
|
+
if (subpath.length() == 2 && subpath[1] == ':') {
|
|
801
|
+
continue;
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
|
805
|
+
|
|
788
806
|
if (!success) {
|
|
789
807
|
const DWORD error = GetLastError();
|
|
790
808
|
|
|
@@ -798,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
|
798
816
|
return false;
|
|
799
817
|
}
|
|
800
818
|
}
|
|
801
|
-
|
|
802
|
-
pos_slash += 1;
|
|
803
819
|
}
|
|
804
820
|
|
|
805
821
|
return true;
|
|
@@ -895,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
895
911
|
|
|
896
912
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
897
913
|
|
|
898
|
-
if (params.reranking) {
|
|
899
|
-
bool ok = true;
|
|
900
|
-
|
|
901
|
-
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
902
|
-
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
903
|
-
ok = false;
|
|
904
|
-
}
|
|
905
|
-
|
|
906
|
-
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
907
|
-
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
908
|
-
|
|
909
|
-
if (!has_eos && !has_sep) {
|
|
910
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
|
911
|
-
ok = false;
|
|
912
|
-
} else if (!has_eos) {
|
|
913
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
914
|
-
} else if (!has_sep) {
|
|
915
|
-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
916
|
-
ok = false;
|
|
917
|
-
}
|
|
918
|
-
|
|
919
|
-
if (!ok) {
|
|
920
|
-
llama_model_free(model);
|
|
921
|
-
|
|
922
|
-
return iparams;
|
|
923
|
-
}
|
|
924
|
-
}
|
|
925
|
-
|
|
926
914
|
auto cparams = common_context_params_to_llama(params);
|
|
927
915
|
|
|
928
916
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
@@ -932,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
932
920
|
return iparams;
|
|
933
921
|
}
|
|
934
922
|
|
|
935
|
-
if (params.ctx_shift && !
|
|
923
|
+
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
|
936
924
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
|
937
925
|
params.ctx_shift = false;
|
|
938
926
|
}
|
|
@@ -964,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
964
952
|
}
|
|
965
953
|
}
|
|
966
954
|
|
|
955
|
+
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
|
956
|
+
bool ok = true;
|
|
957
|
+
|
|
958
|
+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
959
|
+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
960
|
+
ok = false;
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
964
|
+
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
965
|
+
|
|
966
|
+
if (!has_eos && !has_sep) {
|
|
967
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
|
968
|
+
ok = false;
|
|
969
|
+
} else if (!has_eos) {
|
|
970
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
971
|
+
} else if (!has_sep) {
|
|
972
|
+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
973
|
+
ok = false;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
if (!ok) {
|
|
977
|
+
llama_free(lctx);
|
|
978
|
+
llama_model_free(model);
|
|
979
|
+
|
|
980
|
+
return iparams;
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
967
984
|
// load and optionally apply lora adapters
|
|
968
985
|
for (auto & la : params.lora_adapters) {
|
|
969
986
|
llama_adapter_lora_ptr lora;
|
|
@@ -1039,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1039
1056
|
if (llama_model_has_decoder(model)) {
|
|
1040
1057
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
1041
1058
|
}
|
|
1042
|
-
|
|
1059
|
+
llama_memory_clear(llama_get_memory(lctx), true);
|
|
1043
1060
|
llama_synchronize(lctx);
|
|
1044
1061
|
llama_perf_context_reset(lctx);
|
|
1045
1062
|
llama_set_warmup(lctx, false);
|
|
@@ -1141,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1141
1158
|
cparams.op_offload = !params.no_op_offload;
|
|
1142
1159
|
cparams.swa_full = params.swa_full;
|
|
1143
1160
|
|
|
1144
|
-
if (params.reranking) {
|
|
1145
|
-
cparams.embeddings = true;
|
|
1146
|
-
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
1147
|
-
}
|
|
1148
|
-
|
|
1149
1161
|
cparams.type_k = params.cache_type_k;
|
|
1150
1162
|
cparams.type_v = params.cache_type_v;
|
|
1151
1163
|
|
|
@@ -199,6 +199,9 @@ struct common_params_speculative {
|
|
|
199
199
|
float p_split = 0.1f; // speculative decoding split probability
|
|
200
200
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
201
201
|
|
|
202
|
+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
203
|
+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
204
|
+
|
|
202
205
|
struct cpu_params cpuparams;
|
|
203
206
|
struct cpu_params cpuparams_batch;
|
|
204
207
|
|
|
@@ -215,7 +218,8 @@ struct common_params_vocoder {
|
|
|
215
218
|
|
|
216
219
|
enum common_reasoning_format {
|
|
217
220
|
COMMON_REASONING_FORMAT_NONE,
|
|
218
|
-
|
|
221
|
+
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
222
|
+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
219
223
|
};
|
|
220
224
|
|
|
221
225
|
struct common_params {
|
|
@@ -354,7 +358,6 @@ struct common_params {
|
|
|
354
358
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
355
359
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
356
360
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
357
|
-
bool reranking = false; // enable reranking support on server
|
|
358
361
|
|
|
359
362
|
// server params
|
|
360
363
|
int32_t port = 8080; // server listens on this network port
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#include
|
|
3
|
+
#include <nlohmann/json_fwd.hpp>
|
|
4
|
+
|
|
5
|
+
#include <functional>
|
|
6
|
+
#include <string>
|
|
7
7
|
|
|
8
8
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
|
9
9
|
bool force_gbnf = false);
|
|
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
|
|
|
144
144
|
auto & smpl = spec->smpl;
|
|
145
145
|
auto & prompt = spec->prompt;
|
|
146
146
|
|
|
147
|
+
auto * mem = llama_get_memory(ctx);
|
|
148
|
+
|
|
147
149
|
int reuse_i = 0;
|
|
148
150
|
int reuse_n = 0;
|
|
149
151
|
|
|
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
173
175
|
result.reserve(params.n_draft);
|
|
174
176
|
|
|
175
177
|
if (reuse_n == 0) {
|
|
176
|
-
|
|
178
|
+
llama_memory_clear(mem, false);
|
|
177
179
|
|
|
178
180
|
prompt.clear();
|
|
179
181
|
} else {
|
|
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
|
|
|
192
194
|
}
|
|
193
195
|
|
|
194
196
|
if (reuse_i > 0) {
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
|
198
|
+
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
|
197
199
|
|
|
198
200
|
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
|
199
201
|
}
|
|
200
202
|
|
|
201
203
|
if (reuse_n < (int) prompt.size()) {
|
|
202
|
-
|
|
204
|
+
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
|
203
205
|
|
|
204
206
|
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
|
205
207
|
}
|