@novastera-oss/llamarn 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +134 -36
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -2
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +30 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +50 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +134 -36
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
- package/cpp/llama.cpp/src/llama-batch.h +36 -11
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +313 -213
- package/cpp/llama.cpp/src/llama-context.h +16 -12
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
- package/cpp/llama.cpp/src/llama-graph.h +90 -34
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
- package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +64 -23
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +726 -141
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/llama.h +134 -36
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
|
@@ -73,6 +73,7 @@ enum llm_type {
|
|
|
73
73
|
LLM_TYPE_40B,
|
|
74
74
|
LLM_TYPE_65B,
|
|
75
75
|
LLM_TYPE_70B,
|
|
76
|
+
LLM_TYPE_142B,
|
|
76
77
|
LLM_TYPE_236B,
|
|
77
78
|
LLM_TYPE_290B,
|
|
78
79
|
LLM_TYPE_314B,
|
|
@@ -329,6 +330,9 @@ struct llama_model {
|
|
|
329
330
|
llama_hparams hparams = {};
|
|
330
331
|
llama_vocab vocab;
|
|
331
332
|
|
|
333
|
+
// for classifier models
|
|
334
|
+
std::vector<std::string> classifier_labels;
|
|
335
|
+
|
|
332
336
|
struct ggml_tensor * tok_embd = nullptr;
|
|
333
337
|
struct ggml_tensor * type_embd = nullptr;
|
|
334
338
|
struct ggml_tensor * pos_embd = nullptr;
|
|
@@ -585,7 +585,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
585
585
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
|
586
586
|
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
|
587
587
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
|
588
|
-
|
|
588
|
+
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
|
589
|
+
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
|
|
589
590
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
|
590
591
|
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
|
591
592
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
|
@@ -9,16 +9,16 @@
|
|
|
9
9
|
|
|
10
10
|
#include <algorithm>
|
|
11
11
|
#include <cassert>
|
|
12
|
+
#include <cctype>
|
|
12
13
|
#include <cfloat>
|
|
13
|
-
#include <climits>
|
|
14
14
|
#include <cstdarg>
|
|
15
15
|
#include <cstring>
|
|
16
16
|
#include <forward_list>
|
|
17
|
+
#include <limits>
|
|
17
18
|
#include <map>
|
|
18
19
|
#include <queue>
|
|
19
20
|
#include <set>
|
|
20
21
|
#include <unordered_map>
|
|
21
|
-
#include <cctype>
|
|
22
22
|
|
|
23
23
|
//
|
|
24
24
|
// helpers
|
|
@@ -1987,6 +1987,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1987
1987
|
|| t.first == "<|eom_id|>"
|
|
1988
1988
|
|| t.first == "<EOT>"
|
|
1989
1989
|
|| t.first == "_<EOT>"
|
|
1990
|
+
|| t.first == "<|end_of_text|>"
|
|
1990
1991
|
) {
|
|
1991
1992
|
special_eog_ids.insert(t.second);
|
|
1992
1993
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2059,9 +2060,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2059
2060
|
//NOTE: Per token attributes are missing from the GGUF file.
|
|
2060
2061
|
//TODO: Extract attributes from GGUF file.
|
|
2061
2062
|
{
|
|
2062
|
-
auto _contains_any = [] (const std::string & str, const std::vector<std::
|
|
2063
|
+
auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
|
|
2063
2064
|
for (const auto & substr : substrs) {
|
|
2064
|
-
if (str.find(substr)
|
|
2065
|
+
if (str.find(substr) != std::string::npos) {
|
|
2065
2066
|
return true;
|
|
2066
2067
|
}
|
|
2067
2068
|
}
|
|
@@ -2098,7 +2099,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2098
2099
|
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
|
2099
2100
|
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
|
2100
2101
|
) {
|
|
2101
|
-
|
|
2102
|
+
if (token_to_id.count("<mask>") == 0) {
|
|
2103
|
+
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
|
2104
|
+
} else {
|
|
2105
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
2106
|
+
}
|
|
2102
2107
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
2103
2108
|
for (auto id : cache_special_tokens) {
|
|
2104
2109
|
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
@@ -2568,6 +2573,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
|
|
2568
2573
|
// copy piece chars to output text buffer
|
|
2569
2574
|
// skip up to 'lstrip' leading spaces before copying
|
|
2570
2575
|
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
|
2576
|
+
if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
|
|
2577
|
+
GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
|
|
2578
|
+
}
|
|
2579
|
+
|
|
2571
2580
|
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
|
2572
2581
|
token++;
|
|
2573
2582
|
size--;
|
|
@@ -2764,26 +2773,26 @@ void llama_vocab::impl::print_info() const {
|
|
|
2764
2773
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
|
2765
2774
|
|
|
2766
2775
|
// special tokens
|
|
2767
|
-
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token
|
|
2768
|
-
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token
|
|
2769
|
-
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token
|
|
2770
|
-
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token
|
|
2771
|
-
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token
|
|
2772
|
-
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token
|
|
2773
|
-
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token
|
|
2774
|
-
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token
|
|
2775
|
-
|
|
2776
|
-
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token
|
|
2777
|
-
|
|
2778
|
-
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token
|
|
2779
|
-
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token
|
|
2780
|
-
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token
|
|
2781
|
-
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token
|
|
2782
|
-
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token
|
|
2783
|
-
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token
|
|
2776
|
+
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
|
2777
|
+
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
|
2778
|
+
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
|
2779
|
+
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
|
2780
|
+
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
|
2781
|
+
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
|
2782
|
+
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
|
2783
|
+
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
|
2784
|
+
|
|
2785
|
+
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
|
2786
|
+
|
|
2787
|
+
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
|
2788
|
+
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
|
2789
|
+
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
|
2790
|
+
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
|
2791
|
+
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
|
2792
|
+
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
|
2784
2793
|
|
|
2785
2794
|
for (const auto & id : special_eog_ids) {
|
|
2786
|
-
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token
|
|
2795
|
+
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
|
2787
2796
|
}
|
|
2788
2797
|
|
|
2789
2798
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
|
@@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
198
198
|
|
|
199
199
|
// if using single GPU mode, remove all except the main GPU
|
|
200
200
|
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
201
|
-
if (params.main_gpu < 0
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
201
|
+
if (params.main_gpu < 0) {
|
|
202
|
+
model->devices.clear();
|
|
203
|
+
} else {
|
|
204
|
+
if (params.main_gpu >= (int)model->devices.size()) {
|
|
205
|
+
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
|
|
206
|
+
llama_model_free(model);
|
|
207
|
+
return nullptr;
|
|
208
|
+
}
|
|
209
|
+
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
|
210
|
+
model->devices.clear();
|
|
211
|
+
model->devices.push_back(main_gpu);
|
|
205
212
|
}
|
|
206
|
-
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
|
207
|
-
model->devices.clear();
|
|
208
|
-
model->devices.push_back(main_gpu);
|
|
209
213
|
}
|
|
210
214
|
|
|
211
215
|
for (auto * dev : model->devices) {
|
|
@@ -204,12 +204,17 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
|
|
204
204
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
205
205
|
# pragma clang diagnostic push
|
|
206
206
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
207
|
+
#elif defined(__GNUC__)
|
|
208
|
+
# pragma GCC diagnostic push
|
|
209
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
207
210
|
#endif
|
|
208
211
|
|
|
209
212
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
|
210
213
|
|
|
211
214
|
#if defined(__clang__)
|
|
212
215
|
# pragma clang diagnostic pop
|
|
216
|
+
#elif defined(__GNUC__)
|
|
217
|
+
# pragma GCC diagnostic pop
|
|
213
218
|
#endif
|
|
214
219
|
|
|
215
220
|
return conv.from_bytes(s);
|
package/cpp/rn-completion.cpp
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#include "rn-llama.
|
|
1
|
+
#include "rn-llama.h"
|
|
2
2
|
// Suppress unused function warnings from llama.cpp headers
|
|
3
3
|
#pragma GCC diagnostic push
|
|
4
4
|
#pragma GCC diagnostic ignored "-Wunused-function"
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#include "llama.h"
|
|
8
8
|
#include "sampling.h"
|
|
9
9
|
#pragma GCC diagnostic pop
|
|
10
|
-
#include "rn-utils.
|
|
10
|
+
#include "rn-utils.h"
|
|
11
11
|
|
|
12
12
|
#include <string>
|
|
13
13
|
#include <vector>
|
package/ios/include/chat.h
CHANGED
|
@@ -70,7 +70,7 @@ struct common_chat_msg {
|
|
|
70
70
|
};
|
|
71
71
|
|
|
72
72
|
struct common_chat_msg_diff {
|
|
73
|
-
|
|
73
|
+
std::string reasoning_content_delta;
|
|
74
74
|
std::string content_delta;
|
|
75
75
|
size_t tool_call_index = std::string::npos;
|
|
76
76
|
common_chat_tool_call tool_call_delta;
|
package/ios/include/common.h
CHANGED
|
@@ -199,6 +199,9 @@ struct common_params_speculative {
|
|
|
199
199
|
float p_split = 0.1f; // speculative decoding split probability
|
|
200
200
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
201
201
|
|
|
202
|
+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
203
|
+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
204
|
+
|
|
202
205
|
struct cpu_params cpuparams;
|
|
203
206
|
struct cpu_params cpuparams_batch;
|
|
204
207
|
|
|
@@ -215,7 +218,8 @@ struct common_params_vocoder {
|
|
|
215
218
|
|
|
216
219
|
enum common_reasoning_format {
|
|
217
220
|
COMMON_REASONING_FORMAT_NONE,
|
|
218
|
-
|
|
221
|
+
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
222
|
+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
219
223
|
};
|
|
220
224
|
|
|
221
225
|
struct common_params {
|
|
@@ -354,7 +358,6 @@ struct common_params {
|
|
|
354
358
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
355
359
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
356
360
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
357
|
-
bool reranking = false; // enable reranking support on server
|
|
358
361
|
|
|
359
362
|
// server params
|
|
360
363
|
int32_t port = 8080; // server listens on this network port
|
package/ios/include/llama.h
CHANGED
|
@@ -61,7 +61,10 @@ extern "C" {
|
|
|
61
61
|
struct llama_model;
|
|
62
62
|
struct llama_context;
|
|
63
63
|
struct llama_sampler;
|
|
64
|
-
|
|
64
|
+
|
|
65
|
+
typedef struct llama_memory_i * llama_memory_t;
|
|
66
|
+
|
|
67
|
+
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
|
65
68
|
|
|
66
69
|
typedef int32_t llama_pos;
|
|
67
70
|
typedef int32_t llama_token;
|
|
@@ -240,18 +243,21 @@ extern "C" {
|
|
|
240
243
|
|
|
241
244
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
|
242
245
|
|
|
243
|
-
// Input data for llama_decode
|
|
246
|
+
// Input data for llama_encode/llama_decode
|
|
244
247
|
// A llama_batch object can contain input about one or many sequences
|
|
245
248
|
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
|
246
249
|
//
|
|
247
250
|
// - token : the token ids of the input (used when embd is NULL)
|
|
248
251
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
249
252
|
// - pos : the positions of the respective token in the sequence
|
|
250
|
-
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
|
253
|
+
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
|
251
254
|
// - seq_id : the sequence to which the respective token belongs
|
|
252
255
|
// (if set to NULL, the sequence ID will be assumed to be 0)
|
|
253
256
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
|
254
|
-
// (if set to NULL
|
|
257
|
+
// (if set to NULL:
|
|
258
|
+
// - if embeddings: all tokens are output
|
|
259
|
+
// - if not: only the last token is output
|
|
260
|
+
// )
|
|
255
261
|
//
|
|
256
262
|
typedef struct llama_batch {
|
|
257
263
|
int32_t n_tokens;
|
|
@@ -259,8 +265,8 @@ extern "C" {
|
|
|
259
265
|
llama_token * token;
|
|
260
266
|
float * embd;
|
|
261
267
|
llama_pos * pos;
|
|
262
|
-
int32_t * n_seq_id;
|
|
263
|
-
llama_seq_id ** seq_id;
|
|
268
|
+
int32_t * n_seq_id;
|
|
269
|
+
llama_seq_id ** seq_id;
|
|
264
270
|
int8_t * logits; // TODO: rename this to "output"
|
|
265
271
|
} llama_batch;
|
|
266
272
|
|
|
@@ -493,9 +499,11 @@ extern "C" {
|
|
|
493
499
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
|
494
500
|
|
|
495
501
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
|
496
|
-
LLAMA_API
|
|
502
|
+
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
|
497
503
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
498
504
|
|
|
505
|
+
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
|
506
|
+
|
|
499
507
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
500
508
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
501
509
|
|
|
@@ -509,6 +517,13 @@ extern "C" {
|
|
|
509
517
|
// Get the model's RoPE frequency scaling factor
|
|
510
518
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
511
519
|
|
|
520
|
+
// Returns the number of classifier outputs (only valid for classifier models)
|
|
521
|
+
// Undefined behavior for non-classifier models
|
|
522
|
+
LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
|
|
523
|
+
|
|
524
|
+
// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
|
|
525
|
+
LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
|
|
526
|
+
|
|
512
527
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
|
513
528
|
|
|
514
529
|
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
|
@@ -609,7 +624,81 @@ extern "C" {
|
|
|
609
624
|
int32_t il_end);
|
|
610
625
|
|
|
611
626
|
//
|
|
612
|
-
//
|
|
627
|
+
// Memory
|
|
628
|
+
//
|
|
629
|
+
|
|
630
|
+
// Clear the memory contents
|
|
631
|
+
// If data == true, the data buffers will also be cleared together with the metadata
|
|
632
|
+
LLAMA_API void llama_memory_clear(
|
|
633
|
+
llama_memory_t mem,
|
|
634
|
+
bool data);
|
|
635
|
+
|
|
636
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
637
|
+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
638
|
+
// seq_id < 0 : match any sequence
|
|
639
|
+
// p0 < 0 : [0, p1]
|
|
640
|
+
// p1 < 0 : [p0, inf)
|
|
641
|
+
LLAMA_API bool llama_memory_seq_rm(
|
|
642
|
+
llama_memory_t mem,
|
|
643
|
+
llama_seq_id seq_id,
|
|
644
|
+
llama_pos p0,
|
|
645
|
+
llama_pos p1);
|
|
646
|
+
|
|
647
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
|
648
|
+
// p0 < 0 : [0, p1]
|
|
649
|
+
// p1 < 0 : [p0, inf)
|
|
650
|
+
LLAMA_API void llama_memory_seq_cp(
|
|
651
|
+
llama_memory_t mem,
|
|
652
|
+
llama_seq_id seq_id_src,
|
|
653
|
+
llama_seq_id seq_id_dst,
|
|
654
|
+
llama_pos p0,
|
|
655
|
+
llama_pos p1);
|
|
656
|
+
|
|
657
|
+
// Removes all tokens that do not belong to the specified sequence
|
|
658
|
+
LLAMA_API void llama_memory_seq_keep(
|
|
659
|
+
llama_memory_t mem,
|
|
660
|
+
llama_seq_id seq_id);
|
|
661
|
+
|
|
662
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
663
|
+
// p0 < 0 : [0, p1]
|
|
664
|
+
// p1 < 0 : [p0, inf)
|
|
665
|
+
LLAMA_API void llama_memory_seq_add(
|
|
666
|
+
llama_memory_t mem,
|
|
667
|
+
llama_seq_id seq_id,
|
|
668
|
+
llama_pos p0,
|
|
669
|
+
llama_pos p1,
|
|
670
|
+
llama_pos delta);
|
|
671
|
+
|
|
672
|
+
// Integer division of the positions by factor of `d > 1`
|
|
673
|
+
// p0 < 0 : [0, p1]
|
|
674
|
+
// p1 < 0 : [p0, inf)
|
|
675
|
+
LLAMA_API void llama_memory_seq_div(
|
|
676
|
+
llama_memory_t mem,
|
|
677
|
+
llama_seq_id seq_id,
|
|
678
|
+
llama_pos p0,
|
|
679
|
+
llama_pos p1,
|
|
680
|
+
int d);
|
|
681
|
+
|
|
682
|
+
// Returns the smallest position present in the memory for the specified sequence
|
|
683
|
+
// This is typically non-zero only for SWA caches
|
|
684
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
685
|
+
// Return -1 if the sequence is empty
|
|
686
|
+
LLAMA_API llama_pos llama_memory_seq_pos_min(
|
|
687
|
+
llama_memory_t mem,
|
|
688
|
+
llama_seq_id seq_id);
|
|
689
|
+
|
|
690
|
+
// Returns the largest position present in the memory for the specified sequence
|
|
691
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
692
|
+
// Return -1 if the sequence is empty
|
|
693
|
+
LLAMA_API llama_pos llama_memory_seq_pos_max(
|
|
694
|
+
llama_memory_t mem,
|
|
695
|
+
llama_seq_id seq_id);
|
|
696
|
+
|
|
697
|
+
// Check if the memory supports shifting
|
|
698
|
+
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
|
699
|
+
|
|
700
|
+
//
|
|
701
|
+
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
|
613
702
|
//
|
|
614
703
|
|
|
615
704
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
@@ -622,86 +711,95 @@ extern "C" {
|
|
|
622
711
|
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
623
712
|
|
|
624
713
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
625
|
-
LLAMA_API void llama_kv_self_clear(
|
|
626
|
-
|
|
714
|
+
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
|
715
|
+
struct llama_context * ctx),
|
|
716
|
+
"Use llama_memory_clear() instead");
|
|
627
717
|
|
|
628
718
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
629
719
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
630
720
|
// seq_id < 0 : match any sequence
|
|
631
721
|
// p0 < 0 : [0, p1]
|
|
632
722
|
// p1 < 0 : [p0, inf)
|
|
633
|
-
LLAMA_API bool llama_kv_self_seq_rm(
|
|
723
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
|
634
724
|
struct llama_context * ctx,
|
|
635
725
|
llama_seq_id seq_id,
|
|
636
726
|
llama_pos p0,
|
|
637
|
-
llama_pos p1)
|
|
727
|
+
llama_pos p1),
|
|
728
|
+
"Use llama_memory_seq_rm() instead");
|
|
638
729
|
|
|
639
730
|
// Copy all tokens that belong to the specified sequence to another sequence
|
|
640
731
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
641
732
|
// p0 < 0 : [0, p1]
|
|
642
733
|
// p1 < 0 : [p0, inf)
|
|
643
|
-
LLAMA_API void llama_kv_self_seq_cp(
|
|
734
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
|
644
735
|
struct llama_context * ctx,
|
|
645
736
|
llama_seq_id seq_id_src,
|
|
646
737
|
llama_seq_id seq_id_dst,
|
|
647
738
|
llama_pos p0,
|
|
648
|
-
llama_pos p1)
|
|
739
|
+
llama_pos p1),
|
|
740
|
+
"Use llama_memory_seq_cp() instead");
|
|
649
741
|
|
|
650
742
|
// Removes all tokens that do not belong to the specified sequence
|
|
651
|
-
LLAMA_API void llama_kv_self_seq_keep(
|
|
743
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
|
652
744
|
struct llama_context * ctx,
|
|
653
|
-
llama_seq_id seq_id)
|
|
745
|
+
llama_seq_id seq_id),
|
|
746
|
+
"Use llama_memory_seq_keep() instead");
|
|
654
747
|
|
|
655
748
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
656
749
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
657
750
|
// - lazily on next llama_decode()
|
|
658
751
|
// p0 < 0 : [0, p1]
|
|
659
752
|
// p1 < 0 : [p0, inf)
|
|
660
|
-
LLAMA_API void llama_kv_self_seq_add(
|
|
753
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
|
661
754
|
struct llama_context * ctx,
|
|
662
755
|
llama_seq_id seq_id,
|
|
663
756
|
llama_pos p0,
|
|
664
757
|
llama_pos p1,
|
|
665
|
-
llama_pos delta)
|
|
758
|
+
llama_pos delta),
|
|
759
|
+
"Use llama_memory_seq_add() instead");
|
|
666
760
|
|
|
667
761
|
// Integer division of the positions by factor of `d > 1`
|
|
668
762
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
669
763
|
// - lazily on next llama_decode()
|
|
670
764
|
// p0 < 0 : [0, p1]
|
|
671
765
|
// p1 < 0 : [p0, inf)
|
|
672
|
-
|
|
766
|
+
DEPRECATED(void llama_kv_self_seq_div(
|
|
673
767
|
struct llama_context * ctx,
|
|
674
768
|
llama_seq_id seq_id,
|
|
675
769
|
llama_pos p0,
|
|
676
770
|
llama_pos p1,
|
|
677
|
-
int d)
|
|
771
|
+
int d),
|
|
772
|
+
"Use llama_memory_seq_div() instead");
|
|
678
773
|
|
|
679
774
|
// Returns the smallest position present in the KV cache for the specified sequence
|
|
680
775
|
// This is typically non-zero only for SWA caches
|
|
681
776
|
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
682
777
|
// Return -1 if the sequence is empty
|
|
683
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
778
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
684
779
|
struct llama_context * ctx,
|
|
685
|
-
llama_seq_id seq_id)
|
|
780
|
+
llama_seq_id seq_id),
|
|
781
|
+
"Use llama_memory_seq_pos_min() instead");
|
|
686
782
|
|
|
687
783
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
688
784
|
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
689
785
|
// Return -1 if the sequence is empty
|
|
690
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
786
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
691
787
|
struct llama_context * ctx,
|
|
692
|
-
llama_seq_id seq_id)
|
|
788
|
+
llama_seq_id seq_id),
|
|
789
|
+
"Use llama_memory_seq_pos_max() instead");
|
|
693
790
|
|
|
694
791
|
// Defragment the KV cache
|
|
695
792
|
// This will be applied:
|
|
696
793
|
// - lazily on next llama_decode()
|
|
697
|
-
LLAMA_API
|
|
794
|
+
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
|
698
795
|
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
699
796
|
|
|
700
797
|
// Check if the context supports KV cache shifting
|
|
701
|
-
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx)
|
|
798
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
|
799
|
+
"use llama_memory_can_shift() instead");
|
|
702
800
|
|
|
703
801
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
704
|
-
LLAMA_API
|
|
802
|
+
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
|
705
803
|
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
706
804
|
|
|
707
805
|
//
|
|
@@ -709,7 +807,7 @@ extern "C" {
|
|
|
709
807
|
//
|
|
710
808
|
|
|
711
809
|
// Returns the *actual* size in bytes of the state
|
|
712
|
-
// (logits, embedding and
|
|
810
|
+
// (logits, embedding and memory)
|
|
713
811
|
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
|
714
812
|
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
|
715
813
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
|
@@ -765,12 +863,12 @@ extern "C" {
|
|
|
765
863
|
size_t n_token_count),
|
|
766
864
|
"use llama_state_save_file instead");
|
|
767
865
|
|
|
768
|
-
// Get the exact size needed to copy the
|
|
866
|
+
// Get the exact size needed to copy the state of a single sequence
|
|
769
867
|
LLAMA_API size_t llama_state_seq_get_size(
|
|
770
868
|
struct llama_context * ctx,
|
|
771
869
|
llama_seq_id seq_id);
|
|
772
870
|
|
|
773
|
-
// Copy the
|
|
871
|
+
// Copy the state of a single sequence into the specified buffer
|
|
774
872
|
LLAMA_API size_t llama_state_seq_get_data(
|
|
775
873
|
struct llama_context * ctx,
|
|
776
874
|
uint8_t * dst,
|
|
@@ -836,16 +934,16 @@ extern "C" {
|
|
|
836
934
|
// For encode-decoder contexts, processes the batch using the encoder.
|
|
837
935
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
|
838
936
|
// 0 - success
|
|
839
|
-
// < 0 - error. the
|
|
937
|
+
// < 0 - error. the memory state is restored to the state before this call
|
|
840
938
|
LLAMA_API int32_t llama_encode(
|
|
841
939
|
struct llama_context * ctx,
|
|
842
940
|
struct llama_batch batch);
|
|
843
941
|
|
|
844
942
|
// Process a batch of tokens.
|
|
845
|
-
// Requires
|
|
943
|
+
// Requires the context to have a memory.
|
|
846
944
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
847
945
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
848
|
-
// Upon non-zero return values, the
|
|
946
|
+
// Upon non-zero return values, the memory state is restored to the state before this call
|
|
849
947
|
// 0 - success
|
|
850
948
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
851
949
|
// 2 - aborted
|
|
@@ -866,8 +964,8 @@ extern "C" {
|
|
|
866
964
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
|
867
965
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
|
868
966
|
|
|
869
|
-
// Set whether the
|
|
870
|
-
//
|
|
967
|
+
// Set whether the context outputs embeddings or not
|
|
968
|
+
// TODO: rename to avoid confusion with llama_get_embeddings()
|
|
871
969
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
|
872
970
|
|
|
873
971
|
// Set whether to use causal attention or not
|
|
@@ -916,7 +1014,7 @@ extern "C" {
|
|
|
916
1014
|
|
|
917
1015
|
// Get the embeddings for a sequence id
|
|
918
1016
|
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
|
919
|
-
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[
|
|
1017
|
+
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
|
|
920
1018
|
// otherwise: float[n_embd] (1-dimensional)
|
|
921
1019
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
922
1020
|
|