@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
// llama_memory_recurrent
|
|
12
12
|
//
|
|
13
13
|
|
|
14
|
-
// TODO: extract the cache state used for graph computation into
|
|
15
|
-
// see the implementation of
|
|
14
|
+
// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
|
|
15
|
+
// see the implementation of llama_kv_cache_unified_context_i for an example how to do it
|
|
16
16
|
class llama_memory_recurrent : public llama_memory_i {
|
|
17
17
|
public:
|
|
18
18
|
|
|
@@ -34,14 +34,14 @@ public:
|
|
|
34
34
|
// llama_memory_i
|
|
35
35
|
//
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
llama_memory_context_ptr init_batch(
|
|
38
|
+
llama_batch_allocr & balloc,
|
|
39
39
|
uint32_t n_ubatch,
|
|
40
40
|
bool embd_all) override;
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
llama_memory_context_ptr init_full() override;
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
|
|
45
45
|
|
|
46
46
|
void clear(bool data) override;
|
|
47
47
|
|
|
@@ -125,37 +125,34 @@ private:
|
|
|
125
125
|
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
|
126
126
|
};
|
|
127
127
|
|
|
128
|
-
class
|
|
128
|
+
class llama_memory_recurrent_context : public llama_memory_context_i {
|
|
129
129
|
public:
|
|
130
130
|
// used for errors
|
|
131
|
-
|
|
131
|
+
llama_memory_recurrent_context(llama_memory_status status);
|
|
132
132
|
|
|
133
|
-
// used to create a full-cache
|
|
134
|
-
|
|
133
|
+
// used to create a full-cache or update context
|
|
134
|
+
llama_memory_recurrent_context(
|
|
135
135
|
llama_memory_recurrent * mem);
|
|
136
136
|
|
|
137
|
-
// used to create a
|
|
138
|
-
|
|
137
|
+
// used to create a batch processing context from a batch
|
|
138
|
+
llama_memory_recurrent_context(
|
|
139
139
|
llama_memory_recurrent * mem,
|
|
140
|
-
llama_sbatch sbatch,
|
|
141
140
|
std::vector<llama_ubatch> ubatches);
|
|
142
141
|
|
|
143
|
-
virtual ~
|
|
142
|
+
virtual ~llama_memory_recurrent_context();
|
|
144
143
|
|
|
145
144
|
//
|
|
146
|
-
//
|
|
145
|
+
// llama_memory_context_i
|
|
147
146
|
//
|
|
148
147
|
|
|
149
148
|
bool next() override;
|
|
150
149
|
bool apply() override;
|
|
151
150
|
|
|
152
|
-
std::vector<int64_t> & out_ids() override;
|
|
153
|
-
|
|
154
151
|
llama_memory_status get_status() const override;
|
|
155
152
|
const llama_ubatch & get_ubatch() const override;
|
|
156
153
|
|
|
157
154
|
//
|
|
158
|
-
//
|
|
155
|
+
// llama_memory_recurrent_context specific API
|
|
159
156
|
//
|
|
160
157
|
|
|
161
158
|
uint32_t get_n_rs() const;
|
|
@@ -173,8 +170,6 @@ private:
|
|
|
173
170
|
|
|
174
171
|
llama_memory_recurrent * mem;
|
|
175
172
|
|
|
176
|
-
llama_sbatch sbatch;
|
|
177
|
-
|
|
178
173
|
size_t i_next = 0;
|
|
179
174
|
|
|
180
175
|
std::vector<llama_ubatch> ubatches;
|
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
5
|
#include <memory>
|
|
6
|
-
#include <vector>
|
|
7
6
|
|
|
8
7
|
struct llama_ubatch;
|
|
9
8
|
|
|
9
|
+
class llama_batch_allocr;
|
|
10
|
+
|
|
10
11
|
class llama_io_write_i;
|
|
11
12
|
class llama_io_read_i;
|
|
12
13
|
|
|
@@ -26,23 +27,21 @@ enum llama_memory_status {
|
|
|
26
27
|
LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
|
|
27
28
|
};
|
|
28
29
|
|
|
29
|
-
// helper function for combining the status of two memory
|
|
30
|
+
// helper function for combining the status of two memory contexts
|
|
30
31
|
// useful for implementing hybrid memory types (e.g. iSWA)
|
|
31
32
|
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
|
|
32
33
|
|
|
33
|
-
// the interface for managing the memory
|
|
34
|
+
// the interface for managing the memory context during batch processing
|
|
34
35
|
// this interface is implemented per memory type. see:
|
|
35
|
-
// -
|
|
36
|
-
// -
|
|
36
|
+
// - llama_kv_cache_unified_context
|
|
37
|
+
// - llama_kv_cache_unified_iswa_context
|
|
37
38
|
// ...
|
|
38
39
|
//
|
|
39
|
-
// the only method that
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
struct llama_memory_state_i {
|
|
43
|
-
virtual ~llama_memory_state_i() = default;
|
|
40
|
+
// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
|
|
41
|
+
struct llama_memory_context_i {
|
|
42
|
+
virtual ~llama_memory_context_i() = default;
|
|
44
43
|
|
|
45
|
-
// consume the current ubatch from the
|
|
44
|
+
// consume the current ubatch from the context and proceed to the next one
|
|
46
45
|
// return false if we are done
|
|
47
46
|
virtual bool next() = 0;
|
|
48
47
|
|
|
@@ -50,17 +49,14 @@ struct llama_memory_state_i {
|
|
|
50
49
|
// return false on failure
|
|
51
50
|
virtual bool apply() = 0;
|
|
52
51
|
|
|
53
|
-
// TODO: this might get reworked in the future when refactoring llama_batch
|
|
54
|
-
virtual std::vector<int64_t> & out_ids() = 0;
|
|
55
|
-
|
|
56
52
|
// get the current ubatch
|
|
57
53
|
virtual const llama_ubatch & get_ubatch() const = 0;
|
|
58
54
|
|
|
59
|
-
// get the status of the memory
|
|
55
|
+
// get the status of the memory context - used for error handling and checking if any updates would be applied
|
|
60
56
|
virtual llama_memory_status get_status() const = 0;
|
|
61
57
|
};
|
|
62
58
|
|
|
63
|
-
using
|
|
59
|
+
using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
|
|
64
60
|
|
|
65
61
|
// general concept of LLM memory
|
|
66
62
|
// the KV cache is a type of LLM memory, but there can be other types
|
|
@@ -68,19 +64,19 @@ struct llama_memory_i {
|
|
|
68
64
|
virtual ~llama_memory_i() = default;
|
|
69
65
|
|
|
70
66
|
// split the input batch into a set of ubatches and verify that they can fit into the cache
|
|
71
|
-
// return a
|
|
72
|
-
// check the
|
|
73
|
-
virtual
|
|
74
|
-
|
|
67
|
+
// return a context object containing the ubatches and memory state required to process them
|
|
68
|
+
// check the llama_memory_context_i::get_status() for the result
|
|
69
|
+
virtual llama_memory_context_ptr init_batch(
|
|
70
|
+
llama_batch_allocr & balloc,
|
|
75
71
|
uint32_t n_ubatch,
|
|
76
72
|
bool embd_all) = 0;
|
|
77
73
|
|
|
78
74
|
// simulate full cache, used for allocating worst-case compute buffers
|
|
79
|
-
virtual
|
|
75
|
+
virtual llama_memory_context_ptr init_full() = 0;
|
|
80
76
|
|
|
81
77
|
// prepare for any pending memory updates, such as shifts, defrags, etc.
|
|
82
78
|
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
|
|
83
|
-
virtual
|
|
79
|
+
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
|
|
84
80
|
|
|
85
81
|
// getters
|
|
86
82
|
virtual bool get_can_shift() const = 0;
|
|
@@ -228,6 +228,7 @@ void llama_model_saver::add_kv_from_model() {
|
|
|
228
228
|
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
|
229
229
|
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
|
230
230
|
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
|
231
|
+
add_kv(LLM_KV_TOKENIZER_ADD_SEP, vocab.get_add_sep());
|
|
231
232
|
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
|
232
233
|
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
|
233
234
|
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|