@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -15,18 +15,21 @@ add_library(llama
|
|
|
15
15
|
llama-chat.cpp
|
|
16
16
|
llama-context.cpp
|
|
17
17
|
llama-grammar.cpp
|
|
18
|
+
llama-graph.cpp
|
|
18
19
|
llama-hparams.cpp
|
|
19
20
|
llama-impl.cpp
|
|
21
|
+
llama-io.cpp
|
|
20
22
|
llama-kv-cache.cpp
|
|
23
|
+
llama-memory.cpp
|
|
21
24
|
llama-mmap.cpp
|
|
22
25
|
llama-model-loader.cpp
|
|
23
26
|
llama-model.cpp
|
|
24
27
|
llama-quant.cpp
|
|
25
28
|
llama-sampling.cpp
|
|
26
29
|
llama-vocab.cpp
|
|
27
|
-
unicode.h
|
|
28
|
-
unicode.cpp
|
|
29
30
|
unicode-data.cpp
|
|
31
|
+
unicode.cpp
|
|
32
|
+
unicode.h
|
|
30
33
|
)
|
|
31
34
|
|
|
32
35
|
target_include_directories(llama PUBLIC . ../include ../common)
|
|
@@ -4,14 +4,13 @@
|
|
|
4
4
|
#include "llama-mmap.h"
|
|
5
5
|
#include "llama-model.h"
|
|
6
6
|
|
|
7
|
-
#include <algorithm>
|
|
8
7
|
#include <map>
|
|
9
8
|
#include <cassert>
|
|
10
9
|
#include <stdexcept>
|
|
11
10
|
|
|
12
11
|
// vec
|
|
13
12
|
|
|
14
|
-
|
|
13
|
+
ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
|
15
14
|
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
|
16
15
|
return nullptr;
|
|
17
16
|
}
|
|
@@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
|
|
19
18
|
return tensors[il];
|
|
20
19
|
}
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
|
|
23
22
|
ggml_tensor * layer_dir = tensor_for(il);
|
|
24
23
|
if (layer_dir != nullptr) {
|
|
25
24
|
cur = ggml_add(ctx, cur, layer_dir);
|
|
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
|
|
40
39
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
41
40
|
auto it = ctx_map.find(buft);
|
|
42
41
|
if (it == ctx_map.end()) {
|
|
43
|
-
|
|
42
|
+
ggml_init_params params = {
|
|
44
43
|
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
|
|
45
44
|
/*.mem_buffer =*/ NULL,
|
|
46
45
|
/*.no_alloc =*/ true,
|
|
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
|
|
91
90
|
return true;
|
|
92
91
|
}
|
|
93
92
|
|
|
94
|
-
|
|
93
|
+
bool llama_adapter_cvec::apply(
|
|
95
94
|
const llama_model & model,
|
|
96
95
|
const float * data,
|
|
97
96
|
size_t len,
|
|
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
|
|
|
104
103
|
// disable the current control vector (but leave allocated for later)
|
|
105
104
|
layer_start = -1;
|
|
106
105
|
layer_end = -1;
|
|
107
|
-
return
|
|
106
|
+
return true;
|
|
108
107
|
}
|
|
109
108
|
|
|
110
109
|
if (n_embd != (int) hparams.n_embd) {
|
|
111
110
|
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
|
112
|
-
return
|
|
111
|
+
return false;
|
|
113
112
|
}
|
|
114
113
|
|
|
115
114
|
if (tensors.empty()) {
|
|
116
115
|
if (!init(model)) {
|
|
117
|
-
return
|
|
116
|
+
return false;
|
|
118
117
|
}
|
|
119
118
|
}
|
|
120
119
|
|
|
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
|
|
|
130
129
|
}
|
|
131
130
|
}
|
|
132
131
|
|
|
133
|
-
return
|
|
132
|
+
return true;
|
|
134
133
|
}
|
|
135
134
|
|
|
136
135
|
// lora
|
|
137
136
|
|
|
138
|
-
llama_adapter_lora_weight * llama_adapter_lora::get_weight(
|
|
137
|
+
llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|
139
138
|
const std::string name(w->name);
|
|
140
139
|
|
|
141
140
|
const auto pos = ab_map.find(name);
|
|
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
|
|
|
146
145
|
return nullptr;
|
|
147
146
|
}
|
|
148
147
|
|
|
149
|
-
static void llama_adapter_lora_init_impl(
|
|
148
|
+
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
|
150
149
|
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
|
151
150
|
|
|
152
151
|
ggml_context * ctx_init;
|
|
153
|
-
|
|
152
|
+
gguf_init_params meta_gguf_params = {
|
|
154
153
|
/* .no_alloc = */ true,
|
|
155
154
|
/* .ctx = */ &ctx_init,
|
|
156
155
|
};
|
|
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
201
200
|
auto it = ctx_map.find(buft);
|
|
202
201
|
if (it == ctx_map.end()) {
|
|
203
202
|
// add a new context
|
|
204
|
-
|
|
203
|
+
ggml_init_params params = {
|
|
205
204
|
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
|
|
206
205
|
/*.mem_buffer =*/ NULL,
|
|
207
206
|
/*.no_alloc =*/ true,
|
|
@@ -264,7 +263,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
264
263
|
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
|
265
264
|
}
|
|
266
265
|
|
|
267
|
-
|
|
266
|
+
ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
|
268
267
|
// validate tensor shape
|
|
269
268
|
if (is_token_embd) {
|
|
270
269
|
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
|
@@ -281,8 +280,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
281
280
|
}
|
|
282
281
|
|
|
283
282
|
// save tensor to adapter
|
|
284
|
-
|
|
285
|
-
|
|
283
|
+
ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
|
|
284
|
+
ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
|
286
285
|
ggml_set_name(tensor_a, w.a->name);
|
|
287
286
|
ggml_set_name(tensor_b, w.b->name);
|
|
288
287
|
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
|
@@ -308,7 +307,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
308
307
|
{
|
|
309
308
|
llama_file gguf_file(path_lora, "rb");
|
|
310
309
|
std::vector<uint8_t> read_buf;
|
|
311
|
-
auto set_tensor = [&](
|
|
310
|
+
auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
|
|
312
311
|
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
|
|
313
312
|
size_t size = ggml_nbytes(orig);
|
|
314
313
|
read_buf.resize(size);
|
|
@@ -327,8 +326,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
327
326
|
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
|
328
327
|
}
|
|
329
328
|
|
|
330
|
-
|
|
331
|
-
|
|
329
|
+
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
|
330
|
+
llama_adapter_lora * adapter = new llama_adapter_lora();
|
|
332
331
|
|
|
333
332
|
try {
|
|
334
333
|
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
|
@@ -342,6 +341,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
|
|
|
342
341
|
return nullptr;
|
|
343
342
|
}
|
|
344
343
|
|
|
345
|
-
void llama_adapter_lora_free(
|
|
344
|
+
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
|
346
345
|
delete adapter;
|
|
347
346
|
}
|
|
@@ -15,11 +15,11 @@
|
|
|
15
15
|
//
|
|
16
16
|
|
|
17
17
|
struct llama_adapter_cvec {
|
|
18
|
-
|
|
18
|
+
ggml_tensor * tensor_for(int il) const;
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const;
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
bool apply(
|
|
23
23
|
const llama_model & model,
|
|
24
24
|
const float * data,
|
|
25
25
|
size_t len,
|
|
@@ -36,7 +36,7 @@ private:
|
|
|
36
36
|
std::vector<ggml_context_ptr> ctxs;
|
|
37
37
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
38
38
|
|
|
39
|
-
std::vector<
|
|
39
|
+
std::vector<ggml_tensor *> tensors; // per layer
|
|
40
40
|
};
|
|
41
41
|
|
|
42
42
|
//
|
|
@@ -44,8 +44,8 @@ private:
|
|
|
44
44
|
//
|
|
45
45
|
|
|
46
46
|
struct llama_adapter_lora_weight {
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
ggml_tensor * a = nullptr;
|
|
48
|
+
ggml_tensor * b = nullptr;
|
|
49
49
|
|
|
50
50
|
// get actual scale based on rank and alpha
|
|
51
51
|
float get_scale(float alpha, float adapter_scale) const {
|
|
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
|
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
llama_adapter_lora_weight() = default;
|
|
58
|
-
llama_adapter_lora_weight(
|
|
58
|
+
llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
|
|
59
59
|
};
|
|
60
60
|
|
|
61
61
|
struct llama_adapter_lora {
|
|
62
62
|
// map tensor name to lora_a_b
|
|
63
|
-
std::unordered_map<std::string,
|
|
63
|
+
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
|
64
64
|
|
|
65
65
|
std::vector<ggml_context_ptr> ctxs;
|
|
66
66
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
|
|
|
70
70
|
llama_adapter_lora() = default;
|
|
71
71
|
~llama_adapter_lora() = default;
|
|
72
72
|
|
|
73
|
-
llama_adapter_lora_weight * get_weight(
|
|
73
|
+
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
|
74
74
|
};
|
|
75
|
+
|
|
76
|
+
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|
|
@@ -36,6 +36,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
36
36
|
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
|
37
37
|
{ LLM_ARCH_GEMMA, "gemma" },
|
|
38
38
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
|
39
|
+
{ LLM_ARCH_GEMMA3, "gemma3" },
|
|
39
40
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
40
41
|
{ LLM_ARCH_MAMBA, "mamba" },
|
|
41
42
|
{ LLM_ARCH_XVERSE, "xverse" },
|
|
@@ -58,6 +59,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
58
59
|
{ LLM_ARCH_EXAONE, "exaone" },
|
|
59
60
|
{ LLM_ARCH_RWKV6, "rwkv6" },
|
|
60
61
|
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
|
62
|
+
{ LLM_ARCH_RWKV7, "rwkv7" },
|
|
63
|
+
{ LLM_ARCH_ARWKV7, "arwkv7" },
|
|
61
64
|
{ LLM_ARCH_GRANITE, "granite" },
|
|
62
65
|
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
|
63
66
|
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
|
@@ -109,22 +112,26 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
109
112
|
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
|
110
113
|
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
|
111
114
|
|
|
112
|
-
{ LLM_KV_ATTENTION_HEAD_COUNT,
|
|
113
|
-
{ LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
114
|
-
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
|
115
|
-
{ LLM_KV_ATTENTION_CLAMP_KQV,
|
|
116
|
-
{ LLM_KV_ATTENTION_KEY_LENGTH,
|
|
117
|
-
{ LLM_KV_ATTENTION_VALUE_LENGTH,
|
|
118
|
-
{ LLM_KV_ATTENTION_LAYERNORM_EPS,
|
|
119
|
-
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
120
|
-
{ LLM_KV_ATTENTION_GROUPNORM_EPS,
|
|
121
|
-
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS,
|
|
122
|
-
{ LLM_KV_ATTENTION_CAUSAL,
|
|
123
|
-
{ LLM_KV_ATTENTION_Q_LORA_RANK,
|
|
124
|
-
{ LLM_KV_ATTENTION_KV_LORA_RANK,
|
|
125
|
-
{
|
|
126
|
-
{
|
|
127
|
-
{
|
|
115
|
+
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
|
116
|
+
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
117
|
+
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
|
118
|
+
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
|
119
|
+
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
|
120
|
+
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
|
121
|
+
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
|
122
|
+
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
|
123
|
+
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
|
124
|
+
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
|
125
|
+
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
|
126
|
+
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
|
127
|
+
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
|
128
|
+
{ LLM_KV_ATTENTION_DECAY_LORA_RANK, "%s.attention.decay_lora_rank" },
|
|
129
|
+
{ LLM_KV_ATTENTION_ICLR_LORA_RANK, "%s.attention.iclr_lora_rank" },
|
|
130
|
+
{ LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, "%s.attention.value_residual_mix_lora_rank" },
|
|
131
|
+
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
|
|
132
|
+
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
|
133
|
+
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
|
134
|
+
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
128
135
|
|
|
129
136
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
130
137
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
@@ -766,6 +773,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
766
773
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
767
774
|
},
|
|
768
775
|
},
|
|
776
|
+
{
|
|
777
|
+
LLM_ARCH_GEMMA3,
|
|
778
|
+
{
|
|
779
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
780
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
781
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
782
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
783
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
784
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
785
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
786
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
787
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
788
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
789
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
790
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
791
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
792
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
793
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
794
|
+
},
|
|
795
|
+
},
|
|
769
796
|
{
|
|
770
797
|
LLM_ARCH_STARCODER2,
|
|
771
798
|
{
|
|
@@ -1217,6 +1244,74 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1217
1244
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1218
1245
|
},
|
|
1219
1246
|
},
|
|
1247
|
+
{
|
|
1248
|
+
LLM_ARCH_RWKV7,
|
|
1249
|
+
{
|
|
1250
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1251
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
1252
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1253
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1254
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1255
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
|
1256
|
+
{ LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
|
|
1257
|
+
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
|
1258
|
+
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
|
1259
|
+
{ LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
|
|
1260
|
+
{ LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
|
|
1261
|
+
{ LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
|
|
1262
|
+
{ LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
|
|
1263
|
+
{ LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
|
|
1264
|
+
{ LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
|
|
1265
|
+
{ LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
|
|
1266
|
+
{ LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
|
|
1267
|
+
{ LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
|
|
1268
|
+
{ LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
|
|
1269
|
+
{ LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
|
|
1270
|
+
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
|
1271
|
+
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
|
1272
|
+
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
|
1273
|
+
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
|
1274
|
+
{ LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
|
|
1275
|
+
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
|
1276
|
+
{ LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
|
|
1277
|
+
{ LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
|
|
1278
|
+
{ LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
|
|
1279
|
+
},
|
|
1280
|
+
},
|
|
1281
|
+
{
|
|
1282
|
+
LLM_ARCH_ARWKV7,
|
|
1283
|
+
{
|
|
1284
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1285
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
1286
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1287
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1288
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1289
|
+
{ LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
|
|
1290
|
+
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
|
1291
|
+
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
|
1292
|
+
{ LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
|
|
1293
|
+
{ LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
|
|
1294
|
+
{ LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
|
|
1295
|
+
{ LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
|
|
1296
|
+
{ LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
|
|
1297
|
+
{ LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
|
|
1298
|
+
{ LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
|
|
1299
|
+
{ LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
|
|
1300
|
+
{ LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
|
|
1301
|
+
{ LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
|
|
1302
|
+
{ LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
|
|
1303
|
+
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
|
1304
|
+
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
|
1305
|
+
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
|
1306
|
+
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
|
1307
|
+
{ LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
|
|
1308
|
+
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
|
1309
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1310
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1311
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1312
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1313
|
+
},
|
|
1314
|
+
},
|
|
1220
1315
|
{
|
|
1221
1316
|
LLM_ARCH_GRANITE,
|
|
1222
1317
|
{
|
|
@@ -1376,6 +1471,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1376
1471
|
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1377
1472
|
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1378
1473
|
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1474
|
+
{LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1475
|
+
{LLM_TENSOR_TIME_MIX_A2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1476
|
+
{LLM_TENSOR_TIME_MIX_V1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1477
|
+
{LLM_TENSOR_TIME_MIX_V2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1478
|
+
{LLM_TENSOR_TIME_MIX_G1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1479
|
+
{LLM_TENSOR_TIME_MIX_G2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1379
1480
|
{LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1380
1481
|
{LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1381
1482
|
{LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -1394,6 +1495,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1394
1495
|
{LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1395
1496
|
{LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1396
1497
|
{LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1498
|
+
{LLM_TENSOR_TIME_MIX_K_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1499
|
+
{LLM_TENSOR_TIME_MIX_K_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1500
|
+
{LLM_TENSOR_TIME_MIX_R_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1397
1501
|
{LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1398
1502
|
{LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1399
1503
|
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
@@ -1401,6 +1505,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1401
1505
|
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1402
1506
|
{LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1403
1507
|
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1508
|
+
{LLM_TENSOR_TIME_MIX_W0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1509
|
+
{LLM_TENSOR_TIME_MIX_A0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1510
|
+
{LLM_TENSOR_TIME_MIX_V0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
1404
1511
|
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
|
|
1405
1512
|
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1406
1513
|
{LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
@@ -40,6 +40,7 @@ enum llm_arch {
|
|
|
40
40
|
LLM_ARCH_MINICPM3,
|
|
41
41
|
LLM_ARCH_GEMMA,
|
|
42
42
|
LLM_ARCH_GEMMA2,
|
|
43
|
+
LLM_ARCH_GEMMA3,
|
|
43
44
|
LLM_ARCH_STARCODER2,
|
|
44
45
|
LLM_ARCH_MAMBA,
|
|
45
46
|
LLM_ARCH_XVERSE,
|
|
@@ -62,6 +63,8 @@ enum llm_arch {
|
|
|
62
63
|
LLM_ARCH_EXAONE,
|
|
63
64
|
LLM_ARCH_RWKV6,
|
|
64
65
|
LLM_ARCH_RWKV6QWEN2,
|
|
66
|
+
LLM_ARCH_RWKV7,
|
|
67
|
+
LLM_ARCH_ARWKV7,
|
|
65
68
|
LLM_ARCH_GRANITE,
|
|
66
69
|
LLM_ARCH_GRANITE_MOE,
|
|
67
70
|
LLM_ARCH_CHAMELEON,
|
|
@@ -126,6 +129,10 @@ enum llm_kv {
|
|
|
126
129
|
LLM_KV_ATTENTION_CAUSAL,
|
|
127
130
|
LLM_KV_ATTENTION_Q_LORA_RANK,
|
|
128
131
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
|
132
|
+
LLM_KV_ATTENTION_DECAY_LORA_RANK,
|
|
133
|
+
LLM_KV_ATTENTION_ICLR_LORA_RANK,
|
|
134
|
+
LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
|
|
135
|
+
LLM_KV_ATTENTION_GATE_LORA_RANK,
|
|
129
136
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
130
137
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
131
138
|
LLM_KV_ATTENTION_SCALE,
|
|
@@ -249,8 +256,20 @@ enum llm_tensor {
|
|
|
249
256
|
LLM_TENSOR_SSM_A,
|
|
250
257
|
LLM_TENSOR_SSM_D,
|
|
251
258
|
LLM_TENSOR_SSM_OUT,
|
|
259
|
+
LLM_TENSOR_TIME_MIX_W0,
|
|
252
260
|
LLM_TENSOR_TIME_MIX_W1,
|
|
253
261
|
LLM_TENSOR_TIME_MIX_W2,
|
|
262
|
+
LLM_TENSOR_TIME_MIX_A0,
|
|
263
|
+
LLM_TENSOR_TIME_MIX_A1,
|
|
264
|
+
LLM_TENSOR_TIME_MIX_A2,
|
|
265
|
+
LLM_TENSOR_TIME_MIX_V0,
|
|
266
|
+
LLM_TENSOR_TIME_MIX_V1,
|
|
267
|
+
LLM_TENSOR_TIME_MIX_V2,
|
|
268
|
+
LLM_TENSOR_TIME_MIX_G1,
|
|
269
|
+
LLM_TENSOR_TIME_MIX_G2,
|
|
270
|
+
LLM_TENSOR_TIME_MIX_K_K,
|
|
271
|
+
LLM_TENSOR_TIME_MIX_K_A,
|
|
272
|
+
LLM_TENSOR_TIME_MIX_R_K,
|
|
254
273
|
LLM_TENSOR_TIME_MIX_LERP_X,
|
|
255
274
|
LLM_TENSOR_TIME_MIX_LERP_W,
|
|
256
275
|
LLM_TENSOR_TIME_MIX_LERP_K,
|
|
@@ -42,9 +42,9 @@ struct llama_sbatch {
|
|
|
42
42
|
bool logits_all; // TODO: remove once lctx.logits_all is removed too
|
|
43
43
|
|
|
44
44
|
// sorted indices into the batch
|
|
45
|
-
std::vector<
|
|
45
|
+
std::vector<int64_t> ids;
|
|
46
46
|
// batch indices of the output
|
|
47
|
-
std::vector<
|
|
47
|
+
std::vector<int64_t> out_ids;
|
|
48
48
|
std::vector<llama_sbatch_seq> seq;
|
|
49
49
|
|
|
50
50
|
const llama_batch * batch = nullptr;
|