@fugood/llama.node 1.4.13 → 1.4.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +23 -2
- package/lib/index.js +2 -1
- package/lib/index.ts +8 -1
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -12
- package/src/LlamaContext.cpp +16 -4
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +3 -34
- package/src/llama.cpp/common/arg.cpp +183 -60
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +67 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +2 -1
- package/src/llama.cpp/common/common.h +12 -7
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +88 -369
- package/src/llama.cpp/common/download.h +32 -5
- package/src/llama.cpp/common/preset.cpp +87 -2
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +5 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-mmap.cpp +78 -42
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +225 -101
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +63 -27
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|
|
111
111
|
}
|
|
112
112
|
}
|
|
113
113
|
for (size_t i = 0; i < ret.size(); i++) {
|
|
114
|
-
size_t free
|
|
114
|
+
size_t free;
|
|
115
|
+
size_t total;
|
|
115
116
|
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
|
117
|
+
|
|
118
|
+
// devices can return 0 bytes for free and total memory if they do not
|
|
119
|
+
// have any to report. in this case, we will use the host memory as a fallback
|
|
120
|
+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
121
|
+
if (free == 0 && total == 0) {
|
|
122
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
123
|
+
if (cpu_dev == nullptr) {
|
|
124
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
125
|
+
}
|
|
126
|
+
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
127
|
+
}
|
|
116
128
|
ret[i].free = free;
|
|
117
129
|
ret[i].total = total;
|
|
118
130
|
}
|
|
@@ -147,9 +159,8 @@ class llama_params_fit_exception : public std::runtime_error {
|
|
|
147
159
|
static void llama_params_fit_impl(
|
|
148
160
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
149
161
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
150
|
-
size_t
|
|
162
|
+
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
151
163
|
constexpr int64_t MiB = 1024*1024;
|
|
152
|
-
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
153
164
|
typedef std::vector<llama_device_memory_data> dmds_t;
|
|
154
165
|
const llama_model_params default_mparams = llama_model_default_params();
|
|
155
166
|
|
|
@@ -168,6 +179,12 @@ static void llama_params_fit_impl(
|
|
|
168
179
|
return;
|
|
169
180
|
}
|
|
170
181
|
|
|
182
|
+
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
183
|
+
margins.reserve(nd);
|
|
184
|
+
for (size_t id = 0; id < nd; id++) {
|
|
185
|
+
margins.push_back(margins_s[id]);
|
|
186
|
+
}
|
|
187
|
+
|
|
171
188
|
std::vector<std::string> dev_names;
|
|
172
189
|
{
|
|
173
190
|
dev_names.reserve(nd);
|
|
@@ -187,9 +204,10 @@ static void llama_params_fit_impl(
|
|
|
187
204
|
|
|
188
205
|
int64_t sum_free = 0;
|
|
189
206
|
int64_t sum_projected_free = 0;
|
|
190
|
-
int64_t min_projected_free = INT64_MAX;
|
|
191
207
|
int64_t sum_projected_used = 0;
|
|
192
208
|
int64_t sum_projected_model = 0;
|
|
209
|
+
std::vector<int64_t> projected_free_per_device;
|
|
210
|
+
projected_free_per_device.reserve(nd);
|
|
193
211
|
|
|
194
212
|
if (nd > 1) {
|
|
195
213
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
|
@@ -199,45 +217,63 @@ static void llama_params_fit_impl(
|
|
|
199
217
|
|
|
200
218
|
const int64_t projected_used = dmd.mb.total();
|
|
201
219
|
const int64_t projected_free = dmd.free - projected_used;
|
|
220
|
+
projected_free_per_device.push_back(projected_free);
|
|
202
221
|
|
|
203
222
|
sum_free += dmd.free;
|
|
204
223
|
sum_projected_used += projected_used;
|
|
205
224
|
sum_projected_free += projected_free;
|
|
206
|
-
min_projected_free = std::min(min_projected_free, projected_free);
|
|
207
225
|
sum_projected_model += dmd.mb.model;
|
|
208
226
|
|
|
209
227
|
if (nd > 1) {
|
|
210
|
-
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %
|
|
211
|
-
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB,
|
|
212
|
-
projected_free >= 0 ? "surplus" : "deficit");
|
|
228
|
+
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
|
229
|
+
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
|
213
230
|
}
|
|
214
231
|
}
|
|
215
232
|
assert(sum_free >= 0 && sum_projected_used >= 0);
|
|
216
233
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
|
217
234
|
__func__, sum_projected_used/MiB, sum_free/MiB);
|
|
218
|
-
if (
|
|
219
|
-
if (
|
|
235
|
+
if (nd == 1) {
|
|
236
|
+
if (projected_free_per_device[0] >= margins[0]) {
|
|
220
237
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
|
221
|
-
__func__,
|
|
238
|
+
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
} else {
|
|
242
|
+
bool changes_needed = false;
|
|
243
|
+
for (size_t id = 0; id < nd; id++) {
|
|
244
|
+
if (projected_free_per_device[id] < margins[id]) {
|
|
245
|
+
changes_needed = true;
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
if (!changes_needed) {
|
|
250
|
+
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
|
222
251
|
return;
|
|
223
252
|
}
|
|
224
|
-
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
|
225
|
-
__func__, min_projected_free/MiB, margin/MiB);
|
|
226
|
-
return;
|
|
227
253
|
}
|
|
228
254
|
|
|
229
255
|
// step 2: try reducing memory use by reducing the context size
|
|
230
256
|
|
|
231
257
|
{
|
|
232
|
-
int64_t global_surplus = sum_projected_free
|
|
258
|
+
int64_t global_surplus = sum_projected_free;
|
|
259
|
+
for (size_t id = 0; id < nd; id++) {
|
|
260
|
+
global_surplus -= margins[id];
|
|
261
|
+
}
|
|
233
262
|
if (global_surplus < 0) {
|
|
234
|
-
|
|
235
|
-
"%s: cannot
|
|
236
|
-
|
|
237
|
-
|
|
263
|
+
if (nd == 1) {
|
|
264
|
+
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
|
265
|
+
__func__, margins[0]/MiB, -global_surplus/MiB);
|
|
266
|
+
} else {
|
|
267
|
+
LLAMA_LOG_INFO(
|
|
268
|
+
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
|
269
|
+
__func__, -global_surplus/MiB);
|
|
270
|
+
}
|
|
238
271
|
if (cparams->n_ctx == 0) {
|
|
239
272
|
if (hp_nct > n_ctx_min) {
|
|
240
|
-
int64_t sum_used_target = sum_free
|
|
273
|
+
int64_t sum_used_target = sum_free;
|
|
274
|
+
for (size_t id = 0; id < nd; id++) {
|
|
275
|
+
sum_used_target -= margins[id];
|
|
276
|
+
}
|
|
241
277
|
if (nd > 1) {
|
|
242
278
|
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
|
243
279
|
// - for dense models only whole layers can be assigned to devices
|
|
@@ -448,9 +484,9 @@ static void llama_params_fit_impl(
|
|
|
448
484
|
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
|
449
485
|
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
450
486
|
|
|
451
|
-
for (
|
|
452
|
-
global_surplus_cpu_moe +=
|
|
453
|
-
global_surplus_cpu_moe -= int64_t(
|
|
487
|
+
for (size_t id = 0; id < nd; id++) {
|
|
488
|
+
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
|
489
|
+
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
|
454
490
|
}
|
|
455
491
|
|
|
456
492
|
if (global_surplus_cpu_moe > 0) {
|
|
@@ -469,7 +505,7 @@ static void llama_params_fit_impl(
|
|
|
469
505
|
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
|
470
506
|
targets.reserve(nd);
|
|
471
507
|
for (size_t id = 0; id < nd; id++) {
|
|
472
|
-
targets.push_back(dmds_full[id].free -
|
|
508
|
+
targets.push_back(dmds_full[id].free - margins[id]);
|
|
473
509
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
474
510
|
}
|
|
475
511
|
|
|
@@ -701,11 +737,11 @@ static void llama_params_fit_impl(
|
|
|
701
737
|
enum llama_params_fit_status llama_params_fit(
|
|
702
738
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
703
739
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
704
|
-
size_t
|
|
740
|
+
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
705
741
|
const int64_t t0_us = llama_time_us();
|
|
706
742
|
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
|
707
743
|
try {
|
|
708
|
-
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides,
|
|
744
|
+
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
|
709
745
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
|
710
746
|
} catch (const llama_params_fit_exception & e) {
|
|
711
747
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
|
@@ -794,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
794
830
|
model.t_start_us = tm.t_start_us;
|
|
795
831
|
|
|
796
832
|
try {
|
|
797
|
-
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
|
833
|
+
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
|
798
834
|
|
|
799
835
|
ml.print_info();
|
|
800
836
|
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
|
|
5
|
+
llm_graph_context(params) {
|
|
6
|
+
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
7
|
+
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
|
9
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
10
|
+
|
|
11
|
+
ggml_tensor * cur;
|
|
12
|
+
ggml_tensor * inpL;
|
|
13
|
+
|
|
14
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15
|
+
|
|
16
|
+
// inp_pos - contains the positions
|
|
17
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
18
|
+
|
|
19
|
+
auto * inp_attn_iswa = build_attn_inp_kv_iswa();
|
|
20
|
+
|
|
21
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
22
|
+
|
|
23
|
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
|
24
|
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
|
25
|
+
ggml_tensor * inpSA = inpL;
|
|
26
|
+
|
|
27
|
+
// use RoPE for SWA layers
|
|
28
|
+
const bool is_local_layer = hparams.is_swa(il);
|
|
29
|
+
|
|
30
|
+
// norm
|
|
31
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
32
|
+
cb(cur, "attn_norm", il);
|
|
33
|
+
|
|
34
|
+
// self-attention
|
|
35
|
+
{
|
|
36
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
37
|
+
|
|
38
|
+
// compute Q and K and RoPE them
|
|
39
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
40
|
+
cb(Qcur, "Qcur", il);
|
|
41
|
+
|
|
42
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
43
|
+
cb(Kcur, "Kcur", il);
|
|
44
|
+
|
|
45
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
46
|
+
cb(Vcur, "Vcur", il);
|
|
47
|
+
|
|
48
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
49
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
50
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
51
|
+
|
|
52
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
53
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
54
|
+
cb(Qcur, "Qcur_normed", il);
|
|
55
|
+
cb(Kcur, "Kcur_normed", il);
|
|
56
|
+
|
|
57
|
+
if (is_local_layer) {
|
|
58
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
|
|
59
|
+
freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
|
60
|
+
|
|
61
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
|
|
62
|
+
freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
|
63
|
+
}
|
|
64
|
+
cb(Qcur, "Qcur", il);
|
|
65
|
+
cb(Kcur, "Kcur", il);
|
|
66
|
+
cb(Vcur, "Vcur", il);
|
|
67
|
+
|
|
68
|
+
cur = build_attn(inp_attn_iswa,
|
|
69
|
+
model.layers[il].wo, NULL,
|
|
70
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
71
|
+
cb(cur, "attn_out", il);
|
|
72
|
+
}
|
|
73
|
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
74
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
75
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
76
|
+
}
|
|
77
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
78
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
79
|
+
|
|
80
|
+
// norm
|
|
81
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
82
|
+
cb(cur, "ffn_norm", il);
|
|
83
|
+
|
|
84
|
+
// feed-forward network
|
|
85
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
86
|
+
// dense branch
|
|
87
|
+
cur = build_ffn(cur,
|
|
88
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
89
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
90
|
+
model.layers[il].ffn_down, NULL, NULL, NULL,
|
|
91
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
92
|
+
cb(cur, "ffn_out", il);
|
|
93
|
+
} else {
|
|
94
|
+
// MoE branch
|
|
95
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
96
|
+
model.layers[il].ffn_gate_inp,
|
|
97
|
+
model.layers[il].ffn_up_exps,
|
|
98
|
+
model.layers[il].ffn_gate_exps,
|
|
99
|
+
model.layers[il].ffn_down_exps,
|
|
100
|
+
model.layers[il].ffn_exp_probs_b,
|
|
101
|
+
n_expert, n_expert_used,
|
|
102
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
103
|
+
true, hparams.expert_weights_scale,
|
|
104
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
105
|
+
il);
|
|
106
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
107
|
+
|
|
108
|
+
// FFN shared expert
|
|
109
|
+
{
|
|
110
|
+
ggml_tensor * ffn_shexp =
|
|
111
|
+
build_ffn(cur,
|
|
112
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
113
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
114
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
115
|
+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
116
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
117
|
+
|
|
118
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
119
|
+
cb(cur, "ffn_out", il);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
124
|
+
|
|
125
|
+
cur = build_cvec(cur, il);
|
|
126
|
+
cb(cur, "l_out", il);
|
|
127
|
+
|
|
128
|
+
// input for next layer
|
|
129
|
+
inpL = cur;
|
|
130
|
+
}
|
|
131
|
+
cur = inpL;
|
|
132
|
+
|
|
133
|
+
// final norm
|
|
134
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
135
|
+
|
|
136
|
+
cb(cur, "result_norm", -1);
|
|
137
|
+
res->t_embd = cur;
|
|
138
|
+
|
|
139
|
+
// lm_head
|
|
140
|
+
cur = build_lora_mm(model.output, cur);
|
|
141
|
+
|
|
142
|
+
cb(cur, "result_output", -1);
|
|
143
|
+
res->t_logits = cur;
|
|
144
|
+
|
|
145
|
+
ggml_build_forward_expand(gf, cur);
|
|
146
|
+
}
|
|
@@ -255,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
|
|
255
255
|
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
|
256
256
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
|
|
257
257
|
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
|
258
|
+
res->add_input(std::move(inp));
|
|
258
259
|
} else {
|
|
259
|
-
|
|
260
|
+
// Vision embedding path: use padding token (ID=0) embedding
|
|
261
|
+
// TODO: verify if this is the correct behavior in transformers implementation
|
|
262
|
+
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
|
|
263
|
+
|
|
264
|
+
// Extract and dequantize padding token embedding (row 0)
|
|
265
|
+
ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
|
266
|
+
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
|
|
267
|
+
|
|
268
|
+
// Reshape to [n_embd_altup, n_layer, 1]
|
|
269
|
+
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
|
270
|
+
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
|
260
271
|
}
|
|
261
|
-
res->add_input(std::move(inp));
|
|
262
272
|
return inp_per_layer;
|
|
263
273
|
}
|
|
264
274
|
|
|
@@ -276,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
|
|
|
276
286
|
-1); // [n_embd_altup, n_layer, n_tokens]
|
|
277
287
|
cb(per_layer_proj, "per_layer_proj", -1);
|
|
278
288
|
|
|
279
|
-
inp_per_layer = ggml_add(ctx0,
|
|
289
|
+
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
|
280
290
|
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
|
281
291
|
cb(inp_per_layer, "inp_per_layer", -1);
|
|
282
292
|
|
|
@@ -167,6 +167,10 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
167
167
|
llm_build_exaone(const llama_model & model, const llm_graph_params & params);
|
|
168
168
|
};
|
|
169
169
|
|
|
170
|
+
struct llm_build_exaone_moe : public llm_graph_context {
|
|
171
|
+
llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
|
|
172
|
+
};
|
|
173
|
+
|
|
170
174
|
struct llm_build_falcon : public llm_graph_context {
|
|
171
175
|
llm_build_falcon(const llama_model & model, const llm_graph_params & params);
|
|
172
176
|
};
|
|
@@ -466,7 +470,8 @@ private:
|
|
|
466
470
|
ggml_tensor * cur,
|
|
467
471
|
int il);
|
|
468
472
|
|
|
469
|
-
|
|
473
|
+
// returns pair of output and new state
|
|
474
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
|
|
470
475
|
ggml_tensor * q,
|
|
471
476
|
ggml_tensor * k,
|
|
472
477
|
ggml_tensor * v,
|
|
@@ -478,7 +483,8 @@ private:
|
|
|
478
483
|
ggml_tensor * diag_mask,
|
|
479
484
|
int il);
|
|
480
485
|
|
|
481
|
-
|
|
486
|
+
// returns pair of output and new state
|
|
487
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
|
|
482
488
|
ggml_tensor * q,
|
|
483
489
|
ggml_tensor * k,
|
|
484
490
|
ggml_tensor * v,
|
|
@@ -493,6 +499,11 @@ private:
|
|
|
493
499
|
ggml_tensor * gate,
|
|
494
500
|
int layer);
|
|
495
501
|
|
|
502
|
+
// returns pair of qkv, z
|
|
503
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
|
|
504
|
+
ggml_tensor * input,
|
|
505
|
+
int il);
|
|
506
|
+
|
|
496
507
|
const llama_model & model;
|
|
497
508
|
};
|
|
498
509
|
|