@fugood/llama.node 1.4.13 → 1.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/lib/binding.ts +23 -2
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +8 -1
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -12
  7. package/src/LlamaContext.cpp +16 -4
  8. package/src/llama.cpp/CMakeLists.txt +24 -8
  9. package/src/llama.cpp/common/CMakeLists.txt +3 -34
  10. package/src/llama.cpp/common/arg.cpp +183 -60
  11. package/src/llama.cpp/common/arg.h +0 -8
  12. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  13. package/src/llama.cpp/common/chat.cpp +67 -0
  14. package/src/llama.cpp/common/chat.h +1 -0
  15. package/src/llama.cpp/common/common.cpp +2 -1
  16. package/src/llama.cpp/common/common.h +12 -7
  17. package/src/llama.cpp/common/debug.cpp +165 -0
  18. package/src/llama.cpp/common/debug.h +43 -0
  19. package/src/llama.cpp/common/download.cpp +88 -369
  20. package/src/llama.cpp/common/download.h +32 -5
  21. package/src/llama.cpp/common/preset.cpp +87 -2
  22. package/src/llama.cpp/common/preset.h +10 -1
  23. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  24. package/src/llama.cpp/include/llama.h +5 -2
  25. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  27. package/src/llama.cpp/src/llama-arch.h +1 -0
  28. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  29. package/src/llama.cpp/src/llama-chat.h +1 -0
  30. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  31. package/src/llama.cpp/src/llama-mmap.cpp +78 -42
  32. package/src/llama.cpp/src/llama-mmap.h +5 -4
  33. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  34. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  35. package/src/llama.cpp/src/llama-model.cpp +225 -101
  36. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  37. package/src/llama.cpp/src/llama-sampling.cpp +1 -1
  38. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  39. package/src/llama.cpp/src/llama-vocab.h +1 -0
  40. package/src/llama.cpp/src/llama.cpp +63 -27
  41. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  42. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  43. package/src/llama.cpp/src/models/models.h +13 -2
  44. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
111
111
  }
112
112
  }
113
113
  for (size_t i = 0; i < ret.size(); i++) {
114
- size_t free, total;
114
+ size_t free;
115
+ size_t total;
115
116
  ggml_backend_dev_memory(model->devices[i], &free, &total);
117
+
118
+ // devices can return 0 bytes for free and total memory if they do not
119
+ // have any to report. in this case, we will use the host memory as a fallback
120
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
121
+ if (free == 0 && total == 0) {
122
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
123
+ if (cpu_dev == nullptr) {
124
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
125
+ }
126
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
127
+ }
116
128
  ret[i].free = free;
117
129
  ret[i].total = total;
118
130
  }
@@ -147,9 +159,8 @@ class llama_params_fit_exception : public std::runtime_error {
147
159
  static void llama_params_fit_impl(
148
160
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
149
161
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
150
- size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
162
+ size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
151
163
  constexpr int64_t MiB = 1024*1024;
152
- const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
153
164
  typedef std::vector<llama_device_memory_data> dmds_t;
154
165
  const llama_model_params default_mparams = llama_model_default_params();
155
166
 
@@ -168,6 +179,12 @@ static void llama_params_fit_impl(
168
179
  return;
169
180
  }
170
181
 
182
+ std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
183
+ margins.reserve(nd);
184
+ for (size_t id = 0; id < nd; id++) {
185
+ margins.push_back(margins_s[id]);
186
+ }
187
+
171
188
  std::vector<std::string> dev_names;
172
189
  {
173
190
  dev_names.reserve(nd);
@@ -187,9 +204,10 @@ static void llama_params_fit_impl(
187
204
 
188
205
  int64_t sum_free = 0;
189
206
  int64_t sum_projected_free = 0;
190
- int64_t min_projected_free = INT64_MAX;
191
207
  int64_t sum_projected_used = 0;
192
208
  int64_t sum_projected_model = 0;
209
+ std::vector<int64_t> projected_free_per_device;
210
+ projected_free_per_device.reserve(nd);
193
211
 
194
212
  if (nd > 1) {
195
213
  LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -199,45 +217,63 @@ static void llama_params_fit_impl(
199
217
 
200
218
  const int64_t projected_used = dmd.mb.total();
201
219
  const int64_t projected_free = dmd.free - projected_used;
220
+ projected_free_per_device.push_back(projected_free);
202
221
 
203
222
  sum_free += dmd.free;
204
223
  sum_projected_used += projected_used;
205
224
  sum_projected_free += projected_free;
206
- min_projected_free = std::min(min_projected_free, projected_free);
207
225
  sum_projected_model += dmd.mb.model;
208
226
 
209
227
  if (nd > 1) {
210
- LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
211
- __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
212
- projected_free >= 0 ? "surplus" : "deficit");
228
+ LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
229
+ __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
213
230
  }
214
231
  }
215
232
  assert(sum_free >= 0 && sum_projected_used >= 0);
216
233
  LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
217
234
  __func__, sum_projected_used/MiB, sum_free/MiB);
218
- if (min_projected_free >= margin) {
219
- if (nd == 1) {
235
+ if (nd == 1) {
236
+ if (projected_free_per_device[0] >= margins[0]) {
220
237
  LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
221
- __func__, min_projected_free/MiB, margin/MiB);
238
+ __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
239
+ return;
240
+ }
241
+ } else {
242
+ bool changes_needed = false;
243
+ for (size_t id = 0; id < nd; id++) {
244
+ if (projected_free_per_device[id] < margins[id]) {
245
+ changes_needed = true;
246
+ break;
247
+ }
248
+ }
249
+ if (!changes_needed) {
250
+ LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
222
251
  return;
223
252
  }
224
- LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
225
- __func__, min_projected_free/MiB, margin/MiB);
226
- return;
227
253
  }
228
254
 
229
255
  // step 2: try reducing memory use by reducing the context size
230
256
 
231
257
  {
232
- int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
258
+ int64_t global_surplus = sum_projected_free;
259
+ for (size_t id = 0; id < nd; id++) {
260
+ global_surplus -= margins[id];
261
+ }
233
262
  if (global_surplus < 0) {
234
- LLAMA_LOG_INFO(nd == 1 ?
235
- "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
236
- "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
237
- __func__, margin/MiB, -global_surplus/MiB);
263
+ if (nd == 1) {
264
+ LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
265
+ __func__, margins[0]/MiB, -global_surplus/MiB);
266
+ } else {
267
+ LLAMA_LOG_INFO(
268
+ "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
269
+ __func__, -global_surplus/MiB);
270
+ }
238
271
  if (cparams->n_ctx == 0) {
239
272
  if (hp_nct > n_ctx_min) {
240
- int64_t sum_used_target = sum_free - nd*margin_s;
273
+ int64_t sum_used_target = sum_free;
274
+ for (size_t id = 0; id < nd; id++) {
275
+ sum_used_target -= margins[id];
276
+ }
241
277
  if (nd > 1) {
242
278
  // for multiple devices we need to be more conservative in terms of how much context we think can fit:
243
279
  // - for dense models only whole layers can be assigned to devices
@@ -448,9 +484,9 @@ static void llama_params_fit_impl(
448
484
  const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
449
485
  path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
450
486
 
451
- for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
452
- global_surplus_cpu_moe += dmd.free;
453
- global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
487
+ for (size_t id = 0; id < nd; id++) {
488
+ global_surplus_cpu_moe += dmds_cpu_moe[id].free;
489
+ global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
454
490
  }
455
491
 
456
492
  if (global_surplus_cpu_moe > 0) {
@@ -469,7 +505,7 @@ static void llama_params_fit_impl(
469
505
  std::vector<int64_t> targets; // maximum acceptable memory use per device
470
506
  targets.reserve(nd);
471
507
  for (size_t id = 0; id < nd; id++) {
472
- targets.push_back(dmds_full[id].free - margin);
508
+ targets.push_back(dmds_full[id].free - margins[id]);
473
509
  LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
474
510
  }
475
511
 
@@ -701,11 +737,11 @@ static void llama_params_fit_impl(
701
737
  enum llama_params_fit_status llama_params_fit(
702
738
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
703
739
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
704
- size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
740
+ size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
705
741
  const int64_t t0_us = llama_time_us();
706
742
  llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
707
743
  try {
708
- llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
744
+ llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
709
745
  LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
710
746
  } catch (const llama_params_fit_exception & e) {
711
747
  LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
@@ -794,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
794
830
  model.t_start_us = tm.t_start_us;
795
831
 
796
832
  try {
797
- llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
833
+ llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
798
834
 
799
835
  ml.print_info();
800
836
 
@@ -0,0 +1,146 @@
1
+ #include "models.h"
2
+
3
+
4
+ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
5
+ llm_graph_context(params) {
6
+ const int64_t n_embd_head = hparams.n_embd_head_k;
7
+
8
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
9
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10
+
11
+ ggml_tensor * cur;
12
+ ggml_tensor * inpL;
13
+
14
+ inpL = build_inp_embd(model.tok_embd);
15
+
16
+ // inp_pos - contains the positions
17
+ ggml_tensor * inp_pos = build_inp_pos();
18
+
19
+ auto * inp_attn_iswa = build_attn_inp_kv_iswa();
20
+
21
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
22
+
23
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
24
+ for (int il = 0; il < n_transformer_layers; ++il) {
25
+ ggml_tensor * inpSA = inpL;
26
+
27
+ // use RoPE for SWA layers
28
+ const bool is_local_layer = hparams.is_swa(il);
29
+
30
+ // norm
31
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
32
+ cb(cur, "attn_norm", il);
33
+
34
+ // self-attention
35
+ {
36
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
37
+
38
+ // compute Q and K and RoPE them
39
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
40
+ cb(Qcur, "Qcur", il);
41
+
42
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
43
+ cb(Kcur, "Kcur", il);
44
+
45
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
46
+ cb(Vcur, "Vcur", il);
47
+
48
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
49
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
50
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
51
+
52
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
53
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
54
+ cb(Qcur, "Qcur_normed", il);
55
+ cb(Kcur, "Kcur_normed", il);
56
+
57
+ if (is_local_layer) {
58
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
59
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
60
+
61
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
62
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
63
+ }
64
+ cb(Qcur, "Qcur", il);
65
+ cb(Kcur, "Kcur", il);
66
+ cb(Vcur, "Vcur", il);
67
+
68
+ cur = build_attn(inp_attn_iswa,
69
+ model.layers[il].wo, NULL,
70
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
71
+ cb(cur, "attn_out", il);
72
+ }
73
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
74
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
75
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
76
+ }
77
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
78
+ cb(ffn_inp, "ffn_inp", il);
79
+
80
+ // norm
81
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
82
+ cb(cur, "ffn_norm", il);
83
+
84
+ // feed-forward network
85
+ if (model.layers[il].ffn_gate_inp == nullptr) {
86
+ // dense branch
87
+ cur = build_ffn(cur,
88
+ model.layers[il].ffn_up, NULL, NULL,
89
+ model.layers[il].ffn_gate, NULL, NULL,
90
+ model.layers[il].ffn_down, NULL, NULL, NULL,
91
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
92
+ cb(cur, "ffn_out", il);
93
+ } else {
94
+ // MoE branch
95
+ ggml_tensor * moe_out = build_moe_ffn(cur,
96
+ model.layers[il].ffn_gate_inp,
97
+ model.layers[il].ffn_up_exps,
98
+ model.layers[il].ffn_gate_exps,
99
+ model.layers[il].ffn_down_exps,
100
+ model.layers[il].ffn_exp_probs_b,
101
+ n_expert, n_expert_used,
102
+ LLM_FFN_SILU, hparams.expert_weights_norm,
103
+ true, hparams.expert_weights_scale,
104
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
105
+ il);
106
+ cb(moe_out, "ffn_moe_out", il);
107
+
108
+ // FFN shared expert
109
+ {
110
+ ggml_tensor * ffn_shexp =
111
+ build_ffn(cur,
112
+ model.layers[il].ffn_up_shexp, NULL, NULL,
113
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
114
+ model.layers[il].ffn_down_shexp, NULL, NULL,
115
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
116
+ cb(ffn_shexp, "ffn_shexp", il);
117
+
118
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
119
+ cb(cur, "ffn_out", il);
120
+ }
121
+ }
122
+
123
+ cur = ggml_add(ctx0, cur, ffn_inp);
124
+
125
+ cur = build_cvec(cur, il);
126
+ cb(cur, "l_out", il);
127
+
128
+ // input for next layer
129
+ inpL = cur;
130
+ }
131
+ cur = inpL;
132
+
133
+ // final norm
134
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
135
+
136
+ cb(cur, "result_norm", -1);
137
+ res->t_embd = cur;
138
+
139
+ // lm_head
140
+ cur = build_lora_mm(model.output, cur);
141
+
142
+ cb(cur, "result_output", -1);
143
+ res->t_logits = cur;
144
+
145
+ ggml_build_forward_expand(gf, cur);
146
+ }
@@ -255,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
255
255
  inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
256
256
  inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
257
257
  cb(inp_per_layer, "inp_per_layer_selected", -1);
258
+ res->add_input(std::move(inp));
258
259
  } else {
259
- GGML_ABORT("TODO: support embd input");
260
+ // Vision embedding path: use padding token (ID=0) embedding
261
+ // TODO: verify if this is the correct behavior in transformers implementation
262
+ const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
263
+
264
+ // Extract and dequantize padding token embedding (row 0)
265
+ ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
266
+ inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
267
+
268
+ // Reshape to [n_embd_altup, n_layer, 1]
269
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
270
+ cb(inp_per_layer, "inp_per_layer_vision", -1);
260
271
  }
261
- res->add_input(std::move(inp));
262
272
  return inp_per_layer;
263
273
  }
264
274
 
@@ -276,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
276
286
  -1); // [n_embd_altup, n_layer, n_tokens]
277
287
  cb(per_layer_proj, "per_layer_proj", -1);
278
288
 
279
- inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
289
+ inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
280
290
  inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
281
291
  cb(inp_per_layer, "inp_per_layer", -1);
282
292
 
@@ -167,6 +167,10 @@ struct llm_build_exaone : public llm_graph_context {
167
167
  llm_build_exaone(const llama_model & model, const llm_graph_params & params);
168
168
  };
169
169
 
170
+ struct llm_build_exaone_moe : public llm_graph_context {
171
+ llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
172
+ };
173
+
170
174
  struct llm_build_falcon : public llm_graph_context {
171
175
  llm_build_falcon(const llama_model & model, const llm_graph_params & params);
172
176
  };
@@ -466,7 +470,8 @@ private:
466
470
  ggml_tensor * cur,
467
471
  int il);
468
472
 
469
- ggml_tensor * build_delta_net_chunking(
473
+ // returns pair of output and new state
474
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
470
475
  ggml_tensor * q,
471
476
  ggml_tensor * k,
472
477
  ggml_tensor * v,
@@ -478,7 +483,8 @@ private:
478
483
  ggml_tensor * diag_mask,
479
484
  int il);
480
485
 
481
- ggml_tensor * build_delta_net_autoregressive(
486
+ // returns pair of output and new state
487
+ std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
482
488
  ggml_tensor * q,
483
489
  ggml_tensor * k,
484
490
  ggml_tensor * v,
@@ -493,6 +499,11 @@ private:
493
499
  ggml_tensor * gate,
494
500
  int layer);
495
501
 
502
+ // returns pair of qkv, z
503
+ std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
504
+ ggml_tensor * input,
505
+ int il);
506
+
496
507
  const llama_model & model;
497
508
  };
498
509