@fugood/llama.node 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/src/LlamaContext.cpp +3 -0
  4. package/src/llama.cpp/common/arg.cpp +60 -7
  5. package/src/llama.cpp/common/chat.cpp +6 -6
  6. package/src/llama.cpp/common/common.cpp +1 -0
  7. package/src/llama.cpp/common/common.h +14 -5
  8. package/src/llama.cpp/common/speculative.cpp +135 -54
  9. package/src/llama.cpp/common/speculative.h +8 -1
  10. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  15. package/src/llama.cpp/include/llama.h +8 -4
  16. package/src/llama.cpp/src/llama-arch.cpp +40 -0
  17. package/src/llama.cpp/src/llama-arch.h +2 -0
  18. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  19. package/src/llama.cpp/src/llama-chat.cpp +20 -1
  20. package/src/llama.cpp/src/llama-chat.h +1 -0
  21. package/src/llama.cpp/src/llama-context.cpp +11 -2
  22. package/src/llama.cpp/src/llama-context.h +4 -1
  23. package/src/llama.cpp/src/llama-graph.cpp +57 -139
  24. package/src/llama.cpp/src/llama-graph.h +31 -32
  25. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
  26. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  27. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  28. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  29. package/src/llama.cpp/src/llama-model.cpp +400 -21
  30. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  31. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  32. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -289,7 +289,10 @@ private:
289
289
 
290
290
  // env: LLAMA_SET_ROWS (temporary)
291
291
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
292
- bool supports_set_rows = false;
292
+ bool supports_set_rows = true;
293
+
294
+ // env: LLAMA_GRAPH_REUSE_DISABLE
295
+ bool graph_reuse_disable = false;
293
296
 
294
297
  // perf
295
298
  mutable int64_t t_start_us = 0;
@@ -188,38 +188,23 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
188
188
 
189
189
  void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
190
190
  const int64_t n_tokens = ubatch->n_tokens;
191
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
192
191
  const int64_t n_seqs_unq = ubatch->n_seqs_unq;
193
192
 
194
193
  if (cparams.embeddings && (
195
- cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
196
- cparams.pooling_type == LLAMA_POOLING_TYPE_RANK
197
- )) {
194
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
195
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
196
+ cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
197
+ )) {
198
198
  GGML_ASSERT(cls);
199
199
  GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
200
200
 
201
201
  uint32_t * data = (uint32_t *) cls->data;
202
202
  memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
203
203
 
204
- for (int i = 0; i < n_tokens; i += n_seq_tokens) {
205
- for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
206
- const llama_seq_id seq_id = ubatch->seq_id[i][s];
207
- const int32_t seq_idx = ubatch->seq_idx[seq_id];
208
-
209
- data[seq_idx] = i;
210
- }
211
- }
212
- }
213
-
214
- if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
215
- GGML_ASSERT(cls);
216
- GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
217
-
218
- uint32_t * data = (uint32_t *) cls->data;
219
- memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
204
+ std::vector<int> target_pos(n_seqs_unq, -1);
205
+ std::vector<int> target_row(n_seqs_unq, -1);
220
206
 
221
- std::vector<int> last_pos(n_seqs_unq, -1);
222
- std::vector<int> last_row(n_seqs_unq, -1);
207
+ bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
223
208
 
224
209
  for (int i = 0; i < n_tokens; ++i) {
225
210
  const llama_pos pos = ubatch->pos[i];
@@ -228,16 +213,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
228
213
  const llama_seq_id seq_id = ubatch->seq_id[i][s];
229
214
  const int32_t seq_idx = ubatch->seq_idx[seq_id];
230
215
 
231
- if (pos >= last_pos[seq_idx]) {
232
- last_pos[seq_idx] = pos;
233
- last_row[seq_idx] = i;
216
+ if (
217
+ (target_pos[seq_idx] == -1) ||
218
+ ( last && pos >= target_pos[seq_idx]) ||
219
+ (!last && pos < target_pos[seq_idx])
220
+ ) {
221
+ target_pos[seq_idx] = pos;
222
+ target_row[seq_idx] = i;
234
223
  }
235
224
  }
236
225
  }
237
226
 
238
227
  for (int s = 0; s < n_seqs_unq; ++s) {
239
- if (last_row[s] >= 0) {
240
- data[s] = last_row[s];
228
+ if (target_row[s] >= 0) {
229
+ data[s] = target_row[s];
241
230
  }
242
231
  }
243
232
  }
@@ -796,13 +785,20 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
796
785
  bool scale_w,
797
786
  float w_scale,
798
787
  llama_expert_gating_func_type gating_op,
799
- int il) const {
788
+ int il,
789
+ ggml_tensor * probs_in) const {
800
790
  const int64_t n_embd = cur->ne[0];
801
791
  const int64_t n_tokens = cur->ne[1];
802
792
  const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
803
793
 
804
- ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
805
- cb(logits, "ffn_moe_logits", il);
794
+ ggml_tensor * logits = nullptr;
795
+
796
+ if (probs_in == nullptr) {
797
+ logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
798
+ cb(logits, "ffn_moe_logits", il);
799
+ } else {
800
+ logits = probs_in;
801
+ }
806
802
 
807
803
  ggml_tensor * probs = nullptr;
808
804
  switch (gating_op) {
@@ -895,6 +891,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
895
891
  cur = ggml_gelu(ctx0, cur);
896
892
  cb(cur, "ffn_moe_gelu", il);
897
893
  } break;
894
+ case LLM_FFN_RELU:
895
+ if (gate_exps) {
896
+ cur = ggml_reglu_split(ctx0, cur, up);
897
+ cb(cur, "ffn_moe_reglu", il);
898
+ } else {
899
+ cur = ggml_relu(ctx0, cur);
900
+ cb(cur, "ffn_moe_relu", il);
901
+ } break;
898
902
  default:
899
903
  GGML_ABORT("fatal error");
900
904
  }
@@ -938,100 +942,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
938
942
  return moe_out;
939
943
  }
940
944
 
941
- ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
942
- ggml_tensor * cur,
943
- ggml_tensor * probs,
944
- ggml_tensor * up_exps,
945
- ggml_tensor * gate_exps,
946
- ggml_tensor * down_exps,
947
- ggml_tensor * exp_probs_b,
948
- int64_t n_expert,
949
- int64_t n_expert_used,
950
- llama_expert_gating_func_type gating_op,
951
- int il) const {
952
- const int64_t n_embd = cur->ne[0];
953
- const int64_t n_tokens = cur->ne[1];
954
-
955
- // add experts selection bias - introduced in DeepSeek V3
956
- // leave probs unbiased as it's later used to get expert weights
957
- ggml_tensor * selection_probs = probs;
958
- if (exp_probs_b != nullptr) {
959
- selection_probs = ggml_add(ctx0, probs, exp_probs_b);
960
- cb(selection_probs, "ffn_moe_probs_biased", il);
961
- }
962
-
963
- // select experts
964
- ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
965
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
966
- cb(selected_experts, "ffn_moe_topk", il);
967
-
968
- ggml_tensor * weights = ggml_get_rows(ctx0,
969
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
970
- cb(weights, "ffn_moe_weights", il);
971
-
972
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
973
- if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
974
- weights = ggml_soft_max(ctx0, weights);
975
- } else {
976
- weights = ggml_sigmoid(ctx0, weights);
977
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
978
- cb(weights_sum, "ffn_moe_weights_sum", il);
979
-
980
- weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
981
- cb(weights, "ffn_moe_weights_norm", il);
982
- }
983
-
984
- weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
985
-
986
- cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
987
-
988
- ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
989
- cb(up, "ffn_moe_up", il);
990
-
991
- ggml_tensor * experts = nullptr;
992
- cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
993
- cb(cur, "ffn_moe_gate", il);
994
-
995
- cur = ggml_reglu_split(ctx0, cur, up);
996
- cb(cur, "ffn_moe_reglu", il);
997
-
998
- experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
999
- cb(experts, "ffn_moe_down", il);
1000
-
1001
- experts = ggml_mul(ctx0, experts, weights);
1002
- cb(cur, "ffn_moe_weighted", il);
1003
-
1004
- ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
1005
-
1006
- assert(n_expert_used > 0);
1007
-
1008
- // order the views before the adds
1009
- for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
1010
- cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
1011
-
1012
- ggml_build_forward_expand(gf, cur_experts[i]);
1013
- }
1014
-
1015
- // aggregate experts
1016
- // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
1017
- // to avoid potentially a large number of add nodes during warmup
1018
- // ref: https://github.com/ggml-org/llama.cpp/pull/14753
1019
- ggml_tensor * moe_out = cur_experts[0];
1020
-
1021
- for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
1022
- moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
1023
- }
1024
-
1025
- if (n_expert_used == 1) {
1026
- // avoid returning a non-contiguous tensor
1027
- moe_out = ggml_cont(ctx0, moe_out);
1028
- }
1029
-
1030
- cb(moe_out, "ffn_moe_out", il);
1031
-
1032
- return moe_out;
1033
- }
1034
-
1035
945
  // input embeddings with optional lora
1036
946
  ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1037
947
  const int64_t n_embd = hparams.n_embd;
@@ -1655,16 +1565,17 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
1655
1565
 
1656
1566
  ggml_tensor * llm_graph_context::build_rs(
1657
1567
  ggml_tensor * s,
1658
- ggml_tensor * state_copy,
1568
+ ggml_tensor * state_copy_main,
1569
+ ggml_tensor * state_copy_extra,
1659
1570
  int32_t state_size,
1660
1571
  int32_t n_seqs,
1661
- uint32_t n_kv,
1662
- uint32_t kv_head,
1663
- uint32_t kv_size,
1572
+ uint32_t n_rs,
1573
+ uint32_t rs_head,
1574
+ uint32_t rs_size,
1664
1575
  int32_t rs_zero,
1665
1576
  const llm_graph_get_rows_fn & get_state_rows) const {
1666
1577
 
1667
- ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_size);
1578
+ ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
1668
1579
 
1669
1580
  // Clear a single state which will then be copied to the other cleared states.
1670
1581
  // Note that this is a no-op when the view is zero-sized.
@@ -1672,39 +1583,44 @@ ggml_tensor * llm_graph_context::build_rs(
1672
1583
  ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
1673
1584
 
1674
1585
  // copy states
1675
- // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
1676
- // {state_size, kv_size} -> {state_size, n_seqs}
1677
- ggml_tensor * output_states = get_state_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
1586
+ // NOTE: assuming the copy destinations are ALL contained between rs_head and rs_head + n_rs
1587
+ // {state_size, rs_size} -> {state_size, n_seqs}
1588
+ ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
1678
1589
  ggml_build_forward_expand(gf, output_states);
1679
1590
 
1680
- // copy extra states which won't be changed further (between n_seqs and n_kv)
1681
- ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
1591
+ // copy extra states which won't be changed further (between n_seqs and n_rs)
1592
+ ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
1682
1593
  ggml_build_forward_expand(gf,
1683
1594
  ggml_cpy(ctx0,
1684
1595
  states_extra,
1685
- ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
1596
+ ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
1686
1597
 
1687
1598
  return output_states;
1688
1599
  }
1689
1600
 
1690
1601
  static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
1691
1602
  ggml_context * ctx0,
1603
+ const llama_ubatch & ubatch,
1692
1604
  const llama_memory_recurrent_context * mctx_cur) {
1693
1605
 
1694
1606
  auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
1695
1607
 
1696
- const auto n_rs = mctx_cur->get_n_rs();
1608
+ const int64_t n_rs = mctx_cur->get_n_rs();
1609
+ const int64_t n_seqs = ubatch.n_seqs;
1697
1610
 
1698
1611
  inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
1699
1612
  ggml_set_input(inp->s_copy);
1700
1613
 
1614
+ inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
1615
+ inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
1616
+
1701
1617
  return inp;
1702
1618
  }
1703
1619
 
1704
1620
  llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
1705
1621
  const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
1706
1622
 
1707
- auto inp = build_rs_inp_impl(ctx0, mctx_cur);
1623
+ auto inp = build_rs_inp_impl(ctx0, ubatch, mctx_cur);
1708
1624
 
1709
1625
  return (llm_graph_input_rs *) res->add_input(std::move(inp));
1710
1626
  }
@@ -1717,7 +1633,9 @@ ggml_tensor * llm_graph_context::build_rs(
1717
1633
  const llm_graph_get_rows_fn & get_state_rows) const {
1718
1634
  const auto * kv_state = inp->mctx;
1719
1635
 
1720
- return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
1636
+ return build_rs(s, inp->s_copy_main, inp->s_copy_extra, state_size, n_seqs,
1637
+ kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(),
1638
+ get_state_rows);
1721
1639
  }
1722
1640
 
1723
1641
  ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
@@ -1764,7 +1682,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
1764
1682
  llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1765
1683
  const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
1766
1684
 
1767
- auto inp_rs = build_rs_inp_impl(ctx0, mctx_cur->get_recr());
1685
+ auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
1768
1686
  auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
1769
1687
 
1770
1688
  auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
@@ -144,7 +144,7 @@ public:
144
144
 
145
145
  ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
146
146
 
147
- const llama_hparams & hparams;
147
+ const llama_hparams hparams;
148
148
  };
149
149
 
150
150
  class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +158,7 @@ public:
158
158
 
159
159
  ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
160
160
 
161
- const llama_hparams & hparams;
161
+ const llama_hparams hparams;
162
162
 
163
163
  const llama_kv_cache_unified_context * mctx;
164
164
  };
@@ -177,8 +177,8 @@ public:
177
177
 
178
178
  ggml_tensor * out_ids; // I32 [n_outputs]
179
179
 
180
- const llama_hparams & hparams;
181
- const llama_cparams & cparams;
180
+ const llama_hparams hparams;
181
+ const llama_cparams cparams;
182
182
 
183
183
  const uint32_t n_outputs;
184
184
  };
@@ -192,7 +192,7 @@ public:
192
192
 
193
193
  ggml_tensor * mean; // F32 [n_batch, n_batch]
194
194
 
195
- const llama_cparams & cparams;
195
+ const llama_cparams cparams;
196
196
  };
197
197
 
198
198
  class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +204,7 @@ public:
204
204
 
205
205
  ggml_tensor * cls; // I32 [n_batch]
206
206
 
207
- const llama_cparams & cparams;
207
+ const llama_cparams cparams;
208
208
  };
209
209
 
210
210
  class llm_graph_input_rs : public llm_graph_input_i {
@@ -214,7 +214,12 @@ public:
214
214
 
215
215
  void set_input(const llama_ubatch * ubatch) override;
216
216
 
217
- ggml_tensor * s_copy; // I32 [kv_size]
217
+ ggml_tensor * s_copy; // I32 [n_rs]
218
+
219
+ // views of s_copy, computed once per graph
220
+ // and shared across layers which use build_rs
221
+ ggml_tensor * s_copy_main; // I32 [n_seqs]
222
+ ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
218
223
 
219
224
  const llama_memory_recurrent_context * mctx;
220
225
  };
@@ -247,8 +252,8 @@ public:
247
252
  ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
248
253
  ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
249
254
 
250
- const llama_hparams & hparams;
251
- const llama_cparams & cparams;
255
+ const llama_hparams hparams;
256
+ const llama_cparams cparams;
252
257
  };
253
258
 
254
259
  class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +283,11 @@ public:
278
283
  ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279
284
  ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
280
285
 
281
- const llama_hparams & hparams;
282
- const llama_cparams & cparams;
286
+ // note: these have to be copies because in order to be able to reuse a graph, its inputs
287
+ // need to carry these parameters with them. otherwise, they can point to freed
288
+ // llm_graph_params from a previous batch, causing stack-use-after-return
289
+ const llama_hparams hparams;
290
+ const llama_cparams cparams;
283
291
 
284
292
  const llama_kv_cache_unified_context * mctx;
285
293
  };
@@ -318,8 +326,8 @@ public:
318
326
  ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319
327
  ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
320
328
 
321
- const llama_hparams & hparams;
322
- const llama_cparams & cparams;
329
+ const llama_hparams hparams;
330
+ const llama_cparams cparams;
323
331
 
324
332
  const llama_kv_cache_unified_iswa_context * mctx;
325
333
  };
@@ -415,7 +423,9 @@ struct llm_graph_params {
415
423
  (!ubatch.embd && !other.ubatch.embd)
416
424
  );
417
425
 
418
- if (can_reuse_ubatch && !ubatch.equal_seqs()) {
426
+ // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
427
+ // the reason is because the set of attention streams would be different for different sequences
428
+ if (can_reuse_ubatch && ubatch.equal_seqs()) {
419
429
  if (!ubatch.data) {
420
430
  // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
421
431
  // therefore we cannot perform the sequence id check. normally should never happen
@@ -623,19 +633,8 @@ struct llm_graph_context {
623
633
  bool scale_w,
624
634
  float w_scale,
625
635
  llama_expert_gating_func_type gating_op,
626
- int il) const;
627
-
628
- ggml_tensor * build_moe_ffn_from_probs(
629
- ggml_tensor * cur,
630
- ggml_tensor * probs,
631
- ggml_tensor * up_exps,
632
- ggml_tensor * gate_exps,
633
- ggml_tensor * down_exps,
634
- ggml_tensor * exp_probs_b,
635
- int64_t n_expert,
636
- int64_t n_expert_used,
637
- llama_expert_gating_func_type gating_op,
638
- int il) const;
636
+ int il,
637
+ ggml_tensor * probs_in = nullptr) const;
639
638
 
640
639
  //
641
640
  // inputs
@@ -727,7 +726,6 @@ struct llm_graph_context {
727
726
  // recurrent
728
727
  //
729
728
 
730
- // TODO: avoid notion of "kv"
731
729
  // TODO: move this implementation to llama_memory_recurrent.
732
730
  // this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
733
731
  // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
@@ -735,12 +733,13 @@ struct llm_graph_context {
735
733
  // `llama_memory_recurrent`
736
734
  ggml_tensor * build_rs(
737
735
  ggml_tensor * s,
738
- ggml_tensor * state_copy,
736
+ ggml_tensor * state_copy_main,
737
+ ggml_tensor * state_copy_extra,
739
738
  int32_t state_size,
740
739
  int32_t n_seqs,
741
- uint32_t n_kv,
742
- uint32_t kv_head,
743
- uint32_t kv_size,
740
+ uint32_t n_rs,
741
+ uint32_t rs_head,
742
+ uint32_t rs_size,
744
743
  int32_t rs_zero,
745
744
  const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
746
745
 
@@ -183,7 +183,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
183
183
  const size_t memory_size_k = size_k_bytes();
184
184
  const size_t memory_size_v = size_v_bytes();
185
185
 
186
- LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
186
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
187
187
  (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
188
188
  ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
189
189
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
@@ -193,7 +193,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
193
193
  debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
194
194
 
195
195
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
196
- supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
196
+ supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
197
197
 
198
198
  if (!supports_set_rows) {
199
199
  // ref: https://github.com/ggml-org/llama.cpp/pull/14363
@@ -230,7 +230,7 @@ private:
230
230
 
231
231
  // env: LLAMA_SET_ROWS (temporary)
232
232
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
233
- bool supports_set_rows = false;
233
+ bool supports_set_rows = true;
234
234
 
235
235
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
236
236
 
@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
25
25
  /* common */
26
26
  uint32_t n_seq_max,
27
27
  bool offload,
28
+ bool unified,
28
29
  /* layer filters */
29
30
  layer_filter_cb && filter_attn,
30
31
  layer_filter_cb && filter_recr) :
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
38
39
  type_v,
39
40
  v_trans,
40
41
  offload,
41
- 1,
42
+ unified,
42
43
  kv_size,
43
44
  n_seq_max,
44
45
  n_pad,
@@ -39,6 +39,7 @@ public:
39
39
  /* common */
40
40
  uint32_t n_seq_max,
41
41
  bool offload,
42
+ bool unified,
42
43
  /* layer filters */
43
44
  layer_filter_cb && filter_attn = nullptr,
44
45
  layer_filter_cb && filter_recr = nullptr);