@fugood/llama.node 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +60 -7
- package/src/llama.cpp/common/chat.cpp +6 -6
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +14 -5
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/include/llama.h +8 -4
- package/src/llama.cpp/src/llama-arch.cpp +40 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +20 -1
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +11 -2
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -139
- package/src/llama.cpp/src/llama-graph.h +31 -32
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +400 -21
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -289,7 +289,10 @@ private:
|
|
|
289
289
|
|
|
290
290
|
// env: LLAMA_SET_ROWS (temporary)
|
|
291
291
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
292
|
-
bool supports_set_rows =
|
|
292
|
+
bool supports_set_rows = true;
|
|
293
|
+
|
|
294
|
+
// env: LLAMA_GRAPH_REUSE_DISABLE
|
|
295
|
+
bool graph_reuse_disable = false;
|
|
293
296
|
|
|
294
297
|
// perf
|
|
295
298
|
mutable int64_t t_start_us = 0;
|
|
@@ -188,38 +188,23 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
|
|
188
188
|
|
|
189
189
|
void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
|
190
190
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
191
|
-
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
192
191
|
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
|
193
192
|
|
|
194
193
|
if (cparams.embeddings && (
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
194
|
+
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
|
195
|
+
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
|
|
196
|
+
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
|
|
197
|
+
)) {
|
|
198
198
|
GGML_ASSERT(cls);
|
|
199
199
|
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
|
200
200
|
|
|
201
201
|
uint32_t * data = (uint32_t *) cls->data;
|
|
202
202
|
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
|
203
203
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
207
|
-
const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
|
208
|
-
|
|
209
|
-
data[seq_idx] = i;
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
|
215
|
-
GGML_ASSERT(cls);
|
|
216
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
|
217
|
-
|
|
218
|
-
uint32_t * data = (uint32_t *) cls->data;
|
|
219
|
-
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
|
204
|
+
std::vector<int> target_pos(n_seqs_unq, -1);
|
|
205
|
+
std::vector<int> target_row(n_seqs_unq, -1);
|
|
220
206
|
|
|
221
|
-
|
|
222
|
-
std::vector<int> last_row(n_seqs_unq, -1);
|
|
207
|
+
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
|
|
223
208
|
|
|
224
209
|
for (int i = 0; i < n_tokens; ++i) {
|
|
225
210
|
const llama_pos pos = ubatch->pos[i];
|
|
@@ -228,16 +213,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
|
|
228
213
|
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
229
214
|
const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
|
230
215
|
|
|
231
|
-
if (
|
|
232
|
-
|
|
233
|
-
|
|
216
|
+
if (
|
|
217
|
+
(target_pos[seq_idx] == -1) ||
|
|
218
|
+
( last && pos >= target_pos[seq_idx]) ||
|
|
219
|
+
(!last && pos < target_pos[seq_idx])
|
|
220
|
+
) {
|
|
221
|
+
target_pos[seq_idx] = pos;
|
|
222
|
+
target_row[seq_idx] = i;
|
|
234
223
|
}
|
|
235
224
|
}
|
|
236
225
|
}
|
|
237
226
|
|
|
238
227
|
for (int s = 0; s < n_seqs_unq; ++s) {
|
|
239
|
-
if (
|
|
240
|
-
data[s] =
|
|
228
|
+
if (target_row[s] >= 0) {
|
|
229
|
+
data[s] = target_row[s];
|
|
241
230
|
}
|
|
242
231
|
}
|
|
243
232
|
}
|
|
@@ -796,13 +785,20 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
796
785
|
bool scale_w,
|
|
797
786
|
float w_scale,
|
|
798
787
|
llama_expert_gating_func_type gating_op,
|
|
799
|
-
int il
|
|
788
|
+
int il,
|
|
789
|
+
ggml_tensor * probs_in) const {
|
|
800
790
|
const int64_t n_embd = cur->ne[0];
|
|
801
791
|
const int64_t n_tokens = cur->ne[1];
|
|
802
792
|
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
|
803
793
|
|
|
804
|
-
ggml_tensor * logits =
|
|
805
|
-
|
|
794
|
+
ggml_tensor * logits = nullptr;
|
|
795
|
+
|
|
796
|
+
if (probs_in == nullptr) {
|
|
797
|
+
logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
|
|
798
|
+
cb(logits, "ffn_moe_logits", il);
|
|
799
|
+
} else {
|
|
800
|
+
logits = probs_in;
|
|
801
|
+
}
|
|
806
802
|
|
|
807
803
|
ggml_tensor * probs = nullptr;
|
|
808
804
|
switch (gating_op) {
|
|
@@ -895,6 +891,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
895
891
|
cur = ggml_gelu(ctx0, cur);
|
|
896
892
|
cb(cur, "ffn_moe_gelu", il);
|
|
897
893
|
} break;
|
|
894
|
+
case LLM_FFN_RELU:
|
|
895
|
+
if (gate_exps) {
|
|
896
|
+
cur = ggml_reglu_split(ctx0, cur, up);
|
|
897
|
+
cb(cur, "ffn_moe_reglu", il);
|
|
898
|
+
} else {
|
|
899
|
+
cur = ggml_relu(ctx0, cur);
|
|
900
|
+
cb(cur, "ffn_moe_relu", il);
|
|
901
|
+
} break;
|
|
898
902
|
default:
|
|
899
903
|
GGML_ABORT("fatal error");
|
|
900
904
|
}
|
|
@@ -938,100 +942,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
938
942
|
return moe_out;
|
|
939
943
|
}
|
|
940
944
|
|
|
941
|
-
ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
|
|
942
|
-
ggml_tensor * cur,
|
|
943
|
-
ggml_tensor * probs,
|
|
944
|
-
ggml_tensor * up_exps,
|
|
945
|
-
ggml_tensor * gate_exps,
|
|
946
|
-
ggml_tensor * down_exps,
|
|
947
|
-
ggml_tensor * exp_probs_b,
|
|
948
|
-
int64_t n_expert,
|
|
949
|
-
int64_t n_expert_used,
|
|
950
|
-
llama_expert_gating_func_type gating_op,
|
|
951
|
-
int il) const {
|
|
952
|
-
const int64_t n_embd = cur->ne[0];
|
|
953
|
-
const int64_t n_tokens = cur->ne[1];
|
|
954
|
-
|
|
955
|
-
// add experts selection bias - introduced in DeepSeek V3
|
|
956
|
-
// leave probs unbiased as it's later used to get expert weights
|
|
957
|
-
ggml_tensor * selection_probs = probs;
|
|
958
|
-
if (exp_probs_b != nullptr) {
|
|
959
|
-
selection_probs = ggml_add(ctx0, probs, exp_probs_b);
|
|
960
|
-
cb(selection_probs, "ffn_moe_probs_biased", il);
|
|
961
|
-
}
|
|
962
|
-
|
|
963
|
-
// select experts
|
|
964
|
-
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
965
|
-
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
966
|
-
cb(selected_experts, "ffn_moe_topk", il);
|
|
967
|
-
|
|
968
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
|
969
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
|
970
|
-
cb(weights, "ffn_moe_weights", il);
|
|
971
|
-
|
|
972
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
|
973
|
-
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
|
|
974
|
-
weights = ggml_soft_max(ctx0, weights);
|
|
975
|
-
} else {
|
|
976
|
-
weights = ggml_sigmoid(ctx0, weights);
|
|
977
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
|
|
978
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
979
|
-
|
|
980
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
|
|
981
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
|
982
|
-
}
|
|
983
|
-
|
|
984
|
-
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
|
|
985
|
-
|
|
986
|
-
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
|
987
|
-
|
|
988
|
-
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
989
|
-
cb(up, "ffn_moe_up", il);
|
|
990
|
-
|
|
991
|
-
ggml_tensor * experts = nullptr;
|
|
992
|
-
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
993
|
-
cb(cur, "ffn_moe_gate", il);
|
|
994
|
-
|
|
995
|
-
cur = ggml_reglu_split(ctx0, cur, up);
|
|
996
|
-
cb(cur, "ffn_moe_reglu", il);
|
|
997
|
-
|
|
998
|
-
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
999
|
-
cb(experts, "ffn_moe_down", il);
|
|
1000
|
-
|
|
1001
|
-
experts = ggml_mul(ctx0, experts, weights);
|
|
1002
|
-
cb(cur, "ffn_moe_weighted", il);
|
|
1003
|
-
|
|
1004
|
-
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
|
1005
|
-
|
|
1006
|
-
assert(n_expert_used > 0);
|
|
1007
|
-
|
|
1008
|
-
// order the views before the adds
|
|
1009
|
-
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
|
1010
|
-
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
|
1011
|
-
|
|
1012
|
-
ggml_build_forward_expand(gf, cur_experts[i]);
|
|
1013
|
-
}
|
|
1014
|
-
|
|
1015
|
-
// aggregate experts
|
|
1016
|
-
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
|
1017
|
-
// to avoid potentially a large number of add nodes during warmup
|
|
1018
|
-
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
|
1019
|
-
ggml_tensor * moe_out = cur_experts[0];
|
|
1020
|
-
|
|
1021
|
-
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
|
1022
|
-
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
|
1023
|
-
}
|
|
1024
|
-
|
|
1025
|
-
if (n_expert_used == 1) {
|
|
1026
|
-
// avoid returning a non-contiguous tensor
|
|
1027
|
-
moe_out = ggml_cont(ctx0, moe_out);
|
|
1028
|
-
}
|
|
1029
|
-
|
|
1030
|
-
cb(moe_out, "ffn_moe_out", il);
|
|
1031
|
-
|
|
1032
|
-
return moe_out;
|
|
1033
|
-
}
|
|
1034
|
-
|
|
1035
945
|
// input embeddings with optional lora
|
|
1036
946
|
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|
1037
947
|
const int64_t n_embd = hparams.n_embd;
|
|
@@ -1655,16 +1565,17 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
|
|
1655
1565
|
|
|
1656
1566
|
ggml_tensor * llm_graph_context::build_rs(
|
|
1657
1567
|
ggml_tensor * s,
|
|
1658
|
-
ggml_tensor *
|
|
1568
|
+
ggml_tensor * state_copy_main,
|
|
1569
|
+
ggml_tensor * state_copy_extra,
|
|
1659
1570
|
int32_t state_size,
|
|
1660
1571
|
int32_t n_seqs,
|
|
1661
|
-
uint32_t
|
|
1662
|
-
uint32_t
|
|
1663
|
-
uint32_t
|
|
1572
|
+
uint32_t n_rs,
|
|
1573
|
+
uint32_t rs_head,
|
|
1574
|
+
uint32_t rs_size,
|
|
1664
1575
|
int32_t rs_zero,
|
|
1665
1576
|
const llm_graph_get_rows_fn & get_state_rows) const {
|
|
1666
1577
|
|
|
1667
|
-
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size,
|
|
1578
|
+
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
|
|
1668
1579
|
|
|
1669
1580
|
// Clear a single state which will then be copied to the other cleared states.
|
|
1670
1581
|
// Note that this is a no-op when the view is zero-sized.
|
|
@@ -1672,39 +1583,44 @@ ggml_tensor * llm_graph_context::build_rs(
|
|
|
1672
1583
|
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
|
1673
1584
|
|
|
1674
1585
|
// copy states
|
|
1675
|
-
// NOTE: assuming the copy destinations are ALL contained between
|
|
1676
|
-
// {state_size,
|
|
1677
|
-
ggml_tensor * output_states = get_state_rows(ctx0, states,
|
|
1586
|
+
// NOTE: assuming the copy destinations are ALL contained between rs_head and rs_head + n_rs
|
|
1587
|
+
// {state_size, rs_size} -> {state_size, n_seqs}
|
|
1588
|
+
ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
|
|
1678
1589
|
ggml_build_forward_expand(gf, output_states);
|
|
1679
1590
|
|
|
1680
|
-
// copy extra states which won't be changed further (between n_seqs and
|
|
1681
|
-
ggml_tensor * states_extra = ggml_get_rows(ctx0, states,
|
|
1591
|
+
// copy extra states which won't be changed further (between n_seqs and n_rs)
|
|
1592
|
+
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
|
|
1682
1593
|
ggml_build_forward_expand(gf,
|
|
1683
1594
|
ggml_cpy(ctx0,
|
|
1684
1595
|
states_extra,
|
|
1685
|
-
ggml_view_1d(ctx0, s, state_size*(
|
|
1596
|
+
ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
|
|
1686
1597
|
|
|
1687
1598
|
return output_states;
|
|
1688
1599
|
}
|
|
1689
1600
|
|
|
1690
1601
|
static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
|
1691
1602
|
ggml_context * ctx0,
|
|
1603
|
+
const llama_ubatch & ubatch,
|
|
1692
1604
|
const llama_memory_recurrent_context * mctx_cur) {
|
|
1693
1605
|
|
|
1694
1606
|
auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
|
|
1695
1607
|
|
|
1696
|
-
const
|
|
1608
|
+
const int64_t n_rs = mctx_cur->get_n_rs();
|
|
1609
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
1697
1610
|
|
|
1698
1611
|
inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
|
|
1699
1612
|
ggml_set_input(inp->s_copy);
|
|
1700
1613
|
|
|
1614
|
+
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
|
1615
|
+
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
|
1616
|
+
|
|
1701
1617
|
return inp;
|
|
1702
1618
|
}
|
|
1703
1619
|
|
|
1704
1620
|
llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
|
|
1705
1621
|
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
1706
1622
|
|
|
1707
|
-
auto inp = build_rs_inp_impl(ctx0, mctx_cur);
|
|
1623
|
+
auto inp = build_rs_inp_impl(ctx0, ubatch, mctx_cur);
|
|
1708
1624
|
|
|
1709
1625
|
return (llm_graph_input_rs *) res->add_input(std::move(inp));
|
|
1710
1626
|
}
|
|
@@ -1717,7 +1633,9 @@ ggml_tensor * llm_graph_context::build_rs(
|
|
|
1717
1633
|
const llm_graph_get_rows_fn & get_state_rows) const {
|
|
1718
1634
|
const auto * kv_state = inp->mctx;
|
|
1719
1635
|
|
|
1720
|
-
return build_rs(s, inp->
|
|
1636
|
+
return build_rs(s, inp->s_copy_main, inp->s_copy_extra, state_size, n_seqs,
|
|
1637
|
+
kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(),
|
|
1638
|
+
get_state_rows);
|
|
1721
1639
|
}
|
|
1722
1640
|
|
|
1723
1641
|
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
|
@@ -1764,7 +1682,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
|
|
1764
1682
|
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
|
1765
1683
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
|
1766
1684
|
|
|
1767
|
-
auto inp_rs = build_rs_inp_impl(ctx0, mctx_cur->get_recr());
|
|
1685
|
+
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
|
1768
1686
|
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
|
1769
1687
|
|
|
1770
1688
|
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
|
@@ -144,7 +144,7 @@ public:
|
|
|
144
144
|
|
|
145
145
|
ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
|
|
146
146
|
|
|
147
|
-
const llama_hparams
|
|
147
|
+
const llama_hparams hparams;
|
|
148
148
|
};
|
|
149
149
|
|
|
150
150
|
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
|
@@ -158,7 +158,7 @@ public:
|
|
|
158
158
|
|
|
159
159
|
ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
|
|
160
160
|
|
|
161
|
-
const llama_hparams
|
|
161
|
+
const llama_hparams hparams;
|
|
162
162
|
|
|
163
163
|
const llama_kv_cache_unified_context * mctx;
|
|
164
164
|
};
|
|
@@ -177,8 +177,8 @@ public:
|
|
|
177
177
|
|
|
178
178
|
ggml_tensor * out_ids; // I32 [n_outputs]
|
|
179
179
|
|
|
180
|
-
const llama_hparams
|
|
181
|
-
const llama_cparams
|
|
180
|
+
const llama_hparams hparams;
|
|
181
|
+
const llama_cparams cparams;
|
|
182
182
|
|
|
183
183
|
const uint32_t n_outputs;
|
|
184
184
|
};
|
|
@@ -192,7 +192,7 @@ public:
|
|
|
192
192
|
|
|
193
193
|
ggml_tensor * mean; // F32 [n_batch, n_batch]
|
|
194
194
|
|
|
195
|
-
const llama_cparams
|
|
195
|
+
const llama_cparams cparams;
|
|
196
196
|
};
|
|
197
197
|
|
|
198
198
|
class llm_graph_input_cls : public llm_graph_input_i {
|
|
@@ -204,7 +204,7 @@ public:
|
|
|
204
204
|
|
|
205
205
|
ggml_tensor * cls; // I32 [n_batch]
|
|
206
206
|
|
|
207
|
-
const llama_cparams
|
|
207
|
+
const llama_cparams cparams;
|
|
208
208
|
};
|
|
209
209
|
|
|
210
210
|
class llm_graph_input_rs : public llm_graph_input_i {
|
|
@@ -214,7 +214,12 @@ public:
|
|
|
214
214
|
|
|
215
215
|
void set_input(const llama_ubatch * ubatch) override;
|
|
216
216
|
|
|
217
|
-
ggml_tensor * s_copy;
|
|
217
|
+
ggml_tensor * s_copy; // I32 [n_rs]
|
|
218
|
+
|
|
219
|
+
// views of s_copy, computed once per graph
|
|
220
|
+
// and shared across layers which use build_rs
|
|
221
|
+
ggml_tensor * s_copy_main; // I32 [n_seqs]
|
|
222
|
+
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
|
218
223
|
|
|
219
224
|
const llama_memory_recurrent_context * mctx;
|
|
220
225
|
};
|
|
@@ -247,8 +252,8 @@ public:
|
|
|
247
252
|
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
|
|
248
253
|
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
|
|
249
254
|
|
|
250
|
-
const llama_hparams
|
|
251
|
-
const llama_cparams
|
|
255
|
+
const llama_hparams hparams;
|
|
256
|
+
const llama_cparams cparams;
|
|
252
257
|
};
|
|
253
258
|
|
|
254
259
|
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
|
|
@@ -278,8 +283,11 @@ public:
|
|
|
278
283
|
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
279
284
|
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
280
285
|
|
|
281
|
-
|
|
282
|
-
|
|
286
|
+
// note: these have to be copies because in order to be able to reuse a graph, its inputs
|
|
287
|
+
// need to carry these parameters with them. otherwise, they can point to freed
|
|
288
|
+
// llm_graph_params from a previous batch, causing stack-use-after-return
|
|
289
|
+
const llama_hparams hparams;
|
|
290
|
+
const llama_cparams cparams;
|
|
283
291
|
|
|
284
292
|
const llama_kv_cache_unified_context * mctx;
|
|
285
293
|
};
|
|
@@ -318,8 +326,8 @@ public:
|
|
|
318
326
|
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
319
327
|
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
320
328
|
|
|
321
|
-
const llama_hparams
|
|
322
|
-
const llama_cparams
|
|
329
|
+
const llama_hparams hparams;
|
|
330
|
+
const llama_cparams cparams;
|
|
323
331
|
|
|
324
332
|
const llama_kv_cache_unified_iswa_context * mctx;
|
|
325
333
|
};
|
|
@@ -415,7 +423,9 @@ struct llm_graph_params {
|
|
|
415
423
|
(!ubatch.embd && !other.ubatch.embd)
|
|
416
424
|
);
|
|
417
425
|
|
|
418
|
-
|
|
426
|
+
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
|
427
|
+
// the reason is because the set of attention streams would be different for different sequences
|
|
428
|
+
if (can_reuse_ubatch && ubatch.equal_seqs()) {
|
|
419
429
|
if (!ubatch.data) {
|
|
420
430
|
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
|
421
431
|
// therefore we cannot perform the sequence id check. normally should never happen
|
|
@@ -623,19 +633,8 @@ struct llm_graph_context {
|
|
|
623
633
|
bool scale_w,
|
|
624
634
|
float w_scale,
|
|
625
635
|
llama_expert_gating_func_type gating_op,
|
|
626
|
-
int il
|
|
627
|
-
|
|
628
|
-
ggml_tensor * build_moe_ffn_from_probs(
|
|
629
|
-
ggml_tensor * cur,
|
|
630
|
-
ggml_tensor * probs,
|
|
631
|
-
ggml_tensor * up_exps,
|
|
632
|
-
ggml_tensor * gate_exps,
|
|
633
|
-
ggml_tensor * down_exps,
|
|
634
|
-
ggml_tensor * exp_probs_b,
|
|
635
|
-
int64_t n_expert,
|
|
636
|
-
int64_t n_expert_used,
|
|
637
|
-
llama_expert_gating_func_type gating_op,
|
|
638
|
-
int il) const;
|
|
636
|
+
int il,
|
|
637
|
+
ggml_tensor * probs_in = nullptr) const;
|
|
639
638
|
|
|
640
639
|
//
|
|
641
640
|
// inputs
|
|
@@ -727,7 +726,6 @@ struct llm_graph_context {
|
|
|
727
726
|
// recurrent
|
|
728
727
|
//
|
|
729
728
|
|
|
730
|
-
// TODO: avoid notion of "kv"
|
|
731
729
|
// TODO: move this implementation to llama_memory_recurrent.
|
|
732
730
|
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
|
733
731
|
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
|
@@ -735,12 +733,13 @@ struct llm_graph_context {
|
|
|
735
733
|
// `llama_memory_recurrent`
|
|
736
734
|
ggml_tensor * build_rs(
|
|
737
735
|
ggml_tensor * s,
|
|
738
|
-
ggml_tensor *
|
|
736
|
+
ggml_tensor * state_copy_main,
|
|
737
|
+
ggml_tensor * state_copy_extra,
|
|
739
738
|
int32_t state_size,
|
|
740
739
|
int32_t n_seqs,
|
|
741
|
-
uint32_t
|
|
742
|
-
uint32_t
|
|
743
|
-
uint32_t
|
|
740
|
+
uint32_t n_rs,
|
|
741
|
+
uint32_t rs_head,
|
|
742
|
+
uint32_t rs_size,
|
|
744
743
|
int32_t rs_zero,
|
|
745
744
|
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
|
746
745
|
|
|
@@ -183,7 +183,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
183
183
|
const size_t memory_size_k = size_k_bytes();
|
|
184
184
|
const size_t memory_size_v = size_v_bytes();
|
|
185
185
|
|
|
186
|
-
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%
|
|
186
|
+
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
|
187
187
|
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
|
|
188
188
|
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
|
189
189
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
|
@@ -193,7 +193,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
193
193
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
|
194
194
|
|
|
195
195
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
196
|
-
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 :
|
|
196
|
+
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
|
|
197
197
|
|
|
198
198
|
if (!supports_set_rows) {
|
|
199
199
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
|
25
25
|
/* common */
|
|
26
26
|
uint32_t n_seq_max,
|
|
27
27
|
bool offload,
|
|
28
|
+
bool unified,
|
|
28
29
|
/* layer filters */
|
|
29
30
|
layer_filter_cb && filter_attn,
|
|
30
31
|
layer_filter_cb && filter_recr) :
|
|
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
|
38
39
|
type_v,
|
|
39
40
|
v_trans,
|
|
40
41
|
offload,
|
|
41
|
-
|
|
42
|
+
unified,
|
|
42
43
|
kv_size,
|
|
43
44
|
n_seq_max,
|
|
44
45
|
n_pad,
|