@fugood/llama.node 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +322 -70
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +154 -13
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
- package/src/llama.cpp/include/llama.h +8 -0
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +6 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +568 -41
- package/src/llama.cpp/src/llama-model.h +18 -0
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
|
|
204
204
|
std::vector<int> target_pos(n_seqs_unq, -1);
|
|
205
205
|
std::vector<int> target_row(n_seqs_unq, -1);
|
|
206
206
|
|
|
207
|
-
bool last =
|
|
207
|
+
const bool last = (
|
|
208
|
+
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
|
|
209
|
+
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
|
|
210
|
+
);
|
|
208
211
|
|
|
209
212
|
for (int i = 0; i < n_tokens; ++i) {
|
|
210
213
|
const llama_pos pos = ubatch->pos[i];
|
|
@@ -920,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
920
923
|
selection_probs = logits;
|
|
921
924
|
}
|
|
922
925
|
|
|
926
|
+
if (arch == LLM_ARCH_GROVEMOE) {
|
|
927
|
+
selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
|
928
|
+
cb(selection_probs, "ffn_moe_probs_biased", il);
|
|
929
|
+
}
|
|
930
|
+
|
|
923
931
|
// select experts
|
|
924
932
|
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
925
933
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
926
934
|
cb(selected_experts, "ffn_moe_topk", il);
|
|
927
935
|
|
|
928
|
-
|
|
929
|
-
|
|
936
|
+
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
|
|
937
|
+
// TODO: Use scalar div instead when/if implemented
|
|
938
|
+
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
|
|
939
|
+
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
|
|
940
|
+
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
|
|
941
|
+
} else {
|
|
942
|
+
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
|
|
930
946
|
cb(weights, "ffn_moe_weights", il);
|
|
931
947
|
|
|
948
|
+
|
|
932
949
|
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
|
933
950
|
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
|
934
951
|
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
|
@@ -952,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
952
969
|
cb(weights, "ffn_moe_weights_scaled", il);
|
|
953
970
|
}
|
|
954
971
|
|
|
972
|
+
//call early so that topk-moe can be used
|
|
973
|
+
ggml_build_forward_expand(gf, weights);
|
|
974
|
+
|
|
955
975
|
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
|
956
976
|
|
|
957
977
|
if (weight_before_ffn) {
|
|
@@ -1177,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
|
|
|
1177
1197
|
}
|
|
1178
1198
|
|
|
1179
1199
|
ggml_tensor * llm_graph_context::build_inp_cls() const {
|
|
1180
|
-
auto inp = std::make_unique<llm_graph_input_cls>(cparams);
|
|
1200
|
+
auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
|
|
1181
1201
|
|
|
1182
1202
|
auto & cur = inp->cls;
|
|
1183
1203
|
|
|
@@ -1833,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
|
|
1833
1853
|
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
|
1834
1854
|
}
|
|
1835
1855
|
|
|
1856
|
+
void llm_graph_context::build_dense_out(
|
|
1857
|
+
ggml_tensor * dense_2,
|
|
1858
|
+
ggml_tensor * dense_3) const {
|
|
1859
|
+
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
|
|
1860
|
+
return;
|
|
1861
|
+
}
|
|
1862
|
+
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
|
1863
|
+
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
|
|
1864
|
+
|
|
1865
|
+
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
|
1866
|
+
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
|
1867
|
+
cb(cur, "result_embd_pooled", -1);
|
|
1868
|
+
res->t_embd_pooled = cur;
|
|
1869
|
+
ggml_build_forward_expand(gf, cur);
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
|
|
1836
1873
|
void llm_graph_context::build_pooling(
|
|
1837
1874
|
ggml_tensor * cls,
|
|
1838
1875
|
ggml_tensor * cls_b,
|
|
@@ -1877,34 +1914,32 @@ void llm_graph_context::build_pooling(
|
|
|
1877
1914
|
case LLAMA_POOLING_TYPE_RANK:
|
|
1878
1915
|
{
|
|
1879
1916
|
ggml_tensor * inp_cls = build_inp_cls();
|
|
1880
|
-
|
|
1917
|
+
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
|
1881
1918
|
|
|
1919
|
+
// classification head
|
|
1920
|
+
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
|
1882
1921
|
if (cls) {
|
|
1883
|
-
|
|
1884
|
-
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
|
1885
|
-
cur = ggml_mul_mat(ctx0, cls, inp);
|
|
1922
|
+
cur = ggml_mul_mat(ctx0, cls, cur);
|
|
1886
1923
|
if (cls_b) {
|
|
1887
1924
|
cur = ggml_add(ctx0, cur, cls_b);
|
|
1888
1925
|
}
|
|
1889
1926
|
cur = ggml_tanh(ctx0, cur);
|
|
1927
|
+
}
|
|
1890
1928
|
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
}
|
|
1898
|
-
}
|
|
1899
|
-
} else if (cls_out) {
|
|
1900
|
-
// Single layer classification head (direct projection)
|
|
1901
|
-
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
|
1902
|
-
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
|
1929
|
+
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
|
1930
|
+
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
|
1931
|
+
// Single layer classification head (direct projection)
|
|
1932
|
+
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
|
1933
|
+
if (cls_out) {
|
|
1934
|
+
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
|
1903
1935
|
if (cls_out_b) {
|
|
1904
1936
|
cur = ggml_add(ctx0, cur, cls_out_b);
|
|
1905
1937
|
}
|
|
1906
|
-
}
|
|
1907
|
-
|
|
1938
|
+
}
|
|
1939
|
+
|
|
1940
|
+
// softmax for qwen3 reranker
|
|
1941
|
+
if (arch == LLM_ARCH_QWEN3) {
|
|
1942
|
+
cur = ggml_soft_max(ctx0, cur);
|
|
1908
1943
|
}
|
|
1909
1944
|
} break;
|
|
1910
1945
|
default:
|
|
@@ -206,7 +206,7 @@ public:
|
|
|
206
206
|
|
|
207
207
|
class llm_graph_input_cls : public llm_graph_input_i {
|
|
208
208
|
public:
|
|
209
|
-
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
|
|
209
|
+
llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
|
|
210
210
|
virtual ~llm_graph_input_cls() = default;
|
|
211
211
|
|
|
212
212
|
void set_input(const llama_ubatch * ubatch) override;
|
|
@@ -214,6 +214,7 @@ public:
|
|
|
214
214
|
ggml_tensor * cls; // I32 [n_batch]
|
|
215
215
|
|
|
216
216
|
const llama_cparams cparams;
|
|
217
|
+
const llm_arch arch;
|
|
217
218
|
};
|
|
218
219
|
|
|
219
220
|
class llm_graph_input_rs : public llm_graph_input_i {
|
|
@@ -813,6 +814,14 @@ struct llm_graph_context {
|
|
|
813
814
|
ggml_tensor * cls_b,
|
|
814
815
|
ggml_tensor * cls_out,
|
|
815
816
|
ggml_tensor * cls_out_b) const;
|
|
817
|
+
|
|
818
|
+
//
|
|
819
|
+
// dense (out)
|
|
820
|
+
//
|
|
821
|
+
|
|
822
|
+
void build_dense_out(
|
|
823
|
+
ggml_tensor * dense_2,
|
|
824
|
+
ggml_tensor * dense_3) const;
|
|
816
825
|
};
|
|
817
826
|
|
|
818
827
|
// TODO: better name
|
|
@@ -42,7 +42,7 @@ struct llama_hparams {
|
|
|
42
42
|
uint32_t n_embd;
|
|
43
43
|
uint32_t n_embd_features = 0;
|
|
44
44
|
uint32_t n_layer;
|
|
45
|
-
|
|
45
|
+
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
|
46
46
|
uint32_t n_rot;
|
|
47
47
|
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
|
48
48
|
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
|
@@ -69,10 +69,13 @@ struct llama_hparams {
|
|
|
69
69
|
uint32_t n_lora_kv = 0;
|
|
70
70
|
uint32_t n_ff_exp = 0;
|
|
71
71
|
uint32_t n_ff_shexp = 0;
|
|
72
|
+
uint32_t n_ff_chexp = 0;
|
|
72
73
|
uint32_t n_expert_shared = 0;
|
|
73
74
|
uint32_t n_norm_groups = 0;
|
|
75
|
+
uint32_t n_group_experts = 0;
|
|
74
76
|
|
|
75
|
-
float
|
|
77
|
+
float expert_group_scale = 0.05f;
|
|
78
|
+
float expert_weights_scale = 0.0f;
|
|
76
79
|
bool expert_weights_norm = false;
|
|
77
80
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
|
78
81
|
uint32_t moe_every_n_layers = 0;
|
|
@@ -166,6 +169,18 @@ struct llama_hparams {
|
|
|
166
169
|
uint32_t laurel_rank = 64;
|
|
167
170
|
uint32_t n_embd_altup = 256;
|
|
168
171
|
|
|
172
|
+
// needed for sentence-transformers dense layers
|
|
173
|
+
uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
|
|
174
|
+
uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
|
|
175
|
+
uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
|
|
176
|
+
uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
|
|
177
|
+
|
|
178
|
+
// xIELU
|
|
179
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
|
|
180
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
|
|
181
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
|
|
182
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
|
|
183
|
+
|
|
169
184
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
|
170
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
|
171
186
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
|
@@ -220,7 +220,7 @@ bool llama_kv_cache_iswa::get_can_shift() const {
|
|
|
220
220
|
}
|
|
221
221
|
|
|
222
222
|
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
223
|
-
if ((flags &
|
|
223
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
224
224
|
kv_base->state_write(io, seq_id, flags);
|
|
225
225
|
}
|
|
226
226
|
|
|
@@ -228,7 +228,7 @@ void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id
|
|
|
228
228
|
}
|
|
229
229
|
|
|
230
230
|
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
231
|
-
if ((flags &
|
|
231
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
232
232
|
kv_base->state_read(io, seq_id, flags);
|
|
233
233
|
}
|
|
234
234
|
|
|
@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
|
|
|
123
123
|
throw std::runtime_error("failed to create ggml context for kv cache");
|
|
124
124
|
}
|
|
125
125
|
|
|
126
|
-
ggml_tensor * k;
|
|
127
|
-
ggml_tensor * v;
|
|
128
|
-
|
|
129
|
-
k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
|
|
130
|
-
v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
|
|
126
|
+
ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
|
|
127
|
+
ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
|
|
131
128
|
|
|
132
129
|
ggml_format_name(k, "cache_k_l%d", il);
|
|
133
130
|
ggml_format_name(v, "cache_v_l%d", il);
|
|
@@ -73,7 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
|
|
73
73
|
// if all tokens are output, split by sequence
|
|
74
74
|
ubatch = balloc.split_seq(n_ubatch);
|
|
75
75
|
} else {
|
|
76
|
-
|
|
76
|
+
// TODO: non-sequential equal split can be done if using unified KV cache
|
|
77
|
+
// for simplicity, we always use sequential equal split for now
|
|
78
|
+
ubatch = balloc.split_equal(n_ubatch, true);
|
|
77
79
|
}
|
|
78
80
|
|
|
79
81
|
if (ubatch.n_tokens == 0) {
|
|
@@ -175,17 +177,17 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdo
|
|
|
175
177
|
}
|
|
176
178
|
|
|
177
179
|
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
mem_recr->state_write(io, seq_id);
|
|
180
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
181
|
+
mem_attn->state_write(io, seq_id, flags);
|
|
182
|
+
}
|
|
183
|
+
mem_recr->state_write(io, seq_id, flags);
|
|
182
184
|
}
|
|
183
185
|
|
|
184
186
|
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
mem_recr->state_read(io, seq_id);
|
|
187
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
188
|
+
mem_attn->state_read(io, seq_id, flags);
|
|
189
|
+
}
|
|
190
|
+
mem_recr->state_read(io, seq_id, flags);
|
|
189
191
|
}
|
|
190
192
|
|
|
191
193
|
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
|
|
@@ -136,6 +136,7 @@ void llama_memory_recurrent::clear(bool data) {
|
|
|
136
136
|
}
|
|
137
137
|
|
|
138
138
|
bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
139
|
+
//printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
|
|
139
140
|
uint32_t new_head = size;
|
|
140
141
|
|
|
141
142
|
if (p0 < 0) {
|
|
@@ -156,7 +157,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
156
157
|
if (tail_id >= 0) {
|
|
157
158
|
const auto & cell = cells[tail_id];
|
|
158
159
|
// partial intersection is invalid
|
|
159
|
-
if ((0 < p0 && p0
|
|
160
|
+
if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
|
161
|
+
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
|
|
160
162
|
return false;
|
|
161
163
|
}
|
|
162
164
|
// invalidate tails which will be cleared
|
|
@@ -167,6 +169,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
167
169
|
} else {
|
|
168
170
|
// seq_id is negative, then the range should include everything or nothing
|
|
169
171
|
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
|
|
172
|
+
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
|
|
170
173
|
return false;
|
|
171
174
|
}
|
|
172
175
|
}
|
|
@@ -379,7 +382,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
|
|
379
382
|
// if all tokens are output, split by sequence
|
|
380
383
|
ubatch = balloc.split_seq(n_ubatch);
|
|
381
384
|
} else {
|
|
382
|
-
|
|
385
|
+
// TODO: non-sequential equal split can be done if using unified KV cache
|
|
386
|
+
// for simplicity, we always use sequential equal split for now
|
|
387
|
+
ubatch = balloc.split_equal(n_ubatch, true);
|
|
383
388
|
}
|
|
384
389
|
|
|
385
390
|
if (ubatch.n_tokens == 0) {
|
|
@@ -856,9 +861,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
856
861
|
bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
|
857
862
|
if (dest_seq_id != -1) {
|
|
858
863
|
// single sequence
|
|
859
|
-
|
|
860
864
|
seq_rm(dest_seq_id, -1, -1);
|
|
861
865
|
|
|
866
|
+
if (cell_count == 0) {
|
|
867
|
+
return true;
|
|
868
|
+
}
|
|
869
|
+
|
|
862
870
|
llama_batch_allocr balloc(hparams.n_pos_per_embd());
|
|
863
871
|
|
|
864
872
|
llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
|
|
@@ -465,6 +465,8 @@ namespace GGUFMeta {
|
|
|
465
465
|
// TODO: this is not very clever - figure out something better
|
|
466
466
|
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
|
467
467
|
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
|
468
|
+
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
|
|
469
|
+
|
|
468
470
|
|
|
469
471
|
llama_model_loader::llama_model_loader(
|
|
470
472
|
const std::string & fname,
|