@fugood/llama.node 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/src/LlamaContext.cpp +20 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/common/arg.cpp +13 -4
- package/src/llama.cpp/common/chat.cpp +33 -2
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -197
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
- package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
- package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
- package/src/llama.cpp/src/llama-memory.h +3 -8
- package/src/llama.cpp/src/llama-model.cpp +449 -246
- package/src/llama.cpp/src/llama-model.h +2 -0
|
@@ -19,8 +19,8 @@ struct llama_cparams;
|
|
|
19
19
|
|
|
20
20
|
struct llama_memory_context_i;
|
|
21
21
|
|
|
22
|
-
class
|
|
23
|
-
class
|
|
22
|
+
class llama_kv_cache_context;
|
|
23
|
+
class llama_kv_cache_iswa_context;
|
|
24
24
|
class llama_memory_recurrent_context;
|
|
25
25
|
class llama_memory_hybrid_context;
|
|
26
26
|
|
|
@@ -152,7 +152,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
|
|
152
152
|
public:
|
|
153
153
|
llm_graph_input_pos_bucket_kv(
|
|
154
154
|
const llama_hparams & hparams,
|
|
155
|
-
const
|
|
155
|
+
const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
|
|
156
156
|
virtual ~llm_graph_input_pos_bucket_kv() = default;
|
|
157
157
|
|
|
158
158
|
void set_input(const llama_ubatch * ubatch) override;
|
|
@@ -161,7 +161,7 @@ public:
|
|
|
161
161
|
|
|
162
162
|
const llama_hparams hparams;
|
|
163
163
|
|
|
164
|
-
const
|
|
164
|
+
const llama_kv_cache_context * mctx;
|
|
165
165
|
};
|
|
166
166
|
|
|
167
167
|
class llm_graph_input_out_ids : public llm_graph_input_i {
|
|
@@ -257,17 +257,17 @@ public:
|
|
|
257
257
|
const llama_cparams cparams;
|
|
258
258
|
};
|
|
259
259
|
|
|
260
|
-
class
|
|
260
|
+
class llm_graph_input_attn_kv : public llm_graph_input_i {
|
|
261
261
|
public:
|
|
262
|
-
|
|
262
|
+
llm_graph_input_attn_kv(
|
|
263
263
|
const llama_hparams & hparams,
|
|
264
264
|
const llama_cparams & cparams,
|
|
265
|
-
const
|
|
265
|
+
const llama_kv_cache_context * mctx) :
|
|
266
266
|
hparams(hparams),
|
|
267
267
|
cparams(cparams),
|
|
268
268
|
mctx(mctx) {
|
|
269
269
|
}
|
|
270
|
-
~
|
|
270
|
+
~llm_graph_input_attn_kv() = default;
|
|
271
271
|
|
|
272
272
|
void set_input(const llama_ubatch * ubatch) override;
|
|
273
273
|
|
|
@@ -290,20 +290,20 @@ public:
|
|
|
290
290
|
const llama_hparams hparams;
|
|
291
291
|
const llama_cparams cparams;
|
|
292
292
|
|
|
293
|
-
const
|
|
293
|
+
const llama_kv_cache_context * mctx;
|
|
294
294
|
};
|
|
295
295
|
|
|
296
|
-
class
|
|
296
|
+
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
|
|
297
297
|
public:
|
|
298
|
-
|
|
298
|
+
llm_graph_input_attn_kv_iswa(
|
|
299
299
|
const llama_hparams & hparams,
|
|
300
300
|
const llama_cparams & cparams,
|
|
301
|
-
const
|
|
301
|
+
const llama_kv_cache_iswa_context * mctx) :
|
|
302
302
|
hparams(hparams),
|
|
303
303
|
cparams(cparams),
|
|
304
304
|
mctx(mctx) {
|
|
305
305
|
}
|
|
306
|
-
~
|
|
306
|
+
~llm_graph_input_attn_kv_iswa() = default;
|
|
307
307
|
|
|
308
308
|
void set_input(const llama_ubatch * ubatch) override;
|
|
309
309
|
|
|
@@ -330,7 +330,7 @@ public:
|
|
|
330
330
|
const llama_hparams hparams;
|
|
331
331
|
const llama_cparams cparams;
|
|
332
332
|
|
|
333
|
-
const
|
|
333
|
+
const llama_kv_cache_iswa_context * mctx;
|
|
334
334
|
};
|
|
335
335
|
|
|
336
336
|
class llm_graph_input_attn_cross : public llm_graph_input_i {
|
|
@@ -351,7 +351,7 @@ public:
|
|
|
351
351
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
|
352
352
|
public:
|
|
353
353
|
llm_graph_input_mem_hybrid(
|
|
354
|
-
std::unique_ptr<
|
|
354
|
+
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
|
355
355
|
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
356
356
|
const llama_memory_hybrid_context * mctx) :
|
|
357
357
|
inp_attn(std::move(inp_attn)),
|
|
@@ -361,11 +361,11 @@ public:
|
|
|
361
361
|
|
|
362
362
|
void set_input(const llama_ubatch * ubatch) override;
|
|
363
363
|
|
|
364
|
-
std::unique_ptr<
|
|
365
|
-
std::unique_ptr<llm_graph_input_rs>
|
|
364
|
+
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
|
365
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
|
366
366
|
|
|
367
|
-
|
|
368
|
-
llm_graph_input_rs
|
|
367
|
+
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
|
368
|
+
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
|
369
369
|
|
|
370
370
|
const llama_memory_hybrid_context * mctx;
|
|
371
371
|
};
|
|
@@ -680,14 +680,14 @@ struct llm_graph_context {
|
|
|
680
680
|
//
|
|
681
681
|
|
|
682
682
|
ggml_tensor * build_attn_mha(
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
683
|
+
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
|
684
|
+
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
|
685
|
+
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
|
686
|
+
ggml_tensor * kq_b,
|
|
687
|
+
ggml_tensor * kq_mask,
|
|
688
|
+
ggml_tensor * sinks, // [n_head_q]
|
|
689
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
690
|
+
float kq_scale) const;
|
|
691
691
|
|
|
692
692
|
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
|
693
693
|
|
|
@@ -699,50 +699,39 @@ struct llm_graph_context {
|
|
|
699
699
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
700
700
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
701
701
|
ggml_tensor * kq_b,
|
|
702
|
+
ggml_tensor * sinks, // [n_head_q]
|
|
702
703
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
703
704
|
float kq_scale,
|
|
704
705
|
int il) const;
|
|
705
706
|
|
|
706
|
-
|
|
707
|
+
llm_graph_input_attn_kv * build_attn_inp_kv() const;
|
|
707
708
|
|
|
708
709
|
ggml_tensor * build_attn(
|
|
709
|
-
|
|
710
|
+
llm_graph_input_attn_kv * inp,
|
|
710
711
|
ggml_tensor * wo,
|
|
711
712
|
ggml_tensor * wo_b,
|
|
712
713
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
713
714
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
714
715
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
715
716
|
ggml_tensor * kq_b,
|
|
717
|
+
ggml_tensor * sinks, // [n_head_q]
|
|
716
718
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
717
719
|
float kq_scale,
|
|
718
720
|
int il) const;
|
|
719
721
|
|
|
720
|
-
|
|
722
|
+
llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
|
|
721
723
|
|
|
722
724
|
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
|
723
725
|
ggml_tensor * build_attn(
|
|
724
|
-
|
|
725
|
-
ggml_tensor * wo,
|
|
726
|
-
ggml_tensor * wo_b,
|
|
727
|
-
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
728
|
-
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
|
729
|
-
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
|
730
|
-
ggml_tensor * kq_b,
|
|
731
|
-
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
732
|
-
float kq_scale,
|
|
733
|
-
int il) const;
|
|
734
|
-
|
|
735
|
-
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
|
|
736
|
-
ggml_tensor * build_attn_with_sinks(
|
|
737
|
-
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
726
|
+
llm_graph_input_attn_kv_iswa * inp,
|
|
738
727
|
ggml_tensor * wo,
|
|
739
728
|
ggml_tensor * wo_b,
|
|
740
729
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
741
730
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
|
742
731
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
|
743
732
|
ggml_tensor * kq_b,
|
|
744
|
-
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
745
733
|
ggml_tensor * sinks, // [n_head_q]
|
|
734
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
746
735
|
float kq_scale,
|
|
747
736
|
int il) const;
|
|
748
737
|
|
|
@@ -756,6 +745,7 @@ struct llm_graph_context {
|
|
|
756
745
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
757
746
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
758
747
|
ggml_tensor * kq_b,
|
|
748
|
+
ggml_tensor * sinks, // [n_head_q]
|
|
759
749
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
760
750
|
float kq_scale,
|
|
761
751
|
int il) const;
|
|
@@ -765,7 +755,7 @@ struct llm_graph_context {
|
|
|
765
755
|
//
|
|
766
756
|
|
|
767
757
|
// TODO: move this implementation to llama_memory_recurrent.
|
|
768
|
-
// this is analogous to
|
|
758
|
+
// this is analogous to llama_kv_cache::cpy_k / cpy_v
|
|
769
759
|
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
|
770
760
|
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
|
771
761
|
// `llama_memory_recurrent`
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#include "llama-kv-cache-
|
|
1
|
+
#include "llama-kv-cache-iswa.h"
|
|
2
2
|
|
|
3
3
|
#include "llama-impl.h"
|
|
4
4
|
#include "llama-batch.h"
|
|
@@ -8,10 +8,10 @@
|
|
|
8
8
|
#include <cassert>
|
|
9
9
|
|
|
10
10
|
//
|
|
11
|
-
//
|
|
11
|
+
// llama_kv_cache_iswa
|
|
12
12
|
//
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
llama_kv_cache_iswa::llama_kv_cache_iswa(
|
|
15
15
|
const llama_model & model,
|
|
16
16
|
ggml_type type_k,
|
|
17
17
|
ggml_type type_v,
|
|
@@ -23,8 +23,8 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
|
|
23
23
|
uint32_t n_seq_max,
|
|
24
24
|
uint32_t n_ubatch,
|
|
25
25
|
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
|
27
|
+
llama_kv_cache::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
|
28
28
|
|
|
29
29
|
const uint32_t size_base = kv_size;
|
|
30
30
|
|
|
@@ -40,25 +40,25 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
|
|
40
40
|
|
|
41
41
|
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
|
|
42
42
|
|
|
43
|
-
kv_base = std::make_unique<
|
|
43
|
+
kv_base = std::make_unique<llama_kv_cache>(
|
|
44
44
|
model, std::move(filter_base), type_k, type_v,
|
|
45
45
|
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
|
46
46
|
0, LLAMA_SWA_TYPE_NONE);
|
|
47
47
|
|
|
48
48
|
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
|
49
49
|
|
|
50
|
-
kv_swa = std::make_unique<
|
|
50
|
+
kv_swa = std::make_unique<llama_kv_cache>(
|
|
51
51
|
model, std::move(filter_swa), type_k, type_v,
|
|
52
52
|
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
|
53
53
|
hparams.n_swa, hparams.swa_type);
|
|
54
54
|
}
|
|
55
55
|
|
|
56
|
-
void
|
|
56
|
+
void llama_kv_cache_iswa::clear(bool data) {
|
|
57
57
|
kv_base->clear(data);
|
|
58
58
|
kv_swa ->clear(data);
|
|
59
59
|
}
|
|
60
60
|
|
|
61
|
-
bool
|
|
61
|
+
bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
62
62
|
bool res = true;
|
|
63
63
|
|
|
64
64
|
res = res & kv_base->seq_rm(seq_id, p0, p1);
|
|
@@ -67,36 +67,36 @@ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llam
|
|
|
67
67
|
return res;
|
|
68
68
|
}
|
|
69
69
|
|
|
70
|
-
void
|
|
70
|
+
void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
|
71
71
|
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
72
72
|
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
73
73
|
}
|
|
74
74
|
|
|
75
|
-
void
|
|
75
|
+
void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
|
|
76
76
|
kv_base->seq_keep(seq_id);
|
|
77
77
|
kv_swa ->seq_keep(seq_id);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
void
|
|
80
|
+
void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
81
81
|
kv_base->seq_add(seq_id, p0, p1, shift);
|
|
82
82
|
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
-
void
|
|
85
|
+
void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
86
86
|
kv_base->seq_div(seq_id, p0, p1, d);
|
|
87
87
|
kv_swa ->seq_div(seq_id, p0, p1, d);
|
|
88
88
|
}
|
|
89
89
|
|
|
90
|
-
llama_pos
|
|
90
|
+
llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
|
91
91
|
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
|
|
92
92
|
return kv_swa->seq_pos_min(seq_id);
|
|
93
93
|
}
|
|
94
94
|
|
|
95
|
-
llama_pos
|
|
95
|
+
llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
|
96
96
|
return kv_swa->seq_pos_max(seq_id);
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
-
llama_memory_context_ptr
|
|
99
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
100
100
|
GGML_UNUSED(embd_all);
|
|
101
101
|
|
|
102
102
|
// first try simple split
|
|
@@ -136,7 +136,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
136
136
|
|
|
137
137
|
assert(sinfos_base.size() == sinfos_swa.size());
|
|
138
138
|
|
|
139
|
-
return std::make_unique<
|
|
139
|
+
return std::make_unique<llama_kv_cache_iswa_context>(
|
|
140
140
|
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
|
141
141
|
} while (false);
|
|
142
142
|
|
|
@@ -172,29 +172,29 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
172
172
|
|
|
173
173
|
assert(sinfos_base.size() == sinfos_swa.size());
|
|
174
174
|
|
|
175
|
-
return std::make_unique<
|
|
175
|
+
return std::make_unique<llama_kv_cache_iswa_context>(
|
|
176
176
|
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
|
177
177
|
} while (false);
|
|
178
178
|
|
|
179
179
|
// TODO: if we fail again, we should attempt different splitting strategies
|
|
180
180
|
// but to do that properly, we first have to refactor the batches to be more flexible
|
|
181
181
|
|
|
182
|
-
return std::make_unique<
|
|
182
|
+
return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
183
183
|
}
|
|
184
184
|
|
|
185
|
-
llama_memory_context_ptr
|
|
186
|
-
return std::make_unique<
|
|
185
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
|
|
186
|
+
return std::make_unique<llama_kv_cache_iswa_context>(this);
|
|
187
187
|
}
|
|
188
188
|
|
|
189
|
-
llama_memory_context_ptr
|
|
190
|
-
return std::make_unique<
|
|
189
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
|
|
190
|
+
return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
|
|
191
191
|
}
|
|
192
192
|
|
|
193
|
-
bool
|
|
193
|
+
bool llama_kv_cache_iswa::get_can_shift() const {
|
|
194
194
|
return kv_base->get_size() == kv_swa->get_size();
|
|
195
195
|
}
|
|
196
196
|
|
|
197
|
-
void
|
|
197
|
+
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
198
198
|
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
|
199
199
|
kv_base->state_write(io, seq_id, flags);
|
|
200
200
|
}
|
|
@@ -202,7 +202,7 @@ void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_i
|
|
|
202
202
|
kv_swa->state_write(io, seq_id, flags);
|
|
203
203
|
}
|
|
204
204
|
|
|
205
|
-
void
|
|
205
|
+
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
206
206
|
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
|
207
207
|
kv_base->state_read(io, seq_id, flags);
|
|
208
208
|
}
|
|
@@ -210,29 +210,29 @@ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id
|
|
|
210
210
|
kv_swa->state_read(io, seq_id, flags);
|
|
211
211
|
}
|
|
212
212
|
|
|
213
|
-
|
|
213
|
+
llama_kv_cache * llama_kv_cache_iswa::get_base() const {
|
|
214
214
|
return kv_base.get();
|
|
215
215
|
}
|
|
216
216
|
|
|
217
|
-
|
|
217
|
+
llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
|
|
218
218
|
return kv_swa.get();
|
|
219
219
|
}
|
|
220
220
|
|
|
221
221
|
//
|
|
222
|
-
//
|
|
222
|
+
// llama_kv_cache_iswa_context
|
|
223
223
|
//
|
|
224
224
|
|
|
225
|
-
|
|
225
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
|
|
226
226
|
|
|
227
|
-
|
|
228
|
-
|
|
227
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
|
228
|
+
llama_kv_cache_iswa * kv) :
|
|
229
229
|
ctx_base(kv->get_base()->init_full()),
|
|
230
230
|
ctx_swa (kv->get_swa ()->init_full()),
|
|
231
231
|
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
|
232
232
|
}
|
|
233
233
|
|
|
234
|
-
|
|
235
|
-
|
|
234
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
|
235
|
+
llama_kv_cache_iswa * kv,
|
|
236
236
|
llama_context * lctx,
|
|
237
237
|
bool optimize) :
|
|
238
238
|
ctx_base(kv->get_base()->init_update(lctx, optimize)),
|
|
@@ -240,21 +240,21 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
|
|
240
240
|
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
|
241
241
|
}
|
|
242
242
|
|
|
243
|
-
|
|
244
|
-
|
|
243
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
|
244
|
+
llama_kv_cache_iswa * kv,
|
|
245
245
|
slot_info_vec_t sinfos_base,
|
|
246
246
|
slot_info_vec_t sinfos_swa,
|
|
247
247
|
std::vector<llama_ubatch> ubatches) :
|
|
248
248
|
ubatches(std::move(ubatches)),
|
|
249
249
|
// note: here we copy the ubatches. not sure if this is ideal
|
|
250
|
-
ctx_base(new
|
|
251
|
-
ctx_swa (new
|
|
250
|
+
ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
|
251
|
+
ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
|
252
252
|
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
|
253
253
|
}
|
|
254
254
|
|
|
255
|
-
|
|
255
|
+
llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
|
|
256
256
|
|
|
257
|
-
bool
|
|
257
|
+
bool llama_kv_cache_iswa_context::next() {
|
|
258
258
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
259
259
|
|
|
260
260
|
ctx_base->next();
|
|
@@ -267,7 +267,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
|
|
|
267
267
|
return true;
|
|
268
268
|
}
|
|
269
269
|
|
|
270
|
-
bool
|
|
270
|
+
bool llama_kv_cache_iswa_context::apply() {
|
|
271
271
|
assert(!llama_memory_status_is_fail(status));
|
|
272
272
|
|
|
273
273
|
bool res = true;
|
|
@@ -278,24 +278,24 @@ bool llama_kv_cache_unified_iswa_context::apply() {
|
|
|
278
278
|
return res;
|
|
279
279
|
}
|
|
280
280
|
|
|
281
|
-
llama_memory_status
|
|
281
|
+
llama_memory_status llama_kv_cache_iswa_context::get_status() const {
|
|
282
282
|
return status;
|
|
283
283
|
}
|
|
284
284
|
|
|
285
|
-
const llama_ubatch &
|
|
285
|
+
const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
|
|
286
286
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
287
287
|
|
|
288
288
|
return ubatches[i_next];
|
|
289
289
|
}
|
|
290
290
|
|
|
291
|
-
const
|
|
291
|
+
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
|
|
292
292
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
293
293
|
|
|
294
|
-
return static_cast<const
|
|
294
|
+
return static_cast<const llama_kv_cache_context *>(ctx_base.get());
|
|
295
295
|
}
|
|
296
296
|
|
|
297
|
-
const
|
|
297
|
+
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
|
|
298
298
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
299
299
|
|
|
300
|
-
return static_cast<const
|
|
300
|
+
return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
|
|
301
301
|
}
|
|
@@ -1,32 +1,32 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include "llama-kv-cache
|
|
3
|
+
#include "llama-kv-cache.h"
|
|
4
4
|
|
|
5
5
|
#include <vector>
|
|
6
6
|
|
|
7
7
|
//
|
|
8
|
-
//
|
|
8
|
+
// llama_kv_cache_iswa
|
|
9
9
|
//
|
|
10
10
|
|
|
11
|
-
// utilizes two instances of
|
|
11
|
+
// utilizes two instances of llama_kv_cache
|
|
12
12
|
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
|
|
13
13
|
|
|
14
|
-
class
|
|
14
|
+
class llama_kv_cache_iswa : public llama_memory_i {
|
|
15
15
|
public:
|
|
16
|
-
|
|
16
|
+
llama_kv_cache_iswa(
|
|
17
17
|
const llama_model & model,
|
|
18
18
|
ggml_type type_k,
|
|
19
19
|
ggml_type type_v,
|
|
20
20
|
bool v_trans,
|
|
21
21
|
bool offload,
|
|
22
22
|
bool swa_full,
|
|
23
|
-
bool
|
|
23
|
+
bool ,
|
|
24
24
|
uint32_t kv_size,
|
|
25
25
|
uint32_t n_seq_max,
|
|
26
26
|
uint32_t n_ubatch,
|
|
27
27
|
uint32_t n_pad);
|
|
28
28
|
|
|
29
|
-
~
|
|
29
|
+
~llama_kv_cache_iswa() = default;
|
|
30
30
|
|
|
31
31
|
//
|
|
32
32
|
// llama_memory_i
|
|
@@ -60,46 +60,46 @@ public:
|
|
|
60
60
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
61
61
|
|
|
62
62
|
//
|
|
63
|
-
//
|
|
63
|
+
// llama_kv_cache_iswa specific API
|
|
64
64
|
//
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
llama_kv_cache * get_base() const;
|
|
67
|
+
llama_kv_cache * get_swa () const;
|
|
68
68
|
|
|
69
69
|
private:
|
|
70
70
|
const llama_hparams & hparams;
|
|
71
71
|
|
|
72
72
|
const bool unified;
|
|
73
73
|
|
|
74
|
-
std::unique_ptr<
|
|
75
|
-
std::unique_ptr<
|
|
74
|
+
std::unique_ptr<llama_kv_cache> kv_base;
|
|
75
|
+
std::unique_ptr<llama_kv_cache> kv_swa;
|
|
76
76
|
};
|
|
77
77
|
|
|
78
|
-
class
|
|
78
|
+
class llama_kv_cache_iswa_context : public llama_memory_context_i {
|
|
79
79
|
public:
|
|
80
|
-
using slot_info_vec_t =
|
|
80
|
+
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
|
81
81
|
|
|
82
82
|
// used for errors
|
|
83
|
-
|
|
83
|
+
llama_kv_cache_iswa_context(llama_memory_status status);
|
|
84
84
|
|
|
85
85
|
// used to create a full-cache context
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
llama_kv_cache_iswa_context(
|
|
87
|
+
llama_kv_cache_iswa * kv);
|
|
88
88
|
|
|
89
89
|
// used to create an update context
|
|
90
|
-
|
|
91
|
-
|
|
90
|
+
llama_kv_cache_iswa_context(
|
|
91
|
+
llama_kv_cache_iswa * kv,
|
|
92
92
|
llama_context * lctx,
|
|
93
93
|
bool optimize);
|
|
94
94
|
|
|
95
95
|
// used to create a batch processing context from a batch
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
llama_kv_cache_iswa_context(
|
|
97
|
+
llama_kv_cache_iswa * kv,
|
|
98
98
|
slot_info_vec_t sinfos_base,
|
|
99
99
|
slot_info_vec_t sinfos_swa,
|
|
100
100
|
std::vector<llama_ubatch> ubatches);
|
|
101
101
|
|
|
102
|
-
virtual ~
|
|
102
|
+
virtual ~llama_kv_cache_iswa_context();
|
|
103
103
|
|
|
104
104
|
//
|
|
105
105
|
// llama_memory_context_i
|
|
@@ -112,14 +112,14 @@ public:
|
|
|
112
112
|
const llama_ubatch & get_ubatch() const override;
|
|
113
113
|
|
|
114
114
|
//
|
|
115
|
-
//
|
|
115
|
+
// llama_kv_cache_iswa_context specific API
|
|
116
116
|
//
|
|
117
117
|
|
|
118
|
-
const
|
|
119
|
-
const
|
|
118
|
+
const llama_kv_cache_context * get_base() const;
|
|
119
|
+
const llama_kv_cache_context * get_swa() const;
|
|
120
120
|
|
|
121
121
|
private:
|
|
122
|
-
//
|
|
122
|
+
//llama_kv_cache_iswa * kv;
|
|
123
123
|
|
|
124
124
|
// the index of the next ubatch to process
|
|
125
125
|
size_t i_next = 0;
|