@fugood/llama.node 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +423 -186
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +154 -13
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +23 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/common/sampling.cpp +1 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +18 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +27 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
- package/src/llama.cpp/include/llama.h +23 -11
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +157 -0
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +10 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +10 -5
- package/src/llama.cpp/src/llama-kv-cache.h +2 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +19 -3
- package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +582 -45
- package/src/llama.cpp/src/llama-model.h +23 -1
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
|
|
|
2027
2027
|
n_reused = 0;
|
|
2028
2028
|
}
|
|
2029
2029
|
|
|
2030
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
|
2031
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
|
2032
|
+
for (const auto & buft_size : model.memory_breakdown()) {
|
|
2033
|
+
ret[buft_size.first].model += buft_size.second;
|
|
2034
|
+
}
|
|
2035
|
+
for (const auto & buft_size : memory->memory_breakdown()) {
|
|
2036
|
+
ret[buft_size.first].context += buft_size.second;
|
|
2037
|
+
}
|
|
2038
|
+
for (const auto & backend_ptr : backends) {
|
|
2039
|
+
ggml_backend_t backend = backend_ptr.get();
|
|
2040
|
+
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
2041
|
+
}
|
|
2042
|
+
return ret;
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2030
2045
|
//
|
|
2031
2046
|
// training
|
|
2032
2047
|
//
|
|
@@ -2331,6 +2346,12 @@ llama_context * llama_init_from_model(
|
|
|
2331
2346
|
return nullptr;
|
|
2332
2347
|
}
|
|
2333
2348
|
|
|
2349
|
+
if (params.pooling_type != model->hparams.pooling_type) {
|
|
2350
|
+
//user-specified pooling-type is different from the model default
|
|
2351
|
+
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
|
|
2352
|
+
model->hparams.pooling_type, params.pooling_type);
|
|
2353
|
+
}
|
|
2354
|
+
|
|
2334
2355
|
try {
|
|
2335
2356
|
auto * ctx = new llama_context(*model, params);
|
|
2336
2357
|
return ctx;
|
|
@@ -2765,6 +2786,142 @@ void llama_perf_context_reset(llama_context * ctx) {
|
|
|
2765
2786
|
ctx->perf_reset();
|
|
2766
2787
|
}
|
|
2767
2788
|
|
|
2789
|
+
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
|
2790
|
+
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
|
|
2791
|
+
|
|
2792
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
|
2793
|
+
|
|
2794
|
+
std::vector<std::array<std::string, 9>> table_data;
|
|
2795
|
+
table_data.reserve(devices.size());
|
|
2796
|
+
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
|
2797
|
+
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
|
2798
|
+
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
|
2799
|
+
|
|
2800
|
+
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
|
2801
|
+
|
|
2802
|
+
constexpr size_t MiB = 1024 * 1024;
|
|
2803
|
+
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
|
2804
|
+
|
|
2805
|
+
// track seen buffer types to avoid double counting:
|
|
2806
|
+
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
|
2807
|
+
|
|
2808
|
+
// accumulative memory breakdown for each device and for host:
|
|
2809
|
+
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
|
2810
|
+
llama_memory_breakdown_data mb_host;
|
|
2811
|
+
|
|
2812
|
+
for (const auto & buft_mb : memory_breakdown) {
|
|
2813
|
+
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
2814
|
+
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
2815
|
+
if (ggml_backend_buft_is_host(buft)) {
|
|
2816
|
+
mb_host.model += mb.model;
|
|
2817
|
+
mb_host.context += mb.context;
|
|
2818
|
+
mb_host.compute += mb.compute;
|
|
2819
|
+
seen_buffer_types.insert(buft);
|
|
2820
|
+
continue;
|
|
2821
|
+
}
|
|
2822
|
+
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
2823
|
+
if (dev) {
|
|
2824
|
+
int i_dev = -1;
|
|
2825
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
2826
|
+
if (devices[i] == dev) {
|
|
2827
|
+
i_dev = i;
|
|
2828
|
+
break;
|
|
2829
|
+
}
|
|
2830
|
+
}
|
|
2831
|
+
if (i_dev != -1) {
|
|
2832
|
+
mb_dev[i_dev].model += mb.model;
|
|
2833
|
+
mb_dev[i_dev].context += mb.context;
|
|
2834
|
+
mb_dev[i_dev].compute += mb.compute;
|
|
2835
|
+
seen_buffer_types.insert(buft);
|
|
2836
|
+
continue;
|
|
2837
|
+
}
|
|
2838
|
+
}
|
|
2839
|
+
}
|
|
2840
|
+
|
|
2841
|
+
// print memory breakdown for each device:
|
|
2842
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
2843
|
+
ggml_backend_dev_t dev = devices[i];
|
|
2844
|
+
llama_memory_breakdown_data mb = mb_dev[i];
|
|
2845
|
+
|
|
2846
|
+
const std::string name = ggml_backend_dev_name(dev);
|
|
2847
|
+
std::string desc = ggml_backend_dev_description(dev);
|
|
2848
|
+
for (const std::string & prefix : desc_prefixes_strip) {
|
|
2849
|
+
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
|
2850
|
+
desc = desc.substr(prefix.length());
|
|
2851
|
+
}
|
|
2852
|
+
}
|
|
2853
|
+
|
|
2854
|
+
size_t free, total;
|
|
2855
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
2856
|
+
|
|
2857
|
+
const size_t self = mb.model + mb.context + mb.compute;
|
|
2858
|
+
const size_t unaccounted = total - self - free;
|
|
2859
|
+
|
|
2860
|
+
table_data.push_back({
|
|
2861
|
+
template_gpu,
|
|
2862
|
+
" - " + name + " (" + desc + ")",
|
|
2863
|
+
std::to_string(total / MiB),
|
|
2864
|
+
std::to_string(free / MiB),
|
|
2865
|
+
std::to_string(self / MiB),
|
|
2866
|
+
std::to_string(mb.model / MiB),
|
|
2867
|
+
std::to_string(mb.context / MiB),
|
|
2868
|
+
std::to_string(mb.compute / MiB),
|
|
2869
|
+
std::to_string(unaccounted / MiB)});
|
|
2870
|
+
}
|
|
2871
|
+
|
|
2872
|
+
// print memory breakdown for host:
|
|
2873
|
+
{
|
|
2874
|
+
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
|
2875
|
+
table_data.push_back({
|
|
2876
|
+
template_other,
|
|
2877
|
+
" - Host",
|
|
2878
|
+
"", // total
|
|
2879
|
+
"", // free
|
|
2880
|
+
std::to_string(self / MiB),
|
|
2881
|
+
std::to_string(mb_host.model / MiB),
|
|
2882
|
+
std::to_string(mb_host.context / MiB),
|
|
2883
|
+
std::to_string(mb_host.compute / MiB),
|
|
2884
|
+
""}); // unaccounted
|
|
2885
|
+
}
|
|
2886
|
+
|
|
2887
|
+
// print memory breakdown for all remaining buffer types:
|
|
2888
|
+
for (const auto & buft_mb : memory_breakdown) {
|
|
2889
|
+
ggml_backend_buffer_type_t buft = buft_mb.first;
|
|
2890
|
+
const llama_memory_breakdown_data & mb = buft_mb.second;
|
|
2891
|
+
if (seen_buffer_types.count(buft) == 1) {
|
|
2892
|
+
continue;
|
|
2893
|
+
}
|
|
2894
|
+
const std::string name = ggml_backend_buft_name(buft);
|
|
2895
|
+
const size_t self = mb.model + mb.context + mb.compute;
|
|
2896
|
+
table_data.push_back({
|
|
2897
|
+
template_other,
|
|
2898
|
+
" - " + name,
|
|
2899
|
+
"", // total
|
|
2900
|
+
"", // free
|
|
2901
|
+
std::to_string(self / MiB),
|
|
2902
|
+
std::to_string(mb.model / MiB),
|
|
2903
|
+
std::to_string(mb.context / MiB),
|
|
2904
|
+
std::to_string(mb.compute / MiB),
|
|
2905
|
+
""}); // unaccounted
|
|
2906
|
+
seen_buffer_types.insert(buft);
|
|
2907
|
+
}
|
|
2908
|
+
|
|
2909
|
+
for (size_t j = 1; j < table_data[0].size(); j++) {
|
|
2910
|
+
size_t max_len = 0;
|
|
2911
|
+
for (const auto & td : table_data) {
|
|
2912
|
+
max_len = std::max(max_len, td[j].length());
|
|
2913
|
+
}
|
|
2914
|
+
for (auto & td : table_data) {
|
|
2915
|
+
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
|
2916
|
+
}
|
|
2917
|
+
}
|
|
2918
|
+
for (const auto & td : table_data) {
|
|
2919
|
+
LLAMA_LOG_INFO(td[0].c_str(),
|
|
2920
|
+
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
|
2921
|
+
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
|
2922
|
+
}
|
|
2923
|
+
}
|
|
2924
|
+
|
|
2768
2925
|
//
|
|
2769
2926
|
// training
|
|
2770
2927
|
//
|
|
@@ -17,9 +17,17 @@ class llama_batch_allocr;
|
|
|
17
17
|
class llama_io_read_i;
|
|
18
18
|
class llama_io_write_i;
|
|
19
19
|
|
|
20
|
+
// "memory" as in abstract memory for the context
|
|
20
21
|
struct llama_memory_i;
|
|
21
22
|
struct llama_memory_context_i;
|
|
22
23
|
|
|
24
|
+
// "memory" as in physical memory for a buffer type, in bytes
|
|
25
|
+
struct llama_memory_breakdown_data {
|
|
26
|
+
size_t model = 0; // memory allocated for the model
|
|
27
|
+
size_t context = 0; // memory allocated for the context
|
|
28
|
+
size_t compute = 0; // memory allocated for temporary compute buffers
|
|
29
|
+
};
|
|
30
|
+
|
|
23
31
|
struct llama_context {
|
|
24
32
|
// init scheduler and compute buffers, reserve worst-case graphs
|
|
25
33
|
llama_context(
|
|
@@ -144,6 +152,8 @@ struct llama_context {
|
|
|
144
152
|
llama_perf_context_data perf_get_data() const;
|
|
145
153
|
void perf_reset();
|
|
146
154
|
|
|
155
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
|
156
|
+
|
|
147
157
|
//
|
|
148
158
|
// training
|
|
149
159
|
//
|
|
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
|
|
204
204
|
std::vector<int> target_pos(n_seqs_unq, -1);
|
|
205
205
|
std::vector<int> target_row(n_seqs_unq, -1);
|
|
206
206
|
|
|
207
|
-
bool last =
|
|
207
|
+
const bool last = (
|
|
208
|
+
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
|
|
209
|
+
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
|
|
210
|
+
);
|
|
208
211
|
|
|
209
212
|
for (int i = 0; i < n_tokens; ++i) {
|
|
210
213
|
const llama_pos pos = ubatch->pos[i];
|
|
@@ -920,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
920
923
|
selection_probs = logits;
|
|
921
924
|
}
|
|
922
925
|
|
|
926
|
+
if (arch == LLM_ARCH_GROVEMOE) {
|
|
927
|
+
selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
|
928
|
+
cb(selection_probs, "ffn_moe_probs_biased", il);
|
|
929
|
+
}
|
|
930
|
+
|
|
923
931
|
// select experts
|
|
924
932
|
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
925
933
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
926
934
|
cb(selected_experts, "ffn_moe_topk", il);
|
|
927
935
|
|
|
928
|
-
|
|
929
|
-
|
|
936
|
+
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
|
|
937
|
+
// TODO: Use scalar div instead when/if implemented
|
|
938
|
+
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
|
|
939
|
+
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
|
|
940
|
+
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
|
|
941
|
+
} else {
|
|
942
|
+
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
|
|
930
946
|
cb(weights, "ffn_moe_weights", il);
|
|
931
947
|
|
|
948
|
+
|
|
932
949
|
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
|
933
950
|
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
|
934
951
|
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
|
@@ -952,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
952
969
|
cb(weights, "ffn_moe_weights_scaled", il);
|
|
953
970
|
}
|
|
954
971
|
|
|
972
|
+
//call early so that topk-moe can be used
|
|
973
|
+
ggml_build_forward_expand(gf, weights);
|
|
974
|
+
|
|
955
975
|
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
|
956
976
|
|
|
957
977
|
if (weight_before_ffn) {
|
|
@@ -1177,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
|
|
|
1177
1197
|
}
|
|
1178
1198
|
|
|
1179
1199
|
ggml_tensor * llm_graph_context::build_inp_cls() const {
|
|
1180
|
-
auto inp = std::make_unique<llm_graph_input_cls>(cparams);
|
|
1200
|
+
auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
|
|
1181
1201
|
|
|
1182
1202
|
auto & cur = inp->cls;
|
|
1183
1203
|
|
|
@@ -1833,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
|
|
1833
1853
|
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
|
1834
1854
|
}
|
|
1835
1855
|
|
|
1856
|
+
void llm_graph_context::build_dense_out(
|
|
1857
|
+
ggml_tensor * dense_2,
|
|
1858
|
+
ggml_tensor * dense_3) const {
|
|
1859
|
+
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
|
|
1860
|
+
return;
|
|
1861
|
+
}
|
|
1862
|
+
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
|
1863
|
+
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
|
|
1864
|
+
|
|
1865
|
+
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
|
1866
|
+
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
|
1867
|
+
cb(cur, "result_embd_pooled", -1);
|
|
1868
|
+
res->t_embd_pooled = cur;
|
|
1869
|
+
ggml_build_forward_expand(gf, cur);
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
|
|
1836
1873
|
void llm_graph_context::build_pooling(
|
|
1837
1874
|
ggml_tensor * cls,
|
|
1838
1875
|
ggml_tensor * cls_b,
|
|
@@ -1877,34 +1914,32 @@ void llm_graph_context::build_pooling(
|
|
|
1877
1914
|
case LLAMA_POOLING_TYPE_RANK:
|
|
1878
1915
|
{
|
|
1879
1916
|
ggml_tensor * inp_cls = build_inp_cls();
|
|
1880
|
-
|
|
1917
|
+
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
|
1881
1918
|
|
|
1919
|
+
// classification head
|
|
1920
|
+
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
|
1882
1921
|
if (cls) {
|
|
1883
|
-
|
|
1884
|
-
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
|
1885
|
-
cur = ggml_mul_mat(ctx0, cls, inp);
|
|
1922
|
+
cur = ggml_mul_mat(ctx0, cls, cur);
|
|
1886
1923
|
if (cls_b) {
|
|
1887
1924
|
cur = ggml_add(ctx0, cur, cls_b);
|
|
1888
1925
|
}
|
|
1889
1926
|
cur = ggml_tanh(ctx0, cur);
|
|
1927
|
+
}
|
|
1890
1928
|
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
}
|
|
1898
|
-
}
|
|
1899
|
-
} else if (cls_out) {
|
|
1900
|
-
// Single layer classification head (direct projection)
|
|
1901
|
-
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
|
1902
|
-
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
|
1929
|
+
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
|
1930
|
+
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
|
1931
|
+
// Single layer classification head (direct projection)
|
|
1932
|
+
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
|
1933
|
+
if (cls_out) {
|
|
1934
|
+
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
|
1903
1935
|
if (cls_out_b) {
|
|
1904
1936
|
cur = ggml_add(ctx0, cur, cls_out_b);
|
|
1905
1937
|
}
|
|
1906
|
-
}
|
|
1907
|
-
|
|
1938
|
+
}
|
|
1939
|
+
|
|
1940
|
+
// softmax for qwen3 reranker
|
|
1941
|
+
if (arch == LLM_ARCH_QWEN3) {
|
|
1942
|
+
cur = ggml_soft_max(ctx0, cur);
|
|
1908
1943
|
}
|
|
1909
1944
|
} break;
|
|
1910
1945
|
default:
|
|
@@ -206,7 +206,7 @@ public:
|
|
|
206
206
|
|
|
207
207
|
class llm_graph_input_cls : public llm_graph_input_i {
|
|
208
208
|
public:
|
|
209
|
-
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
|
|
209
|
+
llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
|
|
210
210
|
virtual ~llm_graph_input_cls() = default;
|
|
211
211
|
|
|
212
212
|
void set_input(const llama_ubatch * ubatch) override;
|
|
@@ -214,6 +214,7 @@ public:
|
|
|
214
214
|
ggml_tensor * cls; // I32 [n_batch]
|
|
215
215
|
|
|
216
216
|
const llama_cparams cparams;
|
|
217
|
+
const llm_arch arch;
|
|
217
218
|
};
|
|
218
219
|
|
|
219
220
|
class llm_graph_input_rs : public llm_graph_input_i {
|
|
@@ -813,6 +814,14 @@ struct llm_graph_context {
|
|
|
813
814
|
ggml_tensor * cls_b,
|
|
814
815
|
ggml_tensor * cls_out,
|
|
815
816
|
ggml_tensor * cls_out_b) const;
|
|
817
|
+
|
|
818
|
+
//
|
|
819
|
+
// dense (out)
|
|
820
|
+
//
|
|
821
|
+
|
|
822
|
+
void build_dense_out(
|
|
823
|
+
ggml_tensor * dense_2,
|
|
824
|
+
ggml_tensor * dense_3) const;
|
|
816
825
|
};
|
|
817
826
|
|
|
818
827
|
// TODO: better name
|
|
@@ -42,7 +42,7 @@ struct llama_hparams {
|
|
|
42
42
|
uint32_t n_embd;
|
|
43
43
|
uint32_t n_embd_features = 0;
|
|
44
44
|
uint32_t n_layer;
|
|
45
|
-
|
|
45
|
+
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
|
46
46
|
uint32_t n_rot;
|
|
47
47
|
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
|
48
48
|
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
|
@@ -69,10 +69,13 @@ struct llama_hparams {
|
|
|
69
69
|
uint32_t n_lora_kv = 0;
|
|
70
70
|
uint32_t n_ff_exp = 0;
|
|
71
71
|
uint32_t n_ff_shexp = 0;
|
|
72
|
+
uint32_t n_ff_chexp = 0;
|
|
72
73
|
uint32_t n_expert_shared = 0;
|
|
73
74
|
uint32_t n_norm_groups = 0;
|
|
75
|
+
uint32_t n_group_experts = 0;
|
|
74
76
|
|
|
75
|
-
float
|
|
77
|
+
float expert_group_scale = 0.05f;
|
|
78
|
+
float expert_weights_scale = 0.0f;
|
|
76
79
|
bool expert_weights_norm = false;
|
|
77
80
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
|
78
81
|
uint32_t moe_every_n_layers = 0;
|
|
@@ -166,6 +169,18 @@ struct llama_hparams {
|
|
|
166
169
|
uint32_t laurel_rank = 64;
|
|
167
170
|
uint32_t n_embd_altup = 256;
|
|
168
171
|
|
|
172
|
+
// needed for sentence-transformers dense layers
|
|
173
|
+
uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
|
|
174
|
+
uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
|
|
175
|
+
uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
|
|
176
|
+
uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
|
|
177
|
+
|
|
178
|
+
// xIELU
|
|
179
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
|
|
180
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
|
|
181
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
|
|
182
|
+
std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
|
|
183
|
+
|
|
169
184
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
|
170
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
|
171
186
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
|
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
113
113
|
return kv_swa->seq_pos_max(seq_id);
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
|
117
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
|
118
|
+
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
|
119
|
+
mb[buft_size.first] += buft_size.second;
|
|
120
|
+
}
|
|
121
|
+
return mb;
|
|
122
|
+
}
|
|
123
|
+
|
|
116
124
|
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
117
125
|
GGML_UNUSED(embd_all);
|
|
118
126
|
|
|
@@ -212,7 +220,7 @@ bool llama_kv_cache_iswa::get_can_shift() const {
|
|
|
212
220
|
}
|
|
213
221
|
|
|
214
222
|
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
215
|
-
if ((flags &
|
|
223
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
216
224
|
kv_base->state_write(io, seq_id, flags);
|
|
217
225
|
}
|
|
218
226
|
|
|
@@ -220,7 +228,7 @@ void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id
|
|
|
220
228
|
}
|
|
221
229
|
|
|
222
230
|
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
223
|
-
if ((flags &
|
|
231
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
224
232
|
kv_base->state_read(io, seq_id, flags);
|
|
225
233
|
}
|
|
226
234
|
|
|
@@ -56,6 +56,8 @@ public:
|
|
|
56
56
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
57
57
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
58
58
|
|
|
59
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
60
|
+
|
|
59
61
|
// state write/load
|
|
60
62
|
|
|
61
63
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
|
|
|
123
123
|
throw std::runtime_error("failed to create ggml context for kv cache");
|
|
124
124
|
}
|
|
125
125
|
|
|
126
|
-
ggml_tensor * k;
|
|
127
|
-
ggml_tensor * v;
|
|
128
|
-
|
|
129
|
-
k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
|
|
130
|
-
v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
|
|
126
|
+
ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
|
|
127
|
+
ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
|
|
131
128
|
|
|
132
129
|
ggml_format_name(k, "cache_k_l%d", il);
|
|
133
130
|
ggml_format_name(v, "cache_v_l%d", il);
|
|
@@ -473,6 +470,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
473
470
|
return cells.seq_pos_max(seq_id);
|
|
474
471
|
}
|
|
475
472
|
|
|
473
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
|
474
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
475
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
|
476
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
477
|
+
}
|
|
478
|
+
return ret;
|
|
479
|
+
}
|
|
480
|
+
|
|
476
481
|
llama_memory_context_ptr llama_kv_cache::init_batch(
|
|
477
482
|
llama_batch_allocr & balloc,
|
|
478
483
|
uint32_t n_ubatch,
|
|
@@ -121,6 +121,8 @@ public:
|
|
|
121
121
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
122
122
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
123
123
|
|
|
124
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
125
|
+
|
|
124
126
|
// state write/load
|
|
125
127
|
|
|
126
128
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -73,7 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
|
|
73
73
|
// if all tokens are output, split by sequence
|
|
74
74
|
ubatch = balloc.split_seq(n_ubatch);
|
|
75
75
|
} else {
|
|
76
|
-
|
|
76
|
+
// TODO: non-sequential equal split can be done if using unified KV cache
|
|
77
|
+
// for simplicity, we always use sequential equal split for now
|
|
78
|
+
ubatch = balloc.split_equal(n_ubatch, true);
|
|
77
79
|
}
|
|
78
80
|
|
|
79
81
|
if (ubatch.n_tokens == 0) {
|
|
@@ -166,18 +168,26 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
166
168
|
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
|
167
169
|
}
|
|
168
170
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
|
|
172
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
|
173
|
+
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
|
174
|
+
mb[buft_size.first] += buft_size.second;
|
|
175
|
+
}
|
|
176
|
+
return mb;
|
|
177
|
+
}
|
|
171
178
|
|
|
172
|
-
|
|
173
|
-
|
|
179
|
+
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
180
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
181
|
+
mem_attn->state_write(io, seq_id, flags);
|
|
182
|
+
}
|
|
183
|
+
mem_recr->state_write(io, seq_id, flags);
|
|
174
184
|
}
|
|
175
185
|
|
|
176
186
|
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
mem_recr->state_read(io, seq_id);
|
|
187
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
188
|
+
mem_attn->state_read(io, seq_id, flags);
|
|
189
|
+
}
|
|
190
|
+
mem_recr->state_read(io, seq_id, flags);
|
|
181
191
|
}
|
|
182
192
|
|
|
183
193
|
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
|
|
@@ -68,6 +68,8 @@ public:
|
|
|
68
68
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
69
69
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
70
70
|
|
|
71
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
72
|
+
|
|
71
73
|
// state write/load
|
|
72
74
|
|
|
73
75
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
@@ -136,6 +136,7 @@ void llama_memory_recurrent::clear(bool data) {
|
|
|
136
136
|
}
|
|
137
137
|
|
|
138
138
|
bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
139
|
+
//printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
|
|
139
140
|
uint32_t new_head = size;
|
|
140
141
|
|
|
141
142
|
if (p0 < 0) {
|
|
@@ -156,7 +157,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
156
157
|
if (tail_id >= 0) {
|
|
157
158
|
const auto & cell = cells[tail_id];
|
|
158
159
|
// partial intersection is invalid
|
|
159
|
-
if ((0 < p0 && p0
|
|
160
|
+
if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
|
161
|
+
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
|
|
160
162
|
return false;
|
|
161
163
|
}
|
|
162
164
|
// invalidate tails which will be cleared
|
|
@@ -167,6 +169,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
167
169
|
} else {
|
|
168
170
|
// seq_id is negative, then the range should include everything or nothing
|
|
169
171
|
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
|
|
172
|
+
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
|
|
170
173
|
return false;
|
|
171
174
|
}
|
|
172
175
|
}
|
|
@@ -359,6 +362,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
359
362
|
return result;
|
|
360
363
|
}
|
|
361
364
|
|
|
365
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
|
366
|
+
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
367
|
+
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
|
368
|
+
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
|
369
|
+
}
|
|
370
|
+
return ret;
|
|
371
|
+
}
|
|
372
|
+
|
|
362
373
|
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
363
374
|
do {
|
|
364
375
|
balloc.split_reset();
|
|
@@ -371,7 +382,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
|
|
371
382
|
// if all tokens are output, split by sequence
|
|
372
383
|
ubatch = balloc.split_seq(n_ubatch);
|
|
373
384
|
} else {
|
|
374
|
-
|
|
385
|
+
// TODO: non-sequential equal split can be done if using unified KV cache
|
|
386
|
+
// for simplicity, we always use sequential equal split for now
|
|
387
|
+
ubatch = balloc.split_equal(n_ubatch, true);
|
|
375
388
|
}
|
|
376
389
|
|
|
377
390
|
if (ubatch.n_tokens == 0) {
|
|
@@ -848,9 +861,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
848
861
|
bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
|
849
862
|
if (dest_seq_id != -1) {
|
|
850
863
|
// single sequence
|
|
851
|
-
|
|
852
864
|
seq_rm(dest_seq_id, -1, -1);
|
|
853
865
|
|
|
866
|
+
if (cell_count == 0) {
|
|
867
|
+
return true;
|
|
868
|
+
}
|
|
869
|
+
|
|
854
870
|
llama_batch_allocr balloc(hparams.n_pos_per_embd());
|
|
855
871
|
|
|
856
872
|
llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "llama-graph.h"
|
|
5
5
|
#include "llama-memory.h"
|
|
6
6
|
|
|
7
|
+
#include <map>
|
|
7
8
|
#include <set>
|
|
8
9
|
#include <vector>
|
|
9
10
|
|
|
@@ -50,6 +51,8 @@ public:
|
|
|
50
51
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
51
52
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
52
53
|
|
|
54
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
55
|
+
|
|
53
56
|
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
|
54
57
|
|
|
55
58
|
// find a contiguous slot of memory cells and emplace the ubatch there
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
|
+
#include <map>
|
|
5
6
|
#include <memory>
|
|
6
7
|
#include <functional>
|
|
7
8
|
|
|
@@ -108,6 +109,8 @@ struct llama_memory_i {
|
|
|
108
109
|
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
|
109
110
|
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
|
110
111
|
|
|
112
|
+
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
|
|
113
|
+
|
|
111
114
|
//
|
|
112
115
|
// state write/read
|
|
113
116
|
//
|
|
@@ -465,6 +465,8 @@ namespace GGUFMeta {
|
|
|
465
465
|
// TODO: this is not very clever - figure out something better
|
|
466
466
|
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
|
467
467
|
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
|
468
|
+
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
|
|
469
|
+
|
|
468
470
|
|
|
469
471
|
llama_model_loader::llama_model_loader(
|
|
470
472
|
const std::string & fname,
|