@fugood/llama.node 1.2.0-rc.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/chat.cpp +139 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -0
- package/src/llama.cpp/src/llama-arch.cpp +1 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -3
- package/src/llama.cpp/src/llama-graph.cpp +3 -2
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
- package/src/llama.cpp/src/llama-kv-cache.h +8 -0
- package/src/llama.cpp/src/llama-model.cpp +58 -96
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama.cpp +53 -10
|
@@ -483,11 +483,16 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
|
|
483
483
|
/**
|
|
484
484
|
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
|
485
485
|
*/
|
|
486
|
-
inline static float
|
|
486
|
+
inline static float vec_hsum_f32x4(float32x4_t v) {
|
|
487
487
|
float32x4_t v_temp = v + vec_reve(v);
|
|
488
488
|
return v_temp[0] + v_temp[1];
|
|
489
489
|
}
|
|
490
490
|
|
|
491
|
+
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
|
|
492
|
+
int32x4_t v_temp = v + vec_reve(v);
|
|
493
|
+
return v_temp[0] + v_temp[1];
|
|
494
|
+
}
|
|
495
|
+
|
|
491
496
|
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
492
497
|
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
|
493
498
|
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
|
@@ -373,6 +373,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
|
373
373
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
|
374
374
|
.nrows = 1,
|
|
375
375
|
},
|
|
376
|
+
[GGML_TYPE_I32] = {
|
|
377
|
+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
|
378
|
+
},
|
|
376
379
|
};
|
|
377
380
|
|
|
378
381
|
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
|
@@ -2696,7 +2699,10 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2696
2699
|
if (ggml_is_quantized(node->type) ||
|
|
2697
2700
|
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
|
2698
2701
|
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
|
|
2699
|
-
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)
|
|
2702
|
+
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
|
|
2703
|
+
// conversion between F32 and I32
|
|
2704
|
+
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
|
|
2705
|
+
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
|
|
2700
2706
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
|
2701
2707
|
}
|
|
2702
2708
|
} break;
|
|
@@ -3258,6 +3264,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
|
|
3258
3264
|
}
|
|
3259
3265
|
}
|
|
3260
3266
|
|
|
3267
|
+
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
|
|
3268
|
+
int64_t i = 0;
|
|
3269
|
+
for (; i < n; ++i) {
|
|
3270
|
+
y[i] = x[i];
|
|
3271
|
+
}
|
|
3272
|
+
}
|
|
3273
|
+
|
|
3261
3274
|
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
|
3262
3275
|
int64_t i = 0;
|
|
3263
3276
|
#if defined(__AVX2__)
|
|
@@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
|
190
190
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
191
191
|
/* .event_record = */ NULL,
|
|
192
192
|
/* .event_wait = */ NULL,
|
|
193
|
+
/* .optimize_graph = */ NULL,
|
|
193
194
|
};
|
|
194
195
|
|
|
195
196
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
@@ -515,9 +515,6 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|
|
515
515
|
op->src[0]->buffer &&
|
|
516
516
|
(ggml_n_dims(op->src[0]) == 2) &&
|
|
517
517
|
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
|
|
518
|
-
if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
|
|
519
|
-
return false;
|
|
520
|
-
}
|
|
521
518
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
|
522
519
|
return false;
|
|
523
520
|
}
|
|
@@ -776,6 +776,24 @@ static void ggml_compute_forward_dup_f32(
|
|
|
776
776
|
id += ne00 * (ne01 - ir1);
|
|
777
777
|
}
|
|
778
778
|
}
|
|
779
|
+
} else if (dst->type == GGML_TYPE_I32) {
|
|
780
|
+
size_t id = 0;
|
|
781
|
+
int32_t * dst_ptr = (int32_t *) dst->data;
|
|
782
|
+
|
|
783
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
|
784
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
|
785
|
+
id += ne00 * ir0;
|
|
786
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
787
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
|
788
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
789
|
+
|
|
790
|
+
dst_ptr[id] = *src0_ptr;
|
|
791
|
+
id++;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
id += ne00 * (ne01 - ir1);
|
|
795
|
+
}
|
|
796
|
+
}
|
|
779
797
|
} else {
|
|
780
798
|
GGML_ABORT("fatal error"); // TODO: implement
|
|
781
799
|
}
|
|
@@ -947,6 +965,144 @@ static void ggml_compute_forward_dup_f32(
|
|
|
947
965
|
}
|
|
948
966
|
}
|
|
949
967
|
}
|
|
968
|
+
} else if (dst->type == GGML_TYPE_I32) {
|
|
969
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
970
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
971
|
+
i10 += ne00 * ir0;
|
|
972
|
+
while (i10 >= ne0) {
|
|
973
|
+
i10 -= ne0;
|
|
974
|
+
if (++i11 == ne1) {
|
|
975
|
+
i11 = 0;
|
|
976
|
+
if (++i12 == ne2) {
|
|
977
|
+
i12 = 0;
|
|
978
|
+
if (++i13 == ne3) {
|
|
979
|
+
i13 = 0;
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
985
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
986
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
987
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
988
|
+
|
|
989
|
+
*(int32_t *) dst_ptr = *(const float *) src0_ptr;
|
|
990
|
+
|
|
991
|
+
if (++i10 == ne0) {
|
|
992
|
+
i10 = 0;
|
|
993
|
+
if (++i11 == ne1) {
|
|
994
|
+
i11 = 0;
|
|
995
|
+
if (++i12 == ne2) {
|
|
996
|
+
i12 = 0;
|
|
997
|
+
if (++i13 == ne3) {
|
|
998
|
+
i13 = 0;
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
i10 += ne00 * (ne01 - ir1);
|
|
1006
|
+
while (i10 >= ne0) {
|
|
1007
|
+
i10 -= ne0;
|
|
1008
|
+
if (++i11 == ne1) {
|
|
1009
|
+
i11 = 0;
|
|
1010
|
+
if (++i12 == ne2) {
|
|
1011
|
+
i12 = 0;
|
|
1012
|
+
if (++i13 == ne3) {
|
|
1013
|
+
i13 = 0;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
}
|
|
1020
|
+
} else {
|
|
1021
|
+
GGML_ABORT("fatal error"); // TODO: implement
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
static void ggml_compute_forward_dup_i32(
|
|
1026
|
+
const ggml_compute_params * params,
|
|
1027
|
+
ggml_tensor * dst) {
|
|
1028
|
+
|
|
1029
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
1030
|
+
|
|
1031
|
+
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
1032
|
+
|
|
1033
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
|
1034
|
+
|
|
1035
|
+
const int ith = params->ith; // thread index
|
|
1036
|
+
const int nth = params->nth; // number of threads
|
|
1037
|
+
|
|
1038
|
+
// parallelize by rows
|
|
1039
|
+
const int nr = ne01;
|
|
1040
|
+
// number of rows per thread
|
|
1041
|
+
const int dr = (nr + nth - 1) / nth;
|
|
1042
|
+
// row range for this thread
|
|
1043
|
+
const int ir0 = dr * ith;
|
|
1044
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
1045
|
+
|
|
1046
|
+
// dst counters
|
|
1047
|
+
|
|
1048
|
+
int64_t i10 = 0;
|
|
1049
|
+
int64_t i11 = 0;
|
|
1050
|
+
int64_t i12 = 0;
|
|
1051
|
+
int64_t i13 = 0;
|
|
1052
|
+
|
|
1053
|
+
// TODO: not optimal, but works
|
|
1054
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
1055
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
1056
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
1057
|
+
i10 += ne00 * ir0;
|
|
1058
|
+
while (i10 >= ne0) {
|
|
1059
|
+
i10 -= ne0;
|
|
1060
|
+
if (++i11 == ne1) {
|
|
1061
|
+
i11 = 0;
|
|
1062
|
+
if (++i12 == ne2) {
|
|
1063
|
+
i12 = 0;
|
|
1064
|
+
if (++i13 == ne3) {
|
|
1065
|
+
i13 = 0;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
1071
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
1072
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
1073
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
1074
|
+
|
|
1075
|
+
*(float *) dst_ptr = *(const int32_t *) src0_ptr;
|
|
1076
|
+
|
|
1077
|
+
if (++i10 == ne0) {
|
|
1078
|
+
i10 = 0;
|
|
1079
|
+
if (++i11 == ne1) {
|
|
1080
|
+
i11 = 0;
|
|
1081
|
+
if (++i12 == ne2) {
|
|
1082
|
+
i12 = 0;
|
|
1083
|
+
if (++i13 == ne3) {
|
|
1084
|
+
i13 = 0;
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
i10 += ne00 * (ne01 - ir1);
|
|
1092
|
+
while (i10 >= ne0) {
|
|
1093
|
+
i10 -= ne0;
|
|
1094
|
+
if (++i11 == ne1) {
|
|
1095
|
+
i11 = 0;
|
|
1096
|
+
if (++i12 == ne2) {
|
|
1097
|
+
i12 = 0;
|
|
1098
|
+
if (++i13 == ne3) {
|
|
1099
|
+
i13 = 0;
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
950
1106
|
} else {
|
|
951
1107
|
GGML_ABORT("fatal error"); // TODO: implement
|
|
952
1108
|
}
|
|
@@ -1177,6 +1333,10 @@ void ggml_compute_forward_dup(
|
|
|
1177
1333
|
{
|
|
1178
1334
|
ggml_compute_forward_dup_f32(params, dst);
|
|
1179
1335
|
} break;
|
|
1336
|
+
case GGML_TYPE_I32:
|
|
1337
|
+
{
|
|
1338
|
+
ggml_compute_forward_dup_i32(params, dst);
|
|
1339
|
+
} break;
|
|
1180
1340
|
default:
|
|
1181
1341
|
{
|
|
1182
1342
|
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
|
|
@@ -8438,6 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32(
|
|
|
8438
8598
|
embed_data[j + half] = sinf(arg);
|
|
8439
8599
|
}
|
|
8440
8600
|
if (dim % 2 != 0 && ith == 0) {
|
|
8601
|
+
embed_data[2 * half] = 0.f;
|
|
8441
8602
|
embed_data[dim] = 0.f;
|
|
8442
8603
|
}
|
|
8443
8604
|
}
|
|
@@ -137,6 +137,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
137
137
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
138
138
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
139
139
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
140
|
+
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
|
|
140
141
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
|
141
142
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
|
142
143
|
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
|
@@ -285,8 +285,8 @@ llama_context::llama_context(
|
|
|
285
285
|
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
286
286
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
287
287
|
|
|
288
|
-
// avoid reserving graphs with zero outputs
|
|
289
|
-
n_outputs =
|
|
288
|
+
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
289
|
+
n_outputs = n_seqs;
|
|
290
290
|
|
|
291
291
|
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
292
292
|
|
|
@@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute(
|
|
|
1447
1447
|
if (backend_cpu != nullptr) {
|
|
1448
1448
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
|
1449
1449
|
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
|
1450
|
-
set_threadpool_fn
|
|
1450
|
+
if (set_threadpool_fn) {
|
|
1451
|
+
set_threadpool_fn(backend_cpu, tp);
|
|
1452
|
+
}
|
|
1451
1453
|
}
|
|
1452
1454
|
|
|
1453
1455
|
// set the number of threads for all the backends
|
|
@@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1273
1273
|
// split the batch into streams if needed
|
|
1274
1274
|
const auto n_stream = k->ne[3];
|
|
1275
1275
|
|
|
1276
|
-
q =
|
|
1276
|
+
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
|
|
1277
1277
|
|
|
1278
1278
|
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
|
1279
1279
|
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
|
@@ -1431,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1431
1431
|
|
|
1432
1432
|
// [TAG_NO_CACHE_PAD]
|
|
1433
1433
|
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
|
1434
|
-
|
|
1434
|
+
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
|
|
1435
|
+
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
|
1435
1436
|
|
|
1436
1437
|
ggml_tensor * q = q_cur;
|
|
1437
1438
|
ggml_tensor * k = k_cur;
|
|
@@ -159,6 +159,7 @@ struct llama_hparams {
|
|
|
159
159
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
|
160
160
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
|
161
161
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
|
162
|
+
uint32_t dec_n_layer = 0;
|
|
162
163
|
|
|
163
164
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
164
165
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
@@ -1018,16 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
|
|
|
1018
1018
|
|
|
1019
1019
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1020
1020
|
|
|
1021
|
-
|
|
1021
|
+
ggml_tensor * k = layers[ikv].k;
|
|
1022
|
+
|
|
1023
|
+
const int64_t n_embd_head = k_cur->ne[0];
|
|
1024
|
+
const int64_t n_head = k_cur->ne[1];
|
|
1025
|
+
const int64_t n_tokens = k_cur->ne[2];
|
|
1026
|
+
|
|
1027
|
+
const int64_t n_embd_gqa = n_embd_head*n_head;
|
|
1022
1028
|
|
|
1023
|
-
|
|
1029
|
+
// we can merge dims 0 and 1
|
|
1030
|
+
// TODO: add ggml helper function for this?
|
|
1031
|
+
GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
|
|
1024
1032
|
|
|
1025
|
-
k_cur =
|
|
1033
|
+
k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
|
|
1026
1034
|
|
|
1027
|
-
|
|
1028
|
-
|
|
1035
|
+
const int64_t n_stream = k->ne[2];
|
|
1036
|
+
|
|
1037
|
+
if (n_stream > 1) {
|
|
1038
|
+
const int64_t kv_size = get_size();
|
|
1039
|
+
|
|
1040
|
+
assert(n_embd_gqa == k->ne[0]);
|
|
1041
|
+
assert(kv_size == k->ne[1]);
|
|
1042
|
+
|
|
1043
|
+
// merge the buffer across all streams because the idxs are global
|
|
1044
|
+
k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
|
|
1029
1045
|
}
|
|
1030
1046
|
|
|
1047
|
+
// store the current K values into the cache
|
|
1031
1048
|
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
|
1032
1049
|
}
|
|
1033
1050
|
|
|
@@ -1038,28 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
|
|
1038
1055
|
|
|
1039
1056
|
auto * v = layers[ikv].v;
|
|
1040
1057
|
|
|
1041
|
-
const int64_t
|
|
1042
|
-
const int64_t
|
|
1058
|
+
const int64_t n_embd_head = v_cur->ne[0];
|
|
1059
|
+
const int64_t n_head = v_cur->ne[1];
|
|
1060
|
+
const int64_t n_tokens = v_cur->ne[2];
|
|
1061
|
+
|
|
1062
|
+
const int64_t n_embd_gqa = n_embd_head*n_head;
|
|
1043
1063
|
|
|
1044
|
-
|
|
1064
|
+
// we can merge dims 0 and 1
|
|
1065
|
+
GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
|
|
1045
1066
|
|
|
1067
|
+
const int64_t n_stream = v->ne[2];
|
|
1068
|
+
|
|
1069
|
+
// take this branch when FA is enabled (the V cache is not transposed)
|
|
1046
1070
|
if (!v_trans) {
|
|
1047
|
-
|
|
1048
|
-
|
|
1071
|
+
v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
|
|
1072
|
+
|
|
1073
|
+
if (n_stream > 1) {
|
|
1074
|
+
const int64_t kv_size = get_size();
|
|
1075
|
+
|
|
1076
|
+
assert(n_embd_gqa == v->ne[0]);
|
|
1077
|
+
assert(kv_size == v->ne[1]);
|
|
1078
|
+
|
|
1079
|
+
// merge the buffer across all streams because the idxs are global
|
|
1080
|
+
v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
|
|
1049
1081
|
}
|
|
1050
1082
|
|
|
1051
1083
|
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
|
1052
1084
|
}
|
|
1053
1085
|
|
|
1086
|
+
if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
|
|
1087
|
+
// we can merge dims 0, 1 and 2
|
|
1088
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
|
|
1089
|
+
} else {
|
|
1090
|
+
// otherwise -> make a copy to get contiguous data
|
|
1091
|
+
v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1054
1094
|
// [TAG_V_CACHE_VARIABLE]
|
|
1055
|
-
if (
|
|
1056
|
-
v_cur = ggml_pad(ctx, v_cur, v->ne[0] -
|
|
1095
|
+
if (n_embd_gqa < v->ne[0]) {
|
|
1096
|
+
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
|
|
1057
1097
|
}
|
|
1058
1098
|
|
|
1059
|
-
// the row
|
|
1060
|
-
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v
|
|
1099
|
+
// in this branch the v_idxs are constructed in such a way that each row is a single head element
|
|
1100
|
+
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
|
|
1061
1101
|
|
|
1062
|
-
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur
|
|
1102
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
|
|
1063
1103
|
|
|
1064
1104
|
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
|
1065
1105
|
}
|
|
@@ -317,9 +317,17 @@ public:
|
|
|
317
317
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
|
318
318
|
|
|
319
319
|
// store k_cur and v_cur in the cache based on the provided head location
|
|
320
|
+
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
|
|
321
|
+
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
|
322
|
+
// - k_idxs [n_tokens]
|
|
323
|
+
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
|
324
|
+
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
|
|
320
325
|
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
|
|
321
326
|
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
|
|
322
327
|
|
|
328
|
+
// create destination indices for each head of the current batch for where it would be written in the KV cache
|
|
329
|
+
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
|
|
330
|
+
// helps understand the implementation logic of cpy_k and cpy_v
|
|
323
331
|
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
|
324
332
|
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
|
325
333
|
|