@fugood/llama.node 1.1.6 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaCompletionWorker.cpp +73 -20
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +124 -40
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +312 -9
- package/src/llama.cpp/common/chat.h +4 -1
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +41 -7
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +28 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/include/llama.h +25 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +2 -4
- package/src/llama.cpp/src/llama-context.cpp +29 -17
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +1 -0
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
|
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
206
206
|
const int ncols_interleaved = 4;
|
|
207
207
|
const int blocklen = 4;
|
|
208
208
|
|
|
209
|
-
assert
|
|
210
|
-
assert
|
|
209
|
+
assert(nr == 1);
|
|
210
|
+
assert(n % qk == 0);
|
|
211
|
+
assert(nc % ncols_interleaved == 0);
|
|
211
212
|
|
|
212
213
|
UNUSED(s);
|
|
213
214
|
UNUSED(bs);
|
|
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
307
308
|
UNUSED(ncols_interleaved);
|
|
308
309
|
UNUSED(blocklen);
|
|
309
310
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
int sumi;
|
|
311
|
+
float sumf[8];
|
|
312
|
+
int sumi;
|
|
313
313
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
314
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
315
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
316
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
317
317
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
}
|
|
328
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
318
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
319
|
+
for (int l = 0; l < nb; l++) {
|
|
320
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
321
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
322
|
+
sumi = 0;
|
|
323
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
324
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
325
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
326
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
329
327
|
}
|
|
328
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
330
329
|
}
|
|
331
330
|
}
|
|
332
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
333
331
|
}
|
|
332
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
334
333
|
}
|
|
335
334
|
}
|
|
336
335
|
|
|
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
494
493
|
const int ncols_interleaved = 4;
|
|
495
494
|
const int blocklen = 4;
|
|
496
495
|
|
|
497
|
-
assert
|
|
498
|
-
assert
|
|
496
|
+
assert(nr == 1);
|
|
497
|
+
assert(n % qk == 0);
|
|
498
|
+
assert(nc % ncols_interleaved == 0);
|
|
499
499
|
|
|
500
|
-
UNUSED(s);
|
|
501
500
|
UNUSED(bs);
|
|
502
|
-
UNUSED(vx);
|
|
503
|
-
UNUSED(vy);
|
|
504
501
|
UNUSED(nr);
|
|
505
|
-
UNUSED(nc);
|
|
506
|
-
UNUSED(nb);
|
|
507
|
-
UNUSED(ncols_interleaved);
|
|
508
|
-
UNUSED(blocklen);
|
|
509
502
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
int sumi;
|
|
503
|
+
float sumf[4];
|
|
504
|
+
int sumi;
|
|
513
505
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
506
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
507
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
508
|
+
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
517
509
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
}
|
|
528
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
510
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
511
|
+
for (int l = 0; l < nb; l++) {
|
|
512
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
513
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
514
|
+
sumi = 0;
|
|
515
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
516
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
517
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
518
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
529
519
|
}
|
|
520
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
530
521
|
}
|
|
531
522
|
}
|
|
532
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
533
523
|
}
|
|
524
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
529
|
+
const int qk = QK8_0;
|
|
530
|
+
const int nb = n / qk;
|
|
531
|
+
const int ncols_interleaved = 8;
|
|
532
|
+
const int blocklen = 8;
|
|
533
|
+
|
|
534
|
+
assert(nr == 1);
|
|
535
|
+
assert(n % qk == 0);
|
|
536
|
+
assert(nc % ncols_interleaved == 0);
|
|
537
|
+
|
|
538
|
+
UNUSED(bs);
|
|
539
|
+
UNUSED(nr);
|
|
540
|
+
|
|
541
|
+
float sumf[8];
|
|
542
|
+
int sumi;
|
|
543
|
+
|
|
544
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
545
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
546
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
547
|
+
|
|
548
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
549
|
+
for (int l = 0; l < nb; l++) {
|
|
550
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
551
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
552
|
+
sumi = 0;
|
|
553
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
554
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
555
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
556
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
557
|
+
}
|
|
558
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
534
563
|
}
|
|
535
564
|
}
|
|
536
565
|
|
|
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
934
963
|
}
|
|
935
964
|
}
|
|
936
965
|
|
|
966
|
+
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
967
|
+
const int qk = QK8_0;
|
|
968
|
+
const int nb = n / qk;
|
|
969
|
+
const int ncols_interleaved = 8;
|
|
970
|
+
const int blocklen = 8;
|
|
971
|
+
|
|
972
|
+
assert(n % qk == 0);
|
|
973
|
+
assert(nr % 4 == 0);
|
|
974
|
+
assert(nc % ncols_interleaved == 0);
|
|
975
|
+
|
|
976
|
+
float sumf[4][8];
|
|
977
|
+
int sumi;
|
|
978
|
+
|
|
979
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
980
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
981
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
982
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
983
|
+
for (int m = 0; m < 4; m++) {
|
|
984
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
985
|
+
}
|
|
986
|
+
for (int l = 0; l < nb; l++) {
|
|
987
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
988
|
+
for (int m = 0; m < 4; m++) {
|
|
989
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
990
|
+
sumi = 0;
|
|
991
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
992
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
993
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
994
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
995
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
996
|
+
}
|
|
997
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
for (int m = 0; m < 4; m++) {
|
|
1003
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1004
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
|
|
937
1010
|
} // extern "C"
|
|
938
1011
|
|
|
939
1012
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
|
|
1285
1358
|
|
|
1286
1359
|
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1287
1360
|
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
1288
|
-
//GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1289
1361
|
GGML_ASSERT(interleave_block == 4);
|
|
1290
1362
|
|
|
1291
|
-
|
|
1292
|
-
|
|
1363
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
1364
|
+
block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
|
1365
|
+
|
|
1293
1366
|
block_iq4_nl dst_tmp[4];
|
|
1367
|
+
|
|
1294
1368
|
int nrow = ggml_nrows(t);
|
|
1295
1369
|
int nrows_interleaved = 4;
|
|
1296
|
-
int nblocks = t->ne[0] /
|
|
1370
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
1297
1371
|
|
|
1298
1372
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
1299
1373
|
|
|
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
|
|
1315
1389
|
GGML_UNUSED(data_size);
|
|
1316
1390
|
}
|
|
1317
1391
|
|
|
1392
|
+
static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
1393
|
+
block_iq4_nlx8 out;
|
|
1394
|
+
|
|
1395
|
+
for (int i = 0; i < 8; i++) {
|
|
1396
|
+
out.d[i] = in[i].d;
|
|
1397
|
+
}
|
|
1398
|
+
|
|
1399
|
+
const int end = QK4_NL * 4 / blck_size_interleave;
|
|
1400
|
+
|
|
1401
|
+
if (blck_size_interleave == 8) {
|
|
1402
|
+
for (int i = 0; i < end; ++i) {
|
|
1403
|
+
int src_id = i % 8;
|
|
1404
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
1405
|
+
int dst_offset = i * blck_size_interleave;
|
|
1406
|
+
|
|
1407
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1408
|
+
}
|
|
1409
|
+
} else {
|
|
1410
|
+
GGML_ASSERT(false);
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
return out;
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1417
|
+
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
1418
|
+
GGML_ASSERT(interleave_block == 8);
|
|
1419
|
+
|
|
1420
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
1421
|
+
block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
|
1422
|
+
|
|
1423
|
+
block_iq4_nl dst_tmp[8];
|
|
1424
|
+
|
|
1425
|
+
int nrow = ggml_nrows(t);
|
|
1426
|
+
int nrows_interleaved = 8;
|
|
1427
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
1428
|
+
|
|
1429
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
1430
|
+
|
|
1431
|
+
if (t->ne[1] % nrows_interleaved != 0) {
|
|
1432
|
+
return -1;
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1436
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1437
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1438
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1439
|
+
}
|
|
1440
|
+
*dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
|
1441
|
+
}
|
|
1442
|
+
src += nrows_interleaved * nblocks;
|
|
1443
|
+
}
|
|
1444
|
+
return 0;
|
|
1445
|
+
|
|
1446
|
+
GGML_UNUSED(data_size);
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1318
1449
|
namespace ggml::cpu::repack {
|
|
1319
1450
|
// repack
|
|
1320
1451
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
|
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
|
|
|
1350
1481
|
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
|
1351
1482
|
//}
|
|
1352
1483
|
|
|
1484
|
+
template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1485
|
+
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1353
1488
|
// gemv
|
|
1354
1489
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1355
1490
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1378
1513
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1379
1514
|
}
|
|
1380
1515
|
|
|
1516
|
+
template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1517
|
+
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1381
1520
|
// gemm
|
|
1382
1521
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1383
1522
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1406
1545
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1407
1546
|
}
|
|
1408
1547
|
|
|
1548
|
+
template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1549
|
+
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1409
1552
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
1410
1553
|
public:
|
|
1411
1554
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1680
1823
|
|
|
1681
1824
|
// instance for IQ4
|
|
1682
1825
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
1826
|
+
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
1683
1827
|
|
|
1684
1828
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
1685
1829
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
|
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1710
1854
|
}
|
|
1711
1855
|
}
|
|
1712
1856
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
1857
|
+
if (ggml_cpu_has_avx2()) {
|
|
1858
|
+
if (cur->ne[1] % 8 == 0) {
|
|
1859
|
+
return &iq4_nl_8x8_q8_0;
|
|
1860
|
+
}
|
|
1861
|
+
}
|
|
1713
1862
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
1714
1863
|
if (cur->ne[1] % 4 == 0) {
|
|
1715
1864
|
return &iq4_nl_4x4_q8_0;
|
|
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
|
|
|
67
67
|
|
|
68
68
|
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
|
69
69
|
|
|
70
|
+
struct block_iq4_nlx8 {
|
|
71
|
+
ggml_half d[8]; // deltas for 8 iq4_nl blocks
|
|
72
|
+
uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
|
76
|
+
|
|
70
77
|
#if defined(__cplusplus)
|
|
71
78
|
extern "C" {
|
|
72
79
|
#endif
|
|
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
80
87
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
81
88
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
82
89
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
90
|
+
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
83
91
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
84
92
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
85
93
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
86
94
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
95
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
96
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
97
|
+
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
89
98
|
|
|
90
99
|
// Native implementations
|
|
91
100
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
97
106
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
98
107
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
99
108
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
109
|
+
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
100
110
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
101
111
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
102
112
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
103
113
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
114
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
105
115
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
116
|
+
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
106
117
|
|
|
107
118
|
#if defined(__cplusplus)
|
|
108
119
|
} // extern "C"
|
|
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
|
|
|
10
10
|
} // namespace ggml::cpu
|
|
11
11
|
|
|
12
12
|
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
|
13
|
-
for (auto extra :
|
|
13
|
+
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
|
14
14
|
if (extra && extra->context) {
|
|
15
15
|
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
16
16
|
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
|
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
|
26
|
-
for (auto extra :
|
|
26
|
+
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
|
27
27
|
if (extra && extra->context) {
|
|
28
28
|
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
29
29
|
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
@@ -33,6 +33,6 @@ class extra_buffer_type {
|
|
|
33
33
|
} // namespace ggml::cpu
|
|
34
34
|
|
|
35
35
|
// implemented in ggml-cpu.cpp.
|
|
36
|
-
std::vector<ggml_backend_buffer_type_t> &
|
|
36
|
+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
|
|
37
37
|
|
|
38
38
|
#endif
|
|
@@ -870,6 +870,29 @@ extern "C" {
|
|
|
870
870
|
size_t n_token_capacity,
|
|
871
871
|
size_t * n_token_count_out);
|
|
872
872
|
|
|
873
|
+
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
|
874
|
+
|
|
875
|
+
typedef uint32_t llama_state_seq_flags;
|
|
876
|
+
|
|
877
|
+
LLAMA_API size_t llama_state_seq_get_size_ext(
|
|
878
|
+
struct llama_context * ctx,
|
|
879
|
+
llama_seq_id seq_id,
|
|
880
|
+
llama_state_seq_flags flags);
|
|
881
|
+
|
|
882
|
+
LLAMA_API size_t llama_state_seq_get_data_ext(
|
|
883
|
+
struct llama_context * ctx,
|
|
884
|
+
uint8_t * dst,
|
|
885
|
+
size_t size,
|
|
886
|
+
llama_seq_id seq_id,
|
|
887
|
+
llama_state_seq_flags flags);
|
|
888
|
+
|
|
889
|
+
LLAMA_API size_t llama_state_seq_set_data_ext(
|
|
890
|
+
struct llama_context * ctx,
|
|
891
|
+
const uint8_t * src,
|
|
892
|
+
size_t size,
|
|
893
|
+
llama_seq_id dest_seq_id,
|
|
894
|
+
llama_state_seq_flags flags);
|
|
895
|
+
|
|
873
896
|
//
|
|
874
897
|
// Decoding
|
|
875
898
|
//
|
|
@@ -1437,6 +1460,8 @@ extern "C" {
|
|
|
1437
1460
|
|
|
1438
1461
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
|
1439
1462
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
1463
|
+
|
|
1464
|
+
enum ggml_opt_optimizer_type optimizer_type;
|
|
1440
1465
|
};
|
|
1441
1466
|
|
|
1442
1467
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
|
@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
|
|
477
477
|
|
|
478
478
|
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
|
479
479
|
if (sequential && has_cpl) {
|
|
480
|
-
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
|
|
480
|
+
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
|
|
481
481
|
|
|
482
482
|
return {};
|
|
483
483
|
}
|
|
@@ -193,11 +193,11 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
193
193
|
return LLM_CHAT_TEMPLATE_LLAMA4;
|
|
194
194
|
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
|
195
195
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
|
196
|
-
} else if (tmpl_contains("<|
|
|
196
|
+
} else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
|
|
197
197
|
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
|
198
198
|
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
|
199
199
|
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
|
200
|
-
} else if (tmpl_contains("<|
|
|
200
|
+
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
|
201
201
|
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
|
202
202
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
|
203
203
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
|
@@ -625,8 +625,6 @@ int32_t llm_chat_apply_template(
|
|
|
625
625
|
} else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
|
|
626
626
|
// Yandex template ("\n\n" is defined as EOT token)
|
|
627
627
|
|
|
628
|
-
ss << "<s>";
|
|
629
|
-
|
|
630
628
|
for (size_t i = 0; i < chat.size(); i++) {
|
|
631
629
|
std::string role(chat[i]->role);
|
|
632
630
|
if (role == "user") {
|
|
@@ -1657,30 +1657,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
|
|
|
1657
1657
|
}
|
|
1658
1658
|
}
|
|
1659
1659
|
|
|
1660
|
-
size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
|
|
1660
|
+
size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
1661
1661
|
llama_io_write_dummy io;
|
|
1662
1662
|
try {
|
|
1663
|
-
return state_seq_write_data(io, seq_id);
|
|
1663
|
+
return state_seq_write_data(io, seq_id, flags);
|
|
1664
1664
|
} catch (const std::exception & err) {
|
|
1665
1665
|
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
|
1666
1666
|
return 0;
|
|
1667
1667
|
}
|
|
1668
1668
|
}
|
|
1669
1669
|
|
|
1670
|
-
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
|
|
1670
|
+
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
|
|
1671
1671
|
llama_io_write_buffer io(dst, size);
|
|
1672
1672
|
try {
|
|
1673
|
-
return state_seq_write_data(io, seq_id);
|
|
1673
|
+
return state_seq_write_data(io, seq_id, flags);
|
|
1674
1674
|
} catch (const std::exception & err) {
|
|
1675
1675
|
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
|
1676
1676
|
return 0;
|
|
1677
1677
|
}
|
|
1678
1678
|
}
|
|
1679
1679
|
|
|
1680
|
-
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
|
|
1680
|
+
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
|
|
1681
1681
|
llama_io_read_buffer io(src, size);
|
|
1682
1682
|
try {
|
|
1683
|
-
return state_seq_read_data(io, seq_id);
|
|
1683
|
+
return state_seq_read_data(io, seq_id, flags);
|
|
1684
1684
|
} catch (const std::exception & err) {
|
|
1685
1685
|
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
|
1686
1686
|
return 0;
|
|
@@ -1778,7 +1778,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
|
|
|
1778
1778
|
{
|
|
1779
1779
|
const size_t state_size = file.size() - file.tell();
|
|
1780
1780
|
llama_io_read_file io(&file);
|
|
1781
|
-
const size_t nread = state_seq_read_data(io, seq_id);
|
|
1781
|
+
const size_t nread = state_seq_read_data(io, seq_id, 0);
|
|
1782
1782
|
if (!nread) {
|
|
1783
1783
|
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
|
1784
1784
|
return 0;
|
|
@@ -1802,7 +1802,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
|
|
|
1802
1802
|
|
|
1803
1803
|
// save the context state using stream saving
|
|
1804
1804
|
llama_io_write_file io(&file);
|
|
1805
|
-
state_seq_write_data(io, seq_id);
|
|
1805
|
+
state_seq_write_data(io, seq_id, 0);
|
|
1806
1806
|
|
|
1807
1807
|
const size_t res = file.tell();
|
|
1808
1808
|
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
|
|
@@ -1971,21 +1971,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
|
|
1971
1971
|
return io.n_bytes();
|
|
1972
1972
|
}
|
|
1973
1973
|
|
|
1974
|
-
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
|
1974
|
+
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
1975
1975
|
GGML_UNUSED(seq_id);
|
|
1976
1976
|
|
|
1977
1977
|
if (memory) {
|
|
1978
|
-
memory->state_write(io, seq_id);
|
|
1978
|
+
memory->state_write(io, seq_id, flags);
|
|
1979
1979
|
}
|
|
1980
1980
|
|
|
1981
1981
|
return io.n_bytes();
|
|
1982
1982
|
}
|
|
1983
1983
|
|
|
1984
|
-
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
1984
|
+
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
1985
1985
|
GGML_UNUSED(seq_id);
|
|
1986
1986
|
|
|
1987
1987
|
if (memory) {
|
|
1988
|
-
memory->state_read(io, seq_id);
|
|
1988
|
+
memory->state_read(io, seq_id, flags);
|
|
1989
1989
|
}
|
|
1990
1990
|
|
|
1991
1991
|
return io.n_bytes();
|
|
@@ -2048,7 +2048,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
|
|
2048
2048
|
opt_params.opt_period = n_batch / n_ubatch;
|
|
2049
2049
|
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
|
2050
2050
|
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
|
2051
|
-
|
|
2051
|
+
opt_params.optimizer = lopt_params.optimizer_type;
|
|
2052
2052
|
opt_ctx = ggml_opt_init(opt_params);
|
|
2053
2053
|
|
|
2054
2054
|
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
|
@@ -2801,19 +2801,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
|
|
|
2801
2801
|
}
|
|
2802
2802
|
|
|
2803
2803
|
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
|
|
2804
|
-
return ctx
|
|
2804
|
+
return llama_state_seq_get_size_ext(ctx, seq_id, 0);
|
|
2805
2805
|
}
|
|
2806
2806
|
|
|
2807
2807
|
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
|
2808
|
+
return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
|
|
2809
|
+
}
|
|
2810
|
+
|
|
2811
|
+
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
|
|
2812
|
+
return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
|
|
2813
|
+
}
|
|
2814
|
+
|
|
2815
|
+
size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
2816
|
+
return ctx->state_seq_get_size(seq_id, flags);
|
|
2817
|
+
}
|
|
2818
|
+
|
|
2819
|
+
size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
2808
2820
|
ctx->synchronize();
|
|
2809
2821
|
|
|
2810
|
-
return ctx->state_seq_get_data(seq_id, dst, size);
|
|
2822
|
+
return ctx->state_seq_get_data(seq_id, dst, size, flags);
|
|
2811
2823
|
}
|
|
2812
2824
|
|
|
2813
|
-
size_t
|
|
2825
|
+
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
2814
2826
|
ctx->synchronize();
|
|
2815
2827
|
|
|
2816
|
-
return ctx->state_seq_set_data(seq_id, src, size);
|
|
2828
|
+
return ctx->state_seq_set_data(seq_id, src, size, flags);
|
|
2817
2829
|
}
|
|
2818
2830
|
|
|
2819
2831
|
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
|
@@ -111,9 +111,9 @@ struct llama_context {
|
|
|
111
111
|
size_t state_get_data( uint8_t * dst, size_t size);
|
|
112
112
|
size_t state_set_data(const uint8_t * src, size_t size);
|
|
113
113
|
|
|
114
|
-
size_t state_seq_get_size(llama_seq_id seq_id);
|
|
115
|
-
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
|
|
116
|
-
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
|
|
114
|
+
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
|
|
115
|
+
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
|
|
116
|
+
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
|
|
117
117
|
|
|
118
118
|
bool state_load_file(
|
|
119
119
|
const char * filepath,
|
|
@@ -152,6 +152,7 @@ struct llama_context {
|
|
|
152
152
|
|
|
153
153
|
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
|
154
154
|
|
|
155
|
+
// TODO: more flexible combinations of logical/physical batch size and context size
|
|
155
156
|
void opt_epoch(
|
|
156
157
|
ggml_opt_dataset_t dataset,
|
|
157
158
|
ggml_opt_result_t result_train,
|
|
@@ -212,8 +213,8 @@ private:
|
|
|
212
213
|
size_t state_write_data(llama_io_write_i & io);
|
|
213
214
|
size_t state_read_data (llama_io_read_i & io);
|
|
214
215
|
|
|
215
|
-
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
|
|
216
|
-
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
|
|
216
|
+
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
|
217
|
+
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
|
217
218
|
|
|
218
219
|
//
|
|
219
220
|
// members
|