@fugood/llama.node 1.1.8 → 1.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +9 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +14 -1
- package/src/llama.cpp/common/arg.cpp +6 -4
- package/src/llama.cpp/common/chat.cpp +34 -3
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +1 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -192
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
- package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
- package/src/llama.cpp/src/llama-memory.h +11 -8
- package/src/llama.cpp/src/llama-model.cpp +396 -187
- package/src/llama.cpp/src/llama-model.h +1 -0
|
@@ -23,6 +23,27 @@
|
|
|
23
23
|
|
|
24
24
|
#define UNUSED GGML_UNUSED
|
|
25
25
|
|
|
26
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
|
27
|
+
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
|
28
|
+
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
|
29
|
+
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
|
30
|
+
#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
|
|
31
|
+
#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
|
|
32
|
+
#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
|
|
33
|
+
#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
|
|
34
|
+
#define B8(c,s ) B7(c,s, c), B7(c,s, s)
|
|
35
|
+
|
|
36
|
+
// precomputed tables for expanding 8bits to 8 bytes:
|
|
37
|
+
static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
|
|
38
|
+
static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
|
|
39
|
+
|
|
40
|
+
// permute mask for byteswapping
|
|
41
|
+
static const uint8x16_t v_kperm = (const uint8x16_t){
|
|
42
|
+
7, 6, 5, 4, 3, 2, 1, 0,
|
|
43
|
+
15, 14, 13, 12, 11, 10, 9, 8
|
|
44
|
+
};
|
|
45
|
+
#endif
|
|
46
|
+
|
|
26
47
|
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
27
48
|
assert(QK8_0 == 32);
|
|
28
49
|
assert(k % QK8_0 == 0);
|
|
@@ -241,6 +262,301 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
241
262
|
#endif
|
|
242
263
|
}
|
|
243
264
|
|
|
265
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
266
|
+
const int qk = QK8_0;
|
|
267
|
+
const int nb = n / qk;
|
|
268
|
+
|
|
269
|
+
assert(n % qk == 0);
|
|
270
|
+
assert(qk == QK5_0);
|
|
271
|
+
assert(nrc == 1);
|
|
272
|
+
UNUSED(nrc);
|
|
273
|
+
UNUSED(bx);
|
|
274
|
+
UNUSED(by);
|
|
275
|
+
UNUSED(bs);
|
|
276
|
+
|
|
277
|
+
const block_q5_0 * GGML_RESTRICT x = vx;
|
|
278
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
279
|
+
|
|
280
|
+
int ib = 0;
|
|
281
|
+
float sumf = 0.0f;
|
|
282
|
+
|
|
283
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
|
284
|
+
float32x4_t v_sum0 = vec_splats(0.0f);
|
|
285
|
+
float32x4_t v_sum1 = vec_splats(0.0f);
|
|
286
|
+
|
|
287
|
+
uint32_t qh0, qh1;
|
|
288
|
+
uint64_t tmp0[4], tmp1[4];
|
|
289
|
+
|
|
290
|
+
const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
|
|
291
|
+
|
|
292
|
+
#pragma GCC unroll 4
|
|
293
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
294
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
295
|
+
const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
296
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
297
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
298
|
+
|
|
299
|
+
memcpy(&qh0, x0->qh, sizeof(qh0));
|
|
300
|
+
memcpy(&qh1, x1->qh, sizeof(qh1));
|
|
301
|
+
|
|
302
|
+
tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
|
|
303
|
+
tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
|
|
304
|
+
tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
|
|
305
|
+
tmp0[3] = table_b2b_1[(qh0 >> 24) ];
|
|
306
|
+
|
|
307
|
+
tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
|
|
308
|
+
tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
|
|
309
|
+
tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
|
|
310
|
+
tmp1[3] = table_b2b_1[(qh1 >> 24) ];
|
|
311
|
+
|
|
312
|
+
int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
|
|
313
|
+
int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
|
|
314
|
+
int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
|
|
315
|
+
int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
|
|
316
|
+
|
|
317
|
+
// required for fixing the byteorder
|
|
318
|
+
v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
|
|
319
|
+
v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
|
|
320
|
+
v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
|
|
321
|
+
v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
|
|
322
|
+
|
|
323
|
+
const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
|
|
324
|
+
const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
|
|
325
|
+
|
|
326
|
+
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
|
327
|
+
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
|
328
|
+
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
|
329
|
+
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
|
330
|
+
|
|
331
|
+
const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
|
|
332
|
+
const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
|
|
333
|
+
const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
|
|
334
|
+
const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
|
|
335
|
+
|
|
336
|
+
const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs);
|
|
337
|
+
const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
|
|
338
|
+
const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs);
|
|
339
|
+
const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
|
|
340
|
+
|
|
341
|
+
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
|
|
342
|
+
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
|
|
343
|
+
|
|
344
|
+
const float32x4_t v_xy0f = vec_float(v_xy0);
|
|
345
|
+
const float32x4_t v_xy1f = vec_float(v_xy1);
|
|
346
|
+
|
|
347
|
+
const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
348
|
+
const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
|
|
349
|
+
|
|
350
|
+
v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
|
|
351
|
+
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
|
|
355
|
+
|
|
356
|
+
#pragma GCC unroll 4
|
|
357
|
+
for (; ib < nb; ++ib) {
|
|
358
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
|
|
359
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
360
|
+
|
|
361
|
+
uint32_t qh;
|
|
362
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
|
363
|
+
|
|
364
|
+
uint64_t tmp[4];
|
|
365
|
+
tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
|
|
366
|
+
tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
|
|
367
|
+
tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
|
|
368
|
+
tmp[3] = table_b2b_1[(qh >> 24) ];
|
|
369
|
+
|
|
370
|
+
int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
|
|
371
|
+
int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
|
|
372
|
+
|
|
373
|
+
// required for fixing the byteorder
|
|
374
|
+
v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
|
|
375
|
+
v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
|
|
376
|
+
|
|
377
|
+
const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
|
|
378
|
+
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
|
379
|
+
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
|
380
|
+
|
|
381
|
+
const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
|
|
382
|
+
const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
|
|
383
|
+
|
|
384
|
+
const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs);
|
|
385
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
|
|
386
|
+
|
|
387
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
|
|
388
|
+
const float32x4_t v_xyf = vec_float(v_xy);
|
|
389
|
+
|
|
390
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
391
|
+
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
|
|
392
|
+
|
|
393
|
+
sumf += vec_hsum(v_acc);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
*s = sumf;
|
|
397
|
+
#else
|
|
398
|
+
UNUSED(nb);
|
|
399
|
+
UNUSED(x);
|
|
400
|
+
UNUSED(y);
|
|
401
|
+
UNUSED(ib);
|
|
402
|
+
UNUSED(sumf);
|
|
403
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
404
|
+
#endif
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
408
|
+
const int qk = QK8_1;
|
|
409
|
+
const int nb = n / qk;
|
|
410
|
+
|
|
411
|
+
assert(n % qk == 0);
|
|
412
|
+
assert(qk == QK5_1);
|
|
413
|
+
assert(nrc == 1);
|
|
414
|
+
UNUSED(nrc);
|
|
415
|
+
UNUSED(bx);
|
|
416
|
+
UNUSED(by);
|
|
417
|
+
UNUSED(bs);
|
|
418
|
+
|
|
419
|
+
const block_q5_1 * GGML_RESTRICT x = vx;
|
|
420
|
+
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
421
|
+
|
|
422
|
+
int ib = 0;
|
|
423
|
+
float sumf = 0.0f;
|
|
424
|
+
|
|
425
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
|
426
|
+
float32x4_t v_sum0 = vec_splats(0.0f);
|
|
427
|
+
float32x4_t v_sum1 = vec_splats(0.0f);
|
|
428
|
+
|
|
429
|
+
float summs0 = 0.0f;
|
|
430
|
+
float summs1 = 0.0f;
|
|
431
|
+
|
|
432
|
+
uint32_t qh0;
|
|
433
|
+
uint32_t qh1;
|
|
434
|
+
|
|
435
|
+
uint64_t tmp0[4];
|
|
436
|
+
uint64_t tmp1[4];
|
|
437
|
+
|
|
438
|
+
const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
|
|
439
|
+
|
|
440
|
+
#pragma GCC unroll 4
|
|
441
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
442
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
443
|
+
const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
444
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
445
|
+
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
446
|
+
|
|
447
|
+
summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
|
448
|
+
summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
|
|
449
|
+
|
|
450
|
+
memcpy(&qh0, x0->qh, sizeof(qh0));
|
|
451
|
+
memcpy(&qh1, x1->qh, sizeof(qh1));
|
|
452
|
+
|
|
453
|
+
tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
|
|
454
|
+
tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
|
|
455
|
+
tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
|
|
456
|
+
tmp0[3] = table_b2b_0[(qh0 >> 24) ];
|
|
457
|
+
|
|
458
|
+
tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
|
|
459
|
+
tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
|
|
460
|
+
tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
|
|
461
|
+
tmp1[3] = table_b2b_0[(qh1 >> 24) ];
|
|
462
|
+
|
|
463
|
+
int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
|
|
464
|
+
int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
|
|
465
|
+
int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
|
|
466
|
+
int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
|
|
467
|
+
|
|
468
|
+
// required for fixing the byteorder
|
|
469
|
+
v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
|
|
470
|
+
v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
|
|
471
|
+
v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
|
|
472
|
+
v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
|
|
473
|
+
|
|
474
|
+
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
|
|
475
|
+
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
|
|
476
|
+
|
|
477
|
+
const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
|
478
|
+
const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
|
479
|
+
const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
|
480
|
+
const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
|
481
|
+
|
|
482
|
+
const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
|
|
483
|
+
const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
|
|
484
|
+
const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
|
|
485
|
+
const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
|
|
486
|
+
|
|
487
|
+
const int8x16_t v_y0l = vec_xl(0 , y0->qs);
|
|
488
|
+
const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
|
|
489
|
+
const int8x16_t v_y1l = vec_xl(0 , y1->qs);
|
|
490
|
+
const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
|
|
491
|
+
|
|
492
|
+
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
|
|
493
|
+
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
|
|
494
|
+
|
|
495
|
+
const float32x4_t v_xy0f = vec_float(v_xy0);
|
|
496
|
+
const float32x4_t v_xy1f = vec_float(v_xy1);
|
|
497
|
+
|
|
498
|
+
const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
499
|
+
const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
|
|
500
|
+
|
|
501
|
+
v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
|
|
502
|
+
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
|
|
506
|
+
|
|
507
|
+
#pragma GCC unroll 4
|
|
508
|
+
for (; ib < nb; ++ib) {
|
|
509
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
|
510
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
|
511
|
+
|
|
512
|
+
float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
|
513
|
+
|
|
514
|
+
uint32_t qh;
|
|
515
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
|
516
|
+
|
|
517
|
+
uint64_t tmp[4];
|
|
518
|
+
tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
|
|
519
|
+
tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
|
|
520
|
+
tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
|
|
521
|
+
tmp[3] = table_b2b_0[(qh >> 24) ];
|
|
522
|
+
|
|
523
|
+
int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
|
|
524
|
+
int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
|
|
525
|
+
|
|
526
|
+
// required for fixing the byteorder
|
|
527
|
+
v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
|
|
528
|
+
v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
|
|
529
|
+
|
|
530
|
+
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
|
531
|
+
const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
|
532
|
+
const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
|
533
|
+
|
|
534
|
+
const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
|
|
535
|
+
const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
|
|
536
|
+
|
|
537
|
+
const int8x16_t v_yl = vec_xl(0 , y0->qs);
|
|
538
|
+
const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
|
|
539
|
+
|
|
540
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
|
|
541
|
+
const float32x4_t v_xyf = vec_float(v_xy);
|
|
542
|
+
|
|
543
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
544
|
+
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
|
|
545
|
+
|
|
546
|
+
sumf += vec_hsum(v_acc) + summs;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
*s = sumf;
|
|
550
|
+
#else
|
|
551
|
+
UNUSED(nb);
|
|
552
|
+
UNUSED(x);
|
|
553
|
+
UNUSED(y);
|
|
554
|
+
UNUSED(ib);
|
|
555
|
+
UNUSED(sumf);
|
|
556
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
557
|
+
#endif
|
|
558
|
+
}
|
|
559
|
+
|
|
244
560
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
245
561
|
const int qk = QK8_0;
|
|
246
562
|
const int nb = n / qk;
|
|
@@ -150,8 +150,6 @@
|
|
|
150
150
|
#elif defined(__s390x__)
|
|
151
151
|
// quants.c
|
|
152
152
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
|
153
|
-
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
|
154
|
-
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
|
155
153
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
|
156
154
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
|
157
155
|
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
|
@@ -486,6 +486,14 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
|
|
486
486
|
return v_abo + v_abe;
|
|
487
487
|
}
|
|
488
488
|
|
|
489
|
+
/**
|
|
490
|
+
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
|
491
|
+
*/
|
|
492
|
+
inline float vec_hsum(float32x4_t v) {
|
|
493
|
+
float32x4_t v_temp = v + vec_reve(v);
|
|
494
|
+
return v_temp[0] + v_temp[1];
|
|
495
|
+
}
|
|
496
|
+
|
|
489
497
|
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
490
498
|
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
|
491
499
|
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
|
@@ -1880,6 +1880,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
1880
1880
|
{
|
|
1881
1881
|
ggml_compute_forward_conv_2d(params, tensor);
|
|
1882
1882
|
} break;
|
|
1883
|
+
case GGML_OP_CONV_3D:
|
|
1884
|
+
{
|
|
1885
|
+
ggml_compute_forward_conv_3d(params, tensor);
|
|
1886
|
+
} break;
|
|
1883
1887
|
case GGML_OP_CONV_2D_DW:
|
|
1884
1888
|
{
|
|
1885
1889
|
ggml_compute_forward_conv_2d_dw(params, tensor);
|
|
@@ -2252,6 +2256,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2252
2256
|
case GGML_OP_IM2COL:
|
|
2253
2257
|
case GGML_OP_IM2COL_BACK:
|
|
2254
2258
|
case GGML_OP_CONV_2D:
|
|
2259
|
+
case GGML_OP_CONV_3D:
|
|
2255
2260
|
case GGML_OP_CONV_2D_DW:
|
|
2256
2261
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
2257
2262
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
@@ -2773,6 +2778,7 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2773
2778
|
}
|
|
2774
2779
|
} break;
|
|
2775
2780
|
case GGML_OP_CONV_2D:
|
|
2781
|
+
case GGML_OP_CONV_3D:
|
|
2776
2782
|
{
|
|
2777
2783
|
cur = GGML_IM2COL_WORK_SIZE;
|
|
2778
2784
|
} break;
|
|
@@ -7207,6 +7207,148 @@ void ggml_compute_forward_conv_2d(
|
|
|
7207
7207
|
ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type);
|
|
7208
7208
|
}
|
|
7209
7209
|
|
|
7210
|
+
// ggml_compute_forward_conv_3d
|
|
7211
|
+
|
|
7212
|
+
static void ggml_compute_forward_conv_3d_impl(const ggml_compute_params * params,
|
|
7213
|
+
const ggml_tensor * kernel,
|
|
7214
|
+
const ggml_tensor * src,
|
|
7215
|
+
ggml_tensor * dst,
|
|
7216
|
+
ggml_type kernel_type) {
|
|
7217
|
+
|
|
7218
|
+
GGML_ASSERT(ggml_is_contiguous(kernel));
|
|
7219
|
+
GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32);
|
|
7220
|
+
GGML_ASSERT(kernel->type == kernel_type);
|
|
7221
|
+
|
|
7222
|
+
const ggml_type_traits * traits = ggml_get_type_traits(kernel_type);
|
|
7223
|
+
|
|
7224
|
+
const int32_t s0 = dst->op_params[0];
|
|
7225
|
+
const int32_t s1 = dst->op_params[1];
|
|
7226
|
+
const int32_t s2 = dst->op_params[2];
|
|
7227
|
+
const int32_t p0 = dst->op_params[3];
|
|
7228
|
+
const int32_t p1 = dst->op_params[4];
|
|
7229
|
+
const int32_t p2 = dst->op_params[5];
|
|
7230
|
+
const int32_t d0 = dst->op_params[6];
|
|
7231
|
+
const int32_t d1 = dst->op_params[7];
|
|
7232
|
+
const int32_t d2 = dst->op_params[8];
|
|
7233
|
+
const int32_t c = dst->op_params[9];
|
|
7234
|
+
const int32_t n = dst->op_params[10];
|
|
7235
|
+
const int32_t oc = dst->op_params[11];
|
|
7236
|
+
|
|
7237
|
+
const int64_t src_w = src->ne[0];
|
|
7238
|
+
const int64_t src_h = src->ne[1];
|
|
7239
|
+
const int64_t src_d = src->ne[2];
|
|
7240
|
+
const int64_t knl_w = kernel->ne[0];
|
|
7241
|
+
const int64_t knl_h = kernel->ne[1];
|
|
7242
|
+
const int64_t knl_d = kernel->ne[2];
|
|
7243
|
+
const int64_t dst_w = dst->ne[0];
|
|
7244
|
+
const int64_t dst_h = dst->ne[1];
|
|
7245
|
+
const int64_t dst_d = dst->ne[2];
|
|
7246
|
+
|
|
7247
|
+
const float * src_data = (float *) src->data;
|
|
7248
|
+
void * knl_data = kernel->data;
|
|
7249
|
+
float * dst_data = (float *) dst->data;
|
|
7250
|
+
|
|
7251
|
+
const int64_t knl_n_per_channel = knl_w * knl_h * knl_d;
|
|
7252
|
+
const int64_t knl_n_total = knl_n_per_channel * c;
|
|
7253
|
+
const int64_t patch_total = n * dst_w * dst_h * dst_d;
|
|
7254
|
+
|
|
7255
|
+
const int64_t space_per_patch = knl_n_total * traits->type_size + oc * sizeof(float);
|
|
7256
|
+
const int64_t batch_size = params->wsize / space_per_patch;
|
|
7257
|
+
const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
|
|
7258
|
+
const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
|
|
7259
|
+
|
|
7260
|
+
GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
|
|
7261
|
+
|
|
7262
|
+
void * tmp = params->wdata;
|
|
7263
|
+
|
|
7264
|
+
for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
|
|
7265
|
+
const int64_t patch_start_batch = batch_i * patches_per_batch;
|
|
7266
|
+
const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, patch_total);
|
|
7267
|
+
const int64_t patch_n_in_batch = patch_end_batch - patch_start_batch;
|
|
7268
|
+
|
|
7269
|
+
const int64_t patch_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth;
|
|
7270
|
+
const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread;
|
|
7271
|
+
const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch);
|
|
7272
|
+
|
|
7273
|
+
for (int64_t p = patch_start; p < patch_end; ++p) {
|
|
7274
|
+
const int64_t p_in_batch = p % (dst_w * dst_h * dst_d);
|
|
7275
|
+
const int64_t p_in_depth = p_in_batch % (dst_w * dst_h);
|
|
7276
|
+
const int64_t batch_idx = p / (dst_w * dst_h * dst_d);
|
|
7277
|
+
const int64_t dst_z = p_in_batch / (dst_w * dst_h);
|
|
7278
|
+
const int64_t dst_y = p_in_depth / dst_w;
|
|
7279
|
+
const int64_t dst_x = p_in_depth % dst_w;
|
|
7280
|
+
|
|
7281
|
+
char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n_total * traits->type_size;
|
|
7282
|
+
|
|
7283
|
+
for (int64_t ic = 0; ic < c; ++ic) {
|
|
7284
|
+
for (int64_t kz = 0; kz < knl_d; ++kz) {
|
|
7285
|
+
for (int64_t ky = 0; ky < knl_h; ++ky) {
|
|
7286
|
+
for (int64_t kx = 0; kx < knl_w; ++kx) {
|
|
7287
|
+
const int64_t sz = dst_z * s2 + kz * d2 - p2;
|
|
7288
|
+
const int64_t sy = dst_y * s1 + ky * d1 - p1;
|
|
7289
|
+
const int64_t sx = dst_x * s0 + kx * d0 - p0;
|
|
7290
|
+
|
|
7291
|
+
int64_t dst_idx = ic * knl_n_per_channel + kz * (knl_h * knl_w) + ky * knl_w + kx;
|
|
7292
|
+
|
|
7293
|
+
float src_val;
|
|
7294
|
+
if (sz < 0 || sz >= src_d || sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
|
|
7295
|
+
src_val = 0.0f;
|
|
7296
|
+
} else {
|
|
7297
|
+
const int64_t cn_idx = batch_idx * c + ic;
|
|
7298
|
+
const float * src_ptr = (const float *)((const char *)src_data + sx*src->nb[0] + sy*src->nb[1] + sz*src->nb[2] + cn_idx*src->nb[3]);
|
|
7299
|
+
src_val = *src_ptr;
|
|
7300
|
+
}
|
|
7301
|
+
|
|
7302
|
+
char * element_ptr = dst_row + dst_idx * traits->type_size;
|
|
7303
|
+
if (kernel_type == GGML_TYPE_F32) {
|
|
7304
|
+
*(float *)element_ptr = src_val;
|
|
7305
|
+
} else if (kernel_type == GGML_TYPE_F16) {
|
|
7306
|
+
*(ggml_fp16_t *)element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
|
|
7307
|
+
}
|
|
7308
|
+
}
|
|
7309
|
+
}
|
|
7310
|
+
}
|
|
7311
|
+
}
|
|
7312
|
+
}
|
|
7313
|
+
|
|
7314
|
+
ggml_barrier(params->threadpool);
|
|
7315
|
+
|
|
7316
|
+
float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n_total * traits->type_size);
|
|
7317
|
+
ggml_call_mul_mat(kernel_type, params, patch_n_in_batch, oc, knl_n_total, tmp, knl_data, gemm_output);
|
|
7318
|
+
|
|
7319
|
+
ggml_barrier(params->threadpool);
|
|
7320
|
+
|
|
7321
|
+
const int64_t permute_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth;
|
|
7322
|
+
const int64_t permute_start = params->ith * permute_per_thread;
|
|
7323
|
+
const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n_in_batch);
|
|
7324
|
+
|
|
7325
|
+
for (int64_t i = permute_start; i < permute_end; ++i) {
|
|
7326
|
+
const int64_t p = patch_start_batch + i;
|
|
7327
|
+
const int64_t p_in_batch = p % (dst_w * dst_h * dst_d);
|
|
7328
|
+
const int64_t p_in_depth = p_in_batch % (dst_w * dst_h);
|
|
7329
|
+
const int64_t batch_idx = p / (dst_w * dst_h * dst_d);
|
|
7330
|
+
const int64_t dst_z = p_in_batch / (dst_w * dst_h);
|
|
7331
|
+
const int64_t dst_y = p_in_depth / dst_w;
|
|
7332
|
+
const int64_t dst_x = p_in_depth % dst_w;
|
|
7333
|
+
|
|
7334
|
+
for (int64_t ioc = 0; ioc < oc; ++ioc) {
|
|
7335
|
+
const float value = gemm_output[i * oc + ioc];
|
|
7336
|
+
const int64_t ocn_idx = batch_idx * oc + ioc;
|
|
7337
|
+
float * dst_ptr = (float *)((char *)dst_data + dst_x*dst->nb[0] + dst_y*dst->nb[1] + dst_z*dst->nb[2] + ocn_idx*dst->nb[3]);
|
|
7338
|
+
*dst_ptr = value;
|
|
7339
|
+
}
|
|
7340
|
+
}
|
|
7341
|
+
}
|
|
7342
|
+
}
|
|
7343
|
+
|
|
7344
|
+
void ggml_compute_forward_conv_3d(
|
|
7345
|
+
const ggml_compute_params * params,
|
|
7346
|
+
ggml_tensor * dst) {
|
|
7347
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7348
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7349
|
+
ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type);
|
|
7350
|
+
}
|
|
7351
|
+
|
|
7210
7352
|
// ggml_compute_forward_conv_transpose_2d
|
|
7211
7353
|
|
|
7212
7354
|
void ggml_compute_forward_conv_transpose_2d(
|
|
@@ -70,6 +70,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
|
|
|
70
70
|
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
71
71
|
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
72
72
|
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
73
|
+
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
73
74
|
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
74
75
|
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
75
76
|
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -64,8 +64,6 @@ extern "C" {
|
|
|
64
64
|
|
|
65
65
|
typedef struct llama_memory_i * llama_memory_t;
|
|
66
66
|
|
|
67
|
-
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
|
68
|
-
|
|
69
67
|
typedef int32_t llama_pos;
|
|
70
68
|
typedef int32_t llama_token;
|
|
71
69
|
typedef int32_t llama_seq_id;
|
|
@@ -314,7 +312,7 @@ extern "C" {
|
|
|
314
312
|
float yarn_beta_fast; // YaRN low correction dim
|
|
315
313
|
float yarn_beta_slow; // YaRN high correction dim
|
|
316
314
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
|
317
|
-
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
|
315
|
+
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
|
318
316
|
|
|
319
317
|
ggml_backend_sched_eval_callback cb_eval;
|
|
320
318
|
void * cb_eval_user_data;
|
|
@@ -469,8 +467,6 @@ extern "C" {
|
|
|
469
467
|
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
|
470
468
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
471
469
|
|
|
472
|
-
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
|
473
|
-
|
|
474
470
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
475
471
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
476
472
|
|
|
@@ -667,111 +663,6 @@ extern "C" {
|
|
|
667
663
|
// Check if the memory supports shifting
|
|
668
664
|
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
|
669
665
|
|
|
670
|
-
//
|
|
671
|
-
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
|
672
|
-
//
|
|
673
|
-
|
|
674
|
-
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
675
|
-
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
676
|
-
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
|
677
|
-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
678
|
-
|
|
679
|
-
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
680
|
-
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
|
681
|
-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
682
|
-
|
|
683
|
-
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
684
|
-
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
|
685
|
-
struct llama_context * ctx),
|
|
686
|
-
"Use llama_memory_clear() instead");
|
|
687
|
-
|
|
688
|
-
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
689
|
-
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
690
|
-
// seq_id < 0 : match any sequence
|
|
691
|
-
// p0 < 0 : [0, p1]
|
|
692
|
-
// p1 < 0 : [p0, inf)
|
|
693
|
-
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
|
694
|
-
struct llama_context * ctx,
|
|
695
|
-
llama_seq_id seq_id,
|
|
696
|
-
llama_pos p0,
|
|
697
|
-
llama_pos p1),
|
|
698
|
-
"Use llama_memory_seq_rm() instead");
|
|
699
|
-
|
|
700
|
-
// Copy all tokens that belong to the specified sequence to another sequence
|
|
701
|
-
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
702
|
-
// p0 < 0 : [0, p1]
|
|
703
|
-
// p1 < 0 : [p0, inf)
|
|
704
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
|
705
|
-
struct llama_context * ctx,
|
|
706
|
-
llama_seq_id seq_id_src,
|
|
707
|
-
llama_seq_id seq_id_dst,
|
|
708
|
-
llama_pos p0,
|
|
709
|
-
llama_pos p1),
|
|
710
|
-
"Use llama_memory_seq_cp() instead");
|
|
711
|
-
|
|
712
|
-
// Removes all tokens that do not belong to the specified sequence
|
|
713
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
|
714
|
-
struct llama_context * ctx,
|
|
715
|
-
llama_seq_id seq_id),
|
|
716
|
-
"Use llama_memory_seq_keep() instead");
|
|
717
|
-
|
|
718
|
-
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
719
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
720
|
-
// - lazily on next llama_decode()
|
|
721
|
-
// p0 < 0 : [0, p1]
|
|
722
|
-
// p1 < 0 : [p0, inf)
|
|
723
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
|
724
|
-
struct llama_context * ctx,
|
|
725
|
-
llama_seq_id seq_id,
|
|
726
|
-
llama_pos p0,
|
|
727
|
-
llama_pos p1,
|
|
728
|
-
llama_pos delta),
|
|
729
|
-
"Use llama_memory_seq_add() instead");
|
|
730
|
-
|
|
731
|
-
// Integer division of the positions by factor of `d > 1`
|
|
732
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
733
|
-
// - lazily on next llama_decode()
|
|
734
|
-
// p0 < 0 : [0, p1]
|
|
735
|
-
// p1 < 0 : [p0, inf)
|
|
736
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
|
737
|
-
struct llama_context * ctx,
|
|
738
|
-
llama_seq_id seq_id,
|
|
739
|
-
llama_pos p0,
|
|
740
|
-
llama_pos p1,
|
|
741
|
-
int d),
|
|
742
|
-
"Use llama_memory_seq_div() instead");
|
|
743
|
-
|
|
744
|
-
// Returns the smallest position present in the KV cache for the specified sequence
|
|
745
|
-
// This is typically non-zero only for SWA caches
|
|
746
|
-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
747
|
-
// Return -1 if the sequence is empty
|
|
748
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
749
|
-
struct llama_context * ctx,
|
|
750
|
-
llama_seq_id seq_id),
|
|
751
|
-
"Use llama_memory_seq_pos_min() instead");
|
|
752
|
-
|
|
753
|
-
// Returns the largest position present in the KV cache for the specified sequence
|
|
754
|
-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
755
|
-
// Return -1 if the sequence is empty
|
|
756
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
757
|
-
struct llama_context * ctx,
|
|
758
|
-
llama_seq_id seq_id),
|
|
759
|
-
"Use llama_memory_seq_pos_max() instead");
|
|
760
|
-
|
|
761
|
-
// Defragment the KV cache
|
|
762
|
-
// This will be applied:
|
|
763
|
-
// - lazily on next llama_decode()
|
|
764
|
-
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
|
765
|
-
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
766
|
-
|
|
767
|
-
// Check if the context supports KV cache shifting
|
|
768
|
-
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
|
769
|
-
"use llama_memory_can_shift() instead");
|
|
770
|
-
|
|
771
|
-
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
772
|
-
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
|
773
|
-
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
774
|
-
|
|
775
666
|
//
|
|
776
667
|
// State / sessions
|
|
777
668
|
//
|