@fugood/llama.node 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/src/LlamaContext.cpp +20 -0
  8. package/src/common.hpp +8 -1
  9. package/src/llama.cpp/common/arg.cpp +13 -4
  10. package/src/llama.cpp/common/chat.cpp +33 -2
  11. package/src/llama.cpp/common/common.cpp +0 -15
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  14. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  22. package/src/llama.cpp/include/llama.h +1 -110
  23. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  24. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  25. package/src/llama.cpp/src/llama-arch.h +1 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +5 -197
  29. package/src/llama.cpp/src/llama-context.h +2 -7
  30. package/src/llama.cpp/src/llama-cparams.h +0 -1
  31. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  32. package/src/llama.cpp/src/llama-graph.h +36 -46
  33. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
  34. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
  35. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
  36. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
  37. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
  40. package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
  41. package/src/llama.cpp/src/llama-memory.h +3 -8
  42. package/src/llama.cpp/src/llama-model.cpp +449 -246
  43. package/src/llama.cpp/src/llama-model.h +2 -0
@@ -23,6 +23,27 @@
23
23
 
24
24
  #define UNUSED GGML_UNUSED
25
25
 
26
+ #if defined(__VXE__) || defined(__VXE2__)
27
+ #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
28
+ #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
29
+ #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
30
+ #define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
31
+ #define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
32
+ #define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
33
+ #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
34
+ #define B8(c,s ) B7(c,s, c), B7(c,s, s)
35
+
36
+ // precomputed tables for expanding 8bits to 8 bytes:
37
+ static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
38
+ static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
39
+
40
+ // permute mask for byteswapping
41
+ static const uint8x16_t v_kperm = (const uint8x16_t){
42
+ 7, 6, 5, 4, 3, 2, 1, 0,
43
+ 15, 14, 13, 12, 11, 10, 9, 8
44
+ };
45
+ #endif
46
+
26
47
  void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
27
48
  assert(QK8_0 == 32);
28
49
  assert(k % QK8_0 == 0);
@@ -241,6 +262,301 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
241
262
  #endif
242
263
  }
243
264
 
265
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
266
+ const int qk = QK8_0;
267
+ const int nb = n / qk;
268
+
269
+ assert(n % qk == 0);
270
+ assert(qk == QK5_0);
271
+ assert(nrc == 1);
272
+ UNUSED(nrc);
273
+ UNUSED(bx);
274
+ UNUSED(by);
275
+ UNUSED(bs);
276
+
277
+ const block_q5_0 * GGML_RESTRICT x = vx;
278
+ const block_q8_0 * GGML_RESTRICT y = vy;
279
+
280
+ int ib = 0;
281
+ float sumf = 0.0f;
282
+
283
+ #if defined(__VXE__) || defined(__VXE2__)
284
+ float32x4_t v_sum0 = vec_splats(0.0f);
285
+ float32x4_t v_sum1 = vec_splats(0.0f);
286
+
287
+ uint32_t qh0, qh1;
288
+ uint64_t tmp0[4], tmp1[4];
289
+
290
+ const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
291
+
292
+ #pragma GCC unroll 4
293
+ for (; ib + 1 < nb; ib += 2) {
294
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
295
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
296
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
297
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
298
+
299
+ memcpy(&qh0, x0->qh, sizeof(qh0));
300
+ memcpy(&qh1, x1->qh, sizeof(qh1));
301
+
302
+ tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
303
+ tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
304
+ tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
305
+ tmp0[3] = table_b2b_1[(qh0 >> 24) ];
306
+
307
+ tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
308
+ tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
309
+ tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
310
+ tmp1[3] = table_b2b_1[(qh1 >> 24) ];
311
+
312
+ int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
313
+ int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
314
+ int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
315
+ int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
316
+
317
+ // required for fixing the byteorder
318
+ v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
319
+ v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
320
+ v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
321
+ v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
322
+
323
+ const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
324
+ const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
325
+
326
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
327
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
328
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
329
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
330
+
331
+ const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
332
+ const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
333
+ const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
334
+ const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
335
+
336
+ const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs);
337
+ const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
338
+ const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs);
339
+ const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
340
+
341
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
342
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
343
+
344
+ const float32x4_t v_xy0f = vec_float(v_xy0);
345
+ const float32x4_t v_xy1f = vec_float(v_xy1);
346
+
347
+ const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
348
+ const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
349
+
350
+ v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
351
+ v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
352
+ }
353
+
354
+ sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
355
+
356
+ #pragma GCC unroll 4
357
+ for (; ib < nb; ++ib) {
358
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
359
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
360
+
361
+ uint32_t qh;
362
+ memcpy(&qh, x0->qh, sizeof(qh));
363
+
364
+ uint64_t tmp[4];
365
+ tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
366
+ tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
367
+ tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
368
+ tmp[3] = table_b2b_1[(qh >> 24) ];
369
+
370
+ int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
371
+ int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
372
+
373
+ // required for fixing the byteorder
374
+ v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
375
+ v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
376
+
377
+ const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
378
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
379
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
380
+
381
+ const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
382
+ const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
383
+
384
+ const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs);
385
+ const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
386
+
387
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
388
+ const float32x4_t v_xyf = vec_float(v_xy);
389
+
390
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
391
+ const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
392
+
393
+ sumf += vec_hsum(v_acc);
394
+ }
395
+
396
+ *s = sumf;
397
+ #else
398
+ UNUSED(nb);
399
+ UNUSED(x);
400
+ UNUSED(y);
401
+ UNUSED(ib);
402
+ UNUSED(sumf);
403
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
404
+ #endif
405
+ }
406
+
407
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
408
+ const int qk = QK8_1;
409
+ const int nb = n / qk;
410
+
411
+ assert(n % qk == 0);
412
+ assert(qk == QK5_1);
413
+ assert(nrc == 1);
414
+ UNUSED(nrc);
415
+ UNUSED(bx);
416
+ UNUSED(by);
417
+ UNUSED(bs);
418
+
419
+ const block_q5_1 * GGML_RESTRICT x = vx;
420
+ const block_q8_1 * GGML_RESTRICT y = vy;
421
+
422
+ int ib = 0;
423
+ float sumf = 0.0f;
424
+
425
+ #if defined(__VXE__) || defined(__VXE2__)
426
+ float32x4_t v_sum0 = vec_splats(0.0f);
427
+ float32x4_t v_sum1 = vec_splats(0.0f);
428
+
429
+ float summs0 = 0.0f;
430
+ float summs1 = 0.0f;
431
+
432
+ uint32_t qh0;
433
+ uint32_t qh1;
434
+
435
+ uint64_t tmp0[4];
436
+ uint64_t tmp1[4];
437
+
438
+ const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
439
+
440
+ #pragma GCC unroll 4
441
+ for (; ib + 1 < nb; ib += 2) {
442
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
443
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
444
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
445
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
446
+
447
+ summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
448
+ summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
449
+
450
+ memcpy(&qh0, x0->qh, sizeof(qh0));
451
+ memcpy(&qh1, x1->qh, sizeof(qh1));
452
+
453
+ tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
454
+ tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
455
+ tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
456
+ tmp0[3] = table_b2b_0[(qh0 >> 24) ];
457
+
458
+ tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
459
+ tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
460
+ tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
461
+ tmp1[3] = table_b2b_0[(qh1 >> 24) ];
462
+
463
+ int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
464
+ int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
465
+ int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
466
+ int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
467
+
468
+ // required for fixing the byteorder
469
+ v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
470
+ v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
471
+ v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
472
+ v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
473
+
474
+ const uint8x16_t v_x0 = vec_xl(0, x0->qs);
475
+ const uint8x16_t v_x1 = vec_xl(0, x1->qs);
476
+
477
+ const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
478
+ const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
479
+ const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
480
+ const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
481
+
482
+ const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
483
+ const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
484
+ const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
485
+ const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
486
+
487
+ const int8x16_t v_y0l = vec_xl(0 , y0->qs);
488
+ const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
489
+ const int8x16_t v_y1l = vec_xl(0 , y1->qs);
490
+ const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
491
+
492
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
493
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
494
+
495
+ const float32x4_t v_xy0f = vec_float(v_xy0);
496
+ const float32x4_t v_xy1f = vec_float(v_xy1);
497
+
498
+ const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
499
+ const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
500
+
501
+ v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
502
+ v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
503
+ }
504
+
505
+ sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
506
+
507
+ #pragma GCC unroll 4
508
+ for (; ib < nb; ++ib) {
509
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
510
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
511
+
512
+ float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
513
+
514
+ uint32_t qh;
515
+ memcpy(&qh, x0->qh, sizeof(qh));
516
+
517
+ uint64_t tmp[4];
518
+ tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
519
+ tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
520
+ tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
521
+ tmp[3] = table_b2b_0[(qh >> 24) ];
522
+
523
+ int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
524
+ int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
525
+
526
+ // required for fixing the byteorder
527
+ v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
528
+ v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
529
+
530
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
531
+ const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
532
+ const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
533
+
534
+ const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
535
+ const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
536
+
537
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
538
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
539
+
540
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
541
+ const float32x4_t v_xyf = vec_float(v_xy);
542
+
543
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
544
+ const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
545
+
546
+ sumf += vec_hsum(v_acc) + summs;
547
+ }
548
+
549
+ *s = sumf;
550
+ #else
551
+ UNUSED(nb);
552
+ UNUSED(x);
553
+ UNUSED(y);
554
+ UNUSED(ib);
555
+ UNUSED(sumf);
556
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
557
+ #endif
558
+ }
559
+
244
560
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
245
561
  const int qk = QK8_0;
246
562
  const int nb = n / qk;
@@ -73,7 +73,6 @@
73
73
  #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
74
74
  #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
75
75
  #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
76
- #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
77
76
  // repack.cpp
78
77
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
79
78
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -151,8 +150,6 @@
151
150
  #elif defined(__s390x__)
152
151
  // quants.c
153
152
  #define quantize_row_q8_K_generic quantize_row_q8_K
154
- #define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
155
- #define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
156
153
  #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
157
154
  #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
158
155
  #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -486,6 +486,14 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
486
486
  return v_abo + v_abe;
487
487
  }
488
488
 
489
+ /**
490
+ * @see https://github.com/ggml-org/llama.cpp/pull/14037
491
+ */
492
+ inline float vec_hsum(float32x4_t v) {
493
+ float32x4_t v_temp = v + vec_reve(v);
494
+ return v_temp[0] + v_temp[1];
495
+ }
496
+
489
497
  inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
490
498
  const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
491
499
  return acc + (vec_unpackh(p) + vec_unpackl(p));
@@ -1880,6 +1880,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1880
1880
  {
1881
1881
  ggml_compute_forward_conv_2d(params, tensor);
1882
1882
  } break;
1883
+ case GGML_OP_CONV_3D:
1884
+ {
1885
+ ggml_compute_forward_conv_3d(params, tensor);
1886
+ } break;
1883
1887
  case GGML_OP_CONV_2D_DW:
1884
1888
  {
1885
1889
  ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -2252,6 +2256,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2252
2256
  case GGML_OP_IM2COL:
2253
2257
  case GGML_OP_IM2COL_BACK:
2254
2258
  case GGML_OP_CONV_2D:
2259
+ case GGML_OP_CONV_3D:
2255
2260
  case GGML_OP_CONV_2D_DW:
2256
2261
  case GGML_OP_CONV_TRANSPOSE_1D:
2257
2262
  case GGML_OP_CONV_TRANSPOSE_2D:
@@ -2773,6 +2778,7 @@ struct ggml_cplan ggml_graph_plan(
2773
2778
  }
2774
2779
  } break;
2775
2780
  case GGML_OP_CONV_2D:
2781
+ case GGML_OP_CONV_3D:
2776
2782
  {
2777
2783
  cur = GGML_IM2COL_WORK_SIZE;
2778
2784
  } break;
@@ -7207,6 +7207,148 @@ void ggml_compute_forward_conv_2d(
7207
7207
  ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type);
7208
7208
  }
7209
7209
 
7210
+ // ggml_compute_forward_conv_3d
7211
+
7212
+ static void ggml_compute_forward_conv_3d_impl(const ggml_compute_params * params,
7213
+ const ggml_tensor * kernel,
7214
+ const ggml_tensor * src,
7215
+ ggml_tensor * dst,
7216
+ ggml_type kernel_type) {
7217
+
7218
+ GGML_ASSERT(ggml_is_contiguous(kernel));
7219
+ GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32);
7220
+ GGML_ASSERT(kernel->type == kernel_type);
7221
+
7222
+ const ggml_type_traits * traits = ggml_get_type_traits(kernel_type);
7223
+
7224
+ const int32_t s0 = dst->op_params[0];
7225
+ const int32_t s1 = dst->op_params[1];
7226
+ const int32_t s2 = dst->op_params[2];
7227
+ const int32_t p0 = dst->op_params[3];
7228
+ const int32_t p1 = dst->op_params[4];
7229
+ const int32_t p2 = dst->op_params[5];
7230
+ const int32_t d0 = dst->op_params[6];
7231
+ const int32_t d1 = dst->op_params[7];
7232
+ const int32_t d2 = dst->op_params[8];
7233
+ const int32_t c = dst->op_params[9];
7234
+ const int32_t n = dst->op_params[10];
7235
+ const int32_t oc = dst->op_params[11];
7236
+
7237
+ const int64_t src_w = src->ne[0];
7238
+ const int64_t src_h = src->ne[1];
7239
+ const int64_t src_d = src->ne[2];
7240
+ const int64_t knl_w = kernel->ne[0];
7241
+ const int64_t knl_h = kernel->ne[1];
7242
+ const int64_t knl_d = kernel->ne[2];
7243
+ const int64_t dst_w = dst->ne[0];
7244
+ const int64_t dst_h = dst->ne[1];
7245
+ const int64_t dst_d = dst->ne[2];
7246
+
7247
+ const float * src_data = (float *) src->data;
7248
+ void * knl_data = kernel->data;
7249
+ float * dst_data = (float *) dst->data;
7250
+
7251
+ const int64_t knl_n_per_channel = knl_w * knl_h * knl_d;
7252
+ const int64_t knl_n_total = knl_n_per_channel * c;
7253
+ const int64_t patch_total = n * dst_w * dst_h * dst_d;
7254
+
7255
+ const int64_t space_per_patch = knl_n_total * traits->type_size + oc * sizeof(float);
7256
+ const int64_t batch_size = params->wsize / space_per_patch;
7257
+ const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
7258
+ const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
7259
+
7260
+ GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
7261
+
7262
+ void * tmp = params->wdata;
7263
+
7264
+ for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
7265
+ const int64_t patch_start_batch = batch_i * patches_per_batch;
7266
+ const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, patch_total);
7267
+ const int64_t patch_n_in_batch = patch_end_batch - patch_start_batch;
7268
+
7269
+ const int64_t patch_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth;
7270
+ const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread;
7271
+ const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch);
7272
+
7273
+ for (int64_t p = patch_start; p < patch_end; ++p) {
7274
+ const int64_t p_in_batch = p % (dst_w * dst_h * dst_d);
7275
+ const int64_t p_in_depth = p_in_batch % (dst_w * dst_h);
7276
+ const int64_t batch_idx = p / (dst_w * dst_h * dst_d);
7277
+ const int64_t dst_z = p_in_batch / (dst_w * dst_h);
7278
+ const int64_t dst_y = p_in_depth / dst_w;
7279
+ const int64_t dst_x = p_in_depth % dst_w;
7280
+
7281
+ char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n_total * traits->type_size;
7282
+
7283
+ for (int64_t ic = 0; ic < c; ++ic) {
7284
+ for (int64_t kz = 0; kz < knl_d; ++kz) {
7285
+ for (int64_t ky = 0; ky < knl_h; ++ky) {
7286
+ for (int64_t kx = 0; kx < knl_w; ++kx) {
7287
+ const int64_t sz = dst_z * s2 + kz * d2 - p2;
7288
+ const int64_t sy = dst_y * s1 + ky * d1 - p1;
7289
+ const int64_t sx = dst_x * s0 + kx * d0 - p0;
7290
+
7291
+ int64_t dst_idx = ic * knl_n_per_channel + kz * (knl_h * knl_w) + ky * knl_w + kx;
7292
+
7293
+ float src_val;
7294
+ if (sz < 0 || sz >= src_d || sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
7295
+ src_val = 0.0f;
7296
+ } else {
7297
+ const int64_t cn_idx = batch_idx * c + ic;
7298
+ const float * src_ptr = (const float *)((const char *)src_data + sx*src->nb[0] + sy*src->nb[1] + sz*src->nb[2] + cn_idx*src->nb[3]);
7299
+ src_val = *src_ptr;
7300
+ }
7301
+
7302
+ char * element_ptr = dst_row + dst_idx * traits->type_size;
7303
+ if (kernel_type == GGML_TYPE_F32) {
7304
+ *(float *)element_ptr = src_val;
7305
+ } else if (kernel_type == GGML_TYPE_F16) {
7306
+ *(ggml_fp16_t *)element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
7307
+ }
7308
+ }
7309
+ }
7310
+ }
7311
+ }
7312
+ }
7313
+
7314
+ ggml_barrier(params->threadpool);
7315
+
7316
+ float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n_total * traits->type_size);
7317
+ ggml_call_mul_mat(kernel_type, params, patch_n_in_batch, oc, knl_n_total, tmp, knl_data, gemm_output);
7318
+
7319
+ ggml_barrier(params->threadpool);
7320
+
7321
+ const int64_t permute_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth;
7322
+ const int64_t permute_start = params->ith * permute_per_thread;
7323
+ const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n_in_batch);
7324
+
7325
+ for (int64_t i = permute_start; i < permute_end; ++i) {
7326
+ const int64_t p = patch_start_batch + i;
7327
+ const int64_t p_in_batch = p % (dst_w * dst_h * dst_d);
7328
+ const int64_t p_in_depth = p_in_batch % (dst_w * dst_h);
7329
+ const int64_t batch_idx = p / (dst_w * dst_h * dst_d);
7330
+ const int64_t dst_z = p_in_batch / (dst_w * dst_h);
7331
+ const int64_t dst_y = p_in_depth / dst_w;
7332
+ const int64_t dst_x = p_in_depth % dst_w;
7333
+
7334
+ for (int64_t ioc = 0; ioc < oc; ++ioc) {
7335
+ const float value = gemm_output[i * oc + ioc];
7336
+ const int64_t ocn_idx = batch_idx * oc + ioc;
7337
+ float * dst_ptr = (float *)((char *)dst_data + dst_x*dst->nb[0] + dst_y*dst->nb[1] + dst_z*dst->nb[2] + ocn_idx*dst->nb[3]);
7338
+ *dst_ptr = value;
7339
+ }
7340
+ }
7341
+ }
7342
+ }
7343
+
7344
+ void ggml_compute_forward_conv_3d(
7345
+ const ggml_compute_params * params,
7346
+ ggml_tensor * dst) {
7347
+ const ggml_tensor * src0 = dst->src[0];
7348
+ const ggml_tensor * src1 = dst->src[1];
7349
+ ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type);
7350
+ }
7351
+
7210
7352
  // ggml_compute_forward_conv_transpose_2d
7211
7353
 
7212
7354
  void ggml_compute_forward_conv_transpose_2d(
@@ -70,6 +70,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
70
70
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
71
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
72
72
  void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
73
+ void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
73
74
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
74
75
  void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
75
76
  void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -64,8 +64,6 @@ extern "C" {
64
64
 
65
65
  typedef struct llama_memory_i * llama_memory_t;
66
66
 
67
- struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
68
-
69
67
  typedef int32_t llama_pos;
70
68
  typedef int32_t llama_token;
71
69
  typedef int32_t llama_seq_id;
@@ -314,7 +312,7 @@ extern "C" {
314
312
  float yarn_beta_fast; // YaRN low correction dim
315
313
  float yarn_beta_slow; // YaRN high correction dim
316
314
  uint32_t yarn_orig_ctx; // YaRN original context size
317
- float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
315
+ float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
318
316
 
319
317
  ggml_backend_sched_eval_callback cb_eval;
320
318
  void * cb_eval_user_data;
@@ -469,8 +467,6 @@ extern "C" {
469
467
  LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
470
468
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
471
469
 
472
- DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
473
-
474
470
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
475
471
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
476
472
 
@@ -667,111 +663,6 @@ extern "C" {
667
663
  // Check if the memory supports shifting
668
664
  LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
669
665
 
670
- //
671
- // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
672
- //
673
-
674
- // Returns the number of tokens in the KV cache (slow, use only for debug)
675
- // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
676
- DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
677
- "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
678
-
679
- // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
680
- DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
681
- "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
682
-
683
- // Clear the KV cache - both cell info is erased and KV data is zeroed
684
- DEPRECATED(LLAMA_API void llama_kv_self_clear(
685
- struct llama_context * ctx),
686
- "Use llama_memory_clear() instead");
687
-
688
- // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
689
- // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
690
- // seq_id < 0 : match any sequence
691
- // p0 < 0 : [0, p1]
692
- // p1 < 0 : [p0, inf)
693
- DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
694
- struct llama_context * ctx,
695
- llama_seq_id seq_id,
696
- llama_pos p0,
697
- llama_pos p1),
698
- "Use llama_memory_seq_rm() instead");
699
-
700
- // Copy all tokens that belong to the specified sequence to another sequence
701
- // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
702
- // p0 < 0 : [0, p1]
703
- // p1 < 0 : [p0, inf)
704
- DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
705
- struct llama_context * ctx,
706
- llama_seq_id seq_id_src,
707
- llama_seq_id seq_id_dst,
708
- llama_pos p0,
709
- llama_pos p1),
710
- "Use llama_memory_seq_cp() instead");
711
-
712
- // Removes all tokens that do not belong to the specified sequence
713
- DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
714
- struct llama_context * ctx,
715
- llama_seq_id seq_id),
716
- "Use llama_memory_seq_keep() instead");
717
-
718
- // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
719
- // If the KV cache is RoPEd, the KV data is updated accordingly:
720
- // - lazily on next llama_decode()
721
- // p0 < 0 : [0, p1]
722
- // p1 < 0 : [p0, inf)
723
- DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
724
- struct llama_context * ctx,
725
- llama_seq_id seq_id,
726
- llama_pos p0,
727
- llama_pos p1,
728
- llama_pos delta),
729
- "Use llama_memory_seq_add() instead");
730
-
731
- // Integer division of the positions by factor of `d > 1`
732
- // If the KV cache is RoPEd, the KV data is updated accordingly:
733
- // - lazily on next llama_decode()
734
- // p0 < 0 : [0, p1]
735
- // p1 < 0 : [p0, inf)
736
- DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
737
- struct llama_context * ctx,
738
- llama_seq_id seq_id,
739
- llama_pos p0,
740
- llama_pos p1,
741
- int d),
742
- "Use llama_memory_seq_div() instead");
743
-
744
- // Returns the smallest position present in the KV cache for the specified sequence
745
- // This is typically non-zero only for SWA caches
746
- // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
747
- // Return -1 if the sequence is empty
748
- DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
749
- struct llama_context * ctx,
750
- llama_seq_id seq_id),
751
- "Use llama_memory_seq_pos_min() instead");
752
-
753
- // Returns the largest position present in the KV cache for the specified sequence
754
- // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
755
- // Return -1 if the sequence is empty
756
- DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
757
- struct llama_context * ctx,
758
- llama_seq_id seq_id),
759
- "Use llama_memory_seq_pos_max() instead");
760
-
761
- // Defragment the KV cache
762
- // This will be applied:
763
- // - lazily on next llama_decode()
764
- DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
765
- "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
766
-
767
- // Check if the context supports KV cache shifting
768
- DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
769
- "use llama_memory_can_shift() instead");
770
-
771
- // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
772
- DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
773
- "simply remove this call, updates are applied lazily on the next llama_decode()");
774
-
775
666
  //
776
667
  // State / sessions
777
668
  //