@fugood/llama.node 1.1.10 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +20 -2
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +174 -388
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +67 -37
  27. package/src/llama.cpp/common/chat.cpp +263 -2
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.cpp +10 -3
  30. package/src/llama.cpp/common/common.h +5 -2
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  39. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  40. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  45. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  46. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
  53. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
  54. package/src/llama.cpp/include/llama.h +32 -7
  55. package/src/llama.cpp/src/llama-adapter.cpp +101 -4
  56. package/src/llama.cpp/src/llama-adapter.h +6 -0
  57. package/src/llama.cpp/src/llama-arch.cpp +69 -2
  58. package/src/llama.cpp/src/llama-arch.h +6 -0
  59. package/src/llama.cpp/src/llama-context.cpp +92 -45
  60. package/src/llama.cpp/src/llama-context.h +1 -5
  61. package/src/llama.cpp/src/llama-graph.cpp +74 -19
  62. package/src/llama.cpp/src/llama-graph.h +10 -1
  63. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  64. package/src/llama.cpp/src/llama-hparams.h +9 -3
  65. package/src/llama.cpp/src/llama-impl.h +2 -0
  66. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
  67. package/src/llama.cpp/src/llama-kv-cache.h +4 -13
  68. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  69. package/src/llama.cpp/src/llama-model.cpp +434 -21
  70. package/src/llama.cpp/src/llama-model.h +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  72. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  73. package/src/llama.cpp/src/llama.cpp +12 -0
  74. package/src/anyascii.c +0 -22223
  75. package/src/anyascii.h +0 -42
  76. package/src/tts_utils.cpp +0 -371
  77. package/src/tts_utils.h +0 -103
@@ -511,6 +511,7 @@ extern "C" {
511
511
  GGML_OP_CONV_TRANSPOSE_1D,
512
512
  GGML_OP_IM2COL,
513
513
  GGML_OP_IM2COL_BACK,
514
+ GGML_OP_IM2COL_3D,
514
515
  GGML_OP_CONV_2D,
515
516
  GGML_OP_CONV_3D,
516
517
  GGML_OP_CONV_2D_DW,
@@ -1870,6 +1871,41 @@ extern "C" {
1870
1871
  int d0, // dilation dimension 0
1871
1872
  int d1); // dilation dimension 1
1872
1873
 
1874
+ GGML_API struct ggml_tensor * ggml_im2col_3d(
1875
+ struct ggml_context * ctx,
1876
+ struct ggml_tensor * a,
1877
+ struct ggml_tensor * b,
1878
+ int64_t IC,
1879
+ int s0, // stride width
1880
+ int s1, // stride height
1881
+ int s2, // stride depth
1882
+ int p0, // padding width
1883
+ int p1, // padding height
1884
+ int p2, // padding depth
1885
+ int d0, // dilation width
1886
+ int d1, // dilation height
1887
+ int d2, // dilation depth
1888
+ enum ggml_type dst_type);
1889
+
1890
+ // a: [OC*IC, KD, KH, KW]
1891
+ // b: [N*IC, ID, IH, IW]
1892
+ // result: [N*OC, OD, OH, OW]
1893
+ GGML_API struct ggml_tensor * ggml_conv_3d(
1894
+ struct ggml_context * ctx,
1895
+ struct ggml_tensor * a,
1896
+ struct ggml_tensor * b,
1897
+ int64_t IC,
1898
+ int s0, // stride width
1899
+ int s1, // stride height
1900
+ int s2, // stride depth
1901
+ int p0, // padding width
1902
+ int p1, // padding height
1903
+ int p2, // padding depth
1904
+ int d0, // dilation width
1905
+ int d1, // dilation height
1906
+ int d2 // dilation depth
1907
+ );
1908
+
1873
1909
  // kernel size is a->ne[0] x a->ne[1]
1874
1910
  // stride is equal to kernel size
1875
1911
  // padding is zero
@@ -1941,7 +1977,7 @@ extern "C" {
1941
1977
  int d0, // dilation dimension 0
1942
1978
  int d1); // dilation dimension 1
1943
1979
 
1944
- GGML_API struct ggml_tensor * ggml_conv_3d(
1980
+ GGML_API struct ggml_tensor * ggml_conv_3d_direct(
1945
1981
  struct ggml_context * ctx,
1946
1982
  struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
1947
1983
  struct ggml_tensor * b, // input [W, H, D, C * N]
@@ -2048,6 +2084,19 @@ extern "C" {
2048
2084
  int p2,
2049
2085
  int p3);
2050
2086
 
2087
+ GGML_API struct ggml_tensor * ggml_pad_ext(
2088
+ struct ggml_context * ctx,
2089
+ struct ggml_tensor * a,
2090
+ int lp0,
2091
+ int rp0,
2092
+ int lp1,
2093
+ int rp1,
2094
+ int lp2,
2095
+ int rp2,
2096
+ int lp3,
2097
+ int rp3
2098
+ );
2099
+
2051
2100
  // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
2052
2101
  GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
2053
2102
  struct ggml_context * ctx,
@@ -433,15 +433,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
433
433
  ggml-cpu/arch/riscv/quants.c
434
434
  ggml-cpu/arch/riscv/repack.cpp
435
435
  )
436
- if (GGML_RVV)
437
- if (GGML_XTHEADVECTOR)
438
- list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
439
- elseif (GGML_RV_ZFH)
440
- list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
441
- else()
442
- list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
436
+ set(MARCH_STR "rv64gc")
437
+ if (GGML_RV_ZFH)
438
+ string(APPEND MARCH_STR "_zfh")
439
+ endif()
440
+ if (GGML_XTHEADVECTOR)
441
+ string(APPEND MARCH_STR "_xtheadvector")
442
+ elseif (GGML_RVV)
443
+ string(APPEND MARCH_STR "_v")
444
+ if (GGML_RV_ZVFH)
445
+ string(APPEND MARCH_STR "_zvfh")
443
446
  endif()
444
447
  endif()
448
+ if (GGML_RV_ZICBOP)
449
+ string(APPEND MARCH_STR "_zicbop")
450
+ endif()
451
+ list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
445
452
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
446
453
  message(STATUS "s390x detected")
447
454
  list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
@@ -450,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
450
457
 
451
458
  # TODO: Separation to determine activation of VX/VXE/VXE2
452
459
  if (${S390X_M} MATCHES "8561|8562")
453
- set(GGML_NNPA OFF)
454
460
  message(STATUS "z15 target")
455
461
  list(APPEND ARCH_FLAGS -march=z15)
456
462
  elseif (${S390X_M} MATCHES "3931")
@@ -472,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
472
478
  list(APPEND ARCH_FLAGS -mvx -mzvector)
473
479
  list(APPEND ARCH_DEFINITIONS GGML_VXE)
474
480
  endif()
475
-
476
- if (GGML_NNPA)
477
- message(STATUS "NNPA enabled")
478
- list(APPEND ARCH_DEFINITIONS GGML_NNPA)
479
- endif()
480
481
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
481
482
  message(STATUS "Wasm detected")
482
483
  list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
@@ -497,9 +498,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
497
498
 
498
499
  # Fetch KleidiAI sources:
499
500
  include(FetchContent)
500
- set(KLEIDIAI_COMMIT_TAG "v1.11.0")
501
+ set(KLEIDIAI_COMMIT_TAG "v1.13.0")
501
502
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
502
- set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
503
+ set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
503
504
 
504
505
  if (POLICY CMP0135)
505
506
  cmake_policy(SET CMP0135 NEW)
@@ -555,6 +556,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
555
556
 
556
557
  list(APPEND GGML_KLEIDIAI_SOURCES
557
558
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
559
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
558
560
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
559
561
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
560
562
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
@@ -576,7 +578,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
576
578
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
577
579
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
578
580
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
579
- ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
581
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
582
+ ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
580
583
  set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
581
584
  endif()
582
585
 
@@ -1270,29 +1270,40 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1270
1270
  const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1271
1271
  const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1272
1272
 
1273
- int tmp, tmp2, sumi;
1273
+ float ftmp, ft2;
1274
+ const uint8_t * restrict q40;
1275
+ const uint8_t * restrict q41;
1276
+ const uint8_t * restrict q42;
1277
+ const uint8_t * restrict q43;
1278
+ const int8_t * restrict q80;
1279
+ const int8_t * restrict q81;
1280
+ const int8_t * restrict q82;
1281
+ const int8_t * restrict q83;
1282
+ int s0, s1, s2, s3;
1283
+
1274
1284
  __asm__ __volatile__(
1275
- "vsetivli zero, 12, e8, m1\n\t"
1276
- "vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
1277
- "vsetivli zero, 4, e32, m1\n\t"
1285
+ "li %[s1], 8\n\t"
1286
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1287
+ "vle32.v v1, (%[s6b])\n\t"
1288
+ "vslide1down.vx v1, v1, zero\n\t"
1289
+ "vmv.v.x v16, zero\n\t"
1278
1290
  "vslidedown.vi v2, v1, 2\n\t"
1279
1291
  "vmv1r.v v3, v2\n\t"
1280
1292
  "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
1281
- "vsetivli zero, 2, e32, m1\n\t"
1293
+ "vsetivli zero, 2, e32, m1, ta, ma\n\t"
1282
1294
  "vmv.v.i v4, 4\n\t"
1283
1295
  "vand.vx v8, v1, %[kmask1]\n\t"
1284
1296
  "vslide1up.vx v5, v4, zero\n\t" // {0, 4}
1285
1297
  "vsrl.vi v6, v1, 6\n\t"
1286
1298
  "vsrl.vv v7, v2, v5\n\t"
1299
+ "vsse32.v v8, (%[utmp]), %[s1]\n\t"
1287
1300
  "vand.vx v0, v6, %[kmask3]\n\t"
1288
1301
  "vand.vx v2, v7, %[kmask2]\n\t"
1289
1302
  "vsll.vi v6, v0, 4\n\t"
1290
- "li %[t2], 8\n\t"
1291
- "addi %[t1], %[utmp], 4\n\t"
1303
+ "addi %[s0], %[utmp], 4\n\t"
1292
1304
  "vor.vv v1, v6, v2\n\t"
1293
- "vsse32.v v8, (%[utmp]), %[t2]\n\t"
1294
- "vsse32.v v1, (%[t1]), %[t2]\n\t"
1295
- "vsetivli zero, 8, e16, m1\n\t"
1305
+ "vsse32.v v1, (%[s0]), %[s1]\n\t"
1306
+ "vsetivli zero, 8, e16, m1, ta, ma\n\t"
1296
1307
  "vle32.v v2, (%[bsums])\n\t"
1297
1308
  "vnsrl.wi v0, v2, 0\n\t"
1298
1309
  "vnsrl.wi v1, v2, 16\n\t"
@@ -1300,13 +1311,131 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1300
1311
  "vle8.v v3, (%[mins])\n\t"
1301
1312
  "vzext.vf2 v4, v3\n\t"
1302
1313
  "vwmul.vv v6, v4, v2\n\t"
1314
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1315
+ "vredsum.vs v0, v6, v16\n\t"
1316
+ "vredsum.vs v0, v7, v0\n\t"
1317
+ "vfcvt.f.x.v v0, v0\n\t"
1318
+ "vfmv.f.s %[ftmp], v0\n\t"
1319
+ "vsetivli zero, 16, e8, m1, ta, ma\n\t"
1320
+ "vle8.v v0, (%[xs])\n\t"
1321
+ "fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
1322
+ "addi %[q40], %[xs], 64\n\t"
1323
+ "addi %[q41], %[xs], 16\n\t"
1324
+ "addi %[q42], %[xs], 32\n\t"
1325
+ "addi %[q43], %[xs], 48\n\t"
1326
+ "addi %[q80], %[ys], 64\n\t"
1327
+ "vle8.v v1, (%[q41])\n\t"
1328
+ "vle8.v v2, (%[q42])\n\t"
1329
+ "addi %[q81], %[ys], 16\n\t"
1330
+ "addi %[q41], %[q41], 64\n\t"
1331
+ "addi %[q82], %[ys], 32\n\t"
1332
+ "vle8.v v3, (%[q43])\n\t"
1333
+ "vle8.v v8, (%[ys])\n\t"
1334
+ "addi %[q42], %[q42], 64\n\t"
1335
+ "addi %[q83], %[ys], 48\n\t"
1336
+ "addi %[q43], %[q43], 64\n\t"
1337
+ "vsrl.vi v4, v0, 4\n\t"
1338
+ "vle8.v v9, (%[q81])\n\t"
1339
+ "vle8.v v10, (%[q82])\n\t"
1340
+ "vand.vi v0, v0, 0xF\n\t"
1341
+ "addi %[q81], %[q81], 64\n\t"
1342
+ "vsrl.vi v5, v1, 4\n\t"
1343
+ "addi %[q82], %[q82], 64\n\t"
1344
+ "vle8.v v11, (%[q83])\n\t"
1345
+ "vle8.v v12, (%[q80])\n\t"
1346
+ "vand.vi v1, v1, 0xF\n\t"
1347
+ "addi %[q83], %[q83], 64\n\t"
1348
+ "vsrl.vi v6, v2, 4\n\t"
1349
+ "addi %[q80], %[q80], 64\n\t"
1350
+ "vle8.v v13, (%[q81])\n\t"
1351
+ "vle8.v v14, (%[q82])\n\t"
1352
+ "vand.vi v2, v2, 0xF\n\t"
1353
+ "addi %[q81], %[q81], 64\n\t"
1354
+ "vsrl.vi v7, v3, 4\n\t"
1355
+ "addi %[q82], %[q82], 64\n\t"
1356
+ "vwmul.vv v16, v0, v8\n\t"
1357
+ "vle8.v v15, (%[q83])\n\t"
1358
+ "vle8.v v0, (%[q40])\n\t"
1359
+ "vand.vi v3, v3, 0xF\n\t"
1360
+ "addi %[q83], %[q83], 64\n\t"
1361
+ "vwmul.vv v24, v2, v12\n\t"
1362
+ "vwmul.vv v20, v4, v10\n\t"
1363
+ "vwmul.vv v28, v6, v14\n\t"
1364
+ "vwmacc.vv v16, v1, v9\n\t"
1365
+ "vle8.v v1, (%[q41])\n\t"
1366
+ "vle8.v v2, (%[q42])\n\t"
1367
+ "vwmacc.vv v24, v3, v13\n\t"
1368
+ "vwmacc.vv v20, v5, v11\n\t"
1369
+ "vwmacc.vv v28, v7, v15\n\t"
1370
+ "addi %[q40], %[q80], 64\n\t"
1371
+ "addi %[q41], %[q81], 64\n\t"
1372
+ "vle8.v v3, (%[q43])\n\t"
1373
+ "vle8.v v8, (%[q80])\n\t"
1374
+ "addi %[q42], %[q82], 64\n\t"
1375
+ "addi %[q43], %[q83], 64\n\t"
1376
+ "vsrl.vi v4, v0, 4\n\t"
1377
+ "vle8.v v9, (%[q81])\n\t"
1378
+ "vle8.v v10, (%[q82])\n\t"
1379
+ "vand.vi v0, v0, 0xF\n\t"
1380
+ "vsrl.vi v5, v1, 4\n\t"
1381
+ "vsrl.vi v7, v3, 4\n\t"
1382
+ "vand.vi v3, v3, 0xF\n\t"
1383
+ "vle8.v v11, (%[q83])\n\t"
1384
+ "vle8.v v12, (%[q40])\n\t"
1385
+ "vand.vi v1, v1, 0xF\n\t"
1386
+ "vsrl.vi v6, v2, 4\n\t"
1387
+ "vand.vi v2, v2, 0xF\n\t"
1388
+ "vwmul.vv v18, v0, v8\n\t"
1389
+ "vle8.v v13, (%[q41])\n\t"
1390
+ "vle8.v v14, (%[q42])\n\t"
1391
+ "vwmul.vv v26, v2, v12\n\t"
1392
+ "vwmul.vv v22, v4, v10\n\t"
1393
+ "vwmul.vv v30, v6, v14\n\t"
1394
+ "vwmacc.vv v18, v1, v9\n\t"
1395
+ "vle8.v v15, (%[q43])\n\t"
1396
+ "vwmacc.vv v26, v3, v13\n\t"
1397
+ "vwmacc.vv v22, v5, v11\n\t"
1398
+ "vwmacc.vv v30, v7, v15\n\t"
1303
1399
  "vmv.v.x v0, zero\n\t"
1304
- "vsetivli zero, 8, e32, m2\n\t"
1305
- "vredsum.vs v0, v6, v0\n\t"
1306
- "vmv.x.s %[sumi], v0"
1307
- : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
1308
- : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1309
- , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
1400
+ "vsetivli zero, 16, e16, m2, ta, ma\n\t"
1401
+ "vwredsum.vs v4, v16, v0\n\t"
1402
+ "lbu %[s0], 0(%[scale])\n\t"
1403
+ "vwredsum.vs v5, v20, v0\n\t"
1404
+ "lbu %[s1], 1(%[scale])\n\t"
1405
+ "vwredsum.vs v6, v24, v0\n\t"
1406
+ "lbu %[s2], 2(%[scale])\n\t"
1407
+ "vwredsum.vs v7, v28, v0\n\t"
1408
+ "lbu %[s3], 3(%[scale])\n\t"
1409
+ "vwredsum.vs v8, v18, v0\n\t"
1410
+ "lbu %[q40], 4(%[scale])\n\t"
1411
+ "vwredsum.vs v9, v22, v0\n\t"
1412
+ "lbu %[q41], 5(%[scale])\n\t"
1413
+ "vwredsum.vs v10, v26, v0\n\t"
1414
+ "lbu %[q42], 6(%[scale])\n\t"
1415
+ "vwredsum.vs v11, v30, v0\n\t"
1416
+ "lbu %[q43], 7(%[scale])\n\t"
1417
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1418
+ "vmul.vx v0, v4, %[s0]\n\t"
1419
+ "vmul.vx v1, v8, %[q40]\n\t"
1420
+ "vmacc.vx v0, %[s1], v5\n\t"
1421
+ "vmacc.vx v1, %[q41], v9\n\t"
1422
+ "vmacc.vx v0, %[s2], v6\n\t"
1423
+ "vmacc.vx v1, %[q42], v10\n\t"
1424
+ "vmacc.vx v0, %[s3], v7\n\t"
1425
+ "vmacc.vx v1, %[q43], v11\n\t"
1426
+ "vfcvt.f.x.v v0, v0\n\t"
1427
+ "vfcvt.f.x.v v1, v1\n\t"
1428
+ "vfmv.f.s %[ft2], v0\n\t"
1429
+ "vfmv.f.s %[ftmp], v1\n\t"
1430
+ "fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
1431
+ "fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
1432
+ : [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
1433
+ , [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
1434
+ , [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
1435
+ , [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
1436
+ : [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
1437
+ , [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1438
+ , [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
1310
1439
  , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
1311
1440
  : "memory"
1312
1441
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
@@ -1314,59 +1443,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1314
1443
  , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1315
1444
  , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1316
1445
  );
1317
- sumf -= dmin * sumi;
1318
-
1319
- const uint8_t * restrict q4 = x[i].qs;
1320
- const int8_t * restrict q8 = y[i].qs;
1321
-
1322
- sumi = 0;
1323
- const uint8_t * scale = scales;
1324
-
1325
- for (int j = 0; j < QK_K/128; ++j) {
1326
- int vl128 = 128, vl64 = 64, vl32 = 32;
1327
- __asm__ __volatile__(
1328
- "vsetvli zero, %[vl128], e8, m8\n\t"
1329
- "vle8.v v8, (%[q8])\n\t"
1330
- "vsetvli zero, %[vl64], e8, m4\n\t"
1331
- "vle8.v v0, (%[q4])\n\t"
1332
- "vsrl.vi v4, v0, 4\n\t"
1333
- "vand.vi v0, v0, 0xF\n\t"
1334
- "vsetvli zero, %[vl32], e8, m2\n\t"
1335
- "vwmul.vv v28, v6, v14\n\t"
1336
- "vwmul.vv v20, v4, v10\n\t"
1337
- "vwmul.vv v24, v2, v12\n\t"
1338
- "vwmul.vv v16, v0, v8\n\t"
1339
- "vsetivli zero, 4, e32, m1\n\t"
1340
- "vle8.v v2, (%[scale])\n\t"
1341
- "vmv.v.x v0, zero\n\t"
1342
- "vzext.vf4 v1, v2\n\t"
1343
- "vsetvli zero, %[vl32], e16, m4\n\t"
1344
- "vwredsum.vs v6, v24, v0\n\t"
1345
- "vwredsum.vs v7, v28, v0\n\t"
1346
- "vwredsum.vs v4, v16, v0\n\t"
1347
- "vwredsum.vs v5, v20, v0\n\t"
1348
- "vsetivli zero, 4, e32, m1\n\t"
1349
- "vslideup.vi v6, v7, 1\n\t"
1350
- "vslideup.vi v4, v5, 1\n\t"
1351
- "vslideup.vi v4, v6, 2\n\t"
1352
- "vmul.vv v8, v4, v1\n\t"
1353
- "vredsum.vs v0, v8, v0\n\t"
1354
- "vmv.x.s %[tmp], v0\n\t"
1355
- "add %[sumi], %[sumi], %[tmp]"
1356
- : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
1357
- : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
1358
- , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
1359
- : "memory"
1360
- , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1361
- , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1362
- , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1363
- , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1364
- );
1365
-
1366
- q4 += 64; q8 += 128; scale += 4;
1367
- }
1368
-
1369
- sumf += d * sumi;
1370
1446
  }
1371
1447
  break;
1372
1448
  default:
@@ -1693,6 +1769,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1693
1769
  case 128:
1694
1770
  for (int i = 0; i < nb; ++i) {
1695
1771
 
1772
+ __builtin_prefetch(&x[i + 1].d, 0, 1);
1773
+
1696
1774
  const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1697
1775
 
1698
1776
  const uint8_t * restrict q6 = x[i].ql;
@@ -1701,23 +1779,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1701
1779
 
1702
1780
  const int8_t * restrict scale = x[i].scales;
1703
1781
 
1704
- int sum_t = 0;
1705
- int t0;
1782
+ int q6h;
1783
+ float ftmp;
1706
1784
 
1707
1785
  for (int j = 0; j < QK_K/128; ++j) {
1708
1786
  __asm__ __volatile__(
1787
+ "addi %[q6h], %[q6], 32\n\t"
1788
+ "ld t0, 0(%[scale])\n\t"
1789
+ "addi %[scale], %[scale], 8\n\t"
1790
+ "slli t6, t0, 1 * 8\n\t"
1791
+ "lb zero, 0(%[q6])\n\t"
1792
+ "slli t5, t0, 2 * 8\n\t"
1793
+ "slli t4, t0, 3 * 8\n\t"
1794
+ "lb zero, 0(%[q6h])\n\t"
1795
+ "slli t3, t0, 4 * 8\n\t"
1796
+ "slli t2, t0, 5 * 8\n\t"
1797
+ "lb zero, 0(%[qh])\n\t"
1798
+ "lb zero, 31(%[q6h])\n\t"
1799
+ "slli t1, t0, 6 * 8\n\t"
1800
+ "srai a7, t0, 56\n\t"
1709
1801
  "vsetvli zero, %[vl32], e8, m2\n\t"
1802
+ "vle8.v v8, (%[q6])\n\t"
1803
+ "srai t6, t6, 56\n\t"
1804
+ "srai t5, t5, 56\n\t"
1805
+ "srai t4, t4, 56\n\t"
1806
+ "srai t3, t3, 56\n\t"
1807
+ "vle8.v v10, (%[q6h])\n\t"
1808
+ "addi %[q6], %[q6], 64\n\t"
1809
+ "slli t0, t0, 7 * 8\n\t"
1810
+ "srai t2, t2, 56\n\t"
1811
+ "srai t1, t1, 56\n\t"
1812
+ "srai t0, t0, 56\n\t"
1710
1813
  "vle8.v v4, (%[qh])\n\t"
1814
+ "vsrl.vi v12, v8, 4\n\t"
1815
+ "vsrl.vi v14, v10, 4\n\t"
1816
+ "lb zero, 0(%[q8])\n\t"
1817
+ "vand.vi v8, v8, 0xF\n\t"
1818
+ "vand.vi v10, v10, 0xF\n\t"
1819
+ "lb zero, 32(%[q8])\n\t"
1711
1820
  "vsll.vi v0, v4, 4\n\t"
1712
1821
  "vsll.vi v2, v4, 2\n\t"
1822
+ "lb zero, 64(%[q8])\n\t"
1713
1823
  "vsrl.vi v6, v4, 2\n\t"
1714
- "vsetvli zero, %[vl64], e8, m4\n\t"
1715
- "vle8.v v8, (%[q6])\n\t"
1716
- "vsrl.vi v12, v8, 4\n\t"
1717
- "vand.vi v8, v8, 0xF\n\t"
1718
- "vsetvli zero, %[vl128], e8, m8\n\t"
1719
1824
  "vand.vx v0, v0, %[mask]\n\t"
1825
+ "lb zero, 96(%[q8])\n\t"
1826
+ "vand.vx v2, v2, %[mask]\n\t"
1827
+ "vand.vx v4, v4, %[mask]\n\t"
1828
+ "vand.vx v6, v6, %[mask]\n\t"
1720
1829
  "vor.vv v8, v8, v0\n\t"
1830
+ "lb zero, 127(%[q8])\n\t"
1831
+ "vor.vv v10, v10, v2\n\t"
1832
+ "vor.vv v12, v12, v4\n\t"
1833
+ "vor.vv v14, v14, v6\n\t"
1834
+ "vsetvli zero, %[vl128], e8, m8\n\t"
1721
1835
  "vle8.v v0, (%[q8])\n\t"
1722
1836
  "vsub.vx v8, v8, %[vl32]\n\t"
1723
1837
  "vsetvli zero, %[vl64], e8, m4\n\t"
@@ -1734,34 +1848,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1734
1848
  "vwredsum.vs v13, v28, v0\n\t"
1735
1849
  "vwredsum.vs v14, v30, v0\n\t"
1736
1850
  "vsetivli zero, 4, e32, m1\n\t"
1737
- "vslideup.vi v10, v9, 1\n\t"
1738
- "vslideup.vi v8, v7, 1\n\t"
1739
- "vslideup.vi v11, v12, 1\n\t"
1740
- "vslideup.vi v13, v14, 1\n\t"
1741
- "vslideup.vi v10, v8, 2\n\t"
1742
- "vslideup.vi v11, v13, 2\n\t"
1743
- "vsetivli zero, 8, e32, m2\n\t"
1744
- "vle8.v v2, (%[scale])\n\t"
1745
- "vsext.vf4 v4, v2\n\t"
1746
- "vmul.vv v2, v4, v10\n\t"
1747
- "vredsum.vs v0, v2, v0\n\t"
1748
- "vmv.x.s %[t0], v0\n\t"
1749
- "add %[sumi], %[sumi], %[t0]"
1750
- : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
1751
- : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
1851
+ "vmul.vx v0, v10, t0\n\t"
1852
+ "vmul.vx v1, v9, t1\n\t"
1853
+ "vmacc.vx v0, t2, v8\n\t"
1854
+ "vmacc.vx v1, t3, v7\n\t"
1855
+ "vmacc.vx v0, t4, v11\n\t"
1856
+ "vmacc.vx v1, t5, v12\n\t"
1857
+ "vmacc.vx v0, t6, v13\n\t"
1858
+ "vmacc.vx v1, a7, v14\n\t"
1859
+ "vadd.vv v0, v0, v1\n\t"
1860
+ "vfcvt.f.x.v v0, v0\n\t"
1861
+ "vfmv.f.s %[ftmp], v0\n\t"
1862
+ "fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
1863
+ : [q6] "+&r" (q6), [q6h] "=&r" (q6h)
1864
+ , [scale] "+&r" (scale)
1865
+ , [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
1866
+ : [qh] "r" (qh), [q8] "r" (q8)
1752
1867
  , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
1753
- , [mask] "r" (0x30)
1868
+ , [mask] "r" (0x30), [d] "f" (d)
1754
1869
  : "memory"
1755
1870
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1756
1871
  , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1757
1872
  , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1758
1873
  , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1874
+ , "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
1875
+ , "a6", "a5", "a4", "a3"
1759
1876
  );
1760
- q6 += 64; qh += 32; q8 += 128; scale += 8;
1877
+ qh += 32; q8 += 128;
1761
1878
  }
1762
-
1763
- sumf += d * sum_t;
1764
-
1765
1879
  }
1766
1880
  break;
1767
1881
  default:
@@ -68,12 +68,6 @@ struct ggml_compute_params {
68
68
  #endif // __VXE2__
69
69
  #endif // __s390x__ && __VEC__
70
70
 
71
- #if defined(__s390x__) && defined(GGML_NNPA)
72
- #ifndef __NNPA__
73
- #define __NNPA__
74
- #endif // __NNPA__
75
- #endif // __s390x__ && GGML_NNPA
76
-
77
71
  #if defined(__ARM_FEATURE_SVE)
78
72
  #include <sys/prctl.h>
79
73
  #endif
@@ -489,7 +483,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
489
483
  /**
490
484
  * @see https://github.com/ggml-org/llama.cpp/pull/14037
491
485
  */
492
- inline float vec_hsum(float32x4_t v) {
486
+ inline static float vec_hsum(float32x4_t v) {
493
487
  float32x4_t v_temp = v + vec_reve(v);
494
488
  return v_temp[0] + v_temp[1];
495
489
  }
@@ -1876,6 +1876,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1876
1876
  {
1877
1877
  ggml_compute_forward_im2col_back_f32(params, tensor);
1878
1878
  } break;
1879
+ case GGML_OP_IM2COL_3D:
1880
+ {
1881
+ ggml_compute_forward_im2col_3d(params, tensor);
1882
+ } break;
1879
1883
  case GGML_OP_CONV_2D:
1880
1884
  {
1881
1885
  ggml_compute_forward_conv_2d(params, tensor);
@@ -2255,6 +2259,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2255
2259
  } break;
2256
2260
  case GGML_OP_IM2COL:
2257
2261
  case GGML_OP_IM2COL_BACK:
2262
+ case GGML_OP_IM2COL_3D:
2258
2263
  case GGML_OP_CONV_2D:
2259
2264
  case GGML_OP_CONV_3D:
2260
2265
  case GGML_OP_CONV_2D_DW:
@@ -3206,20 +3211,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3206
3211
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3207
3212
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3208
3213
  }
3209
- #elif defined(__NNPA__)
3210
- for (; i + 7 < n; i += 8) {
3211
- float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3212
- float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3213
- uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3214
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3215
- vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3216
- }
3217
- for (; i + 3 < n; i += 4) {
3218
- float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3219
- float32x4_t v_zero = vec_splats(0.0f);
3220
- uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3221
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3222
- vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3214
+ #elif defined(__riscv_zvfh)
3215
+ for (int vl; i < n; i += vl) {
3216
+ vl = __riscv_vsetvl_e32m2(n - i);
3217
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
3218
+ vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
3219
+ __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
3223
3220
  }
3224
3221
  #endif
3225
3222
  for (; i < n; ++i) {
@@ -3247,21 +3244,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3247
3244
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3248
3245
  _mm_storeu_ps(y + i, y_vec);
3249
3246
  }
3250
- #elif defined(__NNPA__)
3251
- for (; i + 7 < n; i += 8) {
3252
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3253
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3254
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3255
- float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3256
- vec_xst(v_yh, 0, (float *)(y + i + 0));
3257
- vec_xst(v_yl, 0, (float *)(y + i + 4));
3258
- }
3259
- for (; i + 3 < n; i += 4) {
3260
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3261
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3262
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3263
- vec_xst(v_yh, 0, (float *)(y + i));
3264
- }
3265
3247
  #endif
3266
3248
 
3267
3249
  for (; i < n; ++i) {
@@ -3465,14 +3447,6 @@ int ggml_cpu_has_vxe(void) {
3465
3447
  #endif
3466
3448
  }
3467
3449
 
3468
- int ggml_cpu_has_nnpa(void) {
3469
- #if defined(GGML_NNPA)
3470
- return 1;
3471
- #else
3472
- return 0;
3473
- #endif
3474
- }
3475
-
3476
3450
  int ggml_cpu_has_neon(void) {
3477
3451
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3478
3452
  return 1;