@fugood/llama.node 1.1.11 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +250 -1
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  39. package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
  40. package/src/llama.cpp/ggml/include/ggml.h +56 -2
  41. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  43. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  53. package/src/llama.cpp/include/llama.h +5 -6
  54. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  55. package/src/llama.cpp/src/llama-adapter.h +3 -0
  56. package/src/llama.cpp/src/llama-arch.cpp +28 -4
  57. package/src/llama.cpp/src/llama-arch.h +3 -0
  58. package/src/llama.cpp/src/llama-context.cpp +65 -57
  59. package/src/llama.cpp/src/llama-context.h +1 -1
  60. package/src/llama.cpp/src/llama-graph.cpp +57 -11
  61. package/src/llama.cpp/src/llama-graph.h +8 -0
  62. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  63. package/src/llama.cpp/src/llama-hparams.h +10 -3
  64. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
  65. package/src/llama.cpp/src/llama-kv-cache.h +9 -0
  66. package/src/llama.cpp/src/llama-model.cpp +217 -97
  67. package/src/llama.cpp/src/llama-model.h +0 -1
  68. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  69. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  70. package/src/llama.cpp/src/llama.cpp +53 -10
  71. package/src/anyascii.c +0 -22223
  72. package/src/anyascii.h +0 -42
  73. package/src/tts_utils.cpp +0 -371
  74. package/src/tts_utils.h +0 -103
@@ -101,7 +101,6 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
- GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
105
104
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
106
105
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
107
106
 
@@ -135,6 +134,7 @@ extern "C" {
135
134
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
136
135
 
137
136
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
138
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
139
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
140
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
@@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
43
43
 
44
44
  GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
45
45
 
46
- GGML_DEPRECATED(
47
- GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
48
- "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
49
-
50
46
  GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
51
47
 
52
- GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
53
-
54
48
  // helper to check if the device supports a specific family
55
49
  // ideally, the user code should be doing these checks
56
50
  // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
@@ -511,6 +511,7 @@ extern "C" {
511
511
  GGML_OP_CONV_TRANSPOSE_1D,
512
512
  GGML_OP_IM2COL,
513
513
  GGML_OP_IM2COL_BACK,
514
+ GGML_OP_IM2COL_3D,
514
515
  GGML_OP_CONV_2D,
515
516
  GGML_OP_CONV_3D,
516
517
  GGML_OP_CONV_2D_DW,
@@ -1403,6 +1404,7 @@ extern "C" {
1403
1404
  struct ggml_tensor * a,
1404
1405
  struct ggml_tensor * b);
1405
1406
 
1407
+ // note: casting from f32 to i32 will discard the fractional part
1406
1408
  GGML_API struct ggml_tensor * ggml_cast(
1407
1409
  struct ggml_context * ctx,
1408
1410
  struct ggml_tensor * a,
@@ -1527,7 +1529,11 @@ extern "C" {
1527
1529
  struct ggml_context * ctx,
1528
1530
  struct ggml_tensor * a);
1529
1531
 
1530
- // supports 3D: a->ne[2] == b->ne[1]
1532
+ // supports 4D a:
1533
+ // a [n_embd, ne1, ne2, ne3]
1534
+ // b I32 [n_rows, ne2, ne3, 1]
1535
+ //
1536
+ // return [n_embd, n_rows, ne2, ne3]
1531
1537
  GGML_API struct ggml_tensor * ggml_get_rows(
1532
1538
  struct ggml_context * ctx,
1533
1539
  struct ggml_tensor * a, // data
@@ -1870,6 +1876,41 @@ extern "C" {
1870
1876
  int d0, // dilation dimension 0
1871
1877
  int d1); // dilation dimension 1
1872
1878
 
1879
+ GGML_API struct ggml_tensor * ggml_im2col_3d(
1880
+ struct ggml_context * ctx,
1881
+ struct ggml_tensor * a,
1882
+ struct ggml_tensor * b,
1883
+ int64_t IC,
1884
+ int s0, // stride width
1885
+ int s1, // stride height
1886
+ int s2, // stride depth
1887
+ int p0, // padding width
1888
+ int p1, // padding height
1889
+ int p2, // padding depth
1890
+ int d0, // dilation width
1891
+ int d1, // dilation height
1892
+ int d2, // dilation depth
1893
+ enum ggml_type dst_type);
1894
+
1895
+ // a: [OC*IC, KD, KH, KW]
1896
+ // b: [N*IC, ID, IH, IW]
1897
+ // result: [N*OC, OD, OH, OW]
1898
+ GGML_API struct ggml_tensor * ggml_conv_3d(
1899
+ struct ggml_context * ctx,
1900
+ struct ggml_tensor * a,
1901
+ struct ggml_tensor * b,
1902
+ int64_t IC,
1903
+ int s0, // stride width
1904
+ int s1, // stride height
1905
+ int s2, // stride depth
1906
+ int p0, // padding width
1907
+ int p1, // padding height
1908
+ int p2, // padding depth
1909
+ int d0, // dilation width
1910
+ int d1, // dilation height
1911
+ int d2 // dilation depth
1912
+ );
1913
+
1873
1914
  // kernel size is a->ne[0] x a->ne[1]
1874
1915
  // stride is equal to kernel size
1875
1916
  // padding is zero
@@ -1941,7 +1982,7 @@ extern "C" {
1941
1982
  int d0, // dilation dimension 0
1942
1983
  int d1); // dilation dimension 1
1943
1984
 
1944
- GGML_API struct ggml_tensor * ggml_conv_3d(
1985
+ GGML_API struct ggml_tensor * ggml_conv_3d_direct(
1945
1986
  struct ggml_context * ctx,
1946
1987
  struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
1947
1988
  struct ggml_tensor * b, // input [W, H, D, C * N]
@@ -2048,6 +2089,19 @@ extern "C" {
2048
2089
  int p2,
2049
2090
  int p3);
2050
2091
 
2092
+ GGML_API struct ggml_tensor * ggml_pad_ext(
2093
+ struct ggml_context * ctx,
2094
+ struct ggml_tensor * a,
2095
+ int lp0,
2096
+ int rp0,
2097
+ int lp1,
2098
+ int rp1,
2099
+ int lp2,
2100
+ int rp2,
2101
+ int lp3,
2102
+ int rp3
2103
+ );
2104
+
2051
2105
  // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
2052
2106
  GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
2053
2107
  struct ggml_context * ctx,
@@ -224,7 +224,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
224
224
  foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
225
225
  string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
226
226
  if (NOT ${feature_pos} EQUAL -1)
227
- message(STATUS "ARM feature ${feature} enabled")
227
+ # Special handling for MATMUL_INT8 when machine doesn't support i8mm
228
+ if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
229
+ message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
230
+ list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
231
+ else()
232
+ message(STATUS "ARM feature ${feature} enabled")
233
+ endif()
228
234
  endif()
229
235
  endforeach()
230
236
  endif()
@@ -433,15 +439,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
433
439
  ggml-cpu/arch/riscv/quants.c
434
440
  ggml-cpu/arch/riscv/repack.cpp
435
441
  )
436
- if (GGML_RVV)
437
- if (GGML_XTHEADVECTOR)
438
- list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
439
- elseif (GGML_RV_ZFH)
440
- list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
441
- else()
442
- list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
442
+ set(MARCH_STR "rv64gc")
443
+ if (GGML_RV_ZFH)
444
+ string(APPEND MARCH_STR "_zfh")
445
+ endif()
446
+ if (GGML_XTHEADVECTOR)
447
+ string(APPEND MARCH_STR "_xtheadvector")
448
+ elseif (GGML_RVV)
449
+ string(APPEND MARCH_STR "_v")
450
+ if (GGML_RV_ZVFH)
451
+ string(APPEND MARCH_STR "_zvfh")
443
452
  endif()
444
453
  endif()
454
+ if (GGML_RV_ZICBOP)
455
+ string(APPEND MARCH_STR "_zicbop")
456
+ endif()
457
+ list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
445
458
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
446
459
  message(STATUS "s390x detected")
447
460
  list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
@@ -450,7 +463,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
450
463
 
451
464
  # TODO: Separation to determine activation of VX/VXE/VXE2
452
465
  if (${S390X_M} MATCHES "8561|8562")
453
- set(GGML_NNPA OFF)
454
466
  message(STATUS "z15 target")
455
467
  list(APPEND ARCH_FLAGS -march=z15)
456
468
  elseif (${S390X_M} MATCHES "3931")
@@ -472,11 +484,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
472
484
  list(APPEND ARCH_FLAGS -mvx -mzvector)
473
485
  list(APPEND ARCH_DEFINITIONS GGML_VXE)
474
486
  endif()
475
-
476
- if (GGML_NNPA)
477
- message(STATUS "NNPA enabled")
478
- list(APPEND ARCH_DEFINITIONS GGML_NNPA)
479
- endif()
480
487
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
481
488
  message(STATUS "Wasm detected")
482
489
  list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
@@ -1270,29 +1270,40 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1270
1270
  const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1271
1271
  const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1272
1272
 
1273
- int tmp, tmp2, sumi;
1273
+ float ftmp, ft2;
1274
+ const uint8_t * restrict q40;
1275
+ const uint8_t * restrict q41;
1276
+ const uint8_t * restrict q42;
1277
+ const uint8_t * restrict q43;
1278
+ const int8_t * restrict q80;
1279
+ const int8_t * restrict q81;
1280
+ const int8_t * restrict q82;
1281
+ const int8_t * restrict q83;
1282
+ int s0, s1, s2, s3;
1283
+
1274
1284
  __asm__ __volatile__(
1275
- "vsetivli zero, 12, e8, m1\n\t"
1276
- "vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
1277
- "vsetivli zero, 4, e32, m1\n\t"
1285
+ "li %[s1], 8\n\t"
1286
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1287
+ "vle32.v v1, (%[s6b])\n\t"
1288
+ "vslide1down.vx v1, v1, zero\n\t"
1289
+ "vmv.v.x v16, zero\n\t"
1278
1290
  "vslidedown.vi v2, v1, 2\n\t"
1279
1291
  "vmv1r.v v3, v2\n\t"
1280
1292
  "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
1281
- "vsetivli zero, 2, e32, m1\n\t"
1293
+ "vsetivli zero, 2, e32, m1, ta, ma\n\t"
1282
1294
  "vmv.v.i v4, 4\n\t"
1283
1295
  "vand.vx v8, v1, %[kmask1]\n\t"
1284
1296
  "vslide1up.vx v5, v4, zero\n\t" // {0, 4}
1285
1297
  "vsrl.vi v6, v1, 6\n\t"
1286
1298
  "vsrl.vv v7, v2, v5\n\t"
1299
+ "vsse32.v v8, (%[utmp]), %[s1]\n\t"
1287
1300
  "vand.vx v0, v6, %[kmask3]\n\t"
1288
1301
  "vand.vx v2, v7, %[kmask2]\n\t"
1289
1302
  "vsll.vi v6, v0, 4\n\t"
1290
- "li %[t2], 8\n\t"
1291
- "addi %[t1], %[utmp], 4\n\t"
1303
+ "addi %[s0], %[utmp], 4\n\t"
1292
1304
  "vor.vv v1, v6, v2\n\t"
1293
- "vsse32.v v8, (%[utmp]), %[t2]\n\t"
1294
- "vsse32.v v1, (%[t1]), %[t2]\n\t"
1295
- "vsetivli zero, 8, e16, m1\n\t"
1305
+ "vsse32.v v1, (%[s0]), %[s1]\n\t"
1306
+ "vsetivli zero, 8, e16, m1, ta, ma\n\t"
1296
1307
  "vle32.v v2, (%[bsums])\n\t"
1297
1308
  "vnsrl.wi v0, v2, 0\n\t"
1298
1309
  "vnsrl.wi v1, v2, 16\n\t"
@@ -1300,13 +1311,131 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1300
1311
  "vle8.v v3, (%[mins])\n\t"
1301
1312
  "vzext.vf2 v4, v3\n\t"
1302
1313
  "vwmul.vv v6, v4, v2\n\t"
1314
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1315
+ "vredsum.vs v0, v6, v16\n\t"
1316
+ "vredsum.vs v0, v7, v0\n\t"
1317
+ "vfcvt.f.x.v v0, v0\n\t"
1318
+ "vfmv.f.s %[ftmp], v0\n\t"
1319
+ "vsetivli zero, 16, e8, m1, ta, ma\n\t"
1320
+ "vle8.v v0, (%[xs])\n\t"
1321
+ "fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
1322
+ "addi %[q40], %[xs], 64\n\t"
1323
+ "addi %[q41], %[xs], 16\n\t"
1324
+ "addi %[q42], %[xs], 32\n\t"
1325
+ "addi %[q43], %[xs], 48\n\t"
1326
+ "addi %[q80], %[ys], 64\n\t"
1327
+ "vle8.v v1, (%[q41])\n\t"
1328
+ "vle8.v v2, (%[q42])\n\t"
1329
+ "addi %[q81], %[ys], 16\n\t"
1330
+ "addi %[q41], %[q41], 64\n\t"
1331
+ "addi %[q82], %[ys], 32\n\t"
1332
+ "vle8.v v3, (%[q43])\n\t"
1333
+ "vle8.v v8, (%[ys])\n\t"
1334
+ "addi %[q42], %[q42], 64\n\t"
1335
+ "addi %[q83], %[ys], 48\n\t"
1336
+ "addi %[q43], %[q43], 64\n\t"
1337
+ "vsrl.vi v4, v0, 4\n\t"
1338
+ "vle8.v v9, (%[q81])\n\t"
1339
+ "vle8.v v10, (%[q82])\n\t"
1340
+ "vand.vi v0, v0, 0xF\n\t"
1341
+ "addi %[q81], %[q81], 64\n\t"
1342
+ "vsrl.vi v5, v1, 4\n\t"
1343
+ "addi %[q82], %[q82], 64\n\t"
1344
+ "vle8.v v11, (%[q83])\n\t"
1345
+ "vle8.v v12, (%[q80])\n\t"
1346
+ "vand.vi v1, v1, 0xF\n\t"
1347
+ "addi %[q83], %[q83], 64\n\t"
1348
+ "vsrl.vi v6, v2, 4\n\t"
1349
+ "addi %[q80], %[q80], 64\n\t"
1350
+ "vle8.v v13, (%[q81])\n\t"
1351
+ "vle8.v v14, (%[q82])\n\t"
1352
+ "vand.vi v2, v2, 0xF\n\t"
1353
+ "addi %[q81], %[q81], 64\n\t"
1354
+ "vsrl.vi v7, v3, 4\n\t"
1355
+ "addi %[q82], %[q82], 64\n\t"
1356
+ "vwmul.vv v16, v0, v8\n\t"
1357
+ "vle8.v v15, (%[q83])\n\t"
1358
+ "vle8.v v0, (%[q40])\n\t"
1359
+ "vand.vi v3, v3, 0xF\n\t"
1360
+ "addi %[q83], %[q83], 64\n\t"
1361
+ "vwmul.vv v24, v2, v12\n\t"
1362
+ "vwmul.vv v20, v4, v10\n\t"
1363
+ "vwmul.vv v28, v6, v14\n\t"
1364
+ "vwmacc.vv v16, v1, v9\n\t"
1365
+ "vle8.v v1, (%[q41])\n\t"
1366
+ "vle8.v v2, (%[q42])\n\t"
1367
+ "vwmacc.vv v24, v3, v13\n\t"
1368
+ "vwmacc.vv v20, v5, v11\n\t"
1369
+ "vwmacc.vv v28, v7, v15\n\t"
1370
+ "addi %[q40], %[q80], 64\n\t"
1371
+ "addi %[q41], %[q81], 64\n\t"
1372
+ "vle8.v v3, (%[q43])\n\t"
1373
+ "vle8.v v8, (%[q80])\n\t"
1374
+ "addi %[q42], %[q82], 64\n\t"
1375
+ "addi %[q43], %[q83], 64\n\t"
1376
+ "vsrl.vi v4, v0, 4\n\t"
1377
+ "vle8.v v9, (%[q81])\n\t"
1378
+ "vle8.v v10, (%[q82])\n\t"
1379
+ "vand.vi v0, v0, 0xF\n\t"
1380
+ "vsrl.vi v5, v1, 4\n\t"
1381
+ "vsrl.vi v7, v3, 4\n\t"
1382
+ "vand.vi v3, v3, 0xF\n\t"
1383
+ "vle8.v v11, (%[q83])\n\t"
1384
+ "vle8.v v12, (%[q40])\n\t"
1385
+ "vand.vi v1, v1, 0xF\n\t"
1386
+ "vsrl.vi v6, v2, 4\n\t"
1387
+ "vand.vi v2, v2, 0xF\n\t"
1388
+ "vwmul.vv v18, v0, v8\n\t"
1389
+ "vle8.v v13, (%[q41])\n\t"
1390
+ "vle8.v v14, (%[q42])\n\t"
1391
+ "vwmul.vv v26, v2, v12\n\t"
1392
+ "vwmul.vv v22, v4, v10\n\t"
1393
+ "vwmul.vv v30, v6, v14\n\t"
1394
+ "vwmacc.vv v18, v1, v9\n\t"
1395
+ "vle8.v v15, (%[q43])\n\t"
1396
+ "vwmacc.vv v26, v3, v13\n\t"
1397
+ "vwmacc.vv v22, v5, v11\n\t"
1398
+ "vwmacc.vv v30, v7, v15\n\t"
1303
1399
  "vmv.v.x v0, zero\n\t"
1304
- "vsetivli zero, 8, e32, m2\n\t"
1305
- "vredsum.vs v0, v6, v0\n\t"
1306
- "vmv.x.s %[sumi], v0"
1307
- : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
1308
- : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1309
- , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
1400
+ "vsetivli zero, 16, e16, m2, ta, ma\n\t"
1401
+ "vwredsum.vs v4, v16, v0\n\t"
1402
+ "lbu %[s0], 0(%[scale])\n\t"
1403
+ "vwredsum.vs v5, v20, v0\n\t"
1404
+ "lbu %[s1], 1(%[scale])\n\t"
1405
+ "vwredsum.vs v6, v24, v0\n\t"
1406
+ "lbu %[s2], 2(%[scale])\n\t"
1407
+ "vwredsum.vs v7, v28, v0\n\t"
1408
+ "lbu %[s3], 3(%[scale])\n\t"
1409
+ "vwredsum.vs v8, v18, v0\n\t"
1410
+ "lbu %[q40], 4(%[scale])\n\t"
1411
+ "vwredsum.vs v9, v22, v0\n\t"
1412
+ "lbu %[q41], 5(%[scale])\n\t"
1413
+ "vwredsum.vs v10, v26, v0\n\t"
1414
+ "lbu %[q42], 6(%[scale])\n\t"
1415
+ "vwredsum.vs v11, v30, v0\n\t"
1416
+ "lbu %[q43], 7(%[scale])\n\t"
1417
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1418
+ "vmul.vx v0, v4, %[s0]\n\t"
1419
+ "vmul.vx v1, v8, %[q40]\n\t"
1420
+ "vmacc.vx v0, %[s1], v5\n\t"
1421
+ "vmacc.vx v1, %[q41], v9\n\t"
1422
+ "vmacc.vx v0, %[s2], v6\n\t"
1423
+ "vmacc.vx v1, %[q42], v10\n\t"
1424
+ "vmacc.vx v0, %[s3], v7\n\t"
1425
+ "vmacc.vx v1, %[q43], v11\n\t"
1426
+ "vfcvt.f.x.v v0, v0\n\t"
1427
+ "vfcvt.f.x.v v1, v1\n\t"
1428
+ "vfmv.f.s %[ft2], v0\n\t"
1429
+ "vfmv.f.s %[ftmp], v1\n\t"
1430
+ "fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
1431
+ "fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
1432
+ : [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
1433
+ , [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
1434
+ , [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
1435
+ , [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
1436
+ : [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
1437
+ , [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1438
+ , [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
1310
1439
  , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
1311
1440
  : "memory"
1312
1441
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
@@ -1314,59 +1443,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1314
1443
  , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1315
1444
  , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1316
1445
  );
1317
- sumf -= dmin * sumi;
1318
-
1319
- const uint8_t * restrict q4 = x[i].qs;
1320
- const int8_t * restrict q8 = y[i].qs;
1321
-
1322
- sumi = 0;
1323
- const uint8_t * scale = scales;
1324
-
1325
- for (int j = 0; j < QK_K/128; ++j) {
1326
- int vl128 = 128, vl64 = 64, vl32 = 32;
1327
- __asm__ __volatile__(
1328
- "vsetvli zero, %[vl128], e8, m8\n\t"
1329
- "vle8.v v8, (%[q8])\n\t"
1330
- "vsetvli zero, %[vl64], e8, m4\n\t"
1331
- "vle8.v v0, (%[q4])\n\t"
1332
- "vsrl.vi v4, v0, 4\n\t"
1333
- "vand.vi v0, v0, 0xF\n\t"
1334
- "vsetvli zero, %[vl32], e8, m2\n\t"
1335
- "vwmul.vv v28, v6, v14\n\t"
1336
- "vwmul.vv v20, v4, v10\n\t"
1337
- "vwmul.vv v24, v2, v12\n\t"
1338
- "vwmul.vv v16, v0, v8\n\t"
1339
- "vsetivli zero, 4, e32, m1\n\t"
1340
- "vle8.v v2, (%[scale])\n\t"
1341
- "vmv.v.x v0, zero\n\t"
1342
- "vzext.vf4 v1, v2\n\t"
1343
- "vsetvli zero, %[vl32], e16, m4\n\t"
1344
- "vwredsum.vs v6, v24, v0\n\t"
1345
- "vwredsum.vs v7, v28, v0\n\t"
1346
- "vwredsum.vs v4, v16, v0\n\t"
1347
- "vwredsum.vs v5, v20, v0\n\t"
1348
- "vsetivli zero, 4, e32, m1\n\t"
1349
- "vslideup.vi v6, v7, 1\n\t"
1350
- "vslideup.vi v4, v5, 1\n\t"
1351
- "vslideup.vi v4, v6, 2\n\t"
1352
- "vmul.vv v8, v4, v1\n\t"
1353
- "vredsum.vs v0, v8, v0\n\t"
1354
- "vmv.x.s %[tmp], v0\n\t"
1355
- "add %[sumi], %[sumi], %[tmp]"
1356
- : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
1357
- : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
1358
- , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
1359
- : "memory"
1360
- , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1361
- , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1362
- , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1363
- , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1364
- );
1365
-
1366
- q4 += 64; q8 += 128; scale += 4;
1367
- }
1368
-
1369
- sumf += d * sumi;
1370
1446
  }
1371
1447
  break;
1372
1448
  default:
@@ -1693,6 +1769,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1693
1769
  case 128:
1694
1770
  for (int i = 0; i < nb; ++i) {
1695
1771
 
1772
+ __builtin_prefetch(&x[i + 1].d, 0, 1);
1773
+
1696
1774
  const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1697
1775
 
1698
1776
  const uint8_t * restrict q6 = x[i].ql;
@@ -1701,23 +1779,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1701
1779
 
1702
1780
  const int8_t * restrict scale = x[i].scales;
1703
1781
 
1704
- int sum_t = 0;
1705
- int t0;
1782
+ int q6h;
1783
+ float ftmp;
1706
1784
 
1707
1785
  for (int j = 0; j < QK_K/128; ++j) {
1708
1786
  __asm__ __volatile__(
1787
+ "addi %[q6h], %[q6], 32\n\t"
1788
+ "ld t0, 0(%[scale])\n\t"
1789
+ "addi %[scale], %[scale], 8\n\t"
1790
+ "slli t6, t0, 1 * 8\n\t"
1791
+ "lb zero, 0(%[q6])\n\t"
1792
+ "slli t5, t0, 2 * 8\n\t"
1793
+ "slli t4, t0, 3 * 8\n\t"
1794
+ "lb zero, 0(%[q6h])\n\t"
1795
+ "slli t3, t0, 4 * 8\n\t"
1796
+ "slli t2, t0, 5 * 8\n\t"
1797
+ "lb zero, 0(%[qh])\n\t"
1798
+ "lb zero, 31(%[q6h])\n\t"
1799
+ "slli t1, t0, 6 * 8\n\t"
1800
+ "srai a7, t0, 56\n\t"
1709
1801
  "vsetvli zero, %[vl32], e8, m2\n\t"
1802
+ "vle8.v v8, (%[q6])\n\t"
1803
+ "srai t6, t6, 56\n\t"
1804
+ "srai t5, t5, 56\n\t"
1805
+ "srai t4, t4, 56\n\t"
1806
+ "srai t3, t3, 56\n\t"
1807
+ "vle8.v v10, (%[q6h])\n\t"
1808
+ "addi %[q6], %[q6], 64\n\t"
1809
+ "slli t0, t0, 7 * 8\n\t"
1810
+ "srai t2, t2, 56\n\t"
1811
+ "srai t1, t1, 56\n\t"
1812
+ "srai t0, t0, 56\n\t"
1710
1813
  "vle8.v v4, (%[qh])\n\t"
1814
+ "vsrl.vi v12, v8, 4\n\t"
1815
+ "vsrl.vi v14, v10, 4\n\t"
1816
+ "lb zero, 0(%[q8])\n\t"
1817
+ "vand.vi v8, v8, 0xF\n\t"
1818
+ "vand.vi v10, v10, 0xF\n\t"
1819
+ "lb zero, 32(%[q8])\n\t"
1711
1820
  "vsll.vi v0, v4, 4\n\t"
1712
1821
  "vsll.vi v2, v4, 2\n\t"
1822
+ "lb zero, 64(%[q8])\n\t"
1713
1823
  "vsrl.vi v6, v4, 2\n\t"
1714
- "vsetvli zero, %[vl64], e8, m4\n\t"
1715
- "vle8.v v8, (%[q6])\n\t"
1716
- "vsrl.vi v12, v8, 4\n\t"
1717
- "vand.vi v8, v8, 0xF\n\t"
1718
- "vsetvli zero, %[vl128], e8, m8\n\t"
1719
1824
  "vand.vx v0, v0, %[mask]\n\t"
1825
+ "lb zero, 96(%[q8])\n\t"
1826
+ "vand.vx v2, v2, %[mask]\n\t"
1827
+ "vand.vx v4, v4, %[mask]\n\t"
1828
+ "vand.vx v6, v6, %[mask]\n\t"
1720
1829
  "vor.vv v8, v8, v0\n\t"
1830
+ "lb zero, 127(%[q8])\n\t"
1831
+ "vor.vv v10, v10, v2\n\t"
1832
+ "vor.vv v12, v12, v4\n\t"
1833
+ "vor.vv v14, v14, v6\n\t"
1834
+ "vsetvli zero, %[vl128], e8, m8\n\t"
1721
1835
  "vle8.v v0, (%[q8])\n\t"
1722
1836
  "vsub.vx v8, v8, %[vl32]\n\t"
1723
1837
  "vsetvli zero, %[vl64], e8, m4\n\t"
@@ -1734,34 +1848,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1734
1848
  "vwredsum.vs v13, v28, v0\n\t"
1735
1849
  "vwredsum.vs v14, v30, v0\n\t"
1736
1850
  "vsetivli zero, 4, e32, m1\n\t"
1737
- "vslideup.vi v10, v9, 1\n\t"
1738
- "vslideup.vi v8, v7, 1\n\t"
1739
- "vslideup.vi v11, v12, 1\n\t"
1740
- "vslideup.vi v13, v14, 1\n\t"
1741
- "vslideup.vi v10, v8, 2\n\t"
1742
- "vslideup.vi v11, v13, 2\n\t"
1743
- "vsetivli zero, 8, e32, m2\n\t"
1744
- "vle8.v v2, (%[scale])\n\t"
1745
- "vsext.vf4 v4, v2\n\t"
1746
- "vmul.vv v2, v4, v10\n\t"
1747
- "vredsum.vs v0, v2, v0\n\t"
1748
- "vmv.x.s %[t0], v0\n\t"
1749
- "add %[sumi], %[sumi], %[t0]"
1750
- : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
1751
- : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
1851
+ "vmul.vx v0, v10, t0\n\t"
1852
+ "vmul.vx v1, v9, t1\n\t"
1853
+ "vmacc.vx v0, t2, v8\n\t"
1854
+ "vmacc.vx v1, t3, v7\n\t"
1855
+ "vmacc.vx v0, t4, v11\n\t"
1856
+ "vmacc.vx v1, t5, v12\n\t"
1857
+ "vmacc.vx v0, t6, v13\n\t"
1858
+ "vmacc.vx v1, a7, v14\n\t"
1859
+ "vadd.vv v0, v0, v1\n\t"
1860
+ "vfcvt.f.x.v v0, v0\n\t"
1861
+ "vfmv.f.s %[ftmp], v0\n\t"
1862
+ "fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
1863
+ : [q6] "+&r" (q6), [q6h] "=&r" (q6h)
1864
+ , [scale] "+&r" (scale)
1865
+ , [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
1866
+ : [qh] "r" (qh), [q8] "r" (q8)
1752
1867
  , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
1753
- , [mask] "r" (0x30)
1868
+ , [mask] "r" (0x30), [d] "f" (d)
1754
1869
  : "memory"
1755
1870
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1756
1871
  , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1757
1872
  , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1758
1873
  , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1874
+ , "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
1875
+ , "a6", "a5", "a4", "a3"
1759
1876
  );
1760
- q6 += 64; qh += 32; q8 += 128; scale += 8;
1877
+ qh += 32; q8 += 128;
1761
1878
  }
1762
-
1763
- sumf += d * sum_t;
1764
-
1765
1879
  }
1766
1880
  break;
1767
1881
  default: