@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -101,7 +101,6 @@ extern "C" {
|
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
102
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
103
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
|
-
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
105
104
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
106
105
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
107
106
|
|
|
@@ -135,6 +134,7 @@ extern "C" {
|
|
|
135
134
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
136
135
|
|
|
137
136
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
|
137
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
|
|
138
138
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
|
139
139
|
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
|
140
140
|
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
|
@@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
|
|
43
43
|
|
|
44
44
|
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
|
45
45
|
|
|
46
|
-
GGML_DEPRECATED(
|
|
47
|
-
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
|
48
|
-
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
|
|
49
|
-
|
|
50
46
|
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
|
51
47
|
|
|
52
|
-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
|
53
|
-
|
|
54
48
|
// helper to check if the device supports a specific family
|
|
55
49
|
// ideally, the user code should be doing these checks
|
|
56
50
|
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
|
@@ -511,6 +511,7 @@ extern "C" {
|
|
|
511
511
|
GGML_OP_CONV_TRANSPOSE_1D,
|
|
512
512
|
GGML_OP_IM2COL,
|
|
513
513
|
GGML_OP_IM2COL_BACK,
|
|
514
|
+
GGML_OP_IM2COL_3D,
|
|
514
515
|
GGML_OP_CONV_2D,
|
|
515
516
|
GGML_OP_CONV_3D,
|
|
516
517
|
GGML_OP_CONV_2D_DW,
|
|
@@ -1403,6 +1404,7 @@ extern "C" {
|
|
|
1403
1404
|
struct ggml_tensor * a,
|
|
1404
1405
|
struct ggml_tensor * b);
|
|
1405
1406
|
|
|
1407
|
+
// note: casting from f32 to i32 will discard the fractional part
|
|
1406
1408
|
GGML_API struct ggml_tensor * ggml_cast(
|
|
1407
1409
|
struct ggml_context * ctx,
|
|
1408
1410
|
struct ggml_tensor * a,
|
|
@@ -1527,7 +1529,11 @@ extern "C" {
|
|
|
1527
1529
|
struct ggml_context * ctx,
|
|
1528
1530
|
struct ggml_tensor * a);
|
|
1529
1531
|
|
|
1530
|
-
// supports
|
|
1532
|
+
// supports 4D a:
|
|
1533
|
+
// a [n_embd, ne1, ne2, ne3]
|
|
1534
|
+
// b I32 [n_rows, ne2, ne3, 1]
|
|
1535
|
+
//
|
|
1536
|
+
// return [n_embd, n_rows, ne2, ne3]
|
|
1531
1537
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
|
1532
1538
|
struct ggml_context * ctx,
|
|
1533
1539
|
struct ggml_tensor * a, // data
|
|
@@ -1870,6 +1876,41 @@ extern "C" {
|
|
|
1870
1876
|
int d0, // dilation dimension 0
|
|
1871
1877
|
int d1); // dilation dimension 1
|
|
1872
1878
|
|
|
1879
|
+
GGML_API struct ggml_tensor * ggml_im2col_3d(
|
|
1880
|
+
struct ggml_context * ctx,
|
|
1881
|
+
struct ggml_tensor * a,
|
|
1882
|
+
struct ggml_tensor * b,
|
|
1883
|
+
int64_t IC,
|
|
1884
|
+
int s0, // stride width
|
|
1885
|
+
int s1, // stride height
|
|
1886
|
+
int s2, // stride depth
|
|
1887
|
+
int p0, // padding width
|
|
1888
|
+
int p1, // padding height
|
|
1889
|
+
int p2, // padding depth
|
|
1890
|
+
int d0, // dilation width
|
|
1891
|
+
int d1, // dilation height
|
|
1892
|
+
int d2, // dilation depth
|
|
1893
|
+
enum ggml_type dst_type);
|
|
1894
|
+
|
|
1895
|
+
// a: [OC*IC, KD, KH, KW]
|
|
1896
|
+
// b: [N*IC, ID, IH, IW]
|
|
1897
|
+
// result: [N*OC, OD, OH, OW]
|
|
1898
|
+
GGML_API struct ggml_tensor * ggml_conv_3d(
|
|
1899
|
+
struct ggml_context * ctx,
|
|
1900
|
+
struct ggml_tensor * a,
|
|
1901
|
+
struct ggml_tensor * b,
|
|
1902
|
+
int64_t IC,
|
|
1903
|
+
int s0, // stride width
|
|
1904
|
+
int s1, // stride height
|
|
1905
|
+
int s2, // stride depth
|
|
1906
|
+
int p0, // padding width
|
|
1907
|
+
int p1, // padding height
|
|
1908
|
+
int p2, // padding depth
|
|
1909
|
+
int d0, // dilation width
|
|
1910
|
+
int d1, // dilation height
|
|
1911
|
+
int d2 // dilation depth
|
|
1912
|
+
);
|
|
1913
|
+
|
|
1873
1914
|
// kernel size is a->ne[0] x a->ne[1]
|
|
1874
1915
|
// stride is equal to kernel size
|
|
1875
1916
|
// padding is zero
|
|
@@ -1941,7 +1982,7 @@ extern "C" {
|
|
|
1941
1982
|
int d0, // dilation dimension 0
|
|
1942
1983
|
int d1); // dilation dimension 1
|
|
1943
1984
|
|
|
1944
|
-
GGML_API struct ggml_tensor *
|
|
1985
|
+
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
|
|
1945
1986
|
struct ggml_context * ctx,
|
|
1946
1987
|
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
|
1947
1988
|
struct ggml_tensor * b, // input [W, H, D, C * N]
|
|
@@ -2048,6 +2089,19 @@ extern "C" {
|
|
|
2048
2089
|
int p2,
|
|
2049
2090
|
int p3);
|
|
2050
2091
|
|
|
2092
|
+
GGML_API struct ggml_tensor * ggml_pad_ext(
|
|
2093
|
+
struct ggml_context * ctx,
|
|
2094
|
+
struct ggml_tensor * a,
|
|
2095
|
+
int lp0,
|
|
2096
|
+
int rp0,
|
|
2097
|
+
int lp1,
|
|
2098
|
+
int rp1,
|
|
2099
|
+
int lp2,
|
|
2100
|
+
int rp2,
|
|
2101
|
+
int lp3,
|
|
2102
|
+
int rp3
|
|
2103
|
+
);
|
|
2104
|
+
|
|
2051
2105
|
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
|
2052
2106
|
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
|
2053
2107
|
struct ggml_context * ctx,
|
|
@@ -224,7 +224,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
224
224
|
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
|
225
225
|
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
|
226
226
|
if (NOT ${feature_pos} EQUAL -1)
|
|
227
|
-
|
|
227
|
+
# Special handling for MATMUL_INT8 when machine doesn't support i8mm
|
|
228
|
+
if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
|
|
229
|
+
message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
|
|
230
|
+
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
|
|
231
|
+
else()
|
|
232
|
+
message(STATUS "ARM feature ${feature} enabled")
|
|
233
|
+
endif()
|
|
228
234
|
endif()
|
|
229
235
|
endforeach()
|
|
230
236
|
endif()
|
|
@@ -433,15 +439,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
433
439
|
ggml-cpu/arch/riscv/quants.c
|
|
434
440
|
ggml-cpu/arch/riscv/repack.cpp
|
|
435
441
|
)
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
442
|
+
set(MARCH_STR "rv64gc")
|
|
443
|
+
if (GGML_RV_ZFH)
|
|
444
|
+
string(APPEND MARCH_STR "_zfh")
|
|
445
|
+
endif()
|
|
446
|
+
if (GGML_XTHEADVECTOR)
|
|
447
|
+
string(APPEND MARCH_STR "_xtheadvector")
|
|
448
|
+
elseif (GGML_RVV)
|
|
449
|
+
string(APPEND MARCH_STR "_v")
|
|
450
|
+
if (GGML_RV_ZVFH)
|
|
451
|
+
string(APPEND MARCH_STR "_zvfh")
|
|
443
452
|
endif()
|
|
444
453
|
endif()
|
|
454
|
+
if (GGML_RV_ZICBOP)
|
|
455
|
+
string(APPEND MARCH_STR "_zicbop")
|
|
456
|
+
endif()
|
|
457
|
+
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
|
445
458
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
446
459
|
message(STATUS "s390x detected")
|
|
447
460
|
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
|
@@ -450,7 +463,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
450
463
|
|
|
451
464
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
452
465
|
if (${S390X_M} MATCHES "8561|8562")
|
|
453
|
-
set(GGML_NNPA OFF)
|
|
454
466
|
message(STATUS "z15 target")
|
|
455
467
|
list(APPEND ARCH_FLAGS -march=z15)
|
|
456
468
|
elseif (${S390X_M} MATCHES "3931")
|
|
@@ -472,11 +484,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
472
484
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
473
485
|
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
474
486
|
endif()
|
|
475
|
-
|
|
476
|
-
if (GGML_NNPA)
|
|
477
|
-
message(STATUS "NNPA enabled")
|
|
478
|
-
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
|
479
|
-
endif()
|
|
480
487
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
|
481
488
|
message(STATUS "Wasm detected")
|
|
482
489
|
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
|
@@ -1270,29 +1270,40 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1270
1270
|
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1271
1271
|
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1272
1272
|
|
|
1273
|
-
|
|
1273
|
+
float ftmp, ft2;
|
|
1274
|
+
const uint8_t * restrict q40;
|
|
1275
|
+
const uint8_t * restrict q41;
|
|
1276
|
+
const uint8_t * restrict q42;
|
|
1277
|
+
const uint8_t * restrict q43;
|
|
1278
|
+
const int8_t * restrict q80;
|
|
1279
|
+
const int8_t * restrict q81;
|
|
1280
|
+
const int8_t * restrict q82;
|
|
1281
|
+
const int8_t * restrict q83;
|
|
1282
|
+
int s0, s1, s2, s3;
|
|
1283
|
+
|
|
1274
1284
|
__asm__ __volatile__(
|
|
1275
|
-
"
|
|
1276
|
-
"
|
|
1277
|
-
"
|
|
1285
|
+
"li %[s1], 8\n\t"
|
|
1286
|
+
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
|
1287
|
+
"vle32.v v1, (%[s6b])\n\t"
|
|
1288
|
+
"vslide1down.vx v1, v1, zero\n\t"
|
|
1289
|
+
"vmv.v.x v16, zero\n\t"
|
|
1278
1290
|
"vslidedown.vi v2, v1, 2\n\t"
|
|
1279
1291
|
"vmv1r.v v3, v2\n\t"
|
|
1280
1292
|
"vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
|
|
1281
|
-
"vsetivli zero, 2, e32, m1\n\t"
|
|
1293
|
+
"vsetivli zero, 2, e32, m1, ta, ma\n\t"
|
|
1282
1294
|
"vmv.v.i v4, 4\n\t"
|
|
1283
1295
|
"vand.vx v8, v1, %[kmask1]\n\t"
|
|
1284
1296
|
"vslide1up.vx v5, v4, zero\n\t" // {0, 4}
|
|
1285
1297
|
"vsrl.vi v6, v1, 6\n\t"
|
|
1286
1298
|
"vsrl.vv v7, v2, v5\n\t"
|
|
1299
|
+
"vsse32.v v8, (%[utmp]), %[s1]\n\t"
|
|
1287
1300
|
"vand.vx v0, v6, %[kmask3]\n\t"
|
|
1288
1301
|
"vand.vx v2, v7, %[kmask2]\n\t"
|
|
1289
1302
|
"vsll.vi v6, v0, 4\n\t"
|
|
1290
|
-
"
|
|
1291
|
-
"addi %[t1], %[utmp], 4\n\t"
|
|
1303
|
+
"addi %[s0], %[utmp], 4\n\t"
|
|
1292
1304
|
"vor.vv v1, v6, v2\n\t"
|
|
1293
|
-
"vsse32.v
|
|
1294
|
-
"
|
|
1295
|
-
"vsetivli zero, 8, e16, m1\n\t"
|
|
1305
|
+
"vsse32.v v1, (%[s0]), %[s1]\n\t"
|
|
1306
|
+
"vsetivli zero, 8, e16, m1, ta, ma\n\t"
|
|
1296
1307
|
"vle32.v v2, (%[bsums])\n\t"
|
|
1297
1308
|
"vnsrl.wi v0, v2, 0\n\t"
|
|
1298
1309
|
"vnsrl.wi v1, v2, 16\n\t"
|
|
@@ -1300,13 +1311,131 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1300
1311
|
"vle8.v v3, (%[mins])\n\t"
|
|
1301
1312
|
"vzext.vf2 v4, v3\n\t"
|
|
1302
1313
|
"vwmul.vv v6, v4, v2\n\t"
|
|
1314
|
+
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
|
1315
|
+
"vredsum.vs v0, v6, v16\n\t"
|
|
1316
|
+
"vredsum.vs v0, v7, v0\n\t"
|
|
1317
|
+
"vfcvt.f.x.v v0, v0\n\t"
|
|
1318
|
+
"vfmv.f.s %[ftmp], v0\n\t"
|
|
1319
|
+
"vsetivli zero, 16, e8, m1, ta, ma\n\t"
|
|
1320
|
+
"vle8.v v0, (%[xs])\n\t"
|
|
1321
|
+
"fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
|
|
1322
|
+
"addi %[q40], %[xs], 64\n\t"
|
|
1323
|
+
"addi %[q41], %[xs], 16\n\t"
|
|
1324
|
+
"addi %[q42], %[xs], 32\n\t"
|
|
1325
|
+
"addi %[q43], %[xs], 48\n\t"
|
|
1326
|
+
"addi %[q80], %[ys], 64\n\t"
|
|
1327
|
+
"vle8.v v1, (%[q41])\n\t"
|
|
1328
|
+
"vle8.v v2, (%[q42])\n\t"
|
|
1329
|
+
"addi %[q81], %[ys], 16\n\t"
|
|
1330
|
+
"addi %[q41], %[q41], 64\n\t"
|
|
1331
|
+
"addi %[q82], %[ys], 32\n\t"
|
|
1332
|
+
"vle8.v v3, (%[q43])\n\t"
|
|
1333
|
+
"vle8.v v8, (%[ys])\n\t"
|
|
1334
|
+
"addi %[q42], %[q42], 64\n\t"
|
|
1335
|
+
"addi %[q83], %[ys], 48\n\t"
|
|
1336
|
+
"addi %[q43], %[q43], 64\n\t"
|
|
1337
|
+
"vsrl.vi v4, v0, 4\n\t"
|
|
1338
|
+
"vle8.v v9, (%[q81])\n\t"
|
|
1339
|
+
"vle8.v v10, (%[q82])\n\t"
|
|
1340
|
+
"vand.vi v0, v0, 0xF\n\t"
|
|
1341
|
+
"addi %[q81], %[q81], 64\n\t"
|
|
1342
|
+
"vsrl.vi v5, v1, 4\n\t"
|
|
1343
|
+
"addi %[q82], %[q82], 64\n\t"
|
|
1344
|
+
"vle8.v v11, (%[q83])\n\t"
|
|
1345
|
+
"vle8.v v12, (%[q80])\n\t"
|
|
1346
|
+
"vand.vi v1, v1, 0xF\n\t"
|
|
1347
|
+
"addi %[q83], %[q83], 64\n\t"
|
|
1348
|
+
"vsrl.vi v6, v2, 4\n\t"
|
|
1349
|
+
"addi %[q80], %[q80], 64\n\t"
|
|
1350
|
+
"vle8.v v13, (%[q81])\n\t"
|
|
1351
|
+
"vle8.v v14, (%[q82])\n\t"
|
|
1352
|
+
"vand.vi v2, v2, 0xF\n\t"
|
|
1353
|
+
"addi %[q81], %[q81], 64\n\t"
|
|
1354
|
+
"vsrl.vi v7, v3, 4\n\t"
|
|
1355
|
+
"addi %[q82], %[q82], 64\n\t"
|
|
1356
|
+
"vwmul.vv v16, v0, v8\n\t"
|
|
1357
|
+
"vle8.v v15, (%[q83])\n\t"
|
|
1358
|
+
"vle8.v v0, (%[q40])\n\t"
|
|
1359
|
+
"vand.vi v3, v3, 0xF\n\t"
|
|
1360
|
+
"addi %[q83], %[q83], 64\n\t"
|
|
1361
|
+
"vwmul.vv v24, v2, v12\n\t"
|
|
1362
|
+
"vwmul.vv v20, v4, v10\n\t"
|
|
1363
|
+
"vwmul.vv v28, v6, v14\n\t"
|
|
1364
|
+
"vwmacc.vv v16, v1, v9\n\t"
|
|
1365
|
+
"vle8.v v1, (%[q41])\n\t"
|
|
1366
|
+
"vle8.v v2, (%[q42])\n\t"
|
|
1367
|
+
"vwmacc.vv v24, v3, v13\n\t"
|
|
1368
|
+
"vwmacc.vv v20, v5, v11\n\t"
|
|
1369
|
+
"vwmacc.vv v28, v7, v15\n\t"
|
|
1370
|
+
"addi %[q40], %[q80], 64\n\t"
|
|
1371
|
+
"addi %[q41], %[q81], 64\n\t"
|
|
1372
|
+
"vle8.v v3, (%[q43])\n\t"
|
|
1373
|
+
"vle8.v v8, (%[q80])\n\t"
|
|
1374
|
+
"addi %[q42], %[q82], 64\n\t"
|
|
1375
|
+
"addi %[q43], %[q83], 64\n\t"
|
|
1376
|
+
"vsrl.vi v4, v0, 4\n\t"
|
|
1377
|
+
"vle8.v v9, (%[q81])\n\t"
|
|
1378
|
+
"vle8.v v10, (%[q82])\n\t"
|
|
1379
|
+
"vand.vi v0, v0, 0xF\n\t"
|
|
1380
|
+
"vsrl.vi v5, v1, 4\n\t"
|
|
1381
|
+
"vsrl.vi v7, v3, 4\n\t"
|
|
1382
|
+
"vand.vi v3, v3, 0xF\n\t"
|
|
1383
|
+
"vle8.v v11, (%[q83])\n\t"
|
|
1384
|
+
"vle8.v v12, (%[q40])\n\t"
|
|
1385
|
+
"vand.vi v1, v1, 0xF\n\t"
|
|
1386
|
+
"vsrl.vi v6, v2, 4\n\t"
|
|
1387
|
+
"vand.vi v2, v2, 0xF\n\t"
|
|
1388
|
+
"vwmul.vv v18, v0, v8\n\t"
|
|
1389
|
+
"vle8.v v13, (%[q41])\n\t"
|
|
1390
|
+
"vle8.v v14, (%[q42])\n\t"
|
|
1391
|
+
"vwmul.vv v26, v2, v12\n\t"
|
|
1392
|
+
"vwmul.vv v22, v4, v10\n\t"
|
|
1393
|
+
"vwmul.vv v30, v6, v14\n\t"
|
|
1394
|
+
"vwmacc.vv v18, v1, v9\n\t"
|
|
1395
|
+
"vle8.v v15, (%[q43])\n\t"
|
|
1396
|
+
"vwmacc.vv v26, v3, v13\n\t"
|
|
1397
|
+
"vwmacc.vv v22, v5, v11\n\t"
|
|
1398
|
+
"vwmacc.vv v30, v7, v15\n\t"
|
|
1303
1399
|
"vmv.v.x v0, zero\n\t"
|
|
1304
|
-
"vsetivli zero,
|
|
1305
|
-
"
|
|
1306
|
-
"
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1400
|
+
"vsetivli zero, 16, e16, m2, ta, ma\n\t"
|
|
1401
|
+
"vwredsum.vs v4, v16, v0\n\t"
|
|
1402
|
+
"lbu %[s0], 0(%[scale])\n\t"
|
|
1403
|
+
"vwredsum.vs v5, v20, v0\n\t"
|
|
1404
|
+
"lbu %[s1], 1(%[scale])\n\t"
|
|
1405
|
+
"vwredsum.vs v6, v24, v0\n\t"
|
|
1406
|
+
"lbu %[s2], 2(%[scale])\n\t"
|
|
1407
|
+
"vwredsum.vs v7, v28, v0\n\t"
|
|
1408
|
+
"lbu %[s3], 3(%[scale])\n\t"
|
|
1409
|
+
"vwredsum.vs v8, v18, v0\n\t"
|
|
1410
|
+
"lbu %[q40], 4(%[scale])\n\t"
|
|
1411
|
+
"vwredsum.vs v9, v22, v0\n\t"
|
|
1412
|
+
"lbu %[q41], 5(%[scale])\n\t"
|
|
1413
|
+
"vwredsum.vs v10, v26, v0\n\t"
|
|
1414
|
+
"lbu %[q42], 6(%[scale])\n\t"
|
|
1415
|
+
"vwredsum.vs v11, v30, v0\n\t"
|
|
1416
|
+
"lbu %[q43], 7(%[scale])\n\t"
|
|
1417
|
+
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
|
1418
|
+
"vmul.vx v0, v4, %[s0]\n\t"
|
|
1419
|
+
"vmul.vx v1, v8, %[q40]\n\t"
|
|
1420
|
+
"vmacc.vx v0, %[s1], v5\n\t"
|
|
1421
|
+
"vmacc.vx v1, %[q41], v9\n\t"
|
|
1422
|
+
"vmacc.vx v0, %[s2], v6\n\t"
|
|
1423
|
+
"vmacc.vx v1, %[q42], v10\n\t"
|
|
1424
|
+
"vmacc.vx v0, %[s3], v7\n\t"
|
|
1425
|
+
"vmacc.vx v1, %[q43], v11\n\t"
|
|
1426
|
+
"vfcvt.f.x.v v0, v0\n\t"
|
|
1427
|
+
"vfcvt.f.x.v v1, v1\n\t"
|
|
1428
|
+
"vfmv.f.s %[ft2], v0\n\t"
|
|
1429
|
+
"vfmv.f.s %[ftmp], v1\n\t"
|
|
1430
|
+
"fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
|
|
1431
|
+
"fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
|
|
1432
|
+
: [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
|
|
1433
|
+
, [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
|
|
1434
|
+
, [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
|
|
1435
|
+
, [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
|
|
1436
|
+
: [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
|
|
1437
|
+
, [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
|
1438
|
+
, [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
|
|
1310
1439
|
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
|
|
1311
1440
|
: "memory"
|
|
1312
1441
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
@@ -1314,59 +1443,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1314
1443
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
|
1315
1444
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
|
1316
1445
|
);
|
|
1317
|
-
sumf -= dmin * sumi;
|
|
1318
|
-
|
|
1319
|
-
const uint8_t * restrict q4 = x[i].qs;
|
|
1320
|
-
const int8_t * restrict q8 = y[i].qs;
|
|
1321
|
-
|
|
1322
|
-
sumi = 0;
|
|
1323
|
-
const uint8_t * scale = scales;
|
|
1324
|
-
|
|
1325
|
-
for (int j = 0; j < QK_K/128; ++j) {
|
|
1326
|
-
int vl128 = 128, vl64 = 64, vl32 = 32;
|
|
1327
|
-
__asm__ __volatile__(
|
|
1328
|
-
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
1329
|
-
"vle8.v v8, (%[q8])\n\t"
|
|
1330
|
-
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
1331
|
-
"vle8.v v0, (%[q4])\n\t"
|
|
1332
|
-
"vsrl.vi v4, v0, 4\n\t"
|
|
1333
|
-
"vand.vi v0, v0, 0xF\n\t"
|
|
1334
|
-
"vsetvli zero, %[vl32], e8, m2\n\t"
|
|
1335
|
-
"vwmul.vv v28, v6, v14\n\t"
|
|
1336
|
-
"vwmul.vv v20, v4, v10\n\t"
|
|
1337
|
-
"vwmul.vv v24, v2, v12\n\t"
|
|
1338
|
-
"vwmul.vv v16, v0, v8\n\t"
|
|
1339
|
-
"vsetivli zero, 4, e32, m1\n\t"
|
|
1340
|
-
"vle8.v v2, (%[scale])\n\t"
|
|
1341
|
-
"vmv.v.x v0, zero\n\t"
|
|
1342
|
-
"vzext.vf4 v1, v2\n\t"
|
|
1343
|
-
"vsetvli zero, %[vl32], e16, m4\n\t"
|
|
1344
|
-
"vwredsum.vs v6, v24, v0\n\t"
|
|
1345
|
-
"vwredsum.vs v7, v28, v0\n\t"
|
|
1346
|
-
"vwredsum.vs v4, v16, v0\n\t"
|
|
1347
|
-
"vwredsum.vs v5, v20, v0\n\t"
|
|
1348
|
-
"vsetivli zero, 4, e32, m1\n\t"
|
|
1349
|
-
"vslideup.vi v6, v7, 1\n\t"
|
|
1350
|
-
"vslideup.vi v4, v5, 1\n\t"
|
|
1351
|
-
"vslideup.vi v4, v6, 2\n\t"
|
|
1352
|
-
"vmul.vv v8, v4, v1\n\t"
|
|
1353
|
-
"vredsum.vs v0, v8, v0\n\t"
|
|
1354
|
-
"vmv.x.s %[tmp], v0\n\t"
|
|
1355
|
-
"add %[sumi], %[sumi], %[tmp]"
|
|
1356
|
-
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
|
|
1357
|
-
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
|
|
1358
|
-
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
|
|
1359
|
-
: "memory"
|
|
1360
|
-
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
1361
|
-
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
|
1362
|
-
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
|
1363
|
-
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
|
1364
|
-
);
|
|
1365
|
-
|
|
1366
|
-
q4 += 64; q8 += 128; scale += 4;
|
|
1367
|
-
}
|
|
1368
|
-
|
|
1369
|
-
sumf += d * sumi;
|
|
1370
1446
|
}
|
|
1371
1447
|
break;
|
|
1372
1448
|
default:
|
|
@@ -1693,6 +1769,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1693
1769
|
case 128:
|
|
1694
1770
|
for (int i = 0; i < nb; ++i) {
|
|
1695
1771
|
|
|
1772
|
+
__builtin_prefetch(&x[i + 1].d, 0, 1);
|
|
1773
|
+
|
|
1696
1774
|
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1697
1775
|
|
|
1698
1776
|
const uint8_t * restrict q6 = x[i].ql;
|
|
@@ -1701,23 +1779,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1701
1779
|
|
|
1702
1780
|
const int8_t * restrict scale = x[i].scales;
|
|
1703
1781
|
|
|
1704
|
-
int
|
|
1705
|
-
|
|
1782
|
+
int q6h;
|
|
1783
|
+
float ftmp;
|
|
1706
1784
|
|
|
1707
1785
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
1708
1786
|
__asm__ __volatile__(
|
|
1787
|
+
"addi %[q6h], %[q6], 32\n\t"
|
|
1788
|
+
"ld t0, 0(%[scale])\n\t"
|
|
1789
|
+
"addi %[scale], %[scale], 8\n\t"
|
|
1790
|
+
"slli t6, t0, 1 * 8\n\t"
|
|
1791
|
+
"lb zero, 0(%[q6])\n\t"
|
|
1792
|
+
"slli t5, t0, 2 * 8\n\t"
|
|
1793
|
+
"slli t4, t0, 3 * 8\n\t"
|
|
1794
|
+
"lb zero, 0(%[q6h])\n\t"
|
|
1795
|
+
"slli t3, t0, 4 * 8\n\t"
|
|
1796
|
+
"slli t2, t0, 5 * 8\n\t"
|
|
1797
|
+
"lb zero, 0(%[qh])\n\t"
|
|
1798
|
+
"lb zero, 31(%[q6h])\n\t"
|
|
1799
|
+
"slli t1, t0, 6 * 8\n\t"
|
|
1800
|
+
"srai a7, t0, 56\n\t"
|
|
1709
1801
|
"vsetvli zero, %[vl32], e8, m2\n\t"
|
|
1802
|
+
"vle8.v v8, (%[q6])\n\t"
|
|
1803
|
+
"srai t6, t6, 56\n\t"
|
|
1804
|
+
"srai t5, t5, 56\n\t"
|
|
1805
|
+
"srai t4, t4, 56\n\t"
|
|
1806
|
+
"srai t3, t3, 56\n\t"
|
|
1807
|
+
"vle8.v v10, (%[q6h])\n\t"
|
|
1808
|
+
"addi %[q6], %[q6], 64\n\t"
|
|
1809
|
+
"slli t0, t0, 7 * 8\n\t"
|
|
1810
|
+
"srai t2, t2, 56\n\t"
|
|
1811
|
+
"srai t1, t1, 56\n\t"
|
|
1812
|
+
"srai t0, t0, 56\n\t"
|
|
1710
1813
|
"vle8.v v4, (%[qh])\n\t"
|
|
1814
|
+
"vsrl.vi v12, v8, 4\n\t"
|
|
1815
|
+
"vsrl.vi v14, v10, 4\n\t"
|
|
1816
|
+
"lb zero, 0(%[q8])\n\t"
|
|
1817
|
+
"vand.vi v8, v8, 0xF\n\t"
|
|
1818
|
+
"vand.vi v10, v10, 0xF\n\t"
|
|
1819
|
+
"lb zero, 32(%[q8])\n\t"
|
|
1711
1820
|
"vsll.vi v0, v4, 4\n\t"
|
|
1712
1821
|
"vsll.vi v2, v4, 2\n\t"
|
|
1822
|
+
"lb zero, 64(%[q8])\n\t"
|
|
1713
1823
|
"vsrl.vi v6, v4, 2\n\t"
|
|
1714
|
-
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
1715
|
-
"vle8.v v8, (%[q6])\n\t"
|
|
1716
|
-
"vsrl.vi v12, v8, 4\n\t"
|
|
1717
|
-
"vand.vi v8, v8, 0xF\n\t"
|
|
1718
|
-
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
1719
1824
|
"vand.vx v0, v0, %[mask]\n\t"
|
|
1825
|
+
"lb zero, 96(%[q8])\n\t"
|
|
1826
|
+
"vand.vx v2, v2, %[mask]\n\t"
|
|
1827
|
+
"vand.vx v4, v4, %[mask]\n\t"
|
|
1828
|
+
"vand.vx v6, v6, %[mask]\n\t"
|
|
1720
1829
|
"vor.vv v8, v8, v0\n\t"
|
|
1830
|
+
"lb zero, 127(%[q8])\n\t"
|
|
1831
|
+
"vor.vv v10, v10, v2\n\t"
|
|
1832
|
+
"vor.vv v12, v12, v4\n\t"
|
|
1833
|
+
"vor.vv v14, v14, v6\n\t"
|
|
1834
|
+
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
1721
1835
|
"vle8.v v0, (%[q8])\n\t"
|
|
1722
1836
|
"vsub.vx v8, v8, %[vl32]\n\t"
|
|
1723
1837
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
@@ -1734,34 +1848,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1734
1848
|
"vwredsum.vs v13, v28, v0\n\t"
|
|
1735
1849
|
"vwredsum.vs v14, v30, v0\n\t"
|
|
1736
1850
|
"vsetivli zero, 4, e32, m1\n\t"
|
|
1737
|
-
"
|
|
1738
|
-
"
|
|
1739
|
-
"
|
|
1740
|
-
"
|
|
1741
|
-
"
|
|
1742
|
-
"
|
|
1743
|
-
"
|
|
1744
|
-
"
|
|
1745
|
-
"
|
|
1746
|
-
"
|
|
1747
|
-
"
|
|
1748
|
-
"
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1851
|
+
"vmul.vx v0, v10, t0\n\t"
|
|
1852
|
+
"vmul.vx v1, v9, t1\n\t"
|
|
1853
|
+
"vmacc.vx v0, t2, v8\n\t"
|
|
1854
|
+
"vmacc.vx v1, t3, v7\n\t"
|
|
1855
|
+
"vmacc.vx v0, t4, v11\n\t"
|
|
1856
|
+
"vmacc.vx v1, t5, v12\n\t"
|
|
1857
|
+
"vmacc.vx v0, t6, v13\n\t"
|
|
1858
|
+
"vmacc.vx v1, a7, v14\n\t"
|
|
1859
|
+
"vadd.vv v0, v0, v1\n\t"
|
|
1860
|
+
"vfcvt.f.x.v v0, v0\n\t"
|
|
1861
|
+
"vfmv.f.s %[ftmp], v0\n\t"
|
|
1862
|
+
"fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
|
|
1863
|
+
: [q6] "+&r" (q6), [q6h] "=&r" (q6h)
|
|
1864
|
+
, [scale] "+&r" (scale)
|
|
1865
|
+
, [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
|
|
1866
|
+
: [qh] "r" (qh), [q8] "r" (q8)
|
|
1752
1867
|
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
|
1753
|
-
, [mask] "r" (0x30)
|
|
1868
|
+
, [mask] "r" (0x30), [d] "f" (d)
|
|
1754
1869
|
: "memory"
|
|
1755
1870
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
1756
1871
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
|
1757
1872
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
|
1758
1873
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
|
1874
|
+
, "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
|
|
1875
|
+
, "a6", "a5", "a4", "a3"
|
|
1759
1876
|
);
|
|
1760
|
-
|
|
1877
|
+
qh += 32; q8 += 128;
|
|
1761
1878
|
}
|
|
1762
|
-
|
|
1763
|
-
sumf += d * sum_t;
|
|
1764
|
-
|
|
1765
1879
|
}
|
|
1766
1880
|
break;
|
|
1767
1881
|
default:
|