@fugood/llama.node 1.1.10 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +20 -2
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +174 -388
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +67 -37
- package/src/llama.cpp/common/chat.cpp +263 -2
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +5 -2
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
- package/src/llama.cpp/include/llama.h +32 -7
- package/src/llama.cpp/src/llama-adapter.cpp +101 -4
- package/src/llama.cpp/src/llama-adapter.h +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +69 -2
- package/src/llama.cpp/src/llama-arch.h +6 -0
- package/src/llama.cpp/src/llama-context.cpp +92 -45
- package/src/llama.cpp/src/llama-context.h +1 -5
- package/src/llama.cpp/src/llama-graph.cpp +74 -19
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
- package/src/llama.cpp/src/llama-kv-cache.h +4 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +434 -21
- package/src/llama.cpp/src/llama-model.h +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -511,6 +511,7 @@ extern "C" {
|
|
|
511
511
|
GGML_OP_CONV_TRANSPOSE_1D,
|
|
512
512
|
GGML_OP_IM2COL,
|
|
513
513
|
GGML_OP_IM2COL_BACK,
|
|
514
|
+
GGML_OP_IM2COL_3D,
|
|
514
515
|
GGML_OP_CONV_2D,
|
|
515
516
|
GGML_OP_CONV_3D,
|
|
516
517
|
GGML_OP_CONV_2D_DW,
|
|
@@ -1870,6 +1871,41 @@ extern "C" {
|
|
|
1870
1871
|
int d0, // dilation dimension 0
|
|
1871
1872
|
int d1); // dilation dimension 1
|
|
1872
1873
|
|
|
1874
|
+
GGML_API struct ggml_tensor * ggml_im2col_3d(
|
|
1875
|
+
struct ggml_context * ctx,
|
|
1876
|
+
struct ggml_tensor * a,
|
|
1877
|
+
struct ggml_tensor * b,
|
|
1878
|
+
int64_t IC,
|
|
1879
|
+
int s0, // stride width
|
|
1880
|
+
int s1, // stride height
|
|
1881
|
+
int s2, // stride depth
|
|
1882
|
+
int p0, // padding width
|
|
1883
|
+
int p1, // padding height
|
|
1884
|
+
int p2, // padding depth
|
|
1885
|
+
int d0, // dilation width
|
|
1886
|
+
int d1, // dilation height
|
|
1887
|
+
int d2, // dilation depth
|
|
1888
|
+
enum ggml_type dst_type);
|
|
1889
|
+
|
|
1890
|
+
// a: [OC*IC, KD, KH, KW]
|
|
1891
|
+
// b: [N*IC, ID, IH, IW]
|
|
1892
|
+
// result: [N*OC, OD, OH, OW]
|
|
1893
|
+
GGML_API struct ggml_tensor * ggml_conv_3d(
|
|
1894
|
+
struct ggml_context * ctx,
|
|
1895
|
+
struct ggml_tensor * a,
|
|
1896
|
+
struct ggml_tensor * b,
|
|
1897
|
+
int64_t IC,
|
|
1898
|
+
int s0, // stride width
|
|
1899
|
+
int s1, // stride height
|
|
1900
|
+
int s2, // stride depth
|
|
1901
|
+
int p0, // padding width
|
|
1902
|
+
int p1, // padding height
|
|
1903
|
+
int p2, // padding depth
|
|
1904
|
+
int d0, // dilation width
|
|
1905
|
+
int d1, // dilation height
|
|
1906
|
+
int d2 // dilation depth
|
|
1907
|
+
);
|
|
1908
|
+
|
|
1873
1909
|
// kernel size is a->ne[0] x a->ne[1]
|
|
1874
1910
|
// stride is equal to kernel size
|
|
1875
1911
|
// padding is zero
|
|
@@ -1941,7 +1977,7 @@ extern "C" {
|
|
|
1941
1977
|
int d0, // dilation dimension 0
|
|
1942
1978
|
int d1); // dilation dimension 1
|
|
1943
1979
|
|
|
1944
|
-
GGML_API struct ggml_tensor *
|
|
1980
|
+
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
|
|
1945
1981
|
struct ggml_context * ctx,
|
|
1946
1982
|
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
|
1947
1983
|
struct ggml_tensor * b, // input [W, H, D, C * N]
|
|
@@ -2048,6 +2084,19 @@ extern "C" {
|
|
|
2048
2084
|
int p2,
|
|
2049
2085
|
int p3);
|
|
2050
2086
|
|
|
2087
|
+
GGML_API struct ggml_tensor * ggml_pad_ext(
|
|
2088
|
+
struct ggml_context * ctx,
|
|
2089
|
+
struct ggml_tensor * a,
|
|
2090
|
+
int lp0,
|
|
2091
|
+
int rp0,
|
|
2092
|
+
int lp1,
|
|
2093
|
+
int rp1,
|
|
2094
|
+
int lp2,
|
|
2095
|
+
int rp2,
|
|
2096
|
+
int lp3,
|
|
2097
|
+
int rp3
|
|
2098
|
+
);
|
|
2099
|
+
|
|
2051
2100
|
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
|
2052
2101
|
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
|
2053
2102
|
struct ggml_context * ctx,
|
|
@@ -433,15 +433,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
433
433
|
ggml-cpu/arch/riscv/quants.c
|
|
434
434
|
ggml-cpu/arch/riscv/repack.cpp
|
|
435
435
|
)
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
436
|
+
set(MARCH_STR "rv64gc")
|
|
437
|
+
if (GGML_RV_ZFH)
|
|
438
|
+
string(APPEND MARCH_STR "_zfh")
|
|
439
|
+
endif()
|
|
440
|
+
if (GGML_XTHEADVECTOR)
|
|
441
|
+
string(APPEND MARCH_STR "_xtheadvector")
|
|
442
|
+
elseif (GGML_RVV)
|
|
443
|
+
string(APPEND MARCH_STR "_v")
|
|
444
|
+
if (GGML_RV_ZVFH)
|
|
445
|
+
string(APPEND MARCH_STR "_zvfh")
|
|
443
446
|
endif()
|
|
444
447
|
endif()
|
|
448
|
+
if (GGML_RV_ZICBOP)
|
|
449
|
+
string(APPEND MARCH_STR "_zicbop")
|
|
450
|
+
endif()
|
|
451
|
+
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
|
445
452
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
446
453
|
message(STATUS "s390x detected")
|
|
447
454
|
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
|
@@ -450,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
450
457
|
|
|
451
458
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
452
459
|
if (${S390X_M} MATCHES "8561|8562")
|
|
453
|
-
set(GGML_NNPA OFF)
|
|
454
460
|
message(STATUS "z15 target")
|
|
455
461
|
list(APPEND ARCH_FLAGS -march=z15)
|
|
456
462
|
elseif (${S390X_M} MATCHES "3931")
|
|
@@ -472,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
472
478
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
473
479
|
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
474
480
|
endif()
|
|
475
|
-
|
|
476
|
-
if (GGML_NNPA)
|
|
477
|
-
message(STATUS "NNPA enabled")
|
|
478
|
-
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
|
479
|
-
endif()
|
|
480
481
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
|
481
482
|
message(STATUS "Wasm detected")
|
|
482
483
|
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
|
@@ -497,9 +498,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
497
498
|
|
|
498
499
|
# Fetch KleidiAI sources:
|
|
499
500
|
include(FetchContent)
|
|
500
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
501
|
+
set(KLEIDIAI_COMMIT_TAG "v1.13.0")
|
|
501
502
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
502
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
503
|
+
set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
|
|
503
504
|
|
|
504
505
|
if (POLICY CMP0135)
|
|
505
506
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -555,6 +556,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
555
556
|
|
|
556
557
|
list(APPEND GGML_KLEIDIAI_SOURCES
|
|
557
558
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
|
|
559
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
|
|
558
560
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
|
|
559
561
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
|
560
562
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
|
|
@@ -576,7 +578,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
576
578
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
|
577
579
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
|
578
580
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
|
579
|
-
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
|
581
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
|
582
|
+
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
|
580
583
|
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
|
581
584
|
endif()
|
|
582
585
|
|
|
@@ -1270,29 +1270,40 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1270
1270
|
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1271
1271
|
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1272
1272
|
|
|
1273
|
-
|
|
1273
|
+
float ftmp, ft2;
|
|
1274
|
+
const uint8_t * restrict q40;
|
|
1275
|
+
const uint8_t * restrict q41;
|
|
1276
|
+
const uint8_t * restrict q42;
|
|
1277
|
+
const uint8_t * restrict q43;
|
|
1278
|
+
const int8_t * restrict q80;
|
|
1279
|
+
const int8_t * restrict q81;
|
|
1280
|
+
const int8_t * restrict q82;
|
|
1281
|
+
const int8_t * restrict q83;
|
|
1282
|
+
int s0, s1, s2, s3;
|
|
1283
|
+
|
|
1274
1284
|
__asm__ __volatile__(
|
|
1275
|
-
"
|
|
1276
|
-
"
|
|
1277
|
-
"
|
|
1285
|
+
"li %[s1], 8\n\t"
|
|
1286
|
+
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
|
1287
|
+
"vle32.v v1, (%[s6b])\n\t"
|
|
1288
|
+
"vslide1down.vx v1, v1, zero\n\t"
|
|
1289
|
+
"vmv.v.x v16, zero\n\t"
|
|
1278
1290
|
"vslidedown.vi v2, v1, 2\n\t"
|
|
1279
1291
|
"vmv1r.v v3, v2\n\t"
|
|
1280
1292
|
"vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
|
|
1281
|
-
"vsetivli zero, 2, e32, m1\n\t"
|
|
1293
|
+
"vsetivli zero, 2, e32, m1, ta, ma\n\t"
|
|
1282
1294
|
"vmv.v.i v4, 4\n\t"
|
|
1283
1295
|
"vand.vx v8, v1, %[kmask1]\n\t"
|
|
1284
1296
|
"vslide1up.vx v5, v4, zero\n\t" // {0, 4}
|
|
1285
1297
|
"vsrl.vi v6, v1, 6\n\t"
|
|
1286
1298
|
"vsrl.vv v7, v2, v5\n\t"
|
|
1299
|
+
"vsse32.v v8, (%[utmp]), %[s1]\n\t"
|
|
1287
1300
|
"vand.vx v0, v6, %[kmask3]\n\t"
|
|
1288
1301
|
"vand.vx v2, v7, %[kmask2]\n\t"
|
|
1289
1302
|
"vsll.vi v6, v0, 4\n\t"
|
|
1290
|
-
"
|
|
1291
|
-
"addi %[t1], %[utmp], 4\n\t"
|
|
1303
|
+
"addi %[s0], %[utmp], 4\n\t"
|
|
1292
1304
|
"vor.vv v1, v6, v2\n\t"
|
|
1293
|
-
"vsse32.v
|
|
1294
|
-
"
|
|
1295
|
-
"vsetivli zero, 8, e16, m1\n\t"
|
|
1305
|
+
"vsse32.v v1, (%[s0]), %[s1]\n\t"
|
|
1306
|
+
"vsetivli zero, 8, e16, m1, ta, ma\n\t"
|
|
1296
1307
|
"vle32.v v2, (%[bsums])\n\t"
|
|
1297
1308
|
"vnsrl.wi v0, v2, 0\n\t"
|
|
1298
1309
|
"vnsrl.wi v1, v2, 16\n\t"
|
|
@@ -1300,13 +1311,131 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1300
1311
|
"vle8.v v3, (%[mins])\n\t"
|
|
1301
1312
|
"vzext.vf2 v4, v3\n\t"
|
|
1302
1313
|
"vwmul.vv v6, v4, v2\n\t"
|
|
1314
|
+
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
|
1315
|
+
"vredsum.vs v0, v6, v16\n\t"
|
|
1316
|
+
"vredsum.vs v0, v7, v0\n\t"
|
|
1317
|
+
"vfcvt.f.x.v v0, v0\n\t"
|
|
1318
|
+
"vfmv.f.s %[ftmp], v0\n\t"
|
|
1319
|
+
"vsetivli zero, 16, e8, m1, ta, ma\n\t"
|
|
1320
|
+
"vle8.v v0, (%[xs])\n\t"
|
|
1321
|
+
"fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
|
|
1322
|
+
"addi %[q40], %[xs], 64\n\t"
|
|
1323
|
+
"addi %[q41], %[xs], 16\n\t"
|
|
1324
|
+
"addi %[q42], %[xs], 32\n\t"
|
|
1325
|
+
"addi %[q43], %[xs], 48\n\t"
|
|
1326
|
+
"addi %[q80], %[ys], 64\n\t"
|
|
1327
|
+
"vle8.v v1, (%[q41])\n\t"
|
|
1328
|
+
"vle8.v v2, (%[q42])\n\t"
|
|
1329
|
+
"addi %[q81], %[ys], 16\n\t"
|
|
1330
|
+
"addi %[q41], %[q41], 64\n\t"
|
|
1331
|
+
"addi %[q82], %[ys], 32\n\t"
|
|
1332
|
+
"vle8.v v3, (%[q43])\n\t"
|
|
1333
|
+
"vle8.v v8, (%[ys])\n\t"
|
|
1334
|
+
"addi %[q42], %[q42], 64\n\t"
|
|
1335
|
+
"addi %[q83], %[ys], 48\n\t"
|
|
1336
|
+
"addi %[q43], %[q43], 64\n\t"
|
|
1337
|
+
"vsrl.vi v4, v0, 4\n\t"
|
|
1338
|
+
"vle8.v v9, (%[q81])\n\t"
|
|
1339
|
+
"vle8.v v10, (%[q82])\n\t"
|
|
1340
|
+
"vand.vi v0, v0, 0xF\n\t"
|
|
1341
|
+
"addi %[q81], %[q81], 64\n\t"
|
|
1342
|
+
"vsrl.vi v5, v1, 4\n\t"
|
|
1343
|
+
"addi %[q82], %[q82], 64\n\t"
|
|
1344
|
+
"vle8.v v11, (%[q83])\n\t"
|
|
1345
|
+
"vle8.v v12, (%[q80])\n\t"
|
|
1346
|
+
"vand.vi v1, v1, 0xF\n\t"
|
|
1347
|
+
"addi %[q83], %[q83], 64\n\t"
|
|
1348
|
+
"vsrl.vi v6, v2, 4\n\t"
|
|
1349
|
+
"addi %[q80], %[q80], 64\n\t"
|
|
1350
|
+
"vle8.v v13, (%[q81])\n\t"
|
|
1351
|
+
"vle8.v v14, (%[q82])\n\t"
|
|
1352
|
+
"vand.vi v2, v2, 0xF\n\t"
|
|
1353
|
+
"addi %[q81], %[q81], 64\n\t"
|
|
1354
|
+
"vsrl.vi v7, v3, 4\n\t"
|
|
1355
|
+
"addi %[q82], %[q82], 64\n\t"
|
|
1356
|
+
"vwmul.vv v16, v0, v8\n\t"
|
|
1357
|
+
"vle8.v v15, (%[q83])\n\t"
|
|
1358
|
+
"vle8.v v0, (%[q40])\n\t"
|
|
1359
|
+
"vand.vi v3, v3, 0xF\n\t"
|
|
1360
|
+
"addi %[q83], %[q83], 64\n\t"
|
|
1361
|
+
"vwmul.vv v24, v2, v12\n\t"
|
|
1362
|
+
"vwmul.vv v20, v4, v10\n\t"
|
|
1363
|
+
"vwmul.vv v28, v6, v14\n\t"
|
|
1364
|
+
"vwmacc.vv v16, v1, v9\n\t"
|
|
1365
|
+
"vle8.v v1, (%[q41])\n\t"
|
|
1366
|
+
"vle8.v v2, (%[q42])\n\t"
|
|
1367
|
+
"vwmacc.vv v24, v3, v13\n\t"
|
|
1368
|
+
"vwmacc.vv v20, v5, v11\n\t"
|
|
1369
|
+
"vwmacc.vv v28, v7, v15\n\t"
|
|
1370
|
+
"addi %[q40], %[q80], 64\n\t"
|
|
1371
|
+
"addi %[q41], %[q81], 64\n\t"
|
|
1372
|
+
"vle8.v v3, (%[q43])\n\t"
|
|
1373
|
+
"vle8.v v8, (%[q80])\n\t"
|
|
1374
|
+
"addi %[q42], %[q82], 64\n\t"
|
|
1375
|
+
"addi %[q43], %[q83], 64\n\t"
|
|
1376
|
+
"vsrl.vi v4, v0, 4\n\t"
|
|
1377
|
+
"vle8.v v9, (%[q81])\n\t"
|
|
1378
|
+
"vle8.v v10, (%[q82])\n\t"
|
|
1379
|
+
"vand.vi v0, v0, 0xF\n\t"
|
|
1380
|
+
"vsrl.vi v5, v1, 4\n\t"
|
|
1381
|
+
"vsrl.vi v7, v3, 4\n\t"
|
|
1382
|
+
"vand.vi v3, v3, 0xF\n\t"
|
|
1383
|
+
"vle8.v v11, (%[q83])\n\t"
|
|
1384
|
+
"vle8.v v12, (%[q40])\n\t"
|
|
1385
|
+
"vand.vi v1, v1, 0xF\n\t"
|
|
1386
|
+
"vsrl.vi v6, v2, 4\n\t"
|
|
1387
|
+
"vand.vi v2, v2, 0xF\n\t"
|
|
1388
|
+
"vwmul.vv v18, v0, v8\n\t"
|
|
1389
|
+
"vle8.v v13, (%[q41])\n\t"
|
|
1390
|
+
"vle8.v v14, (%[q42])\n\t"
|
|
1391
|
+
"vwmul.vv v26, v2, v12\n\t"
|
|
1392
|
+
"vwmul.vv v22, v4, v10\n\t"
|
|
1393
|
+
"vwmul.vv v30, v6, v14\n\t"
|
|
1394
|
+
"vwmacc.vv v18, v1, v9\n\t"
|
|
1395
|
+
"vle8.v v15, (%[q43])\n\t"
|
|
1396
|
+
"vwmacc.vv v26, v3, v13\n\t"
|
|
1397
|
+
"vwmacc.vv v22, v5, v11\n\t"
|
|
1398
|
+
"vwmacc.vv v30, v7, v15\n\t"
|
|
1303
1399
|
"vmv.v.x v0, zero\n\t"
|
|
1304
|
-
"vsetivli zero,
|
|
1305
|
-
"
|
|
1306
|
-
"
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1400
|
+
"vsetivli zero, 16, e16, m2, ta, ma\n\t"
|
|
1401
|
+
"vwredsum.vs v4, v16, v0\n\t"
|
|
1402
|
+
"lbu %[s0], 0(%[scale])\n\t"
|
|
1403
|
+
"vwredsum.vs v5, v20, v0\n\t"
|
|
1404
|
+
"lbu %[s1], 1(%[scale])\n\t"
|
|
1405
|
+
"vwredsum.vs v6, v24, v0\n\t"
|
|
1406
|
+
"lbu %[s2], 2(%[scale])\n\t"
|
|
1407
|
+
"vwredsum.vs v7, v28, v0\n\t"
|
|
1408
|
+
"lbu %[s3], 3(%[scale])\n\t"
|
|
1409
|
+
"vwredsum.vs v8, v18, v0\n\t"
|
|
1410
|
+
"lbu %[q40], 4(%[scale])\n\t"
|
|
1411
|
+
"vwredsum.vs v9, v22, v0\n\t"
|
|
1412
|
+
"lbu %[q41], 5(%[scale])\n\t"
|
|
1413
|
+
"vwredsum.vs v10, v26, v0\n\t"
|
|
1414
|
+
"lbu %[q42], 6(%[scale])\n\t"
|
|
1415
|
+
"vwredsum.vs v11, v30, v0\n\t"
|
|
1416
|
+
"lbu %[q43], 7(%[scale])\n\t"
|
|
1417
|
+
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
|
|
1418
|
+
"vmul.vx v0, v4, %[s0]\n\t"
|
|
1419
|
+
"vmul.vx v1, v8, %[q40]\n\t"
|
|
1420
|
+
"vmacc.vx v0, %[s1], v5\n\t"
|
|
1421
|
+
"vmacc.vx v1, %[q41], v9\n\t"
|
|
1422
|
+
"vmacc.vx v0, %[s2], v6\n\t"
|
|
1423
|
+
"vmacc.vx v1, %[q42], v10\n\t"
|
|
1424
|
+
"vmacc.vx v0, %[s3], v7\n\t"
|
|
1425
|
+
"vmacc.vx v1, %[q43], v11\n\t"
|
|
1426
|
+
"vfcvt.f.x.v v0, v0\n\t"
|
|
1427
|
+
"vfcvt.f.x.v v1, v1\n\t"
|
|
1428
|
+
"vfmv.f.s %[ft2], v0\n\t"
|
|
1429
|
+
"vfmv.f.s %[ftmp], v1\n\t"
|
|
1430
|
+
"fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
|
|
1431
|
+
"fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
|
|
1432
|
+
: [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
|
|
1433
|
+
, [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
|
|
1434
|
+
, [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
|
|
1435
|
+
, [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
|
|
1436
|
+
: [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
|
|
1437
|
+
, [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
|
1438
|
+
, [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
|
|
1310
1439
|
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
|
|
1311
1440
|
: "memory"
|
|
1312
1441
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
@@ -1314,59 +1443,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1314
1443
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
|
1315
1444
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
|
1316
1445
|
);
|
|
1317
|
-
sumf -= dmin * sumi;
|
|
1318
|
-
|
|
1319
|
-
const uint8_t * restrict q4 = x[i].qs;
|
|
1320
|
-
const int8_t * restrict q8 = y[i].qs;
|
|
1321
|
-
|
|
1322
|
-
sumi = 0;
|
|
1323
|
-
const uint8_t * scale = scales;
|
|
1324
|
-
|
|
1325
|
-
for (int j = 0; j < QK_K/128; ++j) {
|
|
1326
|
-
int vl128 = 128, vl64 = 64, vl32 = 32;
|
|
1327
|
-
__asm__ __volatile__(
|
|
1328
|
-
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
1329
|
-
"vle8.v v8, (%[q8])\n\t"
|
|
1330
|
-
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
1331
|
-
"vle8.v v0, (%[q4])\n\t"
|
|
1332
|
-
"vsrl.vi v4, v0, 4\n\t"
|
|
1333
|
-
"vand.vi v0, v0, 0xF\n\t"
|
|
1334
|
-
"vsetvli zero, %[vl32], e8, m2\n\t"
|
|
1335
|
-
"vwmul.vv v28, v6, v14\n\t"
|
|
1336
|
-
"vwmul.vv v20, v4, v10\n\t"
|
|
1337
|
-
"vwmul.vv v24, v2, v12\n\t"
|
|
1338
|
-
"vwmul.vv v16, v0, v8\n\t"
|
|
1339
|
-
"vsetivli zero, 4, e32, m1\n\t"
|
|
1340
|
-
"vle8.v v2, (%[scale])\n\t"
|
|
1341
|
-
"vmv.v.x v0, zero\n\t"
|
|
1342
|
-
"vzext.vf4 v1, v2\n\t"
|
|
1343
|
-
"vsetvli zero, %[vl32], e16, m4\n\t"
|
|
1344
|
-
"vwredsum.vs v6, v24, v0\n\t"
|
|
1345
|
-
"vwredsum.vs v7, v28, v0\n\t"
|
|
1346
|
-
"vwredsum.vs v4, v16, v0\n\t"
|
|
1347
|
-
"vwredsum.vs v5, v20, v0\n\t"
|
|
1348
|
-
"vsetivli zero, 4, e32, m1\n\t"
|
|
1349
|
-
"vslideup.vi v6, v7, 1\n\t"
|
|
1350
|
-
"vslideup.vi v4, v5, 1\n\t"
|
|
1351
|
-
"vslideup.vi v4, v6, 2\n\t"
|
|
1352
|
-
"vmul.vv v8, v4, v1\n\t"
|
|
1353
|
-
"vredsum.vs v0, v8, v0\n\t"
|
|
1354
|
-
"vmv.x.s %[tmp], v0\n\t"
|
|
1355
|
-
"add %[sumi], %[sumi], %[tmp]"
|
|
1356
|
-
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
|
|
1357
|
-
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
|
|
1358
|
-
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
|
|
1359
|
-
: "memory"
|
|
1360
|
-
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
1361
|
-
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
|
1362
|
-
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
|
1363
|
-
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
|
1364
|
-
);
|
|
1365
|
-
|
|
1366
|
-
q4 += 64; q8 += 128; scale += 4;
|
|
1367
|
-
}
|
|
1368
|
-
|
|
1369
|
-
sumf += d * sumi;
|
|
1370
1446
|
}
|
|
1371
1447
|
break;
|
|
1372
1448
|
default:
|
|
@@ -1693,6 +1769,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1693
1769
|
case 128:
|
|
1694
1770
|
for (int i = 0; i < nb; ++i) {
|
|
1695
1771
|
|
|
1772
|
+
__builtin_prefetch(&x[i + 1].d, 0, 1);
|
|
1773
|
+
|
|
1696
1774
|
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1697
1775
|
|
|
1698
1776
|
const uint8_t * restrict q6 = x[i].ql;
|
|
@@ -1701,23 +1779,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1701
1779
|
|
|
1702
1780
|
const int8_t * restrict scale = x[i].scales;
|
|
1703
1781
|
|
|
1704
|
-
int
|
|
1705
|
-
|
|
1782
|
+
int q6h;
|
|
1783
|
+
float ftmp;
|
|
1706
1784
|
|
|
1707
1785
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
1708
1786
|
__asm__ __volatile__(
|
|
1787
|
+
"addi %[q6h], %[q6], 32\n\t"
|
|
1788
|
+
"ld t0, 0(%[scale])\n\t"
|
|
1789
|
+
"addi %[scale], %[scale], 8\n\t"
|
|
1790
|
+
"slli t6, t0, 1 * 8\n\t"
|
|
1791
|
+
"lb zero, 0(%[q6])\n\t"
|
|
1792
|
+
"slli t5, t0, 2 * 8\n\t"
|
|
1793
|
+
"slli t4, t0, 3 * 8\n\t"
|
|
1794
|
+
"lb zero, 0(%[q6h])\n\t"
|
|
1795
|
+
"slli t3, t0, 4 * 8\n\t"
|
|
1796
|
+
"slli t2, t0, 5 * 8\n\t"
|
|
1797
|
+
"lb zero, 0(%[qh])\n\t"
|
|
1798
|
+
"lb zero, 31(%[q6h])\n\t"
|
|
1799
|
+
"slli t1, t0, 6 * 8\n\t"
|
|
1800
|
+
"srai a7, t0, 56\n\t"
|
|
1709
1801
|
"vsetvli zero, %[vl32], e8, m2\n\t"
|
|
1802
|
+
"vle8.v v8, (%[q6])\n\t"
|
|
1803
|
+
"srai t6, t6, 56\n\t"
|
|
1804
|
+
"srai t5, t5, 56\n\t"
|
|
1805
|
+
"srai t4, t4, 56\n\t"
|
|
1806
|
+
"srai t3, t3, 56\n\t"
|
|
1807
|
+
"vle8.v v10, (%[q6h])\n\t"
|
|
1808
|
+
"addi %[q6], %[q6], 64\n\t"
|
|
1809
|
+
"slli t0, t0, 7 * 8\n\t"
|
|
1810
|
+
"srai t2, t2, 56\n\t"
|
|
1811
|
+
"srai t1, t1, 56\n\t"
|
|
1812
|
+
"srai t0, t0, 56\n\t"
|
|
1710
1813
|
"vle8.v v4, (%[qh])\n\t"
|
|
1814
|
+
"vsrl.vi v12, v8, 4\n\t"
|
|
1815
|
+
"vsrl.vi v14, v10, 4\n\t"
|
|
1816
|
+
"lb zero, 0(%[q8])\n\t"
|
|
1817
|
+
"vand.vi v8, v8, 0xF\n\t"
|
|
1818
|
+
"vand.vi v10, v10, 0xF\n\t"
|
|
1819
|
+
"lb zero, 32(%[q8])\n\t"
|
|
1711
1820
|
"vsll.vi v0, v4, 4\n\t"
|
|
1712
1821
|
"vsll.vi v2, v4, 2\n\t"
|
|
1822
|
+
"lb zero, 64(%[q8])\n\t"
|
|
1713
1823
|
"vsrl.vi v6, v4, 2\n\t"
|
|
1714
|
-
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
1715
|
-
"vle8.v v8, (%[q6])\n\t"
|
|
1716
|
-
"vsrl.vi v12, v8, 4\n\t"
|
|
1717
|
-
"vand.vi v8, v8, 0xF\n\t"
|
|
1718
|
-
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
1719
1824
|
"vand.vx v0, v0, %[mask]\n\t"
|
|
1825
|
+
"lb zero, 96(%[q8])\n\t"
|
|
1826
|
+
"vand.vx v2, v2, %[mask]\n\t"
|
|
1827
|
+
"vand.vx v4, v4, %[mask]\n\t"
|
|
1828
|
+
"vand.vx v6, v6, %[mask]\n\t"
|
|
1720
1829
|
"vor.vv v8, v8, v0\n\t"
|
|
1830
|
+
"lb zero, 127(%[q8])\n\t"
|
|
1831
|
+
"vor.vv v10, v10, v2\n\t"
|
|
1832
|
+
"vor.vv v12, v12, v4\n\t"
|
|
1833
|
+
"vor.vv v14, v14, v6\n\t"
|
|
1834
|
+
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
1721
1835
|
"vle8.v v0, (%[q8])\n\t"
|
|
1722
1836
|
"vsub.vx v8, v8, %[vl32]\n\t"
|
|
1723
1837
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
@@ -1734,34 +1848,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1734
1848
|
"vwredsum.vs v13, v28, v0\n\t"
|
|
1735
1849
|
"vwredsum.vs v14, v30, v0\n\t"
|
|
1736
1850
|
"vsetivli zero, 4, e32, m1\n\t"
|
|
1737
|
-
"
|
|
1738
|
-
"
|
|
1739
|
-
"
|
|
1740
|
-
"
|
|
1741
|
-
"
|
|
1742
|
-
"
|
|
1743
|
-
"
|
|
1744
|
-
"
|
|
1745
|
-
"
|
|
1746
|
-
"
|
|
1747
|
-
"
|
|
1748
|
-
"
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1851
|
+
"vmul.vx v0, v10, t0\n\t"
|
|
1852
|
+
"vmul.vx v1, v9, t1\n\t"
|
|
1853
|
+
"vmacc.vx v0, t2, v8\n\t"
|
|
1854
|
+
"vmacc.vx v1, t3, v7\n\t"
|
|
1855
|
+
"vmacc.vx v0, t4, v11\n\t"
|
|
1856
|
+
"vmacc.vx v1, t5, v12\n\t"
|
|
1857
|
+
"vmacc.vx v0, t6, v13\n\t"
|
|
1858
|
+
"vmacc.vx v1, a7, v14\n\t"
|
|
1859
|
+
"vadd.vv v0, v0, v1\n\t"
|
|
1860
|
+
"vfcvt.f.x.v v0, v0\n\t"
|
|
1861
|
+
"vfmv.f.s %[ftmp], v0\n\t"
|
|
1862
|
+
"fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
|
|
1863
|
+
: [q6] "+&r" (q6), [q6h] "=&r" (q6h)
|
|
1864
|
+
, [scale] "+&r" (scale)
|
|
1865
|
+
, [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
|
|
1866
|
+
: [qh] "r" (qh), [q8] "r" (q8)
|
|
1752
1867
|
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
|
1753
|
-
, [mask] "r" (0x30)
|
|
1868
|
+
, [mask] "r" (0x30), [d] "f" (d)
|
|
1754
1869
|
: "memory"
|
|
1755
1870
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
1756
1871
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
|
1757
1872
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
|
1758
1873
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
|
1874
|
+
, "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
|
|
1875
|
+
, "a6", "a5", "a4", "a3"
|
|
1759
1876
|
);
|
|
1760
|
-
|
|
1877
|
+
qh += 32; q8 += 128;
|
|
1761
1878
|
}
|
|
1762
|
-
|
|
1763
|
-
sumf += d * sum_t;
|
|
1764
|
-
|
|
1765
1879
|
}
|
|
1766
1880
|
break;
|
|
1767
1881
|
default:
|
|
@@ -68,12 +68,6 @@ struct ggml_compute_params {
|
|
|
68
68
|
#endif // __VXE2__
|
|
69
69
|
#endif // __s390x__ && __VEC__
|
|
70
70
|
|
|
71
|
-
#if defined(__s390x__) && defined(GGML_NNPA)
|
|
72
|
-
#ifndef __NNPA__
|
|
73
|
-
#define __NNPA__
|
|
74
|
-
#endif // __NNPA__
|
|
75
|
-
#endif // __s390x__ && GGML_NNPA
|
|
76
|
-
|
|
77
71
|
#if defined(__ARM_FEATURE_SVE)
|
|
78
72
|
#include <sys/prctl.h>
|
|
79
73
|
#endif
|
|
@@ -489,7 +483,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
|
|
489
483
|
/**
|
|
490
484
|
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
|
491
485
|
*/
|
|
492
|
-
inline float vec_hsum(float32x4_t v) {
|
|
486
|
+
inline static float vec_hsum(float32x4_t v) {
|
|
493
487
|
float32x4_t v_temp = v + vec_reve(v);
|
|
494
488
|
return v_temp[0] + v_temp[1];
|
|
495
489
|
}
|
|
@@ -1876,6 +1876,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
1876
1876
|
{
|
|
1877
1877
|
ggml_compute_forward_im2col_back_f32(params, tensor);
|
|
1878
1878
|
} break;
|
|
1879
|
+
case GGML_OP_IM2COL_3D:
|
|
1880
|
+
{
|
|
1881
|
+
ggml_compute_forward_im2col_3d(params, tensor);
|
|
1882
|
+
} break;
|
|
1879
1883
|
case GGML_OP_CONV_2D:
|
|
1880
1884
|
{
|
|
1881
1885
|
ggml_compute_forward_conv_2d(params, tensor);
|
|
@@ -2255,6 +2259,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2255
2259
|
} break;
|
|
2256
2260
|
case GGML_OP_IM2COL:
|
|
2257
2261
|
case GGML_OP_IM2COL_BACK:
|
|
2262
|
+
case GGML_OP_IM2COL_3D:
|
|
2258
2263
|
case GGML_OP_CONV_2D:
|
|
2259
2264
|
case GGML_OP_CONV_3D:
|
|
2260
2265
|
case GGML_OP_CONV_2D_DW:
|
|
@@ -3206,20 +3211,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
|
|
3206
3211
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
3207
3212
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
|
3208
3213
|
}
|
|
3209
|
-
#elif defined(
|
|
3210
|
-
for (; i
|
|
3211
|
-
|
|
3212
|
-
|
|
3213
|
-
|
|
3214
|
-
|
|
3215
|
-
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3216
|
-
}
|
|
3217
|
-
for (; i + 3 < n; i += 4) {
|
|
3218
|
-
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
|
3219
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
3220
|
-
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
|
3221
|
-
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3222
|
-
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3214
|
+
#elif defined(__riscv_zvfh)
|
|
3215
|
+
for (int vl; i < n; i += vl) {
|
|
3216
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
3217
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
3218
|
+
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
|
|
3219
|
+
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
|
|
3223
3220
|
}
|
|
3224
3221
|
#endif
|
|
3225
3222
|
for (; i < n; ++i) {
|
|
@@ -3247,21 +3244,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
|
|
3247
3244
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3248
3245
|
_mm_storeu_ps(y + i, y_vec);
|
|
3249
3246
|
}
|
|
3250
|
-
#elif defined(__NNPA__)
|
|
3251
|
-
for (; i + 7 < n; i += 8) {
|
|
3252
|
-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3253
|
-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3254
|
-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3255
|
-
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
|
3256
|
-
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
|
3257
|
-
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
|
3258
|
-
}
|
|
3259
|
-
for (; i + 3 < n; i += 4) {
|
|
3260
|
-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3261
|
-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3262
|
-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3263
|
-
vec_xst(v_yh, 0, (float *)(y + i));
|
|
3264
|
-
}
|
|
3265
3247
|
#endif
|
|
3266
3248
|
|
|
3267
3249
|
for (; i < n; ++i) {
|
|
@@ -3465,14 +3447,6 @@ int ggml_cpu_has_vxe(void) {
|
|
|
3465
3447
|
#endif
|
|
3466
3448
|
}
|
|
3467
3449
|
|
|
3468
|
-
int ggml_cpu_has_nnpa(void) {
|
|
3469
|
-
#if defined(GGML_NNPA)
|
|
3470
|
-
return 1;
|
|
3471
|
-
#else
|
|
3472
|
-
return 0;
|
|
3473
|
-
#endif
|
|
3474
|
-
}
|
|
3475
|
-
|
|
3476
3450
|
int ggml_cpu_has_neon(void) {
|
|
3477
3451
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
|
3478
3452
|
return 1;
|