cui-llama.rn 1.2.3 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
- package/android/src/main/jni.cpp +9 -11
- package/cpp/common.cpp +85 -75
- package/cpp/common.h +127 -91
- package/cpp/ggml-aarch64.c +269 -0
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend-impl.h +4 -15
- package/cpp/ggml-backend.cpp +1697 -1626
- package/cpp/ggml-backend.h +13 -25
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu.c +13720 -0
- package/cpp/ggml-cpu.h +150 -0
- package/cpp/ggml-impl.h +95 -0
- package/cpp/ggml-metal.m +185 -71
- package/cpp/ggml-quants.c +38 -51
- package/cpp/ggml.c +4468 -19500
- package/cpp/ggml.h +26 -146
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +742 -249
- package/cpp/llama-sampling.h +21 -2
- package/cpp/llama-vocab.cpp +49 -9
- package/cpp/llama-vocab.h +35 -11
- package/cpp/llama.cpp +2468 -2307
- package/cpp/llama.h +65 -32
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +117 -118
- package/cpp/sampling.h +20 -20
- package/cpp/sgemm.cpp +57 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +0 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +0 -1
package/cpp/ggml-aarch64.c
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
#include "ggml-quants.h"
|
9
9
|
#include "ggml-impl.h"
|
10
|
+
#include "ggml-cpu.h"
|
10
11
|
#include "ggml-cpu-impl.h"
|
11
12
|
|
12
13
|
#include <math.h>
|
@@ -991,6 +992,73 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
991
992
|
}
|
992
993
|
}
|
993
994
|
return;
|
995
|
+
#elif defined(__riscv_v_intrinsic)
|
996
|
+
if (__riscv_vlenb() >= QK4_0) {
|
997
|
+
const size_t vl = QK4_0;
|
998
|
+
|
999
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
1000
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
1001
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
1002
|
+
|
1003
|
+
vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
1004
|
+
for (int l = 0; l < nb; l++) {
|
1005
|
+
const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
|
1006
|
+
const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
|
1007
|
+
const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
|
1008
|
+
const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
|
1009
|
+
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
1010
|
+
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
|
1011
|
+
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
|
1012
|
+
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
|
1013
|
+
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
|
1014
|
+
|
1015
|
+
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
1016
|
+
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
1017
|
+
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
1018
|
+
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
1019
|
+
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
1020
|
+
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
1021
|
+
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
1022
|
+
|
1023
|
+
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
1024
|
+
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
1025
|
+
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
1026
|
+
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
1027
|
+
|
1028
|
+
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
|
1029
|
+
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
1030
|
+
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
1031
|
+
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
1032
|
+
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
1033
|
+
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
1034
|
+
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
1035
|
+
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
1036
|
+
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
1037
|
+
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
1038
|
+
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
1039
|
+
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
1040
|
+
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
1041
|
+
|
1042
|
+
// vector version needs Zvfhmin extension
|
1043
|
+
const float a_scale = LM_GGML_FP16_TO_FP32(a_ptr[l].d);
|
1044
|
+
const float b_scales[8] = {
|
1045
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
1046
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
1047
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
1048
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
1049
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
1050
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
1051
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
1052
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
1053
|
+
};
|
1054
|
+
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
1055
|
+
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
|
1056
|
+
sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
|
1057
|
+
}
|
1058
|
+
__riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
|
1059
|
+
}
|
1060
|
+
return;
|
1061
|
+
}
|
994
1062
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
995
1063
|
{
|
996
1064
|
float sumf[8];
|
@@ -3171,6 +3239,207 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
3171
3239
|
}
|
3172
3240
|
}
|
3173
3241
|
}
|
3242
|
+
return;
|
3243
|
+
}
|
3244
|
+
#elif defined(__riscv_v_intrinsic)
|
3245
|
+
if (__riscv_vlenb() >= QK4_0) {
|
3246
|
+
const size_t vl = QK4_0;
|
3247
|
+
|
3248
|
+
for (int y = 0; y < nr / 4; y++) {
|
3249
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
3250
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
3251
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
3252
|
+
vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
3253
|
+
vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
3254
|
+
vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
3255
|
+
vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
3256
|
+
for (int l = 0; l < nb; l++) {
|
3257
|
+
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
3258
|
+
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
3259
|
+
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
3260
|
+
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
3261
|
+
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
3262
|
+
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
3263
|
+
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
3264
|
+
|
3265
|
+
// vector version needs Zvfhmin extension
|
3266
|
+
const float a_scales[4] = {
|
3267
|
+
LM_GGML_FP16_TO_FP32(a_ptr[l].d[0]),
|
3268
|
+
LM_GGML_FP16_TO_FP32(a_ptr[l].d[1]),
|
3269
|
+
LM_GGML_FP16_TO_FP32(a_ptr[l].d[2]),
|
3270
|
+
LM_GGML_FP16_TO_FP32(a_ptr[l].d[3])
|
3271
|
+
};
|
3272
|
+
const float b_scales[8] = {
|
3273
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
3274
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
3275
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
3276
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
3277
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
3278
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
3279
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
3280
|
+
LM_GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
3281
|
+
};
|
3282
|
+
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
3283
|
+
|
3284
|
+
const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
|
3285
|
+
const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
|
3286
|
+
const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
|
3287
|
+
const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
|
3288
|
+
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
3289
|
+
vint16m4_t sumi_l0;
|
3290
|
+
{
|
3291
|
+
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
|
3292
|
+
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
|
3293
|
+
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
|
3294
|
+
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
|
3295
|
+
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
3296
|
+
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
3297
|
+
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
3298
|
+
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
3299
|
+
|
3300
|
+
sumi_l0 = sumi_hi_m;
|
3301
|
+
}
|
3302
|
+
|
3303
|
+
{
|
3304
|
+
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
|
3305
|
+
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
3306
|
+
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
3307
|
+
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
3308
|
+
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
3309
|
+
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
3310
|
+
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
3311
|
+
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
3312
|
+
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
3313
|
+
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
3314
|
+
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
3315
|
+
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
3316
|
+
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
3317
|
+
|
3318
|
+
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
|
3319
|
+
sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
|
3320
|
+
}
|
3321
|
+
|
3322
|
+
const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
|
3323
|
+
const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
|
3324
|
+
const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
|
3325
|
+
const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
|
3326
|
+
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
3327
|
+
vint16m4_t sumi_l1;
|
3328
|
+
{
|
3329
|
+
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
|
3330
|
+
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
|
3331
|
+
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
|
3332
|
+
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
|
3333
|
+
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
3334
|
+
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
3335
|
+
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
3336
|
+
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
3337
|
+
|
3338
|
+
sumi_l1 = sumi_hi_m;
|
3339
|
+
}
|
3340
|
+
|
3341
|
+
{
|
3342
|
+
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
|
3343
|
+
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
3344
|
+
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
3345
|
+
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
3346
|
+
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
3347
|
+
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
3348
|
+
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
3349
|
+
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
3350
|
+
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
3351
|
+
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
3352
|
+
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
3353
|
+
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
3354
|
+
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
3355
|
+
|
3356
|
+
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
|
3357
|
+
sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
|
3358
|
+
}
|
3359
|
+
|
3360
|
+
const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
|
3361
|
+
const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
|
3362
|
+
const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
|
3363
|
+
const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
|
3364
|
+
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
3365
|
+
vint16m4_t sumi_l2;
|
3366
|
+
{
|
3367
|
+
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
|
3368
|
+
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
|
3369
|
+
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
|
3370
|
+
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
|
3371
|
+
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
3372
|
+
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
3373
|
+
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
3374
|
+
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
3375
|
+
|
3376
|
+
sumi_l2 = sumi_hi_m;
|
3377
|
+
}
|
3378
|
+
|
3379
|
+
{
|
3380
|
+
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
|
3381
|
+
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
3382
|
+
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
3383
|
+
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
3384
|
+
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
3385
|
+
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
3386
|
+
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
3387
|
+
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
3388
|
+
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
3389
|
+
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
3390
|
+
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
3391
|
+
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
3392
|
+
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
3393
|
+
|
3394
|
+
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
|
3395
|
+
sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
|
3396
|
+
}
|
3397
|
+
|
3398
|
+
const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
|
3399
|
+
const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
|
3400
|
+
const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
|
3401
|
+
const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
|
3402
|
+
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
3403
|
+
vint16m4_t sumi_l3;
|
3404
|
+
{
|
3405
|
+
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
|
3406
|
+
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
|
3407
|
+
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
|
3408
|
+
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
|
3409
|
+
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
3410
|
+
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
3411
|
+
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
3412
|
+
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
3413
|
+
|
3414
|
+
sumi_l3 = sumi_hi_m;
|
3415
|
+
}
|
3416
|
+
|
3417
|
+
{
|
3418
|
+
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
|
3419
|
+
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
3420
|
+
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
3421
|
+
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
3422
|
+
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
3423
|
+
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
3424
|
+
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
3425
|
+
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
3426
|
+
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
3427
|
+
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
3428
|
+
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
3429
|
+
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
3430
|
+
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
3431
|
+
|
3432
|
+
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
|
3433
|
+
sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
|
3434
|
+
}
|
3435
|
+
}
|
3436
|
+
__riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
|
3437
|
+
__riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
|
3438
|
+
__riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
|
3439
|
+
__riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
|
3440
|
+
}
|
3441
|
+
}
|
3442
|
+
|
3174
3443
|
return;
|
3175
3444
|
}
|
3176
3445
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
package/cpp/ggml-alloc.c
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
|
15
15
|
//#define LM_GGML_ALLOCATOR_DEBUG
|
16
16
|
|
17
|
-
//#define AT_PRINTF(...)
|
17
|
+
//#define AT_PRINTF(...) LM_GGML_LOG_DEBUG(__VA_ARGS__)
|
18
18
|
#define AT_PRINTF(...)
|
19
19
|
|
20
20
|
|
@@ -89,7 +89,7 @@ void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * talloc, struct lm_ggml_tenso
|
|
89
89
|
size = LM_GGML_PAD(size, talloc->alignment);
|
90
90
|
|
91
91
|
if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) {
|
92
|
-
|
92
|
+
LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
93
93
|
__func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
94
94
|
LM_GGML_ABORT("not enough space in the buffer");
|
95
95
|
}
|
@@ -172,7 +172,7 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
|
|
172
172
|
best_fit_block = alloc->n_free_blocks - 1;
|
173
173
|
} else {
|
174
174
|
// this should never happen
|
175
|
-
|
175
|
+
LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
176
176
|
__func__, size, max_avail);
|
177
177
|
LM_GGML_ABORT("not enough space in the buffer");
|
178
178
|
}
|
@@ -209,16 +209,16 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
|
|
209
209
|
}
|
210
210
|
}
|
211
211
|
}
|
212
|
-
|
212
|
+
LM_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
213
213
|
for (int i = 0; i < 1024; i++) {
|
214
214
|
if (alloc->allocated_tensors[i].tensor) {
|
215
|
-
|
215
|
+
LM_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
216
216
|
alloc->allocated_tensors[i].offset,
|
217
217
|
alloc->allocated_tensors[i].offset + lm_ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
218
218
|
lm_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
219
219
|
}
|
220
220
|
}
|
221
|
-
|
221
|
+
LM_GGML_LOG_DEBUG("\n");
|
222
222
|
}
|
223
223
|
#endif
|
224
224
|
|
@@ -348,7 +348,6 @@ struct tensor_alloc {
|
|
348
348
|
};
|
349
349
|
|
350
350
|
struct leaf_alloc {
|
351
|
-
int buffer_id;
|
352
351
|
struct tensor_alloc leaf;
|
353
352
|
};
|
354
353
|
|
@@ -740,7 +739,6 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
|
|
740
739
|
for (int i = 0; i < graph->n_leafs; i++) {
|
741
740
|
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
742
741
|
struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, leaf);
|
743
|
-
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
744
742
|
if (leaf->view_src || leaf->data) {
|
745
743
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
746
744
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
@@ -768,13 +766,13 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
|
|
768
766
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
769
767
|
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
770
768
|
#ifndef NDEBUG
|
771
|
-
|
769
|
+
LM_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
772
770
|
#endif
|
773
771
|
|
774
772
|
lm_ggml_backend_buffer_free(galloc->buffers[i]);
|
775
773
|
galloc->buffers[i] = lm_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
776
774
|
if (galloc->buffers[i] == NULL) {
|
777
|
-
|
775
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
778
776
|
return false;
|
779
777
|
}
|
780
778
|
lm_ggml_backend_buffer_set_usage(galloc->buffers[i], LM_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
@@ -825,14 +823,14 @@ static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct
|
|
825
823
|
static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph) {
|
826
824
|
if (galloc->n_nodes != graph->n_nodes) {
|
827
825
|
#ifndef NDEBUG
|
828
|
-
|
826
|
+
LM_GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
829
827
|
#endif
|
830
828
|
return true;
|
831
829
|
}
|
832
830
|
|
833
831
|
if (galloc->n_leafs != graph->n_leafs) {
|
834
832
|
#ifndef NDEBUG
|
835
|
-
|
833
|
+
LM_GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
836
834
|
#endif
|
837
835
|
return true;
|
838
836
|
}
|
@@ -843,7 +841,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
|
|
843
841
|
|
844
842
|
if (!lm_ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
845
843
|
#ifndef NDEBUG
|
846
|
-
|
844
|
+
LM_GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
847
845
|
#endif
|
848
846
|
return true;
|
849
847
|
}
|
@@ -855,7 +853,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
|
|
855
853
|
}
|
856
854
|
if (!lm_ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
857
855
|
#ifndef NDEBUG
|
858
|
-
|
856
|
+
LM_GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
859
857
|
#endif
|
860
858
|
return true;
|
861
859
|
}
|
@@ -869,14 +867,14 @@ bool lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph
|
|
869
867
|
if (lm_ggml_gallocr_needs_realloc(galloc, graph)) {
|
870
868
|
if (galloc->n_buffers == 1) {
|
871
869
|
#ifndef NDEBUG
|
872
|
-
|
870
|
+
LM_GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
873
871
|
#endif
|
874
872
|
if (!lm_ggml_gallocr_reserve(galloc, graph)) {
|
875
873
|
return false;
|
876
874
|
}
|
877
875
|
} else {
|
878
876
|
#ifndef NDEBUG
|
879
|
-
|
877
|
+
LM_GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
880
878
|
#endif
|
881
879
|
return false;
|
882
880
|
}
|
@@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct lm_ggml_context * ctx,
|
|
940
938
|
lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size);
|
941
939
|
if (buffer == NULL) {
|
942
940
|
#ifndef NDEBUG
|
943
|
-
|
941
|
+
LM_GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
|
944
942
|
#endif
|
945
943
|
for (size_t i = 0; i < *n_buffers; i++) {
|
946
944
|
lm_ggml_backend_buffer_free((*buffers)[i]);
|
@@ -990,7 +988,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
|
|
990
988
|
}
|
991
989
|
|
992
990
|
if (this_size > max_size) {
|
993
|
-
|
991
|
+
LM_GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
994
992
|
__func__, t->name,
|
995
993
|
lm_ggml_backend_buft_name(buft),
|
996
994
|
this_size, max_size);
|
@@ -1022,7 +1020,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
|
|
1022
1020
|
|
1023
1021
|
if (n_buffers == 0) {
|
1024
1022
|
#ifndef NDEBUG
|
1025
|
-
|
1023
|
+
LM_GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
1026
1024
|
#endif
|
1027
1025
|
return NULL;
|
1028
1026
|
}
|
package/cpp/ggml-backend-impl.h
CHANGED
@@ -22,7 +22,7 @@ extern "C" {
|
|
22
22
|
size_t (*get_max_size) (lm_ggml_backend_buffer_type_t buft);
|
23
23
|
// (optional) data size needed to allocate the tensor, including padding (defaults to lm_ggml_nbytes)
|
24
24
|
size_t (*get_alloc_size)(lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor);
|
25
|
-
// (optional) check if tensor data is in host memory (defaults to false)
|
25
|
+
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
|
26
26
|
bool (*is_host) (lm_ggml_backend_buffer_type_t buft);
|
27
27
|
};
|
28
28
|
|
@@ -37,7 +37,6 @@ extern "C" {
|
|
37
37
|
//
|
38
38
|
|
39
39
|
struct lm_ggml_backend_buffer_i {
|
40
|
-
const char * (*get_name) (lm_ggml_backend_buffer_t buffer);
|
41
40
|
// (optional) free the buffer
|
42
41
|
void (*free_buffer) (lm_ggml_backend_buffer_t buffer);
|
43
42
|
// base address of the buffer
|
@@ -88,19 +87,16 @@ extern "C" {
|
|
88
87
|
|
89
88
|
void (*free)(lm_ggml_backend_t backend);
|
90
89
|
|
91
|
-
// Will be moved to the device interface
|
92
|
-
// buffer allocation
|
93
|
-
lm_ggml_backend_buffer_type_t (*get_default_buffer_type)(lm_ggml_backend_t backend);
|
94
|
-
|
95
90
|
// (optional) asynchronous tensor data access
|
96
91
|
void (*set_tensor_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
97
92
|
void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
98
93
|
bool (*cpy_tensor_async)(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
|
99
94
|
|
100
|
-
// (optional) complete all pending operations
|
95
|
+
// (optional) complete all pending operations (required if the backend supports async operations)
|
101
96
|
void (*synchronize)(lm_ggml_backend_t backend);
|
102
97
|
|
103
|
-
// (optional)
|
98
|
+
// (optional) graph plans (not used currently)
|
99
|
+
// compute graph with a plan
|
104
100
|
lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph);
|
105
101
|
void (*graph_plan_free) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
|
106
102
|
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
@@ -111,13 +107,6 @@ extern "C" {
|
|
111
107
|
// compute graph (always async if supported by the backend)
|
112
108
|
enum lm_ggml_status (*graph_compute) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
|
113
109
|
|
114
|
-
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
115
|
-
// new backends should implement the device interface instead
|
116
|
-
// These functions are being moved to the device interface
|
117
|
-
bool (*supports_op) (lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
|
118
|
-
bool (*supports_buft)(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft);
|
119
|
-
bool (*offload_op) (lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
|
120
|
-
|
121
110
|
// (optional) event synchronization
|
122
111
|
// record an event on this stream
|
123
112
|
void (*event_record)(lm_ggml_backend_t backend, lm_ggml_backend_event_t event);
|