llama_cpp 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +165 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +217 -76
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +16 -5
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1082 -774
- data/ext/llama_cpp/src/ggml.h +64 -18
- data/ext/llama_cpp/src/llama.cpp +179 -51
- data/ext/llama_cpp/src/llama.h +15 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +3 -1
- metadata +4 -2
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
|
|
59
59
|
#endif //GGML_CUDA_DMMV_F16
|
60
60
|
|
61
61
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
62
|
-
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
63
|
-
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
62
|
+
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
63
|
+
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
64
64
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
65
65
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
66
66
|
typedef void (*ggml_cuda_op_t)(
|
@@ -131,7 +131,7 @@ typedef struct {
|
|
131
131
|
} block_q8_1;
|
132
132
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
133
|
|
134
|
-
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
135
135
|
|
136
136
|
//================================= k-quants
|
137
137
|
|
@@ -208,9 +208,11 @@ typedef struct {
|
|
208
208
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
209
209
|
|
210
210
|
#define WARP_SIZE 32
|
211
|
+
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
211
212
|
|
212
213
|
#define CUDA_ADD_BLOCK_SIZE 256
|
213
214
|
#define CUDA_MUL_BLOCK_SIZE 256
|
215
|
+
#define CUDA_GELU_BLOCK_SIZE 256
|
214
216
|
#define CUDA_SILU_BLOCK_SIZE 256
|
215
217
|
#define CUDA_CPY_BLOCK_SIZE 32
|
216
218
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
@@ -265,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
|
|
265
267
|
dst[i] = x[i] * y[i%ky];
|
266
268
|
}
|
267
269
|
|
270
|
+
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
271
|
+
const float GELU_COEF_A = 0.044715f;
|
272
|
+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
273
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
274
|
+
|
275
|
+
if (i >= k) {
|
276
|
+
return;
|
277
|
+
}
|
278
|
+
|
279
|
+
float xi = x[i];
|
280
|
+
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
281
|
+
}
|
282
|
+
|
268
283
|
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
269
284
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
270
285
|
|
@@ -274,16 +289,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
274
289
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
275
290
|
}
|
276
291
|
|
292
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
293
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
294
|
+
const int tid = threadIdx.x;
|
295
|
+
|
296
|
+
const float eps = 1e-5f;
|
297
|
+
|
298
|
+
float mean = 0.0f;
|
299
|
+
float var = 0.0f;
|
300
|
+
|
301
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
|
+
const float xi = x[row*ncols + col];
|
303
|
+
mean += xi;
|
304
|
+
var += xi * xi;
|
305
|
+
}
|
306
|
+
|
307
|
+
// sum up partial sums
|
308
|
+
#pragma unroll
|
309
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
310
|
+
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
|
311
|
+
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
|
312
|
+
}
|
313
|
+
|
314
|
+
mean /= ncols;
|
315
|
+
var = var / ncols - mean * mean;
|
316
|
+
const float inv_var = rsqrtf(var + eps);
|
317
|
+
|
318
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
319
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
277
323
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
278
324
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
279
325
|
const int tid = threadIdx.x;
|
280
326
|
|
281
|
-
const float eps = 1e-
|
327
|
+
const float eps = 1e-6f;
|
282
328
|
|
283
329
|
float tmp = 0.0f; // partial sum for thread in warp
|
284
330
|
|
285
|
-
for (int
|
286
|
-
const int col = i + tid;
|
331
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
287
332
|
const float xi = x[row*ncols + col];
|
288
333
|
tmp += xi * xi;
|
289
334
|
}
|
@@ -295,10 +340,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
295
340
|
}
|
296
341
|
|
297
342
|
const float mean = tmp / ncols;
|
298
|
-
const float scale =
|
343
|
+
const float scale = rsqrtf(mean + eps);
|
299
344
|
|
300
|
-
for (int
|
301
|
-
const int col = i + tid;
|
345
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
346
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
303
347
|
}
|
304
348
|
}
|
@@ -407,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
407
451
|
|
408
452
|
//================================== k-quants
|
409
453
|
|
410
|
-
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
454
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
411
455
|
|
412
456
|
const int i = blockIdx.x;
|
413
457
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -440,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
440
484
|
|
441
485
|
}
|
442
486
|
|
443
|
-
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
487
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
444
488
|
|
445
489
|
const int i = blockIdx.x;
|
446
490
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -504,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
504
548
|
}
|
505
549
|
#endif
|
506
550
|
|
507
|
-
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
551
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
508
552
|
const block_q4_K * x = (const block_q4_K *) vx;
|
509
553
|
|
510
554
|
const int i = blockIdx.x;
|
@@ -544,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
544
588
|
#endif
|
545
589
|
}
|
546
590
|
|
547
|
-
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
591
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
548
592
|
const block_q5_K * x = (const block_q5_K *) vx;
|
549
593
|
|
550
594
|
const int i = blockIdx.x;
|
@@ -590,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
590
634
|
#endif
|
591
635
|
}
|
592
636
|
|
593
|
-
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
637
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
594
638
|
const block_q6_K * x = (const block_q6_K *) vx;
|
595
639
|
|
596
640
|
const int i = blockIdx.x;
|
@@ -634,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
634
678
|
#endif
|
635
679
|
}
|
636
680
|
|
637
|
-
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
681
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
638
682
|
|
639
683
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
640
684
|
|
@@ -742,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
742
786
|
}
|
743
787
|
}
|
744
788
|
|
745
|
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
789
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
746
790
|
|
747
791
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
748
792
|
if (row > nrows) return;
|
@@ -846,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
846
890
|
}
|
847
891
|
}
|
848
892
|
|
849
|
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
893
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
850
894
|
|
851
895
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
852
896
|
if (row > nrows) return;
|
@@ -949,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
949
993
|
}
|
950
994
|
}
|
951
995
|
|
952
|
-
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
996
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
953
997
|
|
954
998
|
const int row = blockIdx.x;
|
955
999
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1053,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1053
1097
|
}
|
1054
1098
|
}
|
1055
1099
|
|
1056
|
-
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
1100
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1057
1101
|
|
1058
1102
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1059
1103
|
|
@@ -1171,7 +1215,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1171
1215
|
v.y = x[ib + iqs + 1];
|
1172
1216
|
}
|
1173
1217
|
|
1174
|
-
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1218
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
|
1175
1219
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
1220
|
|
1177
1221
|
if (i >= k) {
|
@@ -1180,10 +1224,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1180
1224
|
|
1181
1225
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
1226
|
|
1183
|
-
const int ib = i /
|
1184
|
-
const int iqs = i %
|
1227
|
+
const int ib = i / QK8_1; // block index
|
1228
|
+
const int iqs = i % QK8_1; // quant index
|
1185
1229
|
|
1186
|
-
const float xi = x[i];
|
1230
|
+
const float xi = i < ndata ? x[i] : 0.0f;
|
1187
1231
|
float amax = fabsf(xi);
|
1188
1232
|
float sum = xi;
|
1189
1233
|
|
@@ -1207,7 +1251,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1207
1251
|
}
|
1208
1252
|
|
1209
1253
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1210
|
-
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1254
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
1211
1255
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1212
1256
|
|
1213
1257
|
if (i >= k) {
|
@@ -1227,8 +1271,8 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1227
1271
|
y[iybs + iqs + y_offset] = v.y;
|
1228
1272
|
}
|
1229
1273
|
|
1230
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1231
|
-
#if __CUDA_ARCH__ >=
|
1274
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1275
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1232
1276
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
1277
|
|
1234
1278
|
int vi;
|
@@ -1249,11 +1293,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
|
|
1249
1293
|
return sumi*d;
|
1250
1294
|
#else
|
1251
1295
|
return 0.0f; // only to satisfy the compiler
|
1252
|
-
#endif // __CUDA_ARCH__ >=
|
1296
|
+
#endif // __CUDA_ARCH__ >= 610
|
1253
1297
|
}
|
1254
1298
|
|
1255
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1256
|
-
#if __CUDA_ARCH__ >=
|
1299
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1300
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1257
1301
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
1302
|
|
1259
1303
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1274,11 +1318,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
|
|
1274
1318
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
1319
|
#else
|
1276
1320
|
return 0.0f; // only to satisfy the compiler
|
1277
|
-
#endif // __CUDA_ARCH__ >=
|
1321
|
+
#endif // __CUDA_ARCH__ >= 610
|
1278
1322
|
}
|
1279
1323
|
|
1280
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1281
|
-
#if __CUDA_ARCH__ >=
|
1324
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1325
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1282
1326
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
1327
|
|
1284
1328
|
int qs;
|
@@ -1309,11 +1353,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
|
|
1309
1353
|
return sumi*d;
|
1310
1354
|
#else
|
1311
1355
|
return 0.0f; // only to satisfy the compiler
|
1312
|
-
#endif // __CUDA_ARCH__ >=
|
1356
|
+
#endif // __CUDA_ARCH__ >= 610
|
1313
1357
|
}
|
1314
1358
|
|
1315
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1316
|
-
#if __CUDA_ARCH__ >=
|
1359
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1360
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1317
1361
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
1362
|
|
1319
1363
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1343,11 +1387,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
|
|
1343
1387
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
1388
|
#else
|
1345
1389
|
return 0.0f; // only to satisfy the compiler
|
1346
|
-
#endif // __CUDA_ARCH__ >=
|
1390
|
+
#endif // __CUDA_ARCH__ >= 610
|
1347
1391
|
}
|
1348
1392
|
|
1349
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1350
|
-
#if __CUDA_ARCH__ >=
|
1393
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1394
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1351
1395
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
1396
|
|
1353
1397
|
int vi;
|
@@ -1362,11 +1406,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
|
|
1362
1406
|
return sumi*d;
|
1363
1407
|
#else
|
1364
1408
|
return 0.0f; // only to satisfy the compiler
|
1365
|
-
#endif // __CUDA_ARCH__ >=
|
1409
|
+
#endif // __CUDA_ARCH__ >= 610
|
1366
1410
|
}
|
1367
1411
|
|
1368
1412
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
-
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1413
|
+
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1370
1414
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
1415
|
|
1372
1416
|
if (row >= nrows) {
|
@@ -1404,7 +1448,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
|
|
1404
1448
|
}
|
1405
1449
|
|
1406
1450
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1407
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1451
|
+
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
1408
1452
|
// qk = quantized weights per x block
|
1409
1453
|
// qr = number of quantized weights per data value in x block
|
1410
1454
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -1471,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1471
1515
|
}
|
1472
1516
|
}
|
1473
1517
|
|
1474
|
-
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1518
|
+
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1475
1519
|
const half * x = (const half *) vx;
|
1476
1520
|
|
1477
1521
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1518,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1518
1562
|
}
|
1519
1563
|
|
1520
1564
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1521
|
-
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1565
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1522
1566
|
const int row_stride_x, const int channel_stride_x) {
|
1523
1567
|
|
1524
1568
|
const half * x = (const half *) vx;
|
@@ -1703,20 +1747,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1703
1747
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1704
1748
|
}
|
1705
1749
|
|
1750
|
+
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1751
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
1752
|
+
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1753
|
+
}
|
1754
|
+
|
1706
1755
|
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1707
1756
|
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
1708
1757
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1709
1758
|
}
|
1710
1759
|
|
1760
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1761
|
+
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1762
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1763
|
+
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1764
|
+
}
|
1765
|
+
|
1711
1766
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1712
1767
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1713
1768
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1714
1769
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1715
1770
|
}
|
1716
1771
|
|
1717
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
1772
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
1718
1773
|
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
-
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
1774
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
|
1720
1775
|
}
|
1721
1776
|
|
1722
1777
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -2236,16 +2291,21 @@ inline void ggml_cuda_op_add(
|
|
2236
2291
|
|
2237
2292
|
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
2238
2293
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2239
|
-
GGML_ASSERT(dst_ddf_i
|
2294
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2240
2295
|
|
2241
|
-
|
2296
|
+
// TODO: support broadcasting
|
2297
|
+
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
2298
|
+
|
2299
|
+
const int64_t ne00 = src0->ne[0];
|
2242
2300
|
const int64_t i01_diff = i01_high - i01_low;
|
2243
2301
|
|
2302
|
+
// const int64_t ne10 = src1->ne[0];
|
2303
|
+
|
2244
2304
|
// compute
|
2245
2305
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2246
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i,
|
2306
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2247
2307
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2248
|
-
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i,
|
2308
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2249
2309
|
} else {
|
2250
2310
|
GGML_ASSERT(false);
|
2251
2311
|
}
|
@@ -2264,10 +2324,9 @@ inline void ggml_cuda_op_mul(
|
|
2264
2324
|
|
2265
2325
|
GGML_ASSERT(src0_ddf_i != nullptr);
|
2266
2326
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2267
|
-
GGML_ASSERT(dst_ddf_i
|
2327
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2268
2328
|
|
2269
2329
|
const int64_t ne00 = src0->ne[0];
|
2270
|
-
|
2271
2330
|
const int64_t ne10 = src1->ne[0];
|
2272
2331
|
const int64_t ne11 = src1->ne[1];
|
2273
2332
|
|
@@ -2276,7 +2335,7 @@ inline void ggml_cuda_op_mul(
|
|
2276
2335
|
|
2277
2336
|
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2278
2337
|
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2279
|
-
float * dst_ddf_i01
|
2338
|
+
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2280
2339
|
|
2281
2340
|
// compute
|
2282
2341
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
@@ -2287,6 +2346,28 @@ inline void ggml_cuda_op_mul(
|
|
2287
2346
|
(void) i02;
|
2288
2347
|
}
|
2289
2348
|
|
2349
|
+
inline void ggml_cuda_op_gelu(
|
2350
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2351
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2352
|
+
cudaStream_t & cudaStream_main){
|
2353
|
+
|
2354
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2355
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2356
|
+
|
2357
|
+
const int64_t ne00 = src0->ne[0];
|
2358
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2359
|
+
|
2360
|
+
// compute
|
2361
|
+
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2362
|
+
|
2363
|
+
(void) src1;
|
2364
|
+
(void) dst;
|
2365
|
+
(void) src0_ddq_i;
|
2366
|
+
(void) src1_ddf_i;
|
2367
|
+
(void) i02;
|
2368
|
+
(void) i1;
|
2369
|
+
}
|
2370
|
+
|
2290
2371
|
inline void ggml_cuda_op_silu(
|
2291
2372
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2292
2373
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2309,6 +2390,28 @@ inline void ggml_cuda_op_silu(
|
|
2309
2390
|
(void) i1;
|
2310
2391
|
}
|
2311
2392
|
|
2393
|
+
inline void ggml_cuda_op_norm(
|
2394
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2395
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2396
|
+
cudaStream_t & cudaStream_main){
|
2397
|
+
|
2398
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2399
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2400
|
+
|
2401
|
+
const int64_t ne00 = src0->ne[0];
|
2402
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2403
|
+
|
2404
|
+
// compute
|
2405
|
+
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2406
|
+
|
2407
|
+
(void) src1;
|
2408
|
+
(void) dst;
|
2409
|
+
(void) src0_ddq_i;
|
2410
|
+
(void) src1_ddf_i;
|
2411
|
+
(void) i02;
|
2412
|
+
(void) i1;
|
2413
|
+
}
|
2414
|
+
|
2312
2415
|
inline void ggml_cuda_op_rms_norm(
|
2313
2416
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2314
2417
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2355,16 +2458,15 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2355
2458
|
src0->type == GGML_TYPE_Q5_1 ||
|
2356
2459
|
src0->type == GGML_TYPE_Q8_0;
|
2357
2460
|
|
2358
|
-
|
2359
|
-
// However, they have bad performance with Pascal cards.
|
2360
|
-
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
2361
|
-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
2461
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
|
2362
2462
|
#endif
|
2363
2463
|
|
2364
2464
|
if (use_mul_mat_vec_q) {
|
2465
|
+
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
2466
|
+
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
2365
2467
|
size_t as;
|
2366
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(
|
2367
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2468
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2469
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
2368
2470
|
|
2369
2471
|
switch (src0->type) {
|
2370
2472
|
case GGML_TYPE_Q4_0:
|
@@ -2925,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2925
3027
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
2926
3028
|
}
|
2927
3029
|
|
3030
|
+
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3031
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3032
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
3033
|
+
}
|
3034
|
+
|
2928
3035
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2929
3036
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2930
3037
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
2931
3038
|
}
|
2932
3039
|
|
3040
|
+
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3041
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3042
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
3043
|
+
}
|
3044
|
+
|
2933
3045
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2934
3046
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2935
3047
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
@@ -3108,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3108
3220
|
|
3109
3221
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
3110
3222
|
int nrows = ggml_nrows(tensor);
|
3223
|
+
|
3224
|
+
const int64_t ne0 = tensor->ne[0];
|
3225
|
+
|
3111
3226
|
const size_t nb1 = tensor->nb[1];
|
3227
|
+
|
3112
3228
|
ggml_backend backend = tensor->backend;
|
3113
3229
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
3114
3230
|
memset(extra, 0, sizeof(*extra));
|
@@ -3137,13 +3253,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3137
3253
|
int64_t nrows_split = row_high - row_low;
|
3138
3254
|
|
3139
3255
|
const size_t offset_split = row_low*nb1;
|
3140
|
-
|
3256
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3257
|
+
const size_t original_size = size;
|
3258
|
+
|
3259
|
+
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
3260
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3261
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3262
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
3263
|
+
}
|
3141
3264
|
|
3142
|
-
|
3265
|
+
char * buf;
|
3143
3266
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
3144
|
-
|
3267
|
+
char * buf_host = (char*)data + offset_split;
|
3268
|
+
|
3269
|
+
// set padding to 0 to avoid possible NaN values
|
3270
|
+
if (size > original_size) {
|
3271
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
3272
|
+
}
|
3145
3273
|
|
3146
|
-
|
3274
|
+
|
3275
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
|
3147
3276
|
|
3148
3277
|
extra->data_device[id] = buf;
|
3149
3278
|
|
@@ -3183,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3183
3312
|
}
|
3184
3313
|
|
3185
3314
|
// recursively assign CUDA buffers until a compute tensor is found
|
3186
|
-
if (tensor->
|
3187
|
-
const ggml_op src0_op = tensor->
|
3315
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
|
+
const ggml_op src0_op = tensor->src[0]->op;
|
3188
3317
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3189
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3318
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3190
3319
|
}
|
3191
3320
|
}
|
3192
|
-
if (tensor->op == GGML_OP_CPY && tensor->
|
3193
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3321
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
3322
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
3194
3323
|
}
|
3195
3324
|
|
3196
3325
|
tensor->backend = GGML_BACKEND_GPU;
|
3197
3326
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
3198
3327
|
memset(extra, 0, sizeof(*extra));
|
3199
3328
|
|
3200
|
-
const bool inplace = (tensor->
|
3329
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3201
3330
|
tensor->op == GGML_OP_VIEW ||
|
3202
3331
|
force_inplace;
|
3203
3332
|
const size_t size = ggml_nbytes(tensor);
|
3204
3333
|
|
3205
3334
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3206
|
-
if (inplace && (tensor->
|
3207
|
-
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3335
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
3336
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
3208
3337
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3209
3338
|
size_t offset = 0;
|
3210
3339
|
if (tensor->op == GGML_OP_VIEW) {
|
3211
|
-
memcpy(&offset, tensor->
|
3340
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
3212
3341
|
}
|
3213
3342
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3214
3343
|
} else if (tensor->op == GGML_OP_CPY) {
|
3215
|
-
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3344
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3216
3345
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3217
3346
|
extra->data_device[g_main_device] = src1_ddv;
|
3218
3347
|
} else if (scratch) {
|
@@ -3283,8 +3412,8 @@ void ggml_cuda_free_scratch() {
|
|
3283
3412
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
3284
3413
|
ggml_cuda_func_t func;
|
3285
3414
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
3286
|
-
|| (tensor->
|
3287
|
-
|| (tensor->
|
3415
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
3416
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3288
3417
|
|
3289
3418
|
switch (tensor->op) {
|
3290
3419
|
case GGML_OP_ADD:
|
@@ -3299,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3299
3428
|
}
|
3300
3429
|
func = ggml_cuda_mul;
|
3301
3430
|
break;
|
3431
|
+
case GGML_OP_GELU:
|
3432
|
+
if (!any_on_device) {
|
3433
|
+
return false;
|
3434
|
+
}
|
3435
|
+
func = ggml_cuda_gelu;
|
3436
|
+
break;
|
3302
3437
|
case GGML_OP_SILU:
|
3303
3438
|
if (!any_on_device) {
|
3304
3439
|
return false;
|
3305
3440
|
}
|
3306
3441
|
func = ggml_cuda_silu;
|
3307
3442
|
break;
|
3443
|
+
case GGML_OP_NORM:
|
3444
|
+
if (!any_on_device) {
|
3445
|
+
return false;
|
3446
|
+
}
|
3447
|
+
func = ggml_cuda_norm;
|
3448
|
+
break;
|
3308
3449
|
case GGML_OP_RMS_NORM:
|
3309
3450
|
if (!any_on_device) {
|
3310
3451
|
return false;
|
@@ -3312,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3312
3453
|
func = ggml_cuda_rms_norm;
|
3313
3454
|
break;
|
3314
3455
|
case GGML_OP_MUL_MAT:
|
3315
|
-
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->
|
3456
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
3316
3457
|
return false;
|
3317
3458
|
}
|
3318
3459
|
func = ggml_cuda_mul_mat;
|
@@ -3366,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3366
3507
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
3367
3508
|
return true;
|
3368
3509
|
}
|
3369
|
-
func(tensor->
|
3510
|
+
func(tensor->src[0], tensor->src[1], tensor);
|
3370
3511
|
return true;
|
3371
3512
|
}
|