llama_cpp 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +165 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +217 -76
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +16 -5
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1082 -774
- data/ext/llama_cpp/src/ggml.h +64 -18
- data/ext/llama_cpp/src/llama.cpp +179 -51
- data/ext/llama_cpp/src/llama.h +15 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +3 -1
- metadata +4 -2
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
|
|
59
59
|
#endif //GGML_CUDA_DMMV_F16
|
60
60
|
|
61
61
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
62
|
-
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
63
|
-
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
62
|
+
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
63
|
+
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
64
64
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
65
65
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
66
66
|
typedef void (*ggml_cuda_op_t)(
|
@@ -131,7 +131,7 @@ typedef struct {
|
|
131
131
|
} block_q8_1;
|
132
132
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
133
|
|
134
|
-
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
135
135
|
|
136
136
|
//================================= k-quants
|
137
137
|
|
@@ -208,9 +208,11 @@ typedef struct {
|
|
208
208
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
209
209
|
|
210
210
|
#define WARP_SIZE 32
|
211
|
+
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
211
212
|
|
212
213
|
#define CUDA_ADD_BLOCK_SIZE 256
|
213
214
|
#define CUDA_MUL_BLOCK_SIZE 256
|
215
|
+
#define CUDA_GELU_BLOCK_SIZE 256
|
214
216
|
#define CUDA_SILU_BLOCK_SIZE 256
|
215
217
|
#define CUDA_CPY_BLOCK_SIZE 32
|
216
218
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
@@ -265,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
|
|
265
267
|
dst[i] = x[i] * y[i%ky];
|
266
268
|
}
|
267
269
|
|
270
|
+
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
271
|
+
const float GELU_COEF_A = 0.044715f;
|
272
|
+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
273
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
274
|
+
|
275
|
+
if (i >= k) {
|
276
|
+
return;
|
277
|
+
}
|
278
|
+
|
279
|
+
float xi = x[i];
|
280
|
+
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
281
|
+
}
|
282
|
+
|
268
283
|
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
269
284
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
270
285
|
|
@@ -274,16 +289,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
274
289
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
275
290
|
}
|
276
291
|
|
292
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
293
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
294
|
+
const int tid = threadIdx.x;
|
295
|
+
|
296
|
+
const float eps = 1e-5f;
|
297
|
+
|
298
|
+
float mean = 0.0f;
|
299
|
+
float var = 0.0f;
|
300
|
+
|
301
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
|
+
const float xi = x[row*ncols + col];
|
303
|
+
mean += xi;
|
304
|
+
var += xi * xi;
|
305
|
+
}
|
306
|
+
|
307
|
+
// sum up partial sums
|
308
|
+
#pragma unroll
|
309
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
310
|
+
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
|
311
|
+
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
|
312
|
+
}
|
313
|
+
|
314
|
+
mean /= ncols;
|
315
|
+
var = var / ncols - mean * mean;
|
316
|
+
const float inv_var = rsqrtf(var + eps);
|
317
|
+
|
318
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
319
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
277
323
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
278
324
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
279
325
|
const int tid = threadIdx.x;
|
280
326
|
|
281
|
-
const float eps = 1e-
|
327
|
+
const float eps = 1e-6f;
|
282
328
|
|
283
329
|
float tmp = 0.0f; // partial sum for thread in warp
|
284
330
|
|
285
|
-
for (int
|
286
|
-
const int col = i + tid;
|
331
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
287
332
|
const float xi = x[row*ncols + col];
|
288
333
|
tmp += xi * xi;
|
289
334
|
}
|
@@ -295,10 +340,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
295
340
|
}
|
296
341
|
|
297
342
|
const float mean = tmp / ncols;
|
298
|
-
const float scale =
|
343
|
+
const float scale = rsqrtf(mean + eps);
|
299
344
|
|
300
|
-
for (int
|
301
|
-
const int col = i + tid;
|
345
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
346
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
303
347
|
}
|
304
348
|
}
|
@@ -407,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
407
451
|
|
408
452
|
//================================== k-quants
|
409
453
|
|
410
|
-
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
454
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
411
455
|
|
412
456
|
const int i = blockIdx.x;
|
413
457
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -440,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
440
484
|
|
441
485
|
}
|
442
486
|
|
443
|
-
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
487
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
444
488
|
|
445
489
|
const int i = blockIdx.x;
|
446
490
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -504,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
504
548
|
}
|
505
549
|
#endif
|
506
550
|
|
507
|
-
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
551
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
508
552
|
const block_q4_K * x = (const block_q4_K *) vx;
|
509
553
|
|
510
554
|
const int i = blockIdx.x;
|
@@ -544,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
544
588
|
#endif
|
545
589
|
}
|
546
590
|
|
547
|
-
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
591
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
548
592
|
const block_q5_K * x = (const block_q5_K *) vx;
|
549
593
|
|
550
594
|
const int i = blockIdx.x;
|
@@ -590,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
590
634
|
#endif
|
591
635
|
}
|
592
636
|
|
593
|
-
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
637
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
594
638
|
const block_q6_K * x = (const block_q6_K *) vx;
|
595
639
|
|
596
640
|
const int i = blockIdx.x;
|
@@ -634,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
634
678
|
#endif
|
635
679
|
}
|
636
680
|
|
637
|
-
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
681
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
638
682
|
|
639
683
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
640
684
|
|
@@ -742,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
742
786
|
}
|
743
787
|
}
|
744
788
|
|
745
|
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
789
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
746
790
|
|
747
791
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
748
792
|
if (row > nrows) return;
|
@@ -846,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
846
890
|
}
|
847
891
|
}
|
848
892
|
|
849
|
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
893
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
850
894
|
|
851
895
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
852
896
|
if (row > nrows) return;
|
@@ -949,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
949
993
|
}
|
950
994
|
}
|
951
995
|
|
952
|
-
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
996
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
953
997
|
|
954
998
|
const int row = blockIdx.x;
|
955
999
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1053,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1053
1097
|
}
|
1054
1098
|
}
|
1055
1099
|
|
1056
|
-
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
1100
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1057
1101
|
|
1058
1102
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1059
1103
|
|
@@ -1171,7 +1215,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1171
1215
|
v.y = x[ib + iqs + 1];
|
1172
1216
|
}
|
1173
1217
|
|
1174
|
-
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1218
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
|
1175
1219
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
1220
|
|
1177
1221
|
if (i >= k) {
|
@@ -1180,10 +1224,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1180
1224
|
|
1181
1225
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
1226
|
|
1183
|
-
const int ib = i /
|
1184
|
-
const int iqs = i %
|
1227
|
+
const int ib = i / QK8_1; // block index
|
1228
|
+
const int iqs = i % QK8_1; // quant index
|
1185
1229
|
|
1186
|
-
const float xi = x[i];
|
1230
|
+
const float xi = i < ndata ? x[i] : 0.0f;
|
1187
1231
|
float amax = fabsf(xi);
|
1188
1232
|
float sum = xi;
|
1189
1233
|
|
@@ -1207,7 +1251,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1207
1251
|
}
|
1208
1252
|
|
1209
1253
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1210
|
-
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1254
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
1211
1255
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1212
1256
|
|
1213
1257
|
if (i >= k) {
|
@@ -1227,8 +1271,8 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1227
1271
|
y[iybs + iqs + y_offset] = v.y;
|
1228
1272
|
}
|
1229
1273
|
|
1230
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1231
|
-
#if __CUDA_ARCH__ >=
|
1274
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1275
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1232
1276
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
1277
|
|
1234
1278
|
int vi;
|
@@ -1249,11 +1293,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
|
|
1249
1293
|
return sumi*d;
|
1250
1294
|
#else
|
1251
1295
|
return 0.0f; // only to satisfy the compiler
|
1252
|
-
#endif // __CUDA_ARCH__ >=
|
1296
|
+
#endif // __CUDA_ARCH__ >= 610
|
1253
1297
|
}
|
1254
1298
|
|
1255
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1256
|
-
#if __CUDA_ARCH__ >=
|
1299
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1300
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1257
1301
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
1302
|
|
1259
1303
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1274,11 +1318,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
|
|
1274
1318
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
1319
|
#else
|
1276
1320
|
return 0.0f; // only to satisfy the compiler
|
1277
|
-
#endif // __CUDA_ARCH__ >=
|
1321
|
+
#endif // __CUDA_ARCH__ >= 610
|
1278
1322
|
}
|
1279
1323
|
|
1280
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1281
|
-
#if __CUDA_ARCH__ >=
|
1324
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1325
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1282
1326
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
1327
|
|
1284
1328
|
int qs;
|
@@ -1309,11 +1353,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
|
|
1309
1353
|
return sumi*d;
|
1310
1354
|
#else
|
1311
1355
|
return 0.0f; // only to satisfy the compiler
|
1312
|
-
#endif // __CUDA_ARCH__ >=
|
1356
|
+
#endif // __CUDA_ARCH__ >= 610
|
1313
1357
|
}
|
1314
1358
|
|
1315
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1316
|
-
#if __CUDA_ARCH__ >=
|
1359
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1360
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1317
1361
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
1362
|
|
1319
1363
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1343,11 +1387,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
|
|
1343
1387
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
1388
|
#else
|
1345
1389
|
return 0.0f; // only to satisfy the compiler
|
1346
|
-
#endif // __CUDA_ARCH__ >=
|
1390
|
+
#endif // __CUDA_ARCH__ >= 610
|
1347
1391
|
}
|
1348
1392
|
|
1349
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1350
|
-
#if __CUDA_ARCH__ >=
|
1393
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1394
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1351
1395
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
1396
|
|
1353
1397
|
int vi;
|
@@ -1362,11 +1406,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
|
|
1362
1406
|
return sumi*d;
|
1363
1407
|
#else
|
1364
1408
|
return 0.0f; // only to satisfy the compiler
|
1365
|
-
#endif // __CUDA_ARCH__ >=
|
1409
|
+
#endif // __CUDA_ARCH__ >= 610
|
1366
1410
|
}
|
1367
1411
|
|
1368
1412
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
-
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1413
|
+
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1370
1414
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
1415
|
|
1372
1416
|
if (row >= nrows) {
|
@@ -1404,7 +1448,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
|
|
1404
1448
|
}
|
1405
1449
|
|
1406
1450
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1407
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1451
|
+
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
1408
1452
|
// qk = quantized weights per x block
|
1409
1453
|
// qr = number of quantized weights per data value in x block
|
1410
1454
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -1471,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1471
1515
|
}
|
1472
1516
|
}
|
1473
1517
|
|
1474
|
-
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1518
|
+
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1475
1519
|
const half * x = (const half *) vx;
|
1476
1520
|
|
1477
1521
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1518,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1518
1562
|
}
|
1519
1563
|
|
1520
1564
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1521
|
-
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1565
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1522
1566
|
const int row_stride_x, const int channel_stride_x) {
|
1523
1567
|
|
1524
1568
|
const half * x = (const half *) vx;
|
@@ -1703,20 +1747,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1703
1747
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1704
1748
|
}
|
1705
1749
|
|
1750
|
+
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1751
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
1752
|
+
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1753
|
+
}
|
1754
|
+
|
1706
1755
|
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1707
1756
|
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
1708
1757
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1709
1758
|
}
|
1710
1759
|
|
1760
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1761
|
+
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1762
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1763
|
+
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1764
|
+
}
|
1765
|
+
|
1711
1766
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1712
1767
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1713
1768
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1714
1769
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1715
1770
|
}
|
1716
1771
|
|
1717
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
1772
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
1718
1773
|
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
-
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
1774
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
|
1720
1775
|
}
|
1721
1776
|
|
1722
1777
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -2236,16 +2291,21 @@ inline void ggml_cuda_op_add(
|
|
2236
2291
|
|
2237
2292
|
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
2238
2293
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2239
|
-
GGML_ASSERT(dst_ddf_i
|
2294
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2240
2295
|
|
2241
|
-
|
2296
|
+
// TODO: support broadcasting
|
2297
|
+
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
2298
|
+
|
2299
|
+
const int64_t ne00 = src0->ne[0];
|
2242
2300
|
const int64_t i01_diff = i01_high - i01_low;
|
2243
2301
|
|
2302
|
+
// const int64_t ne10 = src1->ne[0];
|
2303
|
+
|
2244
2304
|
// compute
|
2245
2305
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2246
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i,
|
2306
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2247
2307
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2248
|
-
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i,
|
2308
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2249
2309
|
} else {
|
2250
2310
|
GGML_ASSERT(false);
|
2251
2311
|
}
|
@@ -2264,10 +2324,9 @@ inline void ggml_cuda_op_mul(
|
|
2264
2324
|
|
2265
2325
|
GGML_ASSERT(src0_ddf_i != nullptr);
|
2266
2326
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2267
|
-
GGML_ASSERT(dst_ddf_i
|
2327
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2268
2328
|
|
2269
2329
|
const int64_t ne00 = src0->ne[0];
|
2270
|
-
|
2271
2330
|
const int64_t ne10 = src1->ne[0];
|
2272
2331
|
const int64_t ne11 = src1->ne[1];
|
2273
2332
|
|
@@ -2276,7 +2335,7 @@ inline void ggml_cuda_op_mul(
|
|
2276
2335
|
|
2277
2336
|
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2278
2337
|
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2279
|
-
float * dst_ddf_i01
|
2338
|
+
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2280
2339
|
|
2281
2340
|
// compute
|
2282
2341
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
@@ -2287,6 +2346,28 @@ inline void ggml_cuda_op_mul(
|
|
2287
2346
|
(void) i02;
|
2288
2347
|
}
|
2289
2348
|
|
2349
|
+
inline void ggml_cuda_op_gelu(
|
2350
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2351
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2352
|
+
cudaStream_t & cudaStream_main){
|
2353
|
+
|
2354
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2355
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2356
|
+
|
2357
|
+
const int64_t ne00 = src0->ne[0];
|
2358
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2359
|
+
|
2360
|
+
// compute
|
2361
|
+
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2362
|
+
|
2363
|
+
(void) src1;
|
2364
|
+
(void) dst;
|
2365
|
+
(void) src0_ddq_i;
|
2366
|
+
(void) src1_ddf_i;
|
2367
|
+
(void) i02;
|
2368
|
+
(void) i1;
|
2369
|
+
}
|
2370
|
+
|
2290
2371
|
inline void ggml_cuda_op_silu(
|
2291
2372
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2292
2373
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2309,6 +2390,28 @@ inline void ggml_cuda_op_silu(
|
|
2309
2390
|
(void) i1;
|
2310
2391
|
}
|
2311
2392
|
|
2393
|
+
inline void ggml_cuda_op_norm(
|
2394
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2395
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2396
|
+
cudaStream_t & cudaStream_main){
|
2397
|
+
|
2398
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2399
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2400
|
+
|
2401
|
+
const int64_t ne00 = src0->ne[0];
|
2402
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2403
|
+
|
2404
|
+
// compute
|
2405
|
+
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2406
|
+
|
2407
|
+
(void) src1;
|
2408
|
+
(void) dst;
|
2409
|
+
(void) src0_ddq_i;
|
2410
|
+
(void) src1_ddf_i;
|
2411
|
+
(void) i02;
|
2412
|
+
(void) i1;
|
2413
|
+
}
|
2414
|
+
|
2312
2415
|
inline void ggml_cuda_op_rms_norm(
|
2313
2416
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2314
2417
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2355,16 +2458,15 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2355
2458
|
src0->type == GGML_TYPE_Q5_1 ||
|
2356
2459
|
src0->type == GGML_TYPE_Q8_0;
|
2357
2460
|
|
2358
|
-
|
2359
|
-
// However, they have bad performance with Pascal cards.
|
2360
|
-
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
2361
|
-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
2461
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
|
2362
2462
|
#endif
|
2363
2463
|
|
2364
2464
|
if (use_mul_mat_vec_q) {
|
2465
|
+
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
2466
|
+
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
2365
2467
|
size_t as;
|
2366
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(
|
2367
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2468
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2469
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
2368
2470
|
|
2369
2471
|
switch (src0->type) {
|
2370
2472
|
case GGML_TYPE_Q4_0:
|
@@ -2925,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2925
3027
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
2926
3028
|
}
|
2927
3029
|
|
3030
|
+
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3031
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3032
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
3033
|
+
}
|
3034
|
+
|
2928
3035
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2929
3036
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2930
3037
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
2931
3038
|
}
|
2932
3039
|
|
3040
|
+
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3041
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3042
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
3043
|
+
}
|
3044
|
+
|
2933
3045
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2934
3046
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2935
3047
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
@@ -3108,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3108
3220
|
|
3109
3221
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
3110
3222
|
int nrows = ggml_nrows(tensor);
|
3223
|
+
|
3224
|
+
const int64_t ne0 = tensor->ne[0];
|
3225
|
+
|
3111
3226
|
const size_t nb1 = tensor->nb[1];
|
3227
|
+
|
3112
3228
|
ggml_backend backend = tensor->backend;
|
3113
3229
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
3114
3230
|
memset(extra, 0, sizeof(*extra));
|
@@ -3137,13 +3253,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3137
3253
|
int64_t nrows_split = row_high - row_low;
|
3138
3254
|
|
3139
3255
|
const size_t offset_split = row_low*nb1;
|
3140
|
-
|
3256
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3257
|
+
const size_t original_size = size;
|
3258
|
+
|
3259
|
+
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
3260
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3261
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3262
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
3263
|
+
}
|
3141
3264
|
|
3142
|
-
|
3265
|
+
char * buf;
|
3143
3266
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
3144
|
-
|
3267
|
+
char * buf_host = (char*)data + offset_split;
|
3268
|
+
|
3269
|
+
// set padding to 0 to avoid possible NaN values
|
3270
|
+
if (size > original_size) {
|
3271
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
3272
|
+
}
|
3145
3273
|
|
3146
|
-
|
3274
|
+
|
3275
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
|
3147
3276
|
|
3148
3277
|
extra->data_device[id] = buf;
|
3149
3278
|
|
@@ -3183,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3183
3312
|
}
|
3184
3313
|
|
3185
3314
|
// recursively assign CUDA buffers until a compute tensor is found
|
3186
|
-
if (tensor->
|
3187
|
-
const ggml_op src0_op = tensor->
|
3315
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
|
+
const ggml_op src0_op = tensor->src[0]->op;
|
3188
3317
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3189
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3318
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3190
3319
|
}
|
3191
3320
|
}
|
3192
|
-
if (tensor->op == GGML_OP_CPY && tensor->
|
3193
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3321
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
3322
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
3194
3323
|
}
|
3195
3324
|
|
3196
3325
|
tensor->backend = GGML_BACKEND_GPU;
|
3197
3326
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
3198
3327
|
memset(extra, 0, sizeof(*extra));
|
3199
3328
|
|
3200
|
-
const bool inplace = (tensor->
|
3329
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3201
3330
|
tensor->op == GGML_OP_VIEW ||
|
3202
3331
|
force_inplace;
|
3203
3332
|
const size_t size = ggml_nbytes(tensor);
|
3204
3333
|
|
3205
3334
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3206
|
-
if (inplace && (tensor->
|
3207
|
-
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3335
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
3336
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
3208
3337
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3209
3338
|
size_t offset = 0;
|
3210
3339
|
if (tensor->op == GGML_OP_VIEW) {
|
3211
|
-
memcpy(&offset, tensor->
|
3340
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
3212
3341
|
}
|
3213
3342
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3214
3343
|
} else if (tensor->op == GGML_OP_CPY) {
|
3215
|
-
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3344
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3216
3345
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3217
3346
|
extra->data_device[g_main_device] = src1_ddv;
|
3218
3347
|
} else if (scratch) {
|
@@ -3283,8 +3412,8 @@ void ggml_cuda_free_scratch() {
|
|
3283
3412
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
3284
3413
|
ggml_cuda_func_t func;
|
3285
3414
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
3286
|
-
|| (tensor->
|
3287
|
-
|| (tensor->
|
3415
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
3416
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3288
3417
|
|
3289
3418
|
switch (tensor->op) {
|
3290
3419
|
case GGML_OP_ADD:
|
@@ -3299,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3299
3428
|
}
|
3300
3429
|
func = ggml_cuda_mul;
|
3301
3430
|
break;
|
3431
|
+
case GGML_OP_GELU:
|
3432
|
+
if (!any_on_device) {
|
3433
|
+
return false;
|
3434
|
+
}
|
3435
|
+
func = ggml_cuda_gelu;
|
3436
|
+
break;
|
3302
3437
|
case GGML_OP_SILU:
|
3303
3438
|
if (!any_on_device) {
|
3304
3439
|
return false;
|
3305
3440
|
}
|
3306
3441
|
func = ggml_cuda_silu;
|
3307
3442
|
break;
|
3443
|
+
case GGML_OP_NORM:
|
3444
|
+
if (!any_on_device) {
|
3445
|
+
return false;
|
3446
|
+
}
|
3447
|
+
func = ggml_cuda_norm;
|
3448
|
+
break;
|
3308
3449
|
case GGML_OP_RMS_NORM:
|
3309
3450
|
if (!any_on_device) {
|
3310
3451
|
return false;
|
@@ -3312,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3312
3453
|
func = ggml_cuda_rms_norm;
|
3313
3454
|
break;
|
3314
3455
|
case GGML_OP_MUL_MAT:
|
3315
|
-
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->
|
3456
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
3316
3457
|
return false;
|
3317
3458
|
}
|
3318
3459
|
func = ggml_cuda_mul_mat;
|
@@ -3366,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3366
3507
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
3367
3508
|
return true;
|
3368
3509
|
}
|
3369
|
-
func(tensor->
|
3510
|
+
func(tensor->src[0], tensor->src[1], tensor);
|
3370
3511
|
return true;
|
3371
3512
|
}
|