llama_cpp 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +11 -2
- data/ext/llama_cpp/llama_cpp.cpp +284 -111
- data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +19 -6
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +1734 -2248
- data/ext/llama_cpp/src/ggml.h +152 -80
- data/ext/llama_cpp/src/llama.cpp +282 -90
- data/ext/llama_cpp/src/llama.h +30 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -13
- data/sig/llama_cpp.rbs +22 -2
- metadata +5 -2
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
|
|
59
59
|
#endif //GGML_CUDA_DMMV_F16
|
60
60
|
|
61
61
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
62
|
-
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
63
|
-
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
62
|
+
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
63
|
+
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
64
64
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
65
65
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
66
66
|
typedef void (*ggml_cuda_op_t)(
|
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
|
|
70
70
|
|
71
71
|
// QK = number of values after dequantization
|
72
72
|
// QR = QK / number of values before dequantization
|
73
|
+
// QI = number of 32 bit integers before dequantization
|
73
74
|
|
74
75
|
#define QK4_0 32
|
75
76
|
#define QR4_0 2
|
77
|
+
#define QI4_0 4
|
76
78
|
typedef struct {
|
77
79
|
half d; // delta
|
78
80
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
81
83
|
|
82
84
|
#define QK4_1 32
|
83
85
|
#define QR4_1 2
|
86
|
+
#define QI4_1 4
|
84
87
|
typedef struct {
|
85
88
|
half d; // delta
|
86
89
|
half m; // min
|
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
90
93
|
|
91
94
|
#define QK5_0 32
|
92
95
|
#define QR5_0 2
|
96
|
+
#define QI5_0 4
|
93
97
|
typedef struct {
|
94
98
|
half d; // delta
|
95
99
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
99
103
|
|
100
104
|
#define QK5_1 32
|
101
105
|
#define QR5_1 2
|
106
|
+
#define QI5_1 4
|
102
107
|
typedef struct {
|
103
108
|
half d; // delta
|
104
109
|
half m; // min
|
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
109
114
|
|
110
115
|
#define QK8_0 32
|
111
116
|
#define QR8_0 1
|
117
|
+
#define QI8_0 8
|
112
118
|
typedef struct {
|
113
119
|
half d; // delta
|
114
120
|
int8_t qs[QK8_0]; // quants
|
115
121
|
} block_q8_0;
|
116
122
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
117
123
|
|
124
|
+
#define QK8_1 32
|
125
|
+
#define QR8_1 1
|
126
|
+
#define QI8_1 8
|
127
|
+
typedef struct {
|
128
|
+
half d; // delta
|
129
|
+
half s; // unquantized sum
|
130
|
+
int8_t qs[QK8_0]; // quants
|
131
|
+
} block_q8_1;
|
132
|
+
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
|
+
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
135
|
+
|
118
136
|
//================================= k-quants
|
119
137
|
|
120
138
|
#ifdef GGML_QKK_64
|
@@ -190,22 +208,25 @@ typedef struct {
|
|
190
208
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
191
209
|
|
192
210
|
#define WARP_SIZE 32
|
211
|
+
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
193
212
|
|
194
213
|
#define CUDA_ADD_BLOCK_SIZE 256
|
195
214
|
#define CUDA_MUL_BLOCK_SIZE 256
|
215
|
+
#define CUDA_GELU_BLOCK_SIZE 256
|
196
216
|
#define CUDA_SILU_BLOCK_SIZE 256
|
197
217
|
#define CUDA_CPY_BLOCK_SIZE 32
|
198
218
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
199
219
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
200
220
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
221
|
+
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
201
222
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
202
223
|
|
203
224
|
// dmmv = dequantize_mul_mat_vec
|
204
225
|
#ifndef GGML_CUDA_DMMV_X
|
205
226
|
#define GGML_CUDA_DMMV_X 32
|
206
227
|
#endif
|
207
|
-
#ifndef
|
208
|
-
#define
|
228
|
+
#ifndef GGML_CUDA_MMV_Y
|
229
|
+
#define GGML_CUDA_MMV_Y 1
|
209
230
|
#endif
|
210
231
|
|
211
232
|
#ifndef K_QUANTS_PER_ITERATION
|
@@ -214,6 +235,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
214
235
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215
236
|
#endif
|
216
237
|
|
238
|
+
struct ggml_tensor_extra_gpu {
|
239
|
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
240
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
241
|
+
};
|
242
|
+
|
217
243
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
218
244
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
219
245
|
|
@@ -241,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
|
|
241
267
|
dst[i] = x[i] * y[i%ky];
|
242
268
|
}
|
243
269
|
|
270
|
+
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
271
|
+
const float GELU_COEF_A = 0.044715f;
|
272
|
+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
273
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
274
|
+
|
275
|
+
if (i >= k) {
|
276
|
+
return;
|
277
|
+
}
|
278
|
+
|
279
|
+
float xi = x[i];
|
280
|
+
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
281
|
+
}
|
282
|
+
|
244
283
|
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
245
284
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
246
285
|
|
@@ -250,32 +289,60 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
250
289
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
251
290
|
}
|
252
291
|
|
292
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
293
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
294
|
+
const int tid = threadIdx.x;
|
295
|
+
|
296
|
+
const float eps = 1e-5f;
|
297
|
+
|
298
|
+
float mean = 0.0f;
|
299
|
+
float var = 0.0f;
|
300
|
+
|
301
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
|
+
const float xi = x[row*ncols + col];
|
303
|
+
mean += xi;
|
304
|
+
var += xi * xi;
|
305
|
+
}
|
306
|
+
|
307
|
+
// sum up partial sums
|
308
|
+
#pragma unroll
|
309
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
310
|
+
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
|
311
|
+
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
|
312
|
+
}
|
313
|
+
|
314
|
+
mean /= ncols;
|
315
|
+
var = var / ncols - mean * mean;
|
316
|
+
const float inv_var = rsqrtf(var + eps);
|
317
|
+
|
318
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
319
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
253
323
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
254
324
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
255
325
|
const int tid = threadIdx.x;
|
256
326
|
|
257
|
-
const float eps = 1e-
|
327
|
+
const float eps = 1e-6f;
|
258
328
|
|
259
329
|
float tmp = 0.0f; // partial sum for thread in warp
|
260
330
|
|
261
|
-
for (int
|
262
|
-
const int col = i + tid;
|
331
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
263
332
|
const float xi = x[row*ncols + col];
|
264
333
|
tmp += xi * xi;
|
265
334
|
}
|
266
335
|
|
267
336
|
// sum up partial sums
|
268
|
-
__syncthreads();
|
269
337
|
#pragma unroll
|
270
338
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
271
339
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
272
340
|
}
|
273
341
|
|
274
342
|
const float mean = tmp / ncols;
|
275
|
-
const float scale =
|
343
|
+
const float scale = rsqrtf(mean + eps);
|
276
344
|
|
277
|
-
for (int
|
278
|
-
const int col = i + tid;
|
345
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
279
346
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
280
347
|
}
|
281
348
|
}
|
@@ -384,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
384
451
|
|
385
452
|
//================================== k-quants
|
386
453
|
|
387
|
-
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
454
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
388
455
|
|
389
456
|
const int i = blockIdx.x;
|
390
457
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -417,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
417
484
|
|
418
485
|
}
|
419
486
|
|
420
|
-
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
487
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
421
488
|
|
422
489
|
const int i = blockIdx.x;
|
423
490
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -481,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
481
548
|
}
|
482
549
|
#endif
|
483
550
|
|
484
|
-
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
551
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
485
552
|
const block_q4_K * x = (const block_q4_K *) vx;
|
486
553
|
|
487
554
|
const int i = blockIdx.x;
|
@@ -521,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
521
588
|
#endif
|
522
589
|
}
|
523
590
|
|
524
|
-
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
591
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
525
592
|
const block_q5_K * x = (const block_q5_K *) vx;
|
526
593
|
|
527
594
|
const int i = blockIdx.x;
|
@@ -567,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
567
634
|
#endif
|
568
635
|
}
|
569
636
|
|
570
|
-
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
637
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
571
638
|
const block_q6_K * x = (const block_q6_K *) vx;
|
572
639
|
|
573
640
|
const int i = blockIdx.x;
|
@@ -611,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
611
678
|
#endif
|
612
679
|
}
|
613
680
|
|
614
|
-
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
681
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
615
682
|
|
616
683
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
617
684
|
|
@@ -709,7 +776,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
709
776
|
#endif
|
710
777
|
|
711
778
|
// sum up partial sums and write back result
|
712
|
-
__syncthreads();
|
713
779
|
#pragma unroll
|
714
780
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
715
781
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -720,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
720
786
|
}
|
721
787
|
}
|
722
788
|
|
723
|
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
789
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
724
790
|
|
725
791
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
726
792
|
if (row > nrows) return;
|
@@ -814,7 +880,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
814
880
|
#endif
|
815
881
|
|
816
882
|
// sum up partial sums and write back result
|
817
|
-
__syncthreads();
|
818
883
|
#pragma unroll
|
819
884
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
820
885
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -825,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
825
890
|
}
|
826
891
|
}
|
827
892
|
|
828
|
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
893
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
829
894
|
|
830
895
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
831
896
|
if (row > nrows) return;
|
@@ -918,7 +983,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
918
983
|
#endif
|
919
984
|
|
920
985
|
// sum up partial sums and write back result
|
921
|
-
__syncthreads();
|
922
986
|
#pragma unroll
|
923
987
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
924
988
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -929,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
929
993
|
}
|
930
994
|
}
|
931
995
|
|
932
|
-
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
996
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
933
997
|
|
934
998
|
const int row = blockIdx.x;
|
935
999
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1023,7 +1087,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1023
1087
|
#endif
|
1024
1088
|
|
1025
1089
|
// sum up partial sums and write back result
|
1026
|
-
__syncthreads();
|
1027
1090
|
#pragma unroll
|
1028
1091
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1029
1092
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1034,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1034
1097
|
}
|
1035
1098
|
}
|
1036
1099
|
|
1037
|
-
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
1100
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1038
1101
|
|
1039
1102
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1040
1103
|
|
@@ -1134,7 +1197,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
1134
1197
|
#endif
|
1135
1198
|
|
1136
1199
|
// sum up partial sums and write back result
|
1137
|
-
__syncthreads();
|
1138
1200
|
#pragma unroll
|
1139
1201
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1140
1202
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1153,8 +1215,43 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1153
1215
|
v.y = x[ib + iqs + 1];
|
1154
1216
|
}
|
1155
1217
|
|
1218
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
|
1219
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1220
|
+
|
1221
|
+
if (i >= k) {
|
1222
|
+
return;
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
block_q8_1 * y = (block_q8_1 *) vy;
|
1226
|
+
|
1227
|
+
const int ib = i / QK8_1; // block index
|
1228
|
+
const int iqs = i % QK8_1; // quant index
|
1229
|
+
|
1230
|
+
const float xi = i < ndata ? x[i] : 0.0f;
|
1231
|
+
float amax = fabsf(xi);
|
1232
|
+
float sum = xi;
|
1233
|
+
|
1234
|
+
#pragma unroll
|
1235
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1236
|
+
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
1237
|
+
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
1238
|
+
}
|
1239
|
+
|
1240
|
+
const float d = amax / 127;
|
1241
|
+
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
1242
|
+
|
1243
|
+
y[ib].qs[iqs] = q;
|
1244
|
+
|
1245
|
+
if (iqs > 0) {
|
1246
|
+
return;
|
1247
|
+
}
|
1248
|
+
|
1249
|
+
y[ib].d = d;
|
1250
|
+
y[ib].s = sum;
|
1251
|
+
}
|
1252
|
+
|
1156
1253
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1157
|
-
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1254
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
1158
1255
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1159
1256
|
|
1160
1257
|
if (i >= k) {
|
@@ -1174,8 +1271,184 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1174
1271
|
y[iybs + iqs + y_offset] = v.y;
|
1175
1272
|
}
|
1176
1273
|
|
1274
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1275
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1276
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1277
|
+
|
1278
|
+
int vi;
|
1279
|
+
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1280
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1281
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1282
|
+
|
1283
|
+
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
1284
|
+
|
1285
|
+
// subtract 8 from each quantized value
|
1286
|
+
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
1287
|
+
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
1288
|
+
|
1289
|
+
// SIMD dot product of quantized values
|
1290
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1291
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1292
|
+
|
1293
|
+
return sumi*d;
|
1294
|
+
#else
|
1295
|
+
return 0.0f; // only to satisfy the compiler
|
1296
|
+
#endif // __CUDA_ARCH__ >= 610
|
1297
|
+
}
|
1298
|
+
|
1299
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1300
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1301
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1302
|
+
|
1303
|
+
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
1304
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1305
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1306
|
+
|
1307
|
+
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
1308
|
+
const float m = bq4_1->m;
|
1309
|
+
const float s = bq8_1->s;
|
1310
|
+
|
1311
|
+
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
1312
|
+
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
1313
|
+
|
1314
|
+
// SIMD dot product of quantized values
|
1315
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1316
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1317
|
+
|
1318
|
+
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1319
|
+
#else
|
1320
|
+
return 0.0f; // only to satisfy the compiler
|
1321
|
+
#endif // __CUDA_ARCH__ >= 610
|
1322
|
+
}
|
1323
|
+
|
1324
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1325
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1326
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1327
|
+
|
1328
|
+
int qs;
|
1329
|
+
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1330
|
+
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1331
|
+
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1332
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1333
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1334
|
+
|
1335
|
+
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1336
|
+
|
1337
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1338
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1339
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1340
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1341
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1342
|
+
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1343
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1344
|
+
|
1345
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1346
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1347
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1348
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1349
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1350
|
+
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1351
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1352
|
+
|
1353
|
+
return sumi*d;
|
1354
|
+
#else
|
1355
|
+
return 0.0f; // only to satisfy the compiler
|
1356
|
+
#endif // __CUDA_ARCH__ >= 610
|
1357
|
+
}
|
1358
|
+
|
1359
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1360
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1361
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1362
|
+
|
1363
|
+
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1364
|
+
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1365
|
+
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1366
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1367
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1368
|
+
|
1369
|
+
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1370
|
+
const float m = bq5_1->m;
|
1371
|
+
const float s = bq8_1->s;
|
1372
|
+
|
1373
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1374
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1375
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1376
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1377
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1378
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1379
|
+
|
1380
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1381
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1382
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1383
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1384
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1385
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1386
|
+
|
1387
|
+
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1388
|
+
#else
|
1389
|
+
return 0.0f; // only to satisfy the compiler
|
1390
|
+
#endif // __CUDA_ARCH__ >= 610
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1394
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1395
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1396
|
+
|
1397
|
+
int vi;
|
1398
|
+
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1399
|
+
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1400
|
+
|
1401
|
+
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
1402
|
+
|
1403
|
+
// SIMD dot product of quantized values
|
1404
|
+
int sumi = __dp4a(vi, ui, 0);
|
1405
|
+
|
1406
|
+
return sumi*d;
|
1407
|
+
#else
|
1408
|
+
return 0.0f; // only to satisfy the compiler
|
1409
|
+
#endif // __CUDA_ARCH__ >= 610
|
1410
|
+
}
|
1411
|
+
|
1412
|
+
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1413
|
+
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1414
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1415
|
+
|
1416
|
+
if (row >= nrows) {
|
1417
|
+
return;
|
1418
|
+
}
|
1419
|
+
|
1420
|
+
const int blocks_per_row = ncols / qk;
|
1421
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1422
|
+
|
1423
|
+
// partial sum for each thread
|
1424
|
+
float tmp = 0.0f;
|
1425
|
+
|
1426
|
+
const block_q_t * x = (const block_q_t *) vx;
|
1427
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1428
|
+
|
1429
|
+
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1430
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1431
|
+
|
1432
|
+
const int iby = i + threadIdx.x / qi; // y block index
|
1433
|
+
|
1434
|
+
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1435
|
+
|
1436
|
+
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1437
|
+
}
|
1438
|
+
|
1439
|
+
// sum up partial sums and write back result
|
1440
|
+
#pragma unroll
|
1441
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1442
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1443
|
+
}
|
1444
|
+
|
1445
|
+
if (threadIdx.x == 0) {
|
1446
|
+
dst[row] = tmp;
|
1447
|
+
}
|
1448
|
+
}
|
1449
|
+
|
1177
1450
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1178
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1451
|
+
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
1179
1452
|
// qk = quantized weights per x block
|
1180
1453
|
// qr = number of quantized weights per data value in x block
|
1181
1454
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -1228,7 +1501,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1228
1501
|
}
|
1229
1502
|
|
1230
1503
|
// sum up partial sums and write back result
|
1231
|
-
__syncthreads();
|
1232
1504
|
#pragma unroll
|
1233
1505
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1234
1506
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1243,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1243
1515
|
}
|
1244
1516
|
}
|
1245
1517
|
|
1246
|
-
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1518
|
+
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1247
1519
|
const half * x = (const half *) vx;
|
1248
1520
|
|
1249
1521
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1279,7 +1551,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1279
1551
|
const int idst = channel*nrows_dst + row_dst;
|
1280
1552
|
|
1281
1553
|
// sum up partial sums and write back result
|
1282
|
-
__syncthreads();
|
1283
1554
|
#pragma unroll
|
1284
1555
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1285
1556
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1291,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1291
1562
|
}
|
1292
1563
|
|
1293
1564
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1294
|
-
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1565
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1295
1566
|
const int row_stride_x, const int channel_stride_x) {
|
1296
1567
|
|
1297
1568
|
const half * x = (const half *) vx;
|
@@ -1325,7 +1596,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1325
1596
|
}
|
1326
1597
|
|
1327
1598
|
// sum up partial sums and write back result
|
1328
|
-
__syncthreads();
|
1329
1599
|
#pragma unroll
|
1330
1600
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1331
1601
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1435,7 +1705,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
1435
1705
|
}
|
1436
1706
|
|
1437
1707
|
// sum up partial sums
|
1438
|
-
__syncthreads();
|
1439
1708
|
#pragma unroll
|
1440
1709
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1441
1710
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1478,17 +1747,33 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1478
1747
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1479
1748
|
}
|
1480
1749
|
|
1750
|
+
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1751
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
1752
|
+
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1753
|
+
}
|
1754
|
+
|
1481
1755
|
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1482
1756
|
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
1483
1757
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1484
1758
|
}
|
1485
1759
|
|
1760
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1761
|
+
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1762
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1763
|
+
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1764
|
+
}
|
1765
|
+
|
1486
1766
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1487
1767
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1488
1768
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1489
1769
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1490
1770
|
}
|
1491
1771
|
|
1772
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
1773
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1774
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
|
1775
|
+
}
|
1776
|
+
|
1492
1777
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1493
1778
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1494
1779
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1557,45 +1842,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1557
1842
|
|
1558
1843
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1559
1844
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1560
|
-
const int block_num_y = (nrows +
|
1845
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1561
1846
|
const dim3 block_nums(1, block_num_y, 1);
|
1562
|
-
const dim3 block_dims(WARP_SIZE,
|
1847
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1563
1848
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
1564
1849
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1565
1850
|
}
|
1566
1851
|
|
1567
1852
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1568
1853
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1569
|
-
const int block_num_y = (nrows +
|
1854
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1570
1855
|
const dim3 block_nums(1, block_num_y, 1);
|
1571
|
-
const dim3 block_dims(WARP_SIZE,
|
1856
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1572
1857
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
1573
1858
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1574
1859
|
}
|
1575
1860
|
|
1576
1861
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1577
1862
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1578
|
-
const int block_num_y = (nrows +
|
1863
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1579
1864
|
const dim3 block_nums(1, block_num_y, 1);
|
1580
|
-
const dim3 block_dims(WARP_SIZE,
|
1865
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1581
1866
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
1582
1867
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1583
1868
|
}
|
1584
1869
|
|
1585
1870
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1586
1871
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1587
|
-
const int block_num_y = (nrows +
|
1872
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1588
1873
|
const dim3 block_nums(1, block_num_y, 1);
|
1589
|
-
const dim3 block_dims(WARP_SIZE,
|
1874
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1590
1875
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
1591
1876
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1592
1877
|
}
|
1593
1878
|
|
1594
1879
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1595
1880
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1596
|
-
const int block_num_y = (nrows +
|
1881
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1597
1882
|
const dim3 block_nums(1, block_num_y, 1);
|
1598
|
-
const dim3 block_dims(WARP_SIZE,
|
1883
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1599
1884
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
1600
1885
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1601
1886
|
}
|
@@ -1642,6 +1927,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1642
1927
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1643
1928
|
}
|
1644
1929
|
|
1930
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1931
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1932
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1933
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1934
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1935
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
1936
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1937
|
+
}
|
1938
|
+
|
1939
|
+
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1940
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1941
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1942
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1943
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1944
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
1945
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1946
|
+
}
|
1947
|
+
|
1948
|
+
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1949
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1950
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1951
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1952
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1953
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
1954
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1955
|
+
}
|
1956
|
+
|
1957
|
+
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1958
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1959
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1960
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1961
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1962
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
1963
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1964
|
+
}
|
1965
|
+
|
1966
|
+
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1967
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1968
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1969
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1970
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1971
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
1972
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1973
|
+
}
|
1974
|
+
|
1645
1975
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1646
1976
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1647
1977
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1649,9 +1979,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1649
1979
|
|
1650
1980
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1651
1981
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1652
|
-
const int block_num_y = (nrows +
|
1982
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1653
1983
|
const dim3 block_nums(1, block_num_y, 1);
|
1654
|
-
const dim3 block_dims(WARP_SIZE,
|
1984
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1655
1985
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
1656
1986
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1657
1987
|
}
|
@@ -1817,6 +2147,7 @@ static size_t g_scratch_offset = 0;
|
|
1817
2147
|
|
1818
2148
|
static int g_device_count = -1;
|
1819
2149
|
static int g_main_device = 0;
|
2150
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
1820
2151
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1821
2152
|
|
1822
2153
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1834,9 +2165,12 @@ void ggml_init_cublas() {
|
|
1834
2165
|
for (int id = 0; id < g_device_count; ++id) {
|
1835
2166
|
cudaDeviceProp prop;
|
1836
2167
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
1837
|
-
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
2168
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
2169
|
+
|
1838
2170
|
g_tensor_split[id] = total_vram;
|
1839
2171
|
total_vram += prop.totalGlobalMem;
|
2172
|
+
|
2173
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
1840
2174
|
}
|
1841
2175
|
for (int id = 0; id < g_device_count; ++id) {
|
1842
2176
|
g_tensor_split[id] /= total_vram;
|
@@ -1957,20 +2291,24 @@ inline void ggml_cuda_op_add(
|
|
1957
2291
|
|
1958
2292
|
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
1959
2293
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1960
|
-
GGML_ASSERT(dst_ddf_i
|
2294
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2295
|
+
|
2296
|
+
// TODO: support broadcasting
|
2297
|
+
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
1961
2298
|
|
1962
|
-
const int64_t
|
2299
|
+
const int64_t ne00 = src0->ne[0];
|
1963
2300
|
const int64_t i01_diff = i01_high - i01_low;
|
1964
2301
|
|
2302
|
+
// const int64_t ne10 = src1->ne[0];
|
2303
|
+
|
1965
2304
|
// compute
|
1966
2305
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
1967
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i,
|
2306
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
1968
2307
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
1969
|
-
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i,
|
2308
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
1970
2309
|
} else {
|
1971
2310
|
GGML_ASSERT(false);
|
1972
2311
|
}
|
1973
|
-
CUDA_CHECK(cudaGetLastError());
|
1974
2312
|
|
1975
2313
|
(void) src1;
|
1976
2314
|
(void) dst;
|
@@ -1986,10 +2324,9 @@ inline void ggml_cuda_op_mul(
|
|
1986
2324
|
|
1987
2325
|
GGML_ASSERT(src0_ddf_i != nullptr);
|
1988
2326
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1989
|
-
GGML_ASSERT(dst_ddf_i
|
2327
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1990
2328
|
|
1991
2329
|
const int64_t ne00 = src0->ne[0];
|
1992
|
-
|
1993
2330
|
const int64_t ne10 = src1->ne[0];
|
1994
2331
|
const int64_t ne11 = src1->ne[1];
|
1995
2332
|
|
@@ -1998,11 +2335,10 @@ inline void ggml_cuda_op_mul(
|
|
1998
2335
|
|
1999
2336
|
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2000
2337
|
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2001
|
-
float * dst_ddf_i01
|
2338
|
+
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2002
2339
|
|
2003
2340
|
// compute
|
2004
2341
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2005
|
-
CUDA_CHECK(cudaGetLastError());
|
2006
2342
|
}
|
2007
2343
|
|
2008
2344
|
(void) dst;
|
@@ -2010,6 +2346,28 @@ inline void ggml_cuda_op_mul(
|
|
2010
2346
|
(void) i02;
|
2011
2347
|
}
|
2012
2348
|
|
2349
|
+
inline void ggml_cuda_op_gelu(
|
2350
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2351
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2352
|
+
cudaStream_t & cudaStream_main){
|
2353
|
+
|
2354
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2355
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2356
|
+
|
2357
|
+
const int64_t ne00 = src0->ne[0];
|
2358
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2359
|
+
|
2360
|
+
// compute
|
2361
|
+
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2362
|
+
|
2363
|
+
(void) src1;
|
2364
|
+
(void) dst;
|
2365
|
+
(void) src0_ddq_i;
|
2366
|
+
(void) src1_ddf_i;
|
2367
|
+
(void) i02;
|
2368
|
+
(void) i1;
|
2369
|
+
}
|
2370
|
+
|
2013
2371
|
inline void ggml_cuda_op_silu(
|
2014
2372
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2015
2373
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2023,7 +2381,28 @@ inline void ggml_cuda_op_silu(
|
|
2023
2381
|
|
2024
2382
|
// compute
|
2025
2383
|
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2026
|
-
|
2384
|
+
|
2385
|
+
(void) src1;
|
2386
|
+
(void) dst;
|
2387
|
+
(void) src0_ddq_i;
|
2388
|
+
(void) src1_ddf_i;
|
2389
|
+
(void) i02;
|
2390
|
+
(void) i1;
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
inline void ggml_cuda_op_norm(
|
2394
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2395
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2396
|
+
cudaStream_t & cudaStream_main){
|
2397
|
+
|
2398
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2399
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2400
|
+
|
2401
|
+
const int64_t ne00 = src0->ne[0];
|
2402
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2403
|
+
|
2404
|
+
// compute
|
2405
|
+
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2027
2406
|
|
2028
2407
|
(void) src1;
|
2029
2408
|
(void) dst;
|
@@ -2046,7 +2425,6 @@ inline void ggml_cuda_op_rms_norm(
|
|
2046
2425
|
|
2047
2426
|
// compute
|
2048
2427
|
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2049
|
-
CUDA_CHECK(cudaGetLastError());
|
2050
2428
|
|
2051
2429
|
(void) src1;
|
2052
2430
|
(void) dst;
|
@@ -2056,7 +2434,7 @@ inline void ggml_cuda_op_rms_norm(
|
|
2056
2434
|
(void) i1;
|
2057
2435
|
}
|
2058
2436
|
|
2059
|
-
inline void
|
2437
|
+
inline void ggml_cuda_op_mul_mat_vec(
|
2060
2438
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2061
2439
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2062
2440
|
cudaStream_t & cudaStream_main){
|
@@ -2068,70 +2446,115 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
2068
2446
|
const int64_t ne00 = src0->ne[0];
|
2069
2447
|
const int64_t nrows = i01_high - i01_low;
|
2070
2448
|
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2449
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
2450
|
+
const bool use_mul_mat_vec_q = false;
|
2451
|
+
#else
|
2452
|
+
int id;
|
2453
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
2075
2454
|
|
2076
|
-
bool
|
2077
|
-
src0->type ==
|
2078
|
-
src0->type ==
|
2455
|
+
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
2456
|
+
src0->type == GGML_TYPE_Q4_1 ||
|
2457
|
+
src0->type == GGML_TYPE_Q5_0 ||
|
2458
|
+
src0->type == GGML_TYPE_Q5_1 ||
|
2459
|
+
src0->type == GGML_TYPE_Q8_0;
|
2079
2460
|
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2461
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
|
2462
|
+
#endif
|
2463
|
+
|
2464
|
+
if (use_mul_mat_vec_q) {
|
2465
|
+
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
2466
|
+
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
2467
|
+
size_t as;
|
2468
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2469
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
2470
|
+
|
2471
|
+
switch (src0->type) {
|
2472
|
+
case GGML_TYPE_Q4_0:
|
2473
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2474
|
+
break;
|
2475
|
+
case GGML_TYPE_Q4_1:
|
2476
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2477
|
+
break;
|
2478
|
+
case GGML_TYPE_Q5_0:
|
2479
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2480
|
+
break;
|
2481
|
+
case GGML_TYPE_Q5_1:
|
2482
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2483
|
+
break;
|
2484
|
+
case GGML_TYPE_Q8_0:
|
2485
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2486
|
+
break;
|
2487
|
+
default:
|
2488
|
+
GGML_ASSERT(false);
|
2489
|
+
break;
|
2490
|
+
}
|
2491
|
+
|
2492
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
2493
|
+
} else {
|
2494
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
2495
|
+
#ifdef GGML_CUDA_DMMV_F16
|
2496
|
+
size_t ash;
|
2497
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
2498
|
+
|
2499
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
2500
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
2501
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
2502
|
+
|
2503
|
+
if (src1_convert_f16) {
|
2504
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
2505
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
2506
|
+
ne00, 1, sizeof(float), 0, 0,
|
2507
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
2508
|
+
}
|
2086
2509
|
#else
|
2087
|
-
|
2510
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
2088
2511
|
#endif // GGML_CUDA_DMMV_F16
|
2089
2512
|
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
|
2110
|
-
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
CUDA_CHECK(cudaGetLastError());
|
2513
|
+
switch (src0->type) {
|
2514
|
+
case GGML_TYPE_Q4_0:
|
2515
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2516
|
+
break;
|
2517
|
+
case GGML_TYPE_Q4_1:
|
2518
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2519
|
+
break;
|
2520
|
+
case GGML_TYPE_Q5_0:
|
2521
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2522
|
+
break;
|
2523
|
+
case GGML_TYPE_Q5_1:
|
2524
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2525
|
+
break;
|
2526
|
+
case GGML_TYPE_Q8_0:
|
2527
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2528
|
+
break;
|
2529
|
+
case GGML_TYPE_Q2_K:
|
2530
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2531
|
+
break;
|
2532
|
+
case GGML_TYPE_Q3_K:
|
2533
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2534
|
+
break;
|
2535
|
+
case GGML_TYPE_Q4_K:
|
2536
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2537
|
+
break;
|
2538
|
+
case GGML_TYPE_Q5_K:
|
2539
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2540
|
+
break;
|
2541
|
+
case GGML_TYPE_Q6_K:
|
2542
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2543
|
+
break;
|
2544
|
+
case GGML_TYPE_F16:
|
2545
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2546
|
+
break;
|
2547
|
+
default:
|
2548
|
+
GGML_ASSERT(false);
|
2549
|
+
break;
|
2550
|
+
}
|
2129
2551
|
|
2130
2552
|
#ifdef GGML_CUDA_DMMV_F16
|
2131
|
-
|
2132
|
-
|
2133
|
-
|
2553
|
+
if (src1_convert_f16) {
|
2554
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
2555
|
+
}
|
2134
2556
|
#endif // GGML_CUDA_DMMV_F16
|
2557
|
+
}
|
2135
2558
|
|
2136
2559
|
(void) src1;
|
2137
2560
|
(void) dst;
|
@@ -2202,7 +2625,6 @@ inline void ggml_cuda_op_rope(
|
|
2202
2625
|
|
2203
2626
|
// compute
|
2204
2627
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
2205
|
-
CUDA_CHECK(cudaGetLastError());
|
2206
2628
|
|
2207
2629
|
(void) dst;
|
2208
2630
|
(void) src0_ddq_i;
|
@@ -2226,7 +2648,6 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2226
2648
|
|
2227
2649
|
// compute
|
2228
2650
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2229
|
-
CUDA_CHECK(cudaGetLastError());
|
2230
2651
|
|
2231
2652
|
(void) dst;
|
2232
2653
|
(void) src0_ddq_i;
|
@@ -2248,7 +2669,6 @@ inline void ggml_cuda_op_soft_max(
|
|
2248
2669
|
|
2249
2670
|
// compute
|
2250
2671
|
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2251
|
-
CUDA_CHECK(cudaGetLastError());
|
2252
2672
|
|
2253
2673
|
(void) src1;
|
2254
2674
|
(void) dst;
|
@@ -2344,10 +2764,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2344
2764
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2345
2765
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2346
2766
|
|
2347
|
-
// if multiple
|
2767
|
+
// if multiple devices are used they need to wait for the main device
|
2768
|
+
// here an event is recorded that signifies that the main device has finished calculating the input data
|
2348
2769
|
if (split && g_device_count > 1) {
|
2349
2770
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2350
|
-
CUDA_CHECK(
|
2771
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
2351
2772
|
}
|
2352
2773
|
|
2353
2774
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -2373,6 +2794,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2373
2794
|
int64_t row_diff = row_high - row_low;
|
2374
2795
|
|
2375
2796
|
cudaSetDevice(id);
|
2797
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2798
|
+
|
2799
|
+
// wait for main GPU data if necessary
|
2800
|
+
if (split && id != g_main_device) {
|
2801
|
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
2802
|
+
}
|
2376
2803
|
|
2377
2804
|
if (src0_on_device && src0_is_contiguous) {
|
2378
2805
|
if (src0_is_f32) {
|
@@ -2448,8 +2875,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2448
2875
|
}
|
2449
2876
|
const int64_t i11 = i13*ne12 + i12;
|
2450
2877
|
|
2451
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2452
|
-
|
2453
2878
|
// for split tensors the data begins at i0 == i0_offset_low
|
2454
2879
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2455
2880
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
@@ -2509,6 +2934,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2509
2934
|
|
2510
2935
|
// do the computation
|
2511
2936
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2937
|
+
CUDA_CHECK(cudaGetLastError());
|
2512
2938
|
|
2513
2939
|
// copy dst to host or other device if necessary
|
2514
2940
|
if (!dst_on_device) {
|
@@ -2538,6 +2964,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2538
2964
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2539
2965
|
}
|
2540
2966
|
}
|
2967
|
+
|
2968
|
+
// signify to main device that other device is done
|
2969
|
+
if (split && g_device_count > 1 && id != g_main_device) {
|
2970
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
|
2971
|
+
}
|
2541
2972
|
}
|
2542
2973
|
}
|
2543
2974
|
}
|
@@ -2549,7 +2980,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2549
2980
|
}
|
2550
2981
|
|
2551
2982
|
CUDA_CHECK(cudaSetDevice(id));
|
2552
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2553
2983
|
|
2554
2984
|
if (src0_asq[id] > 0) {
|
2555
2985
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
@@ -2564,6 +2994,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2564
2994
|
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2565
2995
|
}
|
2566
2996
|
}
|
2997
|
+
|
2998
|
+
// main device waits for all other devices to be finished
|
2999
|
+
if (split && g_device_count > 1) {
|
3000
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3001
|
+
for (int id = 0; id < g_device_count; ++id) {
|
3002
|
+
if (id != g_main_device) {
|
3003
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
3004
|
+
}
|
3005
|
+
}
|
3006
|
+
}
|
3007
|
+
|
3008
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
3009
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3010
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
3011
|
+
}
|
2567
3012
|
}
|
2568
3013
|
|
2569
3014
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2582,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2582
3027
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
2583
3028
|
}
|
2584
3029
|
|
3030
|
+
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3031
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3032
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
3033
|
+
}
|
3034
|
+
|
2585
3035
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2586
3036
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2587
3037
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
2588
3038
|
}
|
2589
3039
|
|
3040
|
+
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3041
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3042
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
3043
|
+
}
|
3044
|
+
|
2590
3045
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2591
3046
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2592
3047
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
@@ -2679,8 +3134,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
2679
3134
|
}else if (src0->type == GGML_TYPE_F32) {
|
2680
3135
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2681
3136
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
2682
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0
|
2683
|
-
ggml_cuda_op(src0, src1, dst,
|
3137
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3138
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
2684
3139
|
} else {
|
2685
3140
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2686
3141
|
}
|
@@ -2765,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2765
3220
|
|
2766
3221
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
2767
3222
|
int nrows = ggml_nrows(tensor);
|
3223
|
+
|
3224
|
+
const int64_t ne0 = tensor->ne[0];
|
3225
|
+
|
2768
3226
|
const size_t nb1 = tensor->nb[1];
|
3227
|
+
|
2769
3228
|
ggml_backend backend = tensor->backend;
|
2770
3229
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
2771
3230
|
memset(extra, 0, sizeof(*extra));
|
@@ -2794,34 +3253,54 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
2794
3253
|
int64_t nrows_split = row_high - row_low;
|
2795
3254
|
|
2796
3255
|
const size_t offset_split = row_low*nb1;
|
2797
|
-
|
3256
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3257
|
+
const size_t original_size = size;
|
3258
|
+
|
3259
|
+
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
3260
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3261
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3262
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
3263
|
+
}
|
2798
3264
|
|
2799
|
-
|
3265
|
+
char * buf;
|
2800
3266
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
2801
|
-
|
3267
|
+
char * buf_host = (char*)data + offset_split;
|
3268
|
+
|
3269
|
+
// set padding to 0 to avoid possible NaN values
|
3270
|
+
if (size > original_size) {
|
3271
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
3272
|
+
}
|
3273
|
+
|
2802
3274
|
|
2803
|
-
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
3275
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
|
2804
3276
|
|
2805
3277
|
extra->data_device[id] = buf;
|
3278
|
+
|
3279
|
+
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3280
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
3281
|
+
}
|
2806
3282
|
}
|
2807
3283
|
|
2808
3284
|
tensor->extra = extra;
|
2809
3285
|
}
|
2810
3286
|
|
2811
3287
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
2812
|
-
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
|
3288
|
+
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
2813
3289
|
return;
|
2814
3290
|
}
|
2815
3291
|
|
2816
3292
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2817
3293
|
|
2818
3294
|
for (int id = 0; id < g_device_count; ++id) {
|
2819
|
-
if (extra->data_device[id]
|
2820
|
-
|
3295
|
+
if (extra->data_device[id] != nullptr) {
|
3296
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3297
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
2821
3298
|
}
|
2822
3299
|
|
2823
|
-
|
2824
|
-
|
3300
|
+
if (extra->events[id] != nullptr) {
|
3301
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3302
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
3303
|
+
}
|
2825
3304
|
}
|
2826
3305
|
|
2827
3306
|
delete extra;
|
@@ -2833,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
2833
3312
|
}
|
2834
3313
|
|
2835
3314
|
// recursively assign CUDA buffers until a compute tensor is found
|
2836
|
-
if (tensor->
|
2837
|
-
const ggml_op src0_op = tensor->
|
3315
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
|
+
const ggml_op src0_op = tensor->src[0]->op;
|
2838
3317
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2839
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3318
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
2840
3319
|
}
|
2841
3320
|
}
|
2842
|
-
if (tensor->op == GGML_OP_CPY && tensor->
|
2843
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3321
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
3322
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
2844
3323
|
}
|
2845
3324
|
|
2846
3325
|
tensor->backend = GGML_BACKEND_GPU;
|
2847
3326
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
2848
3327
|
memset(extra, 0, sizeof(*extra));
|
2849
3328
|
|
2850
|
-
const bool inplace = (tensor->
|
3329
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
2851
3330
|
tensor->op == GGML_OP_VIEW ||
|
2852
3331
|
force_inplace;
|
2853
3332
|
const size_t size = ggml_nbytes(tensor);
|
2854
3333
|
|
2855
3334
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2856
|
-
if (inplace && (tensor->
|
2857
|
-
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3335
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
3336
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
2858
3337
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2859
3338
|
size_t offset = 0;
|
2860
3339
|
if (tensor->op == GGML_OP_VIEW) {
|
2861
|
-
memcpy(&offset, tensor->
|
3340
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
2862
3341
|
}
|
2863
3342
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
2864
3343
|
} else if (tensor->op == GGML_OP_CPY) {
|
2865
|
-
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3344
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
2866
3345
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
2867
3346
|
extra->data_device[g_main_device] = src1_ddv;
|
2868
3347
|
} else if (scratch) {
|
@@ -2933,8 +3412,8 @@ void ggml_cuda_free_scratch() {
|
|
2933
3412
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
2934
3413
|
ggml_cuda_func_t func;
|
2935
3414
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
2936
|
-
|| (tensor->
|
2937
|
-
|| (tensor->
|
3415
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
3416
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
2938
3417
|
|
2939
3418
|
switch (tensor->op) {
|
2940
3419
|
case GGML_OP_ADD:
|
@@ -2949,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
2949
3428
|
}
|
2950
3429
|
func = ggml_cuda_mul;
|
2951
3430
|
break;
|
3431
|
+
case GGML_OP_GELU:
|
3432
|
+
if (!any_on_device) {
|
3433
|
+
return false;
|
3434
|
+
}
|
3435
|
+
func = ggml_cuda_gelu;
|
3436
|
+
break;
|
2952
3437
|
case GGML_OP_SILU:
|
2953
3438
|
if (!any_on_device) {
|
2954
3439
|
return false;
|
2955
3440
|
}
|
2956
3441
|
func = ggml_cuda_silu;
|
2957
3442
|
break;
|
3443
|
+
case GGML_OP_NORM:
|
3444
|
+
if (!any_on_device) {
|
3445
|
+
return false;
|
3446
|
+
}
|
3447
|
+
func = ggml_cuda_norm;
|
3448
|
+
break;
|
2958
3449
|
case GGML_OP_RMS_NORM:
|
2959
3450
|
if (!any_on_device) {
|
2960
3451
|
return false;
|
@@ -2962,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
2962
3453
|
func = ggml_cuda_rms_norm;
|
2963
3454
|
break;
|
2964
3455
|
case GGML_OP_MUL_MAT:
|
2965
|
-
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->
|
3456
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
2966
3457
|
return false;
|
2967
3458
|
}
|
2968
3459
|
func = ggml_cuda_mul_mat;
|
@@ -3016,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3016
3507
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
3017
3508
|
return true;
|
3018
3509
|
}
|
3019
|
-
func(tensor->
|
3510
|
+
func(tensor->src[0], tensor->src[1], tensor);
|
3020
3511
|
return true;
|
3021
3512
|
}
|