llama_cpp 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +11 -2
- data/ext/llama_cpp/llama_cpp.cpp +284 -111
- data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +19 -6
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +1734 -2248
- data/ext/llama_cpp/src/ggml.h +152 -80
- data/ext/llama_cpp/src/llama.cpp +282 -90
- data/ext/llama_cpp/src/llama.h +30 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -13
- data/sig/llama_cpp.rbs +22 -2
- metadata +5 -2
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
|
|
59
59
|
#endif //GGML_CUDA_DMMV_F16
|
60
60
|
|
61
61
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
62
|
-
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
63
|
-
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
62
|
+
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
63
|
+
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
64
64
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
65
65
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
66
66
|
typedef void (*ggml_cuda_op_t)(
|
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
|
|
70
70
|
|
71
71
|
// QK = number of values after dequantization
|
72
72
|
// QR = QK / number of values before dequantization
|
73
|
+
// QI = number of 32 bit integers before dequantization
|
73
74
|
|
74
75
|
#define QK4_0 32
|
75
76
|
#define QR4_0 2
|
77
|
+
#define QI4_0 4
|
76
78
|
typedef struct {
|
77
79
|
half d; // delta
|
78
80
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
81
83
|
|
82
84
|
#define QK4_1 32
|
83
85
|
#define QR4_1 2
|
86
|
+
#define QI4_1 4
|
84
87
|
typedef struct {
|
85
88
|
half d; // delta
|
86
89
|
half m; // min
|
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
90
93
|
|
91
94
|
#define QK5_0 32
|
92
95
|
#define QR5_0 2
|
96
|
+
#define QI5_0 4
|
93
97
|
typedef struct {
|
94
98
|
half d; // delta
|
95
99
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
99
103
|
|
100
104
|
#define QK5_1 32
|
101
105
|
#define QR5_1 2
|
106
|
+
#define QI5_1 4
|
102
107
|
typedef struct {
|
103
108
|
half d; // delta
|
104
109
|
half m; // min
|
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
109
114
|
|
110
115
|
#define QK8_0 32
|
111
116
|
#define QR8_0 1
|
117
|
+
#define QI8_0 8
|
112
118
|
typedef struct {
|
113
119
|
half d; // delta
|
114
120
|
int8_t qs[QK8_0]; // quants
|
115
121
|
} block_q8_0;
|
116
122
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
117
123
|
|
124
|
+
#define QK8_1 32
|
125
|
+
#define QR8_1 1
|
126
|
+
#define QI8_1 8
|
127
|
+
typedef struct {
|
128
|
+
half d; // delta
|
129
|
+
half s; // unquantized sum
|
130
|
+
int8_t qs[QK8_0]; // quants
|
131
|
+
} block_q8_1;
|
132
|
+
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
|
+
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
135
|
+
|
118
136
|
//================================= k-quants
|
119
137
|
|
120
138
|
#ifdef GGML_QKK_64
|
@@ -190,22 +208,25 @@ typedef struct {
|
|
190
208
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
191
209
|
|
192
210
|
#define WARP_SIZE 32
|
211
|
+
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
193
212
|
|
194
213
|
#define CUDA_ADD_BLOCK_SIZE 256
|
195
214
|
#define CUDA_MUL_BLOCK_SIZE 256
|
215
|
+
#define CUDA_GELU_BLOCK_SIZE 256
|
196
216
|
#define CUDA_SILU_BLOCK_SIZE 256
|
197
217
|
#define CUDA_CPY_BLOCK_SIZE 32
|
198
218
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
199
219
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
200
220
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
221
|
+
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
201
222
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
202
223
|
|
203
224
|
// dmmv = dequantize_mul_mat_vec
|
204
225
|
#ifndef GGML_CUDA_DMMV_X
|
205
226
|
#define GGML_CUDA_DMMV_X 32
|
206
227
|
#endif
|
207
|
-
#ifndef
|
208
|
-
#define
|
228
|
+
#ifndef GGML_CUDA_MMV_Y
|
229
|
+
#define GGML_CUDA_MMV_Y 1
|
209
230
|
#endif
|
210
231
|
|
211
232
|
#ifndef K_QUANTS_PER_ITERATION
|
@@ -214,6 +235,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
214
235
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215
236
|
#endif
|
216
237
|
|
238
|
+
struct ggml_tensor_extra_gpu {
|
239
|
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
240
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
241
|
+
};
|
242
|
+
|
217
243
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
218
244
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
219
245
|
|
@@ -241,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
|
|
241
267
|
dst[i] = x[i] * y[i%ky];
|
242
268
|
}
|
243
269
|
|
270
|
+
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
271
|
+
const float GELU_COEF_A = 0.044715f;
|
272
|
+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
273
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
274
|
+
|
275
|
+
if (i >= k) {
|
276
|
+
return;
|
277
|
+
}
|
278
|
+
|
279
|
+
float xi = x[i];
|
280
|
+
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
281
|
+
}
|
282
|
+
|
244
283
|
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
245
284
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
246
285
|
|
@@ -250,32 +289,60 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
250
289
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
251
290
|
}
|
252
291
|
|
292
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
293
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
294
|
+
const int tid = threadIdx.x;
|
295
|
+
|
296
|
+
const float eps = 1e-5f;
|
297
|
+
|
298
|
+
float mean = 0.0f;
|
299
|
+
float var = 0.0f;
|
300
|
+
|
301
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
|
+
const float xi = x[row*ncols + col];
|
303
|
+
mean += xi;
|
304
|
+
var += xi * xi;
|
305
|
+
}
|
306
|
+
|
307
|
+
// sum up partial sums
|
308
|
+
#pragma unroll
|
309
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
310
|
+
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
|
311
|
+
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
|
312
|
+
}
|
313
|
+
|
314
|
+
mean /= ncols;
|
315
|
+
var = var / ncols - mean * mean;
|
316
|
+
const float inv_var = rsqrtf(var + eps);
|
317
|
+
|
318
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
319
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
253
323
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
254
324
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
255
325
|
const int tid = threadIdx.x;
|
256
326
|
|
257
|
-
const float eps = 1e-
|
327
|
+
const float eps = 1e-6f;
|
258
328
|
|
259
329
|
float tmp = 0.0f; // partial sum for thread in warp
|
260
330
|
|
261
|
-
for (int
|
262
|
-
const int col = i + tid;
|
331
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
263
332
|
const float xi = x[row*ncols + col];
|
264
333
|
tmp += xi * xi;
|
265
334
|
}
|
266
335
|
|
267
336
|
// sum up partial sums
|
268
|
-
__syncthreads();
|
269
337
|
#pragma unroll
|
270
338
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
271
339
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
272
340
|
}
|
273
341
|
|
274
342
|
const float mean = tmp / ncols;
|
275
|
-
const float scale =
|
343
|
+
const float scale = rsqrtf(mean + eps);
|
276
344
|
|
277
|
-
for (int
|
278
|
-
const int col = i + tid;
|
345
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
279
346
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
280
347
|
}
|
281
348
|
}
|
@@ -384,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
384
451
|
|
385
452
|
//================================== k-quants
|
386
453
|
|
387
|
-
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
454
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
388
455
|
|
389
456
|
const int i = blockIdx.x;
|
390
457
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -417,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
417
484
|
|
418
485
|
}
|
419
486
|
|
420
|
-
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
487
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
421
488
|
|
422
489
|
const int i = blockIdx.x;
|
423
490
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -481,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
481
548
|
}
|
482
549
|
#endif
|
483
550
|
|
484
|
-
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
551
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
485
552
|
const block_q4_K * x = (const block_q4_K *) vx;
|
486
553
|
|
487
554
|
const int i = blockIdx.x;
|
@@ -521,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
521
588
|
#endif
|
522
589
|
}
|
523
590
|
|
524
|
-
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
591
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
525
592
|
const block_q5_K * x = (const block_q5_K *) vx;
|
526
593
|
|
527
594
|
const int i = blockIdx.x;
|
@@ -567,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
567
634
|
#endif
|
568
635
|
}
|
569
636
|
|
570
|
-
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
637
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
571
638
|
const block_q6_K * x = (const block_q6_K *) vx;
|
572
639
|
|
573
640
|
const int i = blockIdx.x;
|
@@ -611,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
611
678
|
#endif
|
612
679
|
}
|
613
680
|
|
614
|
-
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
681
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
615
682
|
|
616
683
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
617
684
|
|
@@ -709,7 +776,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
709
776
|
#endif
|
710
777
|
|
711
778
|
// sum up partial sums and write back result
|
712
|
-
__syncthreads();
|
713
779
|
#pragma unroll
|
714
780
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
715
781
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -720,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
720
786
|
}
|
721
787
|
}
|
722
788
|
|
723
|
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
789
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
724
790
|
|
725
791
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
726
792
|
if (row > nrows) return;
|
@@ -814,7 +880,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
814
880
|
#endif
|
815
881
|
|
816
882
|
// sum up partial sums and write back result
|
817
|
-
__syncthreads();
|
818
883
|
#pragma unroll
|
819
884
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
820
885
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -825,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
825
890
|
}
|
826
891
|
}
|
827
892
|
|
828
|
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
893
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
829
894
|
|
830
895
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
831
896
|
if (row > nrows) return;
|
@@ -918,7 +983,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
918
983
|
#endif
|
919
984
|
|
920
985
|
// sum up partial sums and write back result
|
921
|
-
__syncthreads();
|
922
986
|
#pragma unroll
|
923
987
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
924
988
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -929,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
929
993
|
}
|
930
994
|
}
|
931
995
|
|
932
|
-
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
996
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
933
997
|
|
934
998
|
const int row = blockIdx.x;
|
935
999
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1023,7 +1087,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1023
1087
|
#endif
|
1024
1088
|
|
1025
1089
|
// sum up partial sums and write back result
|
1026
|
-
__syncthreads();
|
1027
1090
|
#pragma unroll
|
1028
1091
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1029
1092
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1034,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1034
1097
|
}
|
1035
1098
|
}
|
1036
1099
|
|
1037
|
-
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
1100
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1038
1101
|
|
1039
1102
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1040
1103
|
|
@@ -1134,7 +1197,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
1134
1197
|
#endif
|
1135
1198
|
|
1136
1199
|
// sum up partial sums and write back result
|
1137
|
-
__syncthreads();
|
1138
1200
|
#pragma unroll
|
1139
1201
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1140
1202
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1153,8 +1215,43 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1153
1215
|
v.y = x[ib + iqs + 1];
|
1154
1216
|
}
|
1155
1217
|
|
1218
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
|
1219
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1220
|
+
|
1221
|
+
if (i >= k) {
|
1222
|
+
return;
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
block_q8_1 * y = (block_q8_1 *) vy;
|
1226
|
+
|
1227
|
+
const int ib = i / QK8_1; // block index
|
1228
|
+
const int iqs = i % QK8_1; // quant index
|
1229
|
+
|
1230
|
+
const float xi = i < ndata ? x[i] : 0.0f;
|
1231
|
+
float amax = fabsf(xi);
|
1232
|
+
float sum = xi;
|
1233
|
+
|
1234
|
+
#pragma unroll
|
1235
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1236
|
+
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
1237
|
+
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
1238
|
+
}
|
1239
|
+
|
1240
|
+
const float d = amax / 127;
|
1241
|
+
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
1242
|
+
|
1243
|
+
y[ib].qs[iqs] = q;
|
1244
|
+
|
1245
|
+
if (iqs > 0) {
|
1246
|
+
return;
|
1247
|
+
}
|
1248
|
+
|
1249
|
+
y[ib].d = d;
|
1250
|
+
y[ib].s = sum;
|
1251
|
+
}
|
1252
|
+
|
1156
1253
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1157
|
-
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1254
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
1158
1255
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1159
1256
|
|
1160
1257
|
if (i >= k) {
|
@@ -1174,8 +1271,184 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1174
1271
|
y[iybs + iqs + y_offset] = v.y;
|
1175
1272
|
}
|
1176
1273
|
|
1274
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1275
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1276
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1277
|
+
|
1278
|
+
int vi;
|
1279
|
+
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1280
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1281
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1282
|
+
|
1283
|
+
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
1284
|
+
|
1285
|
+
// subtract 8 from each quantized value
|
1286
|
+
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
1287
|
+
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
1288
|
+
|
1289
|
+
// SIMD dot product of quantized values
|
1290
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1291
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1292
|
+
|
1293
|
+
return sumi*d;
|
1294
|
+
#else
|
1295
|
+
return 0.0f; // only to satisfy the compiler
|
1296
|
+
#endif // __CUDA_ARCH__ >= 610
|
1297
|
+
}
|
1298
|
+
|
1299
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1300
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1301
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1302
|
+
|
1303
|
+
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
1304
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1305
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1306
|
+
|
1307
|
+
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
1308
|
+
const float m = bq4_1->m;
|
1309
|
+
const float s = bq8_1->s;
|
1310
|
+
|
1311
|
+
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
1312
|
+
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
1313
|
+
|
1314
|
+
// SIMD dot product of quantized values
|
1315
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1316
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1317
|
+
|
1318
|
+
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1319
|
+
#else
|
1320
|
+
return 0.0f; // only to satisfy the compiler
|
1321
|
+
#endif // __CUDA_ARCH__ >= 610
|
1322
|
+
}
|
1323
|
+
|
1324
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1325
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1326
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1327
|
+
|
1328
|
+
int qs;
|
1329
|
+
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1330
|
+
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1331
|
+
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1332
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1333
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1334
|
+
|
1335
|
+
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1336
|
+
|
1337
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1338
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1339
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1340
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1341
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1342
|
+
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1343
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1344
|
+
|
1345
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1346
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1347
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1348
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1349
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1350
|
+
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1351
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1352
|
+
|
1353
|
+
return sumi*d;
|
1354
|
+
#else
|
1355
|
+
return 0.0f; // only to satisfy the compiler
|
1356
|
+
#endif // __CUDA_ARCH__ >= 610
|
1357
|
+
}
|
1358
|
+
|
1359
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1360
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1361
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1362
|
+
|
1363
|
+
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1364
|
+
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1365
|
+
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1366
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1367
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1368
|
+
|
1369
|
+
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1370
|
+
const float m = bq5_1->m;
|
1371
|
+
const float s = bq8_1->s;
|
1372
|
+
|
1373
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1374
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1375
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1376
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1377
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1378
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1379
|
+
|
1380
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1381
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1382
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1383
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1384
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1385
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1386
|
+
|
1387
|
+
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1388
|
+
#else
|
1389
|
+
return 0.0f; // only to satisfy the compiler
|
1390
|
+
#endif // __CUDA_ARCH__ >= 610
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1394
|
+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
|
1395
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1396
|
+
|
1397
|
+
int vi;
|
1398
|
+
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1399
|
+
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1400
|
+
|
1401
|
+
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
1402
|
+
|
1403
|
+
// SIMD dot product of quantized values
|
1404
|
+
int sumi = __dp4a(vi, ui, 0);
|
1405
|
+
|
1406
|
+
return sumi*d;
|
1407
|
+
#else
|
1408
|
+
return 0.0f; // only to satisfy the compiler
|
1409
|
+
#endif // __CUDA_ARCH__ >= 610
|
1410
|
+
}
|
1411
|
+
|
1412
|
+
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1413
|
+
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1414
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1415
|
+
|
1416
|
+
if (row >= nrows) {
|
1417
|
+
return;
|
1418
|
+
}
|
1419
|
+
|
1420
|
+
const int blocks_per_row = ncols / qk;
|
1421
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1422
|
+
|
1423
|
+
// partial sum for each thread
|
1424
|
+
float tmp = 0.0f;
|
1425
|
+
|
1426
|
+
const block_q_t * x = (const block_q_t *) vx;
|
1427
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1428
|
+
|
1429
|
+
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1430
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1431
|
+
|
1432
|
+
const int iby = i + threadIdx.x / qi; // y block index
|
1433
|
+
|
1434
|
+
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1435
|
+
|
1436
|
+
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1437
|
+
}
|
1438
|
+
|
1439
|
+
// sum up partial sums and write back result
|
1440
|
+
#pragma unroll
|
1441
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1442
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1443
|
+
}
|
1444
|
+
|
1445
|
+
if (threadIdx.x == 0) {
|
1446
|
+
dst[row] = tmp;
|
1447
|
+
}
|
1448
|
+
}
|
1449
|
+
|
1177
1450
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1178
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1451
|
+
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
1179
1452
|
// qk = quantized weights per x block
|
1180
1453
|
// qr = number of quantized weights per data value in x block
|
1181
1454
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -1228,7 +1501,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1228
1501
|
}
|
1229
1502
|
|
1230
1503
|
// sum up partial sums and write back result
|
1231
|
-
__syncthreads();
|
1232
1504
|
#pragma unroll
|
1233
1505
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1234
1506
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1243,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1243
1515
|
}
|
1244
1516
|
}
|
1245
1517
|
|
1246
|
-
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1518
|
+
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1247
1519
|
const half * x = (const half *) vx;
|
1248
1520
|
|
1249
1521
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1279,7 +1551,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1279
1551
|
const int idst = channel*nrows_dst + row_dst;
|
1280
1552
|
|
1281
1553
|
// sum up partial sums and write back result
|
1282
|
-
__syncthreads();
|
1283
1554
|
#pragma unroll
|
1284
1555
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1285
1556
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1291,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1291
1562
|
}
|
1292
1563
|
|
1293
1564
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1294
|
-
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1565
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1295
1566
|
const int row_stride_x, const int channel_stride_x) {
|
1296
1567
|
|
1297
1568
|
const half * x = (const half *) vx;
|
@@ -1325,7 +1596,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1325
1596
|
}
|
1326
1597
|
|
1327
1598
|
// sum up partial sums and write back result
|
1328
|
-
__syncthreads();
|
1329
1599
|
#pragma unroll
|
1330
1600
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1331
1601
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1435,7 +1705,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
1435
1705
|
}
|
1436
1706
|
|
1437
1707
|
// sum up partial sums
|
1438
|
-
__syncthreads();
|
1439
1708
|
#pragma unroll
|
1440
1709
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1441
1710
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1478,17 +1747,33 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1478
1747
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1479
1748
|
}
|
1480
1749
|
|
1750
|
+
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1751
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
1752
|
+
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1753
|
+
}
|
1754
|
+
|
1481
1755
|
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1482
1756
|
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
1483
1757
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1484
1758
|
}
|
1485
1759
|
|
1760
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1761
|
+
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1762
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1763
|
+
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1764
|
+
}
|
1765
|
+
|
1486
1766
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1487
1767
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1488
1768
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1489
1769
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1490
1770
|
}
|
1491
1771
|
|
1772
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
1773
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1774
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
|
1775
|
+
}
|
1776
|
+
|
1492
1777
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1493
1778
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1494
1779
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1557,45 +1842,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1557
1842
|
|
1558
1843
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1559
1844
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1560
|
-
const int block_num_y = (nrows +
|
1845
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1561
1846
|
const dim3 block_nums(1, block_num_y, 1);
|
1562
|
-
const dim3 block_dims(WARP_SIZE,
|
1847
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1563
1848
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
1564
1849
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1565
1850
|
}
|
1566
1851
|
|
1567
1852
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1568
1853
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1569
|
-
const int block_num_y = (nrows +
|
1854
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1570
1855
|
const dim3 block_nums(1, block_num_y, 1);
|
1571
|
-
const dim3 block_dims(WARP_SIZE,
|
1856
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1572
1857
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
1573
1858
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1574
1859
|
}
|
1575
1860
|
|
1576
1861
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1577
1862
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1578
|
-
const int block_num_y = (nrows +
|
1863
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1579
1864
|
const dim3 block_nums(1, block_num_y, 1);
|
1580
|
-
const dim3 block_dims(WARP_SIZE,
|
1865
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1581
1866
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
1582
1867
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1583
1868
|
}
|
1584
1869
|
|
1585
1870
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1586
1871
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1587
|
-
const int block_num_y = (nrows +
|
1872
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1588
1873
|
const dim3 block_nums(1, block_num_y, 1);
|
1589
|
-
const dim3 block_dims(WARP_SIZE,
|
1874
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1590
1875
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
1591
1876
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1592
1877
|
}
|
1593
1878
|
|
1594
1879
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1595
1880
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1596
|
-
const int block_num_y = (nrows +
|
1881
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1597
1882
|
const dim3 block_nums(1, block_num_y, 1);
|
1598
|
-
const dim3 block_dims(WARP_SIZE,
|
1883
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1599
1884
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
1600
1885
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1601
1886
|
}
|
@@ -1642,6 +1927,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1642
1927
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1643
1928
|
}
|
1644
1929
|
|
1930
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1931
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1932
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1933
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1934
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1935
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
1936
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1937
|
+
}
|
1938
|
+
|
1939
|
+
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1940
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1941
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1942
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1943
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1944
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
1945
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1946
|
+
}
|
1947
|
+
|
1948
|
+
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1949
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1950
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1951
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1952
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1953
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
1954
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1955
|
+
}
|
1956
|
+
|
1957
|
+
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1958
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1959
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1960
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1961
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1962
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
1963
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1964
|
+
}
|
1965
|
+
|
1966
|
+
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1967
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1968
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1969
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1970
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1971
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
1972
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1973
|
+
}
|
1974
|
+
|
1645
1975
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1646
1976
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1647
1977
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1649,9 +1979,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1649
1979
|
|
1650
1980
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1651
1981
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1652
|
-
const int block_num_y = (nrows +
|
1982
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1653
1983
|
const dim3 block_nums(1, block_num_y, 1);
|
1654
|
-
const dim3 block_dims(WARP_SIZE,
|
1984
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1655
1985
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
1656
1986
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1657
1987
|
}
|
@@ -1817,6 +2147,7 @@ static size_t g_scratch_offset = 0;
|
|
1817
2147
|
|
1818
2148
|
static int g_device_count = -1;
|
1819
2149
|
static int g_main_device = 0;
|
2150
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
1820
2151
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1821
2152
|
|
1822
2153
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1834,9 +2165,12 @@ void ggml_init_cublas() {
|
|
1834
2165
|
for (int id = 0; id < g_device_count; ++id) {
|
1835
2166
|
cudaDeviceProp prop;
|
1836
2167
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
1837
|
-
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
2168
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
2169
|
+
|
1838
2170
|
g_tensor_split[id] = total_vram;
|
1839
2171
|
total_vram += prop.totalGlobalMem;
|
2172
|
+
|
2173
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
1840
2174
|
}
|
1841
2175
|
for (int id = 0; id < g_device_count; ++id) {
|
1842
2176
|
g_tensor_split[id] /= total_vram;
|
@@ -1957,20 +2291,24 @@ inline void ggml_cuda_op_add(
|
|
1957
2291
|
|
1958
2292
|
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
1959
2293
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1960
|
-
GGML_ASSERT(dst_ddf_i
|
2294
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2295
|
+
|
2296
|
+
// TODO: support broadcasting
|
2297
|
+
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
1961
2298
|
|
1962
|
-
const int64_t
|
2299
|
+
const int64_t ne00 = src0->ne[0];
|
1963
2300
|
const int64_t i01_diff = i01_high - i01_low;
|
1964
2301
|
|
2302
|
+
// const int64_t ne10 = src1->ne[0];
|
2303
|
+
|
1965
2304
|
// compute
|
1966
2305
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
1967
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i,
|
2306
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
1968
2307
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
1969
|
-
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i,
|
2308
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
1970
2309
|
} else {
|
1971
2310
|
GGML_ASSERT(false);
|
1972
2311
|
}
|
1973
|
-
CUDA_CHECK(cudaGetLastError());
|
1974
2312
|
|
1975
2313
|
(void) src1;
|
1976
2314
|
(void) dst;
|
@@ -1986,10 +2324,9 @@ inline void ggml_cuda_op_mul(
|
|
1986
2324
|
|
1987
2325
|
GGML_ASSERT(src0_ddf_i != nullptr);
|
1988
2326
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1989
|
-
GGML_ASSERT(dst_ddf_i
|
2327
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1990
2328
|
|
1991
2329
|
const int64_t ne00 = src0->ne[0];
|
1992
|
-
|
1993
2330
|
const int64_t ne10 = src1->ne[0];
|
1994
2331
|
const int64_t ne11 = src1->ne[1];
|
1995
2332
|
|
@@ -1998,11 +2335,10 @@ inline void ggml_cuda_op_mul(
|
|
1998
2335
|
|
1999
2336
|
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2000
2337
|
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2001
|
-
float * dst_ddf_i01
|
2338
|
+
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2002
2339
|
|
2003
2340
|
// compute
|
2004
2341
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2005
|
-
CUDA_CHECK(cudaGetLastError());
|
2006
2342
|
}
|
2007
2343
|
|
2008
2344
|
(void) dst;
|
@@ -2010,6 +2346,28 @@ inline void ggml_cuda_op_mul(
|
|
2010
2346
|
(void) i02;
|
2011
2347
|
}
|
2012
2348
|
|
2349
|
+
inline void ggml_cuda_op_gelu(
|
2350
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2351
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2352
|
+
cudaStream_t & cudaStream_main){
|
2353
|
+
|
2354
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2355
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2356
|
+
|
2357
|
+
const int64_t ne00 = src0->ne[0];
|
2358
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2359
|
+
|
2360
|
+
// compute
|
2361
|
+
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2362
|
+
|
2363
|
+
(void) src1;
|
2364
|
+
(void) dst;
|
2365
|
+
(void) src0_ddq_i;
|
2366
|
+
(void) src1_ddf_i;
|
2367
|
+
(void) i02;
|
2368
|
+
(void) i1;
|
2369
|
+
}
|
2370
|
+
|
2013
2371
|
inline void ggml_cuda_op_silu(
|
2014
2372
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2015
2373
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2023,7 +2381,28 @@ inline void ggml_cuda_op_silu(
|
|
2023
2381
|
|
2024
2382
|
// compute
|
2025
2383
|
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2026
|
-
|
2384
|
+
|
2385
|
+
(void) src1;
|
2386
|
+
(void) dst;
|
2387
|
+
(void) src0_ddq_i;
|
2388
|
+
(void) src1_ddf_i;
|
2389
|
+
(void) i02;
|
2390
|
+
(void) i1;
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
inline void ggml_cuda_op_norm(
|
2394
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2395
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2396
|
+
cudaStream_t & cudaStream_main){
|
2397
|
+
|
2398
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2399
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2400
|
+
|
2401
|
+
const int64_t ne00 = src0->ne[0];
|
2402
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2403
|
+
|
2404
|
+
// compute
|
2405
|
+
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2027
2406
|
|
2028
2407
|
(void) src1;
|
2029
2408
|
(void) dst;
|
@@ -2046,7 +2425,6 @@ inline void ggml_cuda_op_rms_norm(
|
|
2046
2425
|
|
2047
2426
|
// compute
|
2048
2427
|
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2049
|
-
CUDA_CHECK(cudaGetLastError());
|
2050
2428
|
|
2051
2429
|
(void) src1;
|
2052
2430
|
(void) dst;
|
@@ -2056,7 +2434,7 @@ inline void ggml_cuda_op_rms_norm(
|
|
2056
2434
|
(void) i1;
|
2057
2435
|
}
|
2058
2436
|
|
2059
|
-
inline void
|
2437
|
+
inline void ggml_cuda_op_mul_mat_vec(
|
2060
2438
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2061
2439
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2062
2440
|
cudaStream_t & cudaStream_main){
|
@@ -2068,70 +2446,115 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
2068
2446
|
const int64_t ne00 = src0->ne[0];
|
2069
2447
|
const int64_t nrows = i01_high - i01_low;
|
2070
2448
|
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2449
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
2450
|
+
const bool use_mul_mat_vec_q = false;
|
2451
|
+
#else
|
2452
|
+
int id;
|
2453
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
2075
2454
|
|
2076
|
-
bool
|
2077
|
-
src0->type ==
|
2078
|
-
src0->type ==
|
2455
|
+
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
2456
|
+
src0->type == GGML_TYPE_Q4_1 ||
|
2457
|
+
src0->type == GGML_TYPE_Q5_0 ||
|
2458
|
+
src0->type == GGML_TYPE_Q5_1 ||
|
2459
|
+
src0->type == GGML_TYPE_Q8_0;
|
2079
2460
|
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2461
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
|
2462
|
+
#endif
|
2463
|
+
|
2464
|
+
if (use_mul_mat_vec_q) {
|
2465
|
+
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
2466
|
+
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
2467
|
+
size_t as;
|
2468
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2469
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
2470
|
+
|
2471
|
+
switch (src0->type) {
|
2472
|
+
case GGML_TYPE_Q4_0:
|
2473
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2474
|
+
break;
|
2475
|
+
case GGML_TYPE_Q4_1:
|
2476
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2477
|
+
break;
|
2478
|
+
case GGML_TYPE_Q5_0:
|
2479
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2480
|
+
break;
|
2481
|
+
case GGML_TYPE_Q5_1:
|
2482
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2483
|
+
break;
|
2484
|
+
case GGML_TYPE_Q8_0:
|
2485
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2486
|
+
break;
|
2487
|
+
default:
|
2488
|
+
GGML_ASSERT(false);
|
2489
|
+
break;
|
2490
|
+
}
|
2491
|
+
|
2492
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
2493
|
+
} else {
|
2494
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
2495
|
+
#ifdef GGML_CUDA_DMMV_F16
|
2496
|
+
size_t ash;
|
2497
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
2498
|
+
|
2499
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
2500
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
2501
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
2502
|
+
|
2503
|
+
if (src1_convert_f16) {
|
2504
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
2505
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
2506
|
+
ne00, 1, sizeof(float), 0, 0,
|
2507
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
2508
|
+
}
|
2086
2509
|
#else
|
2087
|
-
|
2510
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
2088
2511
|
#endif // GGML_CUDA_DMMV_F16
|
2089
2512
|
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
|
2110
|
-
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
CUDA_CHECK(cudaGetLastError());
|
2513
|
+
switch (src0->type) {
|
2514
|
+
case GGML_TYPE_Q4_0:
|
2515
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2516
|
+
break;
|
2517
|
+
case GGML_TYPE_Q4_1:
|
2518
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2519
|
+
break;
|
2520
|
+
case GGML_TYPE_Q5_0:
|
2521
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2522
|
+
break;
|
2523
|
+
case GGML_TYPE_Q5_1:
|
2524
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2525
|
+
break;
|
2526
|
+
case GGML_TYPE_Q8_0:
|
2527
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2528
|
+
break;
|
2529
|
+
case GGML_TYPE_Q2_K:
|
2530
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2531
|
+
break;
|
2532
|
+
case GGML_TYPE_Q3_K:
|
2533
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2534
|
+
break;
|
2535
|
+
case GGML_TYPE_Q4_K:
|
2536
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2537
|
+
break;
|
2538
|
+
case GGML_TYPE_Q5_K:
|
2539
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2540
|
+
break;
|
2541
|
+
case GGML_TYPE_Q6_K:
|
2542
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2543
|
+
break;
|
2544
|
+
case GGML_TYPE_F16:
|
2545
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2546
|
+
break;
|
2547
|
+
default:
|
2548
|
+
GGML_ASSERT(false);
|
2549
|
+
break;
|
2550
|
+
}
|
2129
2551
|
|
2130
2552
|
#ifdef GGML_CUDA_DMMV_F16
|
2131
|
-
|
2132
|
-
|
2133
|
-
|
2553
|
+
if (src1_convert_f16) {
|
2554
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
2555
|
+
}
|
2134
2556
|
#endif // GGML_CUDA_DMMV_F16
|
2557
|
+
}
|
2135
2558
|
|
2136
2559
|
(void) src1;
|
2137
2560
|
(void) dst;
|
@@ -2202,7 +2625,6 @@ inline void ggml_cuda_op_rope(
|
|
2202
2625
|
|
2203
2626
|
// compute
|
2204
2627
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
2205
|
-
CUDA_CHECK(cudaGetLastError());
|
2206
2628
|
|
2207
2629
|
(void) dst;
|
2208
2630
|
(void) src0_ddq_i;
|
@@ -2226,7 +2648,6 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2226
2648
|
|
2227
2649
|
// compute
|
2228
2650
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2229
|
-
CUDA_CHECK(cudaGetLastError());
|
2230
2651
|
|
2231
2652
|
(void) dst;
|
2232
2653
|
(void) src0_ddq_i;
|
@@ -2248,7 +2669,6 @@ inline void ggml_cuda_op_soft_max(
|
|
2248
2669
|
|
2249
2670
|
// compute
|
2250
2671
|
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2251
|
-
CUDA_CHECK(cudaGetLastError());
|
2252
2672
|
|
2253
2673
|
(void) src1;
|
2254
2674
|
(void) dst;
|
@@ -2344,10 +2764,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2344
2764
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2345
2765
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2346
2766
|
|
2347
|
-
// if multiple
|
2767
|
+
// if multiple devices are used they need to wait for the main device
|
2768
|
+
// here an event is recorded that signifies that the main device has finished calculating the input data
|
2348
2769
|
if (split && g_device_count > 1) {
|
2349
2770
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2350
|
-
CUDA_CHECK(
|
2771
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
2351
2772
|
}
|
2352
2773
|
|
2353
2774
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -2373,6 +2794,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2373
2794
|
int64_t row_diff = row_high - row_low;
|
2374
2795
|
|
2375
2796
|
cudaSetDevice(id);
|
2797
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2798
|
+
|
2799
|
+
// wait for main GPU data if necessary
|
2800
|
+
if (split && id != g_main_device) {
|
2801
|
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
2802
|
+
}
|
2376
2803
|
|
2377
2804
|
if (src0_on_device && src0_is_contiguous) {
|
2378
2805
|
if (src0_is_f32) {
|
@@ -2448,8 +2875,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2448
2875
|
}
|
2449
2876
|
const int64_t i11 = i13*ne12 + i12;
|
2450
2877
|
|
2451
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2452
|
-
|
2453
2878
|
// for split tensors the data begins at i0 == i0_offset_low
|
2454
2879
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2455
2880
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
@@ -2509,6 +2934,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2509
2934
|
|
2510
2935
|
// do the computation
|
2511
2936
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2937
|
+
CUDA_CHECK(cudaGetLastError());
|
2512
2938
|
|
2513
2939
|
// copy dst to host or other device if necessary
|
2514
2940
|
if (!dst_on_device) {
|
@@ -2538,6 +2964,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2538
2964
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2539
2965
|
}
|
2540
2966
|
}
|
2967
|
+
|
2968
|
+
// signify to main device that other device is done
|
2969
|
+
if (split && g_device_count > 1 && id != g_main_device) {
|
2970
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
|
2971
|
+
}
|
2541
2972
|
}
|
2542
2973
|
}
|
2543
2974
|
}
|
@@ -2549,7 +2980,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2549
2980
|
}
|
2550
2981
|
|
2551
2982
|
CUDA_CHECK(cudaSetDevice(id));
|
2552
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2553
2983
|
|
2554
2984
|
if (src0_asq[id] > 0) {
|
2555
2985
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
@@ -2564,6 +2994,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2564
2994
|
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2565
2995
|
}
|
2566
2996
|
}
|
2997
|
+
|
2998
|
+
// main device waits for all other devices to be finished
|
2999
|
+
if (split && g_device_count > 1) {
|
3000
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3001
|
+
for (int id = 0; id < g_device_count; ++id) {
|
3002
|
+
if (id != g_main_device) {
|
3003
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
3004
|
+
}
|
3005
|
+
}
|
3006
|
+
}
|
3007
|
+
|
3008
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
3009
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3010
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
3011
|
+
}
|
2567
3012
|
}
|
2568
3013
|
|
2569
3014
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2582,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2582
3027
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
2583
3028
|
}
|
2584
3029
|
|
3030
|
+
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3031
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3032
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
3033
|
+
}
|
3034
|
+
|
2585
3035
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2586
3036
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2587
3037
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
2588
3038
|
}
|
2589
3039
|
|
3040
|
+
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3041
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3042
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
3043
|
+
}
|
3044
|
+
|
2590
3045
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2591
3046
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2592
3047
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
@@ -2679,8 +3134,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
2679
3134
|
}else if (src0->type == GGML_TYPE_F32) {
|
2680
3135
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2681
3136
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
2682
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0
|
2683
|
-
ggml_cuda_op(src0, src1, dst,
|
3137
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3138
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
2684
3139
|
} else {
|
2685
3140
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2686
3141
|
}
|
@@ -2765,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2765
3220
|
|
2766
3221
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
2767
3222
|
int nrows = ggml_nrows(tensor);
|
3223
|
+
|
3224
|
+
const int64_t ne0 = tensor->ne[0];
|
3225
|
+
|
2768
3226
|
const size_t nb1 = tensor->nb[1];
|
3227
|
+
|
2769
3228
|
ggml_backend backend = tensor->backend;
|
2770
3229
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
2771
3230
|
memset(extra, 0, sizeof(*extra));
|
@@ -2794,34 +3253,54 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
2794
3253
|
int64_t nrows_split = row_high - row_low;
|
2795
3254
|
|
2796
3255
|
const size_t offset_split = row_low*nb1;
|
2797
|
-
|
3256
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3257
|
+
const size_t original_size = size;
|
3258
|
+
|
3259
|
+
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
3260
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3261
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3262
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
3263
|
+
}
|
2798
3264
|
|
2799
|
-
|
3265
|
+
char * buf;
|
2800
3266
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
2801
|
-
|
3267
|
+
char * buf_host = (char*)data + offset_split;
|
3268
|
+
|
3269
|
+
// set padding to 0 to avoid possible NaN values
|
3270
|
+
if (size > original_size) {
|
3271
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
3272
|
+
}
|
3273
|
+
|
2802
3274
|
|
2803
|
-
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
3275
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
|
2804
3276
|
|
2805
3277
|
extra->data_device[id] = buf;
|
3278
|
+
|
3279
|
+
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3280
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
3281
|
+
}
|
2806
3282
|
}
|
2807
3283
|
|
2808
3284
|
tensor->extra = extra;
|
2809
3285
|
}
|
2810
3286
|
|
2811
3287
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
2812
|
-
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
|
3288
|
+
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
2813
3289
|
return;
|
2814
3290
|
}
|
2815
3291
|
|
2816
3292
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2817
3293
|
|
2818
3294
|
for (int id = 0; id < g_device_count; ++id) {
|
2819
|
-
if (extra->data_device[id]
|
2820
|
-
|
3295
|
+
if (extra->data_device[id] != nullptr) {
|
3296
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3297
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
2821
3298
|
}
|
2822
3299
|
|
2823
|
-
|
2824
|
-
|
3300
|
+
if (extra->events[id] != nullptr) {
|
3301
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3302
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
3303
|
+
}
|
2825
3304
|
}
|
2826
3305
|
|
2827
3306
|
delete extra;
|
@@ -2833,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
2833
3312
|
}
|
2834
3313
|
|
2835
3314
|
// recursively assign CUDA buffers until a compute tensor is found
|
2836
|
-
if (tensor->
|
2837
|
-
const ggml_op src0_op = tensor->
|
3315
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
|
+
const ggml_op src0_op = tensor->src[0]->op;
|
2838
3317
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2839
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3318
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
2840
3319
|
}
|
2841
3320
|
}
|
2842
|
-
if (tensor->op == GGML_OP_CPY && tensor->
|
2843
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3321
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
3322
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
2844
3323
|
}
|
2845
3324
|
|
2846
3325
|
tensor->backend = GGML_BACKEND_GPU;
|
2847
3326
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
2848
3327
|
memset(extra, 0, sizeof(*extra));
|
2849
3328
|
|
2850
|
-
const bool inplace = (tensor->
|
3329
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
2851
3330
|
tensor->op == GGML_OP_VIEW ||
|
2852
3331
|
force_inplace;
|
2853
3332
|
const size_t size = ggml_nbytes(tensor);
|
2854
3333
|
|
2855
3334
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2856
|
-
if (inplace && (tensor->
|
2857
|
-
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3335
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
3336
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
2858
3337
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2859
3338
|
size_t offset = 0;
|
2860
3339
|
if (tensor->op == GGML_OP_VIEW) {
|
2861
|
-
memcpy(&offset, tensor->
|
3340
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
2862
3341
|
}
|
2863
3342
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
2864
3343
|
} else if (tensor->op == GGML_OP_CPY) {
|
2865
|
-
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3344
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
2866
3345
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
2867
3346
|
extra->data_device[g_main_device] = src1_ddv;
|
2868
3347
|
} else if (scratch) {
|
@@ -2933,8 +3412,8 @@ void ggml_cuda_free_scratch() {
|
|
2933
3412
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
2934
3413
|
ggml_cuda_func_t func;
|
2935
3414
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
2936
|
-
|| (tensor->
|
2937
|
-
|| (tensor->
|
3415
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
3416
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
2938
3417
|
|
2939
3418
|
switch (tensor->op) {
|
2940
3419
|
case GGML_OP_ADD:
|
@@ -2949,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
2949
3428
|
}
|
2950
3429
|
func = ggml_cuda_mul;
|
2951
3430
|
break;
|
3431
|
+
case GGML_OP_GELU:
|
3432
|
+
if (!any_on_device) {
|
3433
|
+
return false;
|
3434
|
+
}
|
3435
|
+
func = ggml_cuda_gelu;
|
3436
|
+
break;
|
2952
3437
|
case GGML_OP_SILU:
|
2953
3438
|
if (!any_on_device) {
|
2954
3439
|
return false;
|
2955
3440
|
}
|
2956
3441
|
func = ggml_cuda_silu;
|
2957
3442
|
break;
|
3443
|
+
case GGML_OP_NORM:
|
3444
|
+
if (!any_on_device) {
|
3445
|
+
return false;
|
3446
|
+
}
|
3447
|
+
func = ggml_cuda_norm;
|
3448
|
+
break;
|
2958
3449
|
case GGML_OP_RMS_NORM:
|
2959
3450
|
if (!any_on_device) {
|
2960
3451
|
return false;
|
@@ -2962,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
2962
3453
|
func = ggml_cuda_rms_norm;
|
2963
3454
|
break;
|
2964
3455
|
case GGML_OP_MUL_MAT:
|
2965
|
-
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->
|
3456
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
2966
3457
|
return false;
|
2967
3458
|
}
|
2968
3459
|
func = ggml_cuda_mul_mat;
|
@@ -3016,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3016
3507
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
3017
3508
|
return true;
|
3018
3509
|
}
|
3019
|
-
func(tensor->
|
3510
|
+
func(tensor->src[0], tensor->src[1], tensor);
|
3020
3511
|
return true;
|
3021
3512
|
}
|