llama_cpp 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2483 @@
1
+ #include <cstddef>
2
+ #include <cstdint>
3
+ #include <limits>
4
+ #include <stdint.h>
5
+ #include <stdio.h>
6
+ #include <atomic>
7
+ #include <assert.h>
8
+
9
+ #include <cuda_runtime.h>
10
+ #include <cublas_v2.h>
11
+ #include <cuda_fp16.h>
12
+
13
+ #include "ggml-cuda.h"
14
+ #include "ggml.h"
15
+
16
+ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
17
+
18
+ #define CUDA_CHECK(err) \
19
+ do { \
20
+ cudaError_t err_ = (err); \
21
+ if (err_ != cudaSuccess) { \
22
+ fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
23
+ cudaGetErrorString(err_)); \
24
+ exit(1); \
25
+ } \
26
+ } while (0)
27
+
28
+ #if CUDART_VERSION >= 12000
29
+ #define CUBLAS_CHECK(err) \
30
+ do { \
31
+ cublasStatus_t err_ = (err); \
32
+ if (err_ != CUBLAS_STATUS_SUCCESS) { \
33
+ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
34
+ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
35
+ exit(1); \
36
+ } \
37
+ } while (0)
38
+ #else
39
+ #define CUBLAS_CHECK(err) \
40
+ do { \
41
+ cublasStatus_t err_ = (err); \
42
+ if (err_ != CUBLAS_STATUS_SUCCESS) { \
43
+ fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
44
+ exit(1); \
45
+ } \
46
+ } while (0)
47
+ #endif // CUDART_VERSION >= 11
48
+
49
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
50
+ typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
51
+ typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
52
+ typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
53
+ typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
54
+ typedef void (*ggml_cuda_op_t)(
55
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
56
+ float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
57
+ cudaStream_t & cudaStream_main);
58
+
59
+ // QK = number of values after dequantization
60
+ // QR = QK / number of values before dequantization
61
+
62
+ #define QK4_0 32
63
+ #define QR4_0 2
64
+ typedef struct {
65
+ half d; // delta
66
+ uint8_t qs[QK4_0 / 2]; // nibbles / quants
67
+ } block_q4_0;
68
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
69
+
70
+ #define QK4_1 32
71
+ #define QR4_1 2
72
+ typedef struct {
73
+ half d; // delta
74
+ half m; // min
75
+ uint8_t qs[QK4_1 / 2]; // nibbles / quants
76
+ } block_q4_1;
77
+ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
78
+
79
+ #define QK5_0 32
80
+ #define QR5_0 2
81
+ typedef struct {
82
+ half d; // delta
83
+ uint8_t qh[4]; // 5-th bit of quants
84
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
85
+ } block_q5_0;
86
+ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
87
+
88
+ #define QK5_1 32
89
+ #define QR5_1 2
90
+ typedef struct {
91
+ half d; // delta
92
+ half m; // min
93
+ uint8_t qh[4]; // 5-th bit of quants
94
+ uint8_t qs[QK5_1 / 2]; // nibbles / quants
95
+ } block_q5_1;
96
+ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
97
+
98
+ #define QK8_0 32
99
+ #define QR8_0 1
100
+ typedef struct {
101
+ half d; // delta
102
+ int8_t qs[QK8_0]; // quants
103
+ } block_q8_0;
104
+ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
105
+
106
+ //================================= k-quants
107
+
108
+ #define QK_K 256
109
+
110
+ typedef struct {
111
+ uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
112
+ uint8_t qs[QK_K/4]; // quants
113
+ half d; // super-block scale for quantized scales
114
+ half dmin; // super-block scale for quantized mins
115
+ } block_q2_K;
116
+ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
117
+
118
+ typedef struct {
119
+ uint8_t hmask[QK_K/8];
120
+ uint8_t qs[QK_K/4]; // nibbles / quants
121
+ uint8_t scales[3*QK_K/64];
122
+ half d;
123
+ } block_q3_K;
124
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
125
+
126
+ typedef struct {
127
+ half d; // super-block scale for quantized scales
128
+ half dmin; // super-block scale for quantized mins
129
+ uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
130
+ uint8_t qs[QK_K/2]; // 4--bit quants
131
+ } block_q4_K;
132
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
133
+
134
+ typedef struct {
135
+ half d; // super-block scale for quantized scales
136
+ half dmin; // super-block scale for quantized mins
137
+ uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
138
+ uint8_t qh[QK_K/8]; // quants, high bit
139
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
140
+ } block_q5_K;
141
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
142
+
143
+ typedef struct {
144
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
145
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
146
+ int8_t scales[QK_K/16]; // scales
147
+ half d; // delta
148
+ } block_q6_K;
149
+ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
150
+
151
+ #define WARP_SIZE 32
152
+
153
+ #define CUDA_ADD_BLOCK_SIZE 256
154
+ #define CUDA_MUL_BLOCK_SIZE 256
155
+ #define CUDA_SILU_BLOCK_SIZE 256
156
+ #define CUDA_CPY_BLOCK_SIZE 32
157
+ #define CUDA_SCALE_BLOCK_SIZE 256
158
+ #define CUDA_ROPE_BLOCK_SIZE 256
159
+ #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
160
+ #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
161
+
162
+ // dmmv = dequantize_mul_mat_vec
163
+ #ifndef GGML_CUDA_DMMV_X
164
+ #define GGML_CUDA_DMMV_X 32
165
+ #endif
166
+ #ifndef GGML_CUDA_DMMV_Y
167
+ #define GGML_CUDA_DMMV_Y 1
168
+ #endif
169
+
170
+ static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
171
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
172
+
173
+ if (i >= k) {
174
+ return;
175
+ }
176
+ dst[i] = x[i] + y[i];
177
+ }
178
+
179
+ static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
180
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
181
+
182
+ if (i >= kx) {
183
+ return;
184
+ }
185
+ dst[i] = x[i] * y[i%ky];
186
+ }
187
+
188
+ static __global__ void silu_f32(const float * x, float * dst, const int k) {
189
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
190
+
191
+ if (i >= k) {
192
+ return;
193
+ }
194
+ dst[i] = x[i] / (1.0f + expf(-x[i]));
195
+ }
196
+
197
+ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
198
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
199
+ const int tid = threadIdx.x;
200
+
201
+ const float eps = 1e-6;
202
+
203
+ float tmp = 0.0f; // partial sum for thread in warp
204
+
205
+ for (int i = 0; i < ncols; i += WARP_SIZE) {
206
+ const int col = i + tid;
207
+ const float xi = x[row*ncols + col];
208
+ tmp += xi * xi;
209
+ }
210
+
211
+ // sum up partial sums
212
+ __syncthreads();
213
+ #pragma unroll
214
+ for (int mask = 16; mask > 0; mask >>= 1) {
215
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
216
+ }
217
+
218
+ const float mean = tmp / ncols;
219
+ const float scale = 1.0f / sqrtf(mean + eps);
220
+
221
+ for (int i = 0; i < ncols; i += WARP_SIZE) {
222
+ const int col = i + tid;
223
+ dst[row*ncols + col] = scale * x[row*ncols + col];
224
+ }
225
+ }
226
+
227
+ static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
228
+ const block_q4_0 * x = (const block_q4_0 *) vx;
229
+
230
+ const float d = x[ib].d;
231
+
232
+ const uint8_t vui = x[ib].qs[iqs];
233
+
234
+ const int8_t vi0 = vui & 0xF;
235
+ const int8_t vi1 = vui >> 4;
236
+
237
+ v0 = (vi0 - 8)*d;
238
+ v1 = (vi1 - 8)*d;
239
+ }
240
+
241
+ static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
242
+ const block_q4_1 * x = (const block_q4_1 *) vx;
243
+
244
+ const float d = x[ib].d;
245
+ const float m = x[ib].m;
246
+
247
+ const uint8_t vui = x[ib].qs[iqs];
248
+
249
+ const int8_t vi0 = vui & 0xF;
250
+ const int8_t vi1 = vui >> 4;
251
+
252
+ v0 = vi0*d + m;
253
+ v1 = vi1*d + m;
254
+ }
255
+
256
+ static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
257
+ const block_q5_0 * x = (const block_q5_0 *) vx;
258
+
259
+ const float d = x[ib].d;
260
+
261
+ uint32_t qh;
262
+ memcpy(&qh, x[ib].qh, sizeof(qh));
263
+
264
+ const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
265
+ const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
266
+
267
+ const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
268
+ const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
269
+
270
+ v0 = x0*d;
271
+ v1 = x1*d;
272
+ }
273
+
274
+ static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
275
+ const block_q5_1 * x = (const block_q5_1 *) vx;
276
+
277
+ const float d = x[ib].d;
278
+ const float m = x[ib].m;
279
+
280
+ uint32_t qh;
281
+ memcpy(&qh, x[ib].qh, sizeof(qh));
282
+
283
+ const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
284
+ const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
285
+
286
+ const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
287
+ const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
288
+
289
+ v0 = x0*d + m;
290
+ v1 = x1*d + m;
291
+ }
292
+
293
+ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
294
+ const block_q8_0 * x = (const block_q8_0 *) vx;
295
+
296
+ const float d = x[ib].d;
297
+
298
+ const int8_t vi0 = x[ib].qs[iqs + 0];
299
+ const int8_t vi1 = x[ib].qs[iqs + 1];
300
+
301
+ v0 = vi0*d;
302
+ v1 = vi1*d;
303
+ }
304
+
305
+ //================================== k-quants
306
+
307
+ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
308
+
309
+ const int i = blockIdx.x;
310
+ const int tid = threadIdx.x;
311
+ const int n = tid/32;
312
+ const int l = tid - 32*n;
313
+ const int is = 8*n + l/16;
314
+
315
+ const block_q2_K * x = (const block_q2_K *) vx;
316
+
317
+ const uint8_t q = x[i].qs[32*n + l];
318
+ float * y = yy + i*QK_K + 128*n;
319
+
320
+ float dall = x[i].d;
321
+ float dmin = x[i].dmin;
322
+ y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
323
+ y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
324
+ y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
325
+ y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
326
+
327
+ }
328
+
329
+ static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
330
+
331
+ const block_q2_K * x = (const block_q2_K *) vx;
332
+
333
+ // if n is 0, we want to do the lower 128, else the upper 128,
334
+ // covering y[l+0], y[l+32], y[l+64], y[l+96] and
335
+ // y[l+16], y[l+48], y[l+80], y[l+112]
336
+ int n = iqs/128; // 0 or 1
337
+ int r = iqs - 128*n; // 0...120 in steps of 8
338
+ int l = r/8; // 0...15 in steps of 1
339
+
340
+ const float * y = yy + 128*n + l;
341
+ const uint8_t * q = x[ib].qs + 32*n + l;
342
+ const uint8_t * s = x[ib].scales + 8*n;
343
+
344
+ const float dall = x[ib].d;
345
+ const float dmin = x[ib].dmin;
346
+
347
+ float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
348
+ + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
349
+ + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
350
+ + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
351
+ + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
352
+ + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
353
+ + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
354
+ + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
355
+
356
+ result = sum;
357
+
358
+ }
359
+
360
+ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
361
+
362
+ int r = threadIdx.x/4;
363
+ int i = blockIdx.x;
364
+ int tid = r/2;
365
+ int is0 = r%2;
366
+ int l0 = 16*is0 + 4*(threadIdx.x%4);
367
+ int n = tid / 4;
368
+ int j = tid - 4*n;
369
+
370
+ const block_q3_K * x = (const block_q3_K *) vx;
371
+
372
+ uint8_t m = 1 << (4*n + j);
373
+ int is = 8*n + 2*j + is0;
374
+ int shift = 2*j;
375
+
376
+ int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
377
+ is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
378
+ is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
379
+ (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
380
+ float d_all = x[i].d;
381
+ float dl = d_all * (us - 32);
382
+
383
+ float * y = yy + i*QK_K + 128*n + 32*j;
384
+ const uint8_t * q = x[i].qs + 32*n;
385
+ const uint8_t * hm = x[i].hmask;
386
+
387
+ for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
388
+
389
+ }
390
+
391
+ static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
392
+
393
+ const block_q3_K * x = (const block_q3_K *) vx;
394
+
395
+ const uint32_t kmask1 = 0x03030303;
396
+ const uint32_t kmask2 = 0x0f0f0f0f;
397
+
398
+ uint32_t aux[3];
399
+ uint32_t utmp[4];
400
+
401
+ // if n is 0, we want to do the lower 128, else the upper 128,
402
+ // covering y[l+0], y[l+32], y[l+64], y[l+96] and
403
+ // y[l+16], y[l+48], y[l+80], y[l+112]
404
+ int n = iqs/128; // 0 or 1
405
+ int r = iqs - 128*n; // 0...120 in steps of 8
406
+ int l = r/8; // 0...15 in steps of 1
407
+
408
+ const float * y = yy + 128*n + l;
409
+ const uint8_t * q = x[ib].qs + 32*n + l;
410
+ const uint8_t * hm = x[ib].hmask + l;
411
+ const int8_t * s = (const int8_t *)utmp + 8*n;
412
+
413
+ memcpy(aux, x[ib].scales, 12);
414
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
415
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
416
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
417
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
418
+
419
+ const float dall = x[ib].d;
420
+
421
+ const uint8_t m = 1 << (4*n);
422
+
423
+ float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
424
+ + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
425
+ + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
426
+ + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
427
+ + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
428
+ + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
429
+ + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
430
+ + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
431
+
432
+ result = sum * dall;
433
+
434
+ }
435
+
436
+ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
437
+ if (j < 4) {
438
+ d = q[j] & 63; m = q[j + 4] & 63;
439
+ } else {
440
+ d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
441
+ m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
442
+ }
443
+ }
444
+
445
+ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
446
+ const block_q4_K * x = (const block_q4_K *) vx;
447
+
448
+ const int i = blockIdx.x;
449
+
450
+ //// assume 64 threads - this is very slightly better than the one below
451
+ //const int tid = threadIdx.x;
452
+ //const int il = tid/16;
453
+ //const int ir = tid%16;
454
+ //const int is = 2*il;
455
+ //const int n = 2;
456
+
457
+ // assume 32 threads
458
+ const int tid = threadIdx.x;
459
+ const int il = tid/8;
460
+ const int ir = tid%8;
461
+ const int is = 2*il;
462
+ const int n = 4;
463
+
464
+ float * y = yy + i*QK_K + 64*il + n*ir;
465
+
466
+ const float dall = x[i].d;
467
+ const float dmin = x[i].dmin;
468
+
469
+ const uint8_t * q = x[i].qs + 32*il + n*ir;
470
+
471
+ uint8_t sc, m;
472
+ get_scale_min_k4(is + 0, x[i].scales, sc, m);
473
+ const float d1 = dall * sc; const float m1 = dmin * m;
474
+ get_scale_min_k4(is + 1, x[i].scales, sc, m);
475
+ const float d2 = dall * sc; const float m2 = dmin * m;
476
+ for (int l = 0; l < n; ++l) {
477
+ y[l + 0] = d1 * (q[l] & 0xF) - m1;
478
+ y[l +32] = d2 * (q[l] >> 4) - m2;
479
+ }
480
+ }
481
+
482
+ static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
483
+
484
+ const block_q4_K * x = (const block_q4_K *) vx;
485
+
486
+ // iqs is in 0...248 in steps of 8 =>
487
+ const int j = iqs / 64; // j is in 0...3
488
+ const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
489
+ const int is = 2*j; // is is in 0...6 in steps of 2
490
+
491
+ const float * y = yy + 64*j + ir;
492
+ const uint8_t * q = x[ib].qs + 32*j + ir;
493
+
494
+ const float dall = x[ib].d;
495
+ const float dmin = x[ib].dmin;
496
+
497
+ uint8_t sc, m;
498
+ get_scale_min_k4(is + 0, x[ib].scales, sc, m);
499
+ const float d1 = dall * sc;
500
+ const float m1 = dmin * m;
501
+ get_scale_min_k4(is + 1, x[ib].scales, sc, m);
502
+ const float d2 = dall * sc;
503
+ const float m2 = dmin * m;
504
+
505
+ float sum = 0;
506
+ for (int k = 0; k < 4; ++k) {
507
+ sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
508
+ sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
509
+ }
510
+ result = sum;
511
+
512
+ }
513
+
514
+ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
515
+ const block_q5_K * x = (const block_q5_K *) vx;
516
+
517
+ const int i = blockIdx.x;
518
+
519
+ // assume 64 threads - this is very slightly better than the one below
520
+ const int tid = threadIdx.x;
521
+ const int il = tid/16; // il is in 0...3
522
+ const int ir = tid%16; // ir is in 0...15
523
+ const int is = 2*il; // is is in 0...6
524
+
525
+ float * y = yy + i*QK_K + 64*il + 2*ir;
526
+
527
+ const float dall = x[i].d;
528
+ const float dmin = x[i].dmin;
529
+
530
+ const uint8_t * ql = x[i].qs + 32*il + 2*ir;
531
+ const uint8_t * qh = x[i].qh + 2*ir;
532
+
533
+ uint8_t sc, m;
534
+ get_scale_min_k4(is + 0, x[i].scales, sc, m);
535
+ const float d1 = dall * sc; const float m1 = dmin * m;
536
+ get_scale_min_k4(is + 1, x[i].scales, sc, m);
537
+ const float d2 = dall * sc; const float m2 = dmin * m;
538
+
539
+ uint8_t hm = 1 << (2*il);
540
+ y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
541
+ y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
542
+ hm <<= 1;
543
+ y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
544
+ y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
545
+ }
546
+
547
+ static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
548
+
549
+ const block_q5_K * x = (const block_q5_K *) vx;
550
+
551
+ // iqs is in 0...248 in steps of 8 =>
552
+ const int j = iqs / 64; // j is in 0...3
553
+ const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
554
+ const int is = 2*j; // is is in 0...6 in steps of 2
555
+
556
+ const float * y = yy + 64*j + ir;
557
+ const uint8_t * ql = x[ib].qs + 32*j + ir;
558
+ const uint8_t * qh = x[ib].qh + ir;
559
+
560
+ const float dall = x[ib].d;
561
+ const float dmin = x[ib].dmin;
562
+
563
+ uint8_t sc, m;
564
+ get_scale_min_k4(is + 0, x[ib].scales, sc, m);
565
+ const float d1 = dall * sc;
566
+ const float m1 = dmin * m;
567
+ get_scale_min_k4(is + 1, x[ib].scales, sc, m);
568
+ const float d2 = dall * sc;
569
+ const float m2 = dmin * m;
570
+
571
+ uint8_t hm = 1 << is;
572
+ float sum = 0;
573
+ for (int k = 0; k < 4; ++k) {
574
+ sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
575
+ }
576
+ hm <<= 1;
577
+ for (int k = 0; k < 4; ++k) {
578
+ sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
579
+ }
580
+ result = sum;
581
+
582
+ }
583
+
584
+ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
585
+ const block_q6_K * x = (const block_q6_K *) vx;
586
+
587
+ const int i = blockIdx.x;
588
+
589
+ // assume 64 threads - this is very slightly better than the one below
590
+ const int tid = threadIdx.x;
591
+ const int ip = tid/32; // ip is 0 or 1
592
+ const int il = tid - 32*ip; // 0...32
593
+ const int is = 8*ip + il/16;
594
+
595
+ float * y = yy + i*QK_K + 128*ip + il;
596
+
597
+ const float d = x[i].d;
598
+
599
+ const uint8_t * ql = x[i].ql + 64*ip + il;
600
+ const uint8_t qh = x[i].qh[32*ip + il];
601
+ const int8_t * sc = x[i].scales + is;
602
+
603
+ y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
604
+ y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
605
+ y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
606
+ y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
607
+ }
608
+
609
+ static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
610
+
611
+ const block_q6_K * x = (const block_q6_K *) vx;
612
+
613
+ const int ip = iqs / 128; // 0 or 1
614
+ const int il = (iqs - 128*ip)/8; // 0...15
615
+ const int is = 8*ip;
616
+
617
+ const float * y = yy + 128*ip + il;
618
+
619
+ const float d = x[ib].d;
620
+
621
+ const uint8_t * ql = x[ib].ql + 64*ip + il;
622
+ const uint8_t * qh = x[ib].qh + 32*ip + il;
623
+ const int8_t * sc = x[ib].scales + is;
624
+
625
+ result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
626
+ + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
627
+ + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
628
+ + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
629
+ + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
630
+ + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
631
+ + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
632
+ + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
633
+
634
+ }
635
+
636
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
637
+ const half * x = (const half *) vx;
638
+
639
+ v0 = __half2float(x[ib + iqs + 0]);
640
+ v1 = __half2float(x[ib + iqs + 1]);
641
+ }
642
+
643
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
644
+ static __global__ void dequantize_block(const void * vx, float * y, const int k) {
645
+ const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
646
+
647
+ if (i >= k) {
648
+ return;
649
+ }
650
+
651
+ const int ib = i/qk; // block index
652
+ const int iqs = (i%qk)/qr; // quant index
653
+ const int iybs = i - i%qk; // y block start index
654
+ const int y_offset = qr == 1 ? 1 : qk/2;
655
+
656
+ // dequantize
657
+ float & v0 = y[iybs + iqs + 0];
658
+ float & v1 = y[iybs + iqs + y_offset];
659
+ dequantize_kernel(vx, ib, iqs, v0, v1);
660
+ }
661
+
662
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
663
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
664
+ // qk = quantized weights per x block
665
+ // qr = number of quantized weights per data value in x block
666
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
667
+
668
+ if (row >= nrows) {
669
+ return;
670
+ }
671
+
672
+ const int tid = threadIdx.x;
673
+
674
+ const int iter_stride = 2*GGML_CUDA_DMMV_X;
675
+ const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
676
+ const int y_offset = qr == 1 ? 1 : qk/2;
677
+
678
+ float tmp = 0.0f; // partial sum for thread in warp
679
+
680
+ for (int i = 0; i < ncols; i += iter_stride) {
681
+ const int col = i + vals_per_iter*tid;
682
+ const int ib = (row*ncols + col)/qk; // x block index
683
+ const int iqs = (col%qk)/qr; // x quant index
684
+ const int iybs = col - col%qk; // y block start index
685
+
686
+ // processing >2 values per i iter is faster for fast GPUs
687
+ #pragma unroll
688
+ for (int j = 0; j < vals_per_iter; j += 2) {
689
+ // process 2 vals per j iter
690
+
691
+ // dequantize
692
+ float v0, v1;
693
+ dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
694
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
695
+
696
+ // matrix multiplication
697
+ tmp += v0 * y[iybs + iqs + j/qr + 0];
698
+ tmp += v1 * y[iybs + iqs + j/qr + y_offset];
699
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
700
+ }
701
+ }
702
+
703
+ // sum up partial sums and write back result
704
+ __syncthreads();
705
+ #pragma unroll
706
+ for (int mask = 16; mask > 0; mask >>= 1) {
707
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
708
+ }
709
+
710
+ if (tid == 0) {
711
+ dst[row] = tmp;
712
+ }
713
+ }
714
+
715
+ template <int n_thread, dot_kernel_k_t dot_kernel>
716
+ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
717
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
718
+
719
+ if (row >= nrows) {
720
+ return;
721
+ }
722
+
723
+ const int tid = threadIdx.x;
724
+
725
+ const int iter_stride = QK_K;
726
+ const int vals_per_iter = iter_stride / n_thread;
727
+ const int num_blocks_per_row = ncols / QK_K;
728
+ const int ib0 = row*num_blocks_per_row;
729
+
730
+ float tmp = 0; // partial sum for thread in warp
731
+
732
+ for (int i = 0; i < ncols; i += iter_stride) {
733
+ const int col = i + vals_per_iter*tid;
734
+ const int ib = ib0 + col/QK_K; // x block index
735
+ const int iqs = col%QK_K; // x quant index
736
+ const int iybs = col - col%QK_K; // y block start index
737
+
738
+ float v;
739
+ dot_kernel(vx, ib, iqs, y + iybs, v);
740
+ tmp += v;
741
+ }
742
+
743
+ // sum up partial sums and write back result
744
+ __syncthreads();
745
+ #pragma unroll
746
+ for (int mask = 16; mask > 0; mask >>= 1) {
747
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
748
+ }
749
+
750
+ if (tid == 0) {
751
+ dst[row] = tmp;
752
+ }
753
+ }
754
+
755
+ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
756
+ const half * x = (half *) vx;
757
+
758
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
759
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
760
+
761
+ const int nrows_y = ncols_x;
762
+ const int nrows_dst = nrows_x;
763
+ const int row_dst = row_x;
764
+
765
+ float tmp = 0.0f;
766
+
767
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
768
+ const int col_x = col_x0 + threadIdx.x;
769
+
770
+ if (col_x >= ncols_x) {
771
+ break;
772
+ }
773
+
774
+ // x is transposed and permuted
775
+ const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
776
+ const float xi = __half2float(x[ix]);
777
+
778
+ const int row_y = col_x;
779
+
780
+
781
+ // y is not transposed but permuted
782
+ const int iy = channel*nrows_y + row_y;
783
+
784
+ tmp += xi * y[iy];
785
+ }
786
+
787
+ // dst is not transposed and not permuted
788
+ const int idst = channel*nrows_dst + row_dst;
789
+
790
+ // sum up partial sums and write back result
791
+ __syncthreads();
792
+ #pragma unroll
793
+ for (int mask = 16; mask > 0; mask >>= 1) {
794
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
795
+ }
796
+
797
+ if (threadIdx.x == 0) {
798
+ dst[idst] = tmp;
799
+ }
800
+ }
801
+
802
+ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
803
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
804
+ const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
805
+
806
+ const half * x = (half *) vx;
807
+
808
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
809
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
810
+
811
+ const int nrows_y = ncols_x;
812
+ const int nrows_dst = nrows_x;
813
+ const int row_dst = row_x;
814
+
815
+ const int idst = channel*nrows_dst + row_dst;
816
+
817
+ float tmp = 0.0f;
818
+
819
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
820
+ const int col_x = col_x0 + threadIdx.x;
821
+
822
+ if (col_x >= ncols_x) {
823
+ break;
824
+ }
825
+
826
+ const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
827
+ const float xi = __half2float(x[ix]);
828
+
829
+ const int row_y = col_x;
830
+
831
+ const int iy = channel*nrows_y + row_y;
832
+
833
+ tmp += xi * y[iy];
834
+ }
835
+
836
+ // sum up partial sums and write back result
837
+ __syncthreads();
838
+ #pragma unroll
839
+ for (int mask = 16; mask > 0; mask >>= 1) {
840
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
841
+ }
842
+
843
+ if (threadIdx.x == 0) {
844
+ dst[idst] = tmp;
845
+ }
846
+ }
847
+
848
+ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
849
+ const float * xi = (float *) cxi;
850
+ float * dsti = (float *) cdsti;
851
+
852
+ *dsti = *xi;
853
+ }
854
+
855
+ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
856
+ const float * xi = (float *) cxi;
857
+ half * dsti = (half *) cdsti;
858
+
859
+ *dsti = __float2half(*xi);
860
+ }
861
+
862
+ template <cpy_kernel_t cpy_1>
863
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
864
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
865
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
866
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
867
+
868
+ if (i >= ne) {
869
+ return;
870
+ }
871
+
872
+ // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
873
+ // then combine those indices with the corresponding byte offsets to get the total offsets
874
+ const int i02 = i / (ne00*ne01);
875
+ const int i01 = (i - i02*ne01*ne00) / ne00;
876
+ const int i00 = i - i02*ne01*ne00 - i01*ne00;
877
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
878
+
879
+ const int i12 = i / (ne10*ne11);
880
+ const int i11 = (i - i12*ne10*ne11) / ne10;
881
+ const int i10 = i - i12*ne10*ne11 - i11*ne10;
882
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
883
+
884
+ cpy_1(cx + x_offset, cdst + dst_offset);
885
+ }
886
+
887
+ // rope == RoPE == rotary positional embedding
888
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
889
+ const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
890
+
891
+ if (col >= ncols) {
892
+ return;
893
+ }
894
+
895
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
896
+ const int i = row*ncols + col;
897
+
898
+ const float theta = p*powf(theta_scale, col/2);
899
+ const float sin_theta = sinf(theta);
900
+ const float cos_theta = cosf(theta);
901
+
902
+ const float x0 = x[i + 0];
903
+ const float x1 = x[i + 1];
904
+
905
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
906
+ dst[i + 1] = x0*sin_theta + x1*cos_theta;
907
+ }
908
+
909
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
910
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
911
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
912
+
913
+ if (col >= ncols) {
914
+ return;
915
+ }
916
+
917
+ const int i = row*ncols + col;
918
+ // dst[i] = col > n_past + row ? -INFINITY : x[i];
919
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
920
+ }
921
+
922
+ // the CUDA soft max implementation differs from the CPU implementation
923
+ // instead of doubles floats are used
924
+ // values are also not normalized to the maximum value by subtracting it in the exponential function
925
+ // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
926
+ static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
927
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
928
+ const int block_size = blockDim.x;
929
+ const int tid = threadIdx.x;
930
+
931
+ float tmp = 0.0;
932
+
933
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
934
+ const int col = block_start + tid;
935
+
936
+ if (col >= ncols) {
937
+ break;
938
+ }
939
+
940
+ const int i = row*ncols + col;
941
+ const float val = expf(x[i]);
942
+ tmp += val;
943
+ dst[i] = val;
944
+ }
945
+
946
+ // sum up partial sums
947
+ __syncthreads();
948
+ #pragma unroll
949
+ for (int mask = 16; mask > 0; mask >>= 1) {
950
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
951
+ }
952
+
953
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
954
+ const int col = block_start + tid;
955
+
956
+ if (col >= ncols) {
957
+ break;
958
+ }
959
+
960
+ const int i = row*ncols + col;
961
+ dst[i] /= tmp;
962
+ }
963
+ }
964
+
965
+ static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
966
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
967
+
968
+ if (i >= k) {
969
+ return;
970
+ }
971
+
972
+ dst[i] = scale * x[i];
973
+ }
974
+
975
+ static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
976
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
977
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
978
+ }
979
+
980
+ static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
981
+ const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
982
+ mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
983
+ }
984
+
985
+ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
986
+ const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
987
+ silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
988
+ }
989
+
990
+ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
991
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
992
+ const dim3 block_dims(WARP_SIZE, 1, 1);
993
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
994
+ }
995
+
996
+ static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
997
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
998
+ dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
999
+ }
1000
+
1001
+ static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1002
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1003
+ dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1004
+ }
1005
+
1006
+ static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1007
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1008
+ dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1009
+ }
1010
+
1011
+ static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1012
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1013
+ dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1014
+ }
1015
+
1016
+ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1017
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1018
+ dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1019
+ }
1020
+
1021
+ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1022
+ const int nb = k / QK_K;
1023
+ dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
1024
+ }
1025
+
1026
+ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1027
+ const int nb = k / QK_K;
1028
+ dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
1029
+ }
1030
+
1031
+ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1032
+ const int nb = k / QK_K;
1033
+ dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
1034
+ }
1035
+
1036
+ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1037
+ const int nb = k / QK_K;
1038
+ dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
1039
+ }
1040
+
1041
+ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1042
+ const int nb = k / QK_K;
1043
+ dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
1044
+ }
1045
+
1046
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1047
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1048
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1049
+ const dim3 block_nums(1, block_num_y, 1);
1050
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1051
+ dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
1052
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1053
+ }
1054
+
1055
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1056
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1057
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1058
+ const dim3 block_nums(1, block_num_y, 1);
1059
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1060
+ dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
1061
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1062
+ }
1063
+
1064
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1065
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1066
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1067
+ const dim3 block_nums(1, block_num_y, 1);
1068
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1069
+ dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
1070
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1071
+ }
1072
+
1073
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1074
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1075
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1076
+ const dim3 block_nums(1, block_num_y, 1);
1077
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1078
+ dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
1079
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1080
+ }
1081
+
1082
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1083
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1084
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1085
+ const dim3 block_nums(1, block_num_y, 1);
1086
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1087
+ dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
1088
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1089
+ }
1090
+
1091
+ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1092
+ GGML_ASSERT(ncols % QK_K == 0);
1093
+ const int ny = 2;
1094
+ const int block_num_y = (nrows + ny - 1) / ny;
1095
+ const dim3 block_nums(1, block_num_y, 1);
1096
+ const dim3 block_dims(32, ny, 1);
1097
+ dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1098
+ }
1099
+
1100
+ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1101
+ GGML_ASSERT(ncols % QK_K == 0);
1102
+ const int ny = 2;
1103
+ const int block_num_y = (nrows + ny - 1) / ny;
1104
+ const dim3 block_nums(1, block_num_y, 1);
1105
+ const dim3 block_dims(32, ny, 1);
1106
+ dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1107
+ }
1108
+
1109
+ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1110
+ GGML_ASSERT(ncols % QK_K == 0);
1111
+ const int ny = 2;
1112
+ const int block_num_y = (nrows + ny - 1) / ny;
1113
+ const dim3 block_nums(1, block_num_y, 1);
1114
+ const dim3 block_dims(32, ny, 1);
1115
+ dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1116
+ }
1117
+
1118
+ static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1119
+ GGML_ASSERT(ncols % QK_K == 0);
1120
+ const int ny = 2;
1121
+ const int block_num_y = (nrows + ny - 1) / ny;
1122
+ const dim3 block_nums(1, block_num_y, 1);
1123
+ const dim3 block_dims(32, ny, 1);
1124
+ dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1125
+ }
1126
+
1127
+ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1128
+ GGML_ASSERT(ncols % QK_K == 0);
1129
+ const int ny = 2;
1130
+ const int block_num_y = (nrows + ny - 1) / ny;
1131
+ const dim3 block_nums(1, block_num_y, 1);
1132
+ const dim3 block_dims(32, ny, 1);
1133
+ dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1134
+ }
1135
+
1136
+ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1137
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1138
+ dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1139
+ }
1140
+
1141
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1142
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1143
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1144
+ const dim3 block_nums(1, block_num_y, 1);
1145
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1146
+ dequantize_mul_mat_vec<1, 1, convert_f16>
1147
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1148
+ }
1149
+
1150
+ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
1151
+ switch (type) {
1152
+ case GGML_TYPE_Q4_0:
1153
+ return dequantize_row_q4_0_cuda;
1154
+ case GGML_TYPE_Q4_1:
1155
+ return dequantize_row_q4_1_cuda;
1156
+ case GGML_TYPE_Q5_0:
1157
+ return dequantize_row_q5_0_cuda;
1158
+ case GGML_TYPE_Q5_1:
1159
+ return dequantize_row_q5_1_cuda;
1160
+ case GGML_TYPE_Q8_0:
1161
+ return dequantize_row_q8_0_cuda;
1162
+ case GGML_TYPE_Q2_K:
1163
+ return dequantize_row_q2_K_cuda;
1164
+ case GGML_TYPE_Q3_K:
1165
+ return dequantize_row_q3_K_cuda;
1166
+ case GGML_TYPE_Q4_K:
1167
+ return dequantize_row_q4_K_cuda;
1168
+ case GGML_TYPE_Q5_K:
1169
+ return dequantize_row_q5_K_cuda;
1170
+ case GGML_TYPE_Q6_K:
1171
+ return dequantize_row_q6_K_cuda;
1172
+ case GGML_TYPE_F16:
1173
+ return convert_fp16_to_fp32_cuda;
1174
+ default:
1175
+ return nullptr;
1176
+ }
1177
+ }
1178
+
1179
+ static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
1180
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1181
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1182
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
1183
+ }
1184
+
1185
+ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1186
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
1187
+ const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
1188
+
1189
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1190
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1191
+ mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1192
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1193
+ }
1194
+
1195
+ static void ggml_cpy_f32_f32_cuda(
1196
+ const char * cx, char * cdst, const int ne,
1197
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1198
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1199
+
1200
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1201
+ cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1202
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1203
+ }
1204
+
1205
+ static void ggml_cpy_f32_f16_cuda(
1206
+ const char * cx, char * cdst, const int ne,
1207
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1208
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1209
+
1210
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1211
+ cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1212
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1213
+ }
1214
+
1215
+ static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
1216
+ const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
1217
+ scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
1218
+ }
1219
+
1220
+ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
1221
+ GGML_ASSERT(nrows % 2 == 0);
1222
+ const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
1223
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
1224
+ const dim3 block_nums(num_blocks_x, nrows, 1);
1225
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
1226
+ }
1227
+
1228
+ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
1229
+ const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
1230
+ const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
1231
+ const dim3 block_nums(block_num_x, nrows_x, 1);
1232
+ diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
1233
+ }
1234
+
1235
+ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
1236
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1237
+ const dim3 block_nums(1, nrows_x, 1);
1238
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
1239
+ }
1240
+
1241
+ // buffer pool for cuda
1242
+ #define MAX_CUDA_BUFFERS 256
1243
+
1244
+ struct scoped_spin_lock {
1245
+ std::atomic_flag& lock;
1246
+ scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
1247
+ while (lock.test_and_set(std::memory_order_acquire)) {
1248
+ ; // spin
1249
+ }
1250
+ }
1251
+ ~scoped_spin_lock() {
1252
+ lock.clear(std::memory_order_release);
1253
+ }
1254
+ scoped_spin_lock(const scoped_spin_lock&) = delete;
1255
+ scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
1256
+ };
1257
+
1258
+ struct cuda_buffer {
1259
+ void * ptr = nullptr;
1260
+ size_t size = 0;
1261
+ };
1262
+
1263
+ static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
1264
+ static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
1265
+
1266
+ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
1267
+ scoped_spin_lock lock(g_cuda_pool_lock);
1268
+ int id;
1269
+ CUDA_CHECK(cudaGetDevice(&id));
1270
+
1271
+ for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
1272
+ cuda_buffer& b = g_cuda_buffer_pool[id][i];
1273
+ if (b.size >= size && b.ptr != nullptr) {
1274
+ void * ptr = b.ptr;
1275
+ *actual_size = b.size;
1276
+ b.ptr = nullptr;
1277
+ b.size = 0;
1278
+ return ptr;
1279
+ }
1280
+ }
1281
+ void * ptr;
1282
+ CUDA_CHECK(cudaMalloc((void **) &ptr, size));
1283
+ *actual_size = size;
1284
+ return ptr;
1285
+ }
1286
+
1287
+ static void ggml_cuda_pool_free(void * ptr, size_t size) {
1288
+ scoped_spin_lock lock(g_cuda_pool_lock);
1289
+ int id;
1290
+ CUDA_CHECK(cudaGetDevice(&id));
1291
+
1292
+ for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
1293
+ cuda_buffer& b = g_cuda_buffer_pool[id][i];
1294
+ if (b.ptr == nullptr) {
1295
+ b.ptr = ptr;
1296
+ b.size = size;
1297
+ return;
1298
+ }
1299
+ }
1300
+ fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
1301
+ CUDA_CHECK(cudaFree(ptr));
1302
+ }
1303
+
1304
+
1305
+ static void * g_scratch_buffer = nullptr;
1306
+ static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
1307
+ static size_t g_scratch_offset = 0;
1308
+
1309
+ #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1310
+ #define GGML_CUDA_MAX_EVENTS 64
1311
+
1312
+ static int g_device_count = -1;
1313
+ static int g_main_device = 0;
1314
+ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1315
+
1316
+ static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
1317
+
1318
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1319
+
1320
+ static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1321
+ static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1322
+
1323
+ void ggml_init_cublas() {
1324
+ static bool initialized = false;
1325
+
1326
+ if (!initialized) {
1327
+ CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
1328
+ GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
1329
+ int64_t total_vram = 0;
1330
+ fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
1331
+ for (int id = 0; id < g_device_count; ++id) {
1332
+ cudaDeviceProp prop;
1333
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
1334
+ fprintf(stderr, " Device %d: %s\n", id, prop.name);
1335
+ g_tensor_split[id] = total_vram;
1336
+ total_vram += prop.totalGlobalMem;
1337
+ }
1338
+ for (int id = 0; id < g_device_count; ++id) {
1339
+ g_tensor_split[id] /= total_vram;
1340
+ }
1341
+
1342
+ for (int id = 0; id < g_device_count; ++id) {
1343
+ CUDA_CHECK(cudaSetDevice(id));
1344
+
1345
+ // create streams
1346
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1347
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1348
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1349
+ }
1350
+ // create events
1351
+ for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1352
+ CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1353
+ }
1354
+
1355
+ // create cublas handle
1356
+ CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
1357
+ CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
1358
+ }
1359
+
1360
+ // configure logging to stdout
1361
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
1362
+
1363
+ initialized = true;
1364
+ }
1365
+ }
1366
+
1367
+ void ggml_cuda_set_tensor_split(const float * tensor_split) {
1368
+ bool all_zero = true;
1369
+ for (int i = 0; i < g_device_count; ++i) {
1370
+ if (tensor_split[i] != 0.0f) {
1371
+ all_zero = false;
1372
+ break;
1373
+ }
1374
+ }
1375
+ if (all_zero) {
1376
+ return;
1377
+ }
1378
+ float split_sum = 0.0f;
1379
+ for (int i = 0; i < g_device_count; ++i) {
1380
+ g_tensor_split[i] = split_sum;
1381
+ split_sum += tensor_split[i];
1382
+ }
1383
+ for (int i = 0; i < g_device_count; ++i) {
1384
+ g_tensor_split[i] /= split_sum;
1385
+ }
1386
+ }
1387
+
1388
+ void * ggml_cuda_host_malloc(size_t size) {
1389
+ if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
1390
+ return nullptr;
1391
+ }
1392
+
1393
+ void * ptr = nullptr;
1394
+ cudaError_t err = cudaMallocHost((void **) &ptr, size);
1395
+ if (err != cudaSuccess) {
1396
+ // The allocation error can be bypassed. A null ptr will assigned out of this function.
1397
+ // This can fixed the OOM error in WSL.
1398
+ cudaGetLastError();
1399
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
1400
+ size/1024.0/1024.0, cudaGetErrorString(err));
1401
+ return nullptr;
1402
+ }
1403
+
1404
+ return ptr;
1405
+ }
1406
+
1407
+ void ggml_cuda_host_free(void * ptr) {
1408
+ CUDA_CHECK(cudaFreeHost(ptr));
1409
+ }
1410
+
1411
+ static cudaError_t ggml_cuda_cpy_tensor_2d(
1412
+ void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1413
+
1414
+ cudaMemcpyKind kind;
1415
+ char * src_ptr;
1416
+ if (src->backend == GGML_BACKEND_CPU) {
1417
+ kind = cudaMemcpyHostToDevice;
1418
+ src_ptr = (char *) src->data;
1419
+ } else if (src->backend == GGML_BACKEND_GPU) {
1420
+ kind = cudaMemcpyDeviceToDevice;
1421
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
1422
+ int id;
1423
+ CUDA_CHECK(cudaGetDevice(&id));
1424
+ src_ptr = (char *) extra->data_device[id];
1425
+ } else {
1426
+ GGML_ASSERT(false);
1427
+ }
1428
+ char * dst_ptr = (char *) dst;
1429
+
1430
+ const int64_t ne0 = src->ne[0];
1431
+ const int64_t nb0 = src->nb[0];
1432
+ const int64_t nb1 = src->nb[1];
1433
+ const int64_t nb2 = src->nb[2];
1434
+ const int64_t nb3 = src->nb[3];
1435
+ const enum ggml_type type = src->type;
1436
+ const int64_t ts = ggml_type_size(type);
1437
+ const int64_t bs = ggml_blck_size(type);
1438
+ int64_t i1_diff = i1_high - i1_low;
1439
+
1440
+ const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1441
+ if (nb0 == ts && nb1 == ts*ne0/bs) {
1442
+ return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
1443
+ } else if (nb0 == ts) {
1444
+ return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
1445
+ } else {
1446
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1447
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
1448
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1449
+ // pretend the row is a matrix with cols=1
1450
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
1451
+ if (r != cudaSuccess) return r;
1452
+ }
1453
+ return cudaSuccess;
1454
+ }
1455
+ }
1456
+
1457
+ inline void ggml_cuda_op_add(
1458
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1459
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1460
+ cudaStream_t & cudaStream_main){
1461
+
1462
+ GGML_ASSERT(src0_ddf_i != nullptr);
1463
+ GGML_ASSERT(src1_ddf_i != nullptr);
1464
+ GGML_ASSERT(dst_ddf_i != nullptr);
1465
+
1466
+ const int64_t ne0 = src0->ne[0];
1467
+ const int64_t i01_diff = i01_high - i01_low;
1468
+
1469
+ // compute
1470
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1471
+ CUDA_CHECK(cudaGetLastError());
1472
+
1473
+ (void) src1;
1474
+ (void) dst;
1475
+ (void) src0_ddq_i;
1476
+ (void) i02;
1477
+ (void) i1;
1478
+ }
1479
+
1480
+ inline void ggml_cuda_op_mul(
1481
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1482
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1483
+ cudaStream_t & cudaStream_main){
1484
+
1485
+ GGML_ASSERT(src0_ddf_i != nullptr);
1486
+ GGML_ASSERT(src1_ddf_i != nullptr);
1487
+ GGML_ASSERT(dst_ddf_i != nullptr);
1488
+
1489
+ const int64_t ne00 = src0->ne[0];
1490
+
1491
+ const int64_t ne10 = src1->ne[0];
1492
+ const int64_t ne11 = src1->ne[1];
1493
+
1494
+ for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
1495
+ const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
1496
+
1497
+ float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
1498
+ float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
1499
+ float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
1500
+
1501
+ // compute
1502
+ mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
1503
+ CUDA_CHECK(cudaGetLastError());
1504
+ }
1505
+
1506
+ (void) dst;
1507
+ (void) src0_ddq_i;
1508
+ (void) i02;
1509
+ }
1510
+
1511
+ inline void ggml_cuda_op_silu(
1512
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1513
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1514
+ cudaStream_t & cudaStream_main){
1515
+
1516
+ GGML_ASSERT(src0_ddf_i != nullptr);
1517
+ GGML_ASSERT(dst_ddf_i != nullptr);
1518
+
1519
+ const int64_t ne00 = src0->ne[0];
1520
+ const int64_t i01_diff = i01_high - i01_low;
1521
+
1522
+ // compute
1523
+ silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
1524
+ CUDA_CHECK(cudaGetLastError());
1525
+
1526
+ (void) src1;
1527
+ (void) dst;
1528
+ (void) src0_ddq_i;
1529
+ (void) src1_ddf_i;
1530
+ (void) i02;
1531
+ (void) i1;
1532
+ }
1533
+
1534
+ inline void ggml_cuda_op_rms_norm(
1535
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1536
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1537
+ cudaStream_t & cudaStream_main){
1538
+
1539
+ GGML_ASSERT(src0_ddf_i != nullptr);
1540
+ GGML_ASSERT(dst_ddf_i != nullptr);
1541
+
1542
+ const int64_t ne00 = src0->ne[0];
1543
+ const int64_t i01_diff = i01_high - i01_low;
1544
+
1545
+ // compute
1546
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
1547
+ CUDA_CHECK(cudaGetLastError());
1548
+
1549
+ (void) src1;
1550
+ (void) dst;
1551
+ (void) src0_ddq_i;
1552
+ (void) src1_ddf_i;
1553
+ (void) i02;
1554
+ (void) i1;
1555
+ }
1556
+
1557
+ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1558
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1559
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1560
+ cudaStream_t & cudaStream_main){
1561
+
1562
+ GGML_ASSERT(src0_ddq_i != nullptr);
1563
+ GGML_ASSERT(src1_ddf_i != nullptr);
1564
+ GGML_ASSERT(dst_ddf_i != nullptr);
1565
+
1566
+ const int64_t ne00 = src0->ne[0];
1567
+ const int64_t nrows = i01_high - i01_low;
1568
+
1569
+ switch (src0->type) {
1570
+ case GGML_TYPE_Q4_0:
1571
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1572
+ break;
1573
+ case GGML_TYPE_Q4_1:
1574
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1575
+ break;
1576
+ case GGML_TYPE_Q5_0:
1577
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1578
+ break;
1579
+ case GGML_TYPE_Q5_1:
1580
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1581
+ break;
1582
+ case GGML_TYPE_Q8_0:
1583
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1584
+ break;
1585
+ case GGML_TYPE_Q2_K:
1586
+ dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1587
+ break;
1588
+ case GGML_TYPE_Q3_K:
1589
+ dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1590
+ break;
1591
+ case GGML_TYPE_Q4_K:
1592
+ dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1593
+ break;
1594
+ case GGML_TYPE_Q5_K:
1595
+ dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1596
+ break;
1597
+ case GGML_TYPE_Q6_K:
1598
+ dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1599
+ break;
1600
+ case GGML_TYPE_F16:
1601
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1602
+ break;
1603
+ default:
1604
+ GGML_ASSERT(false);
1605
+ break;
1606
+ }
1607
+ CUDA_CHECK(cudaGetLastError());
1608
+
1609
+ (void) src1;
1610
+ (void) dst;
1611
+ (void) src0_ddf_i;
1612
+ (void) i02;
1613
+ (void) i1;
1614
+ }
1615
+
1616
+ inline void ggml_cuda_op_mul_mat_cublas(
1617
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1618
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1619
+ cudaStream_t & cudaStream_main){
1620
+
1621
+ GGML_ASSERT(src0_ddf_i != nullptr);
1622
+ GGML_ASSERT(src1_ddf_i != nullptr);
1623
+ GGML_ASSERT(dst_ddf_i != nullptr);
1624
+
1625
+ const float alpha = 1.0f;
1626
+ const float beta = 0.0f;
1627
+
1628
+ const int64_t ne00 = src0->ne[0];
1629
+
1630
+ const int64_t ne10 = src1->ne[0];
1631
+ const int64_t ne11 = src1->ne[1];
1632
+
1633
+ const int64_t ne0 = dst->ne[0];
1634
+ const int64_t i01_diff = i01_high - i01_low;
1635
+
1636
+ int id;
1637
+ CUDA_CHECK(cudaGetDevice(&id));
1638
+
1639
+ // the main device has a larger memory buffer to hold the results from all GPUs
1640
+ // ldc == nrows of the matrix that cuBLAS writes into
1641
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
1642
+
1643
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
1644
+ CUBLAS_CHECK(
1645
+ cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
1646
+ i01_diff, ne11, ne10,
1647
+ &alpha, src0_ddf_i, ne00,
1648
+ src1_ddf_i, ne10,
1649
+ &beta, dst_ddf_i, ldc));
1650
+
1651
+ (void) dst;
1652
+ (void) src0_ddq_i;
1653
+ (void) i02;
1654
+ (void) i1;
1655
+ }
1656
+
1657
+ inline void ggml_cuda_op_rope(
1658
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1659
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1660
+ cudaStream_t & cudaStream_main){
1661
+
1662
+ GGML_ASSERT(src0_ddf_i != nullptr);
1663
+ GGML_ASSERT(dst_ddf_i != nullptr);
1664
+
1665
+ const int64_t ne00 = src0->ne[0];
1666
+ const int64_t i01_diff = i01_high - i01_low;
1667
+
1668
+ const int n_past = ((int32_t *) src1->data)[0];
1669
+ const int n_dims = ((int32_t *) src1->data)[1];
1670
+ const int mode = ((int32_t *) src1->data)[2];
1671
+ GGML_ASSERT(mode == 0);
1672
+
1673
+ const float theta_scale = powf(10000.0, -2.0f/n_dims);
1674
+ const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
1675
+
1676
+ // compute
1677
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
1678
+ CUDA_CHECK(cudaGetLastError());
1679
+
1680
+ (void) dst;
1681
+ (void) src0_ddq_i;
1682
+ (void) src1_ddf_i;
1683
+ (void) i1;
1684
+ }
1685
+
1686
+ inline void ggml_cuda_op_diag_mask_inf(
1687
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1688
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1689
+ cudaStream_t & cudaStream_main){
1690
+
1691
+ GGML_ASSERT(src0_ddf_i != nullptr);
1692
+ GGML_ASSERT(dst_ddf_i != nullptr);
1693
+
1694
+ const int64_t ne00 = src0->ne[0];
1695
+ const int64_t ne01 = src0->ne[1];
1696
+ const int64_t i01_diff = i01_high - i01_low;
1697
+
1698
+ const int n_past = ((int32_t *) src1->data)[0];
1699
+
1700
+ // compute
1701
+ diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
1702
+ CUDA_CHECK(cudaGetLastError());
1703
+
1704
+ (void) dst;
1705
+ (void) src0_ddq_i;
1706
+ (void) src1_ddf_i;
1707
+ (void) i02;
1708
+ (void) i1;
1709
+ }
1710
+
1711
+ inline void ggml_cuda_op_soft_max(
1712
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1713
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1714
+ cudaStream_t & cudaStream_main){
1715
+
1716
+ GGML_ASSERT(src0_ddf_i != nullptr);
1717
+ GGML_ASSERT(dst_ddf_i != nullptr);
1718
+
1719
+ const int64_t ne00 = src0->ne[0];
1720
+ const int64_t i01_diff = i01_high - i01_low;
1721
+
1722
+ // compute
1723
+ soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
1724
+ CUDA_CHECK(cudaGetLastError());
1725
+
1726
+ (void) src1;
1727
+ (void) dst;
1728
+ (void) src0_ddq_i;
1729
+ (void) src1_ddf_i;
1730
+ (void) i02;
1731
+ (void) i1;
1732
+ }
1733
+
1734
+ inline void ggml_cuda_op_scale(
1735
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1736
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1737
+ cudaStream_t & cudaStream_main){
1738
+
1739
+ GGML_ASSERT(src0_ddf_i != nullptr);
1740
+ GGML_ASSERT(dst_ddf_i != nullptr);
1741
+
1742
+ const float scale = ((float *) src1->data)[0];
1743
+
1744
+ const int64_t ne00 = src0->ne[0];
1745
+ const int64_t i01_diff = i01_high - i01_low;
1746
+
1747
+ // compute
1748
+ scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
1749
+ CUDA_CHECK(cudaGetLastError());
1750
+
1751
+ (void) src1;
1752
+ (void) dst;
1753
+ (void) src0_ddq_i;
1754
+ (void) src1_ddf_i;
1755
+ (void) i02;
1756
+ (void) i1;
1757
+ }
1758
+
1759
+ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
1760
+ ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
1761
+ const int64_t ne00 = src0->ne[0];
1762
+ const int64_t ne01 = src0->ne[1];
1763
+ const int64_t ne02 = src0->ne[2];
1764
+ const int64_t ne03 = src0->ne[3];
1765
+ const int64_t nrows0 = ggml_nrows(src0);
1766
+
1767
+ const bool use_src1 = src1 != nullptr;
1768
+ const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
1769
+ const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
1770
+ const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
1771
+ const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
1772
+
1773
+ const int64_t ne0 = dst->ne[0];
1774
+ const int64_t ne1 = dst->ne[1];
1775
+
1776
+ const int nb2 = dst->nb[2];
1777
+ const int nb3 = dst->nb[3];
1778
+
1779
+ GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
1780
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
1781
+
1782
+ // strides for iteration over dims 3 and 2
1783
+ const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
1784
+ const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
1785
+ const int64_t src0_stride = ne00 * ne01 * stride_mod;
1786
+ const int64_t src1_stride = ne10 * ne11 * stride_mod;
1787
+ const int64_t dst_stride = ne0 * ne1 * stride_mod;
1788
+
1789
+ const size_t src0_ts = ggml_type_size(src0->type);
1790
+ const size_t src0_bs = ggml_blck_size(src0->type);
1791
+
1792
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
1793
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
1794
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
1795
+
1796
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
1797
+ const bool src0_is_contiguous = ggml_is_contiguous(src0);
1798
+ const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
1799
+
1800
+ const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
1801
+ const bool src1_stays_on_host = use_src1 && (
1802
+ dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
1803
+
1804
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
1805
+
1806
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
1807
+
1808
+ // dd = data device
1809
+ char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
1810
+ float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
1811
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1812
+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1813
+
1814
+ // asq = actual size quantized, asf = actual size float
1815
+ size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
1816
+ size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
1817
+ size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1818
+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
1819
+
1820
+ for (int id = 0; id < g_device_count; ++id) {
1821
+ if (!split && id != g_main_device) {
1822
+ continue;
1823
+ }
1824
+
1825
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
1826
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
1827
+
1828
+ int64_t row_low, row_high;
1829
+ if (split) {
1830
+ row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
1831
+ row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
1832
+ } else {
1833
+ row_low = 0;
1834
+ row_high = nrows0;
1835
+ }
1836
+ if (row_low == row_high) {
1837
+ continue;
1838
+ }
1839
+
1840
+ int64_t row_diff = row_high - row_low;
1841
+
1842
+ cudaSetDevice(id);
1843
+
1844
+ if (src0_on_device && src0_is_contiguous) {
1845
+ if (src0_is_f32) {
1846
+ src0_ddf[id] = (float *) src0_extra->data_device[id];
1847
+ } else {
1848
+ src0_ddq[id] = (char *) src0_extra->data_device[id];
1849
+ }
1850
+ } else {
1851
+ if (src0_is_f32) {
1852
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
1853
+ } else {
1854
+ src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
1855
+ }
1856
+ }
1857
+
1858
+ if (src0_needs_f32 && !src0_is_f32) {
1859
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
1860
+ }
1861
+
1862
+ if (use_src1 && !src1_stays_on_host) {
1863
+ if (src1_on_device && src1_is_contiguous) {
1864
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
1865
+ } else {
1866
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
1867
+ }
1868
+ }
1869
+ if (dst_on_device) {
1870
+ dst_ddf[id] = (float *) dst_extra->data_device[id];
1871
+ } else {
1872
+ size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
1873
+ dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
1874
+ }
1875
+
1876
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
1877
+ const int64_t i02_max = flatten_rows ? 1 : ne02;
1878
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
1879
+
1880
+ for (int64_t i03 = 0; i03 < i03_max; i03++) {
1881
+ const int64_t i13 = i03 % ne13;
1882
+ for (int64_t i02 = 0; i02 < i02_max; i02++) {
1883
+ const int64_t i12 = i02 % ne12;
1884
+
1885
+ const int64_t i0 = i03*ne02 + i02;
1886
+
1887
+ // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
1888
+ const int64_t i0_offset_low = row_low/rows_per_iter;
1889
+ const int64_t i0_offset_high = row_high/rows_per_iter;
1890
+
1891
+ int64_t i01_low = 0;
1892
+ int64_t i01_high = rows_per_iter;
1893
+ if (split) {
1894
+ if (i0 < i0_offset_low || i0 > i0_offset_high) {
1895
+ continue;
1896
+ }
1897
+ if (i0 == i0_offset_low) {
1898
+ i01_low = row_low % rows_per_iter;
1899
+ }
1900
+ if (i0 == i0_offset_high) {
1901
+ i01_high = row_high % rows_per_iter;
1902
+ }
1903
+ }
1904
+
1905
+ // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
1906
+ // Removing the first assert or changing the order of the arguments causes the second assert to fail.
1907
+ // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
1908
+ // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
1909
+ GGML_ASSERT(i01_low == 0 || g_device_count > 1);
1910
+ GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
1911
+
1912
+ const int64_t i01_diff = i01_high - i01_low;
1913
+ if (i01_diff == 0) {
1914
+ continue;
1915
+ }
1916
+ const int64_t i11 = i13*ne12 + i12;
1917
+
1918
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
1919
+ cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
1920
+ cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
1921
+
1922
+ // for split tensors the data begins at i0 == i0_offset_low
1923
+ char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
1924
+ float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
1925
+ float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
1926
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
1927
+
1928
+ // for split tensors the data pointer needs to be rounded down
1929
+ // to the bin edge for i03, i02 bins beyond the first
1930
+ if (i0 - i0_offset_low > 0) {
1931
+ GGML_ASSERT(!flatten_rows);
1932
+ src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
1933
+ src0_ddf_i -= (row_low % ne01)*ne00;
1934
+ dst_ddf_i -= (row_low % ne0)*ne1;
1935
+ }
1936
+
1937
+ // the main device memory buffer can be on VRAM scratch, with space for all partial results
1938
+ // in that case an offset on dst_ddf_i is needed
1939
+ if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
1940
+ dst_ddf_i += i01_low; // offset is 0 if no tensor split
1941
+ }
1942
+
1943
+ // copy src0, src1 to device if necessary
1944
+ if (use_src1 && !src1_stays_on_host) {
1945
+ if (src1->backend == GGML_BACKEND_CPU) {
1946
+ GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
1947
+ int64_t nrows1 = flatten_rows ? nrows0 : ne11;
1948
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
1949
+ } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
1950
+ if (id != g_main_device) {
1951
+ GGML_ASSERT(!flatten_rows);
1952
+ float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
1953
+ src1_ddf_i_source += i11*src1_stride;
1954
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
1955
+ cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
1956
+ }
1957
+ } else if (src1_on_device && !src1_is_contiguous) {
1958
+ GGML_ASSERT(!split);
1959
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
1960
+ } else {
1961
+ GGML_ASSERT(false);
1962
+ }
1963
+ }
1964
+ CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
1965
+
1966
+ if (!src0_on_device || !src0_is_contiguous) {
1967
+ if (src0_is_f32) {
1968
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1969
+ } else {
1970
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1971
+ }
1972
+ }
1973
+
1974
+ // convert src0 to f32 if it is necessary for the ggml_cuda_op
1975
+ if (src0_needs_f32 && !src0_is_f32) {
1976
+ to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
1977
+ CUDA_CHECK(cudaGetLastError());
1978
+ }
1979
+
1980
+ // wait with main stream until src1 memcpy is done
1981
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
1982
+
1983
+ // do the computation
1984
+ op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
1985
+
1986
+ // copy dst to host or other device if necessary
1987
+ if (!dst_on_device) {
1988
+ void * dst_off_device;
1989
+ cudaMemcpyKind kind;
1990
+ if (dst->backend == GGML_BACKEND_CPU) {
1991
+ dst_off_device = dst->data;
1992
+ kind = cudaMemcpyDeviceToHost;
1993
+ } else if (dst->backend == GGML_BACKEND_GPU) {
1994
+ dst_off_device = dst_extra->data_device[g_main_device];
1995
+ kind = cudaMemcpyDeviceToDevice;
1996
+ } else {
1997
+ GGML_ASSERT(false);
1998
+ }
1999
+ if (split) {
2000
+ // src0 = weight matrix is saved as a transposed matrix for better memory layout.
2001
+ // dst is NOT transposed.
2002
+ // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
2003
+ // Instead they need to be copied to the correct slice in ne0 = dst row index.
2004
+ // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
2005
+ for (int64_t j = 0; j < ne1; ++j) {
2006
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
2007
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
2008
+ }
2009
+ } else {
2010
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
2011
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
2012
+ }
2013
+ }
2014
+ }
2015
+ }
2016
+ }
2017
+
2018
+ // wait until each device is finished, then free their buffers
2019
+ for (int id = 0; id < g_device_count; ++id) {
2020
+ CUDA_CHECK(cudaSetDevice(id));
2021
+ CUDA_CHECK(cudaDeviceSynchronize());
2022
+ if (src0_asq[id] > 0) {
2023
+ ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
2024
+ }
2025
+ if (src0_asf[id] > 0) {
2026
+ ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
2027
+ }
2028
+ if (src1_asf[id] > 0) {
2029
+ ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
2030
+ }
2031
+ if (dst_asf[id] > 0) {
2032
+ ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
2033
+ }
2034
+ }
2035
+ }
2036
+
2037
+ void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2038
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2039
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2040
+ }
2041
+
2042
+ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2043
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2044
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2045
+ }
2046
+
2047
+ void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2048
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2049
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2050
+ }
2051
+
2052
+ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2053
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2054
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
2055
+ }
2056
+
2057
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
2058
+ const int64_t ne10 = src1->ne[0];
2059
+
2060
+ const int64_t ne0 = dst->ne[0];
2061
+ const int64_t ne1 = dst->ne[1];
2062
+
2063
+ // TODO: find the optimal values for these
2064
+ if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2065
+ src1->type == GGML_TYPE_F32 &&
2066
+ dst->type == GGML_TYPE_F32 &&
2067
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
2068
+ return true;
2069
+ }
2070
+
2071
+ return false;
2072
+ }
2073
+
2074
+ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2075
+ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2076
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2077
+ GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
2078
+ GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
2079
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2080
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2081
+
2082
+ const int64_t ne00 = src0->ne[0];
2083
+ const int64_t ne01 = src0->ne[1];
2084
+ const int64_t ne02 = src0->ne[2];
2085
+
2086
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2087
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2088
+
2089
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2090
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2091
+
2092
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2093
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2094
+
2095
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2096
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2097
+
2098
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2099
+
2100
+ CUDA_CHECK(cudaDeviceSynchronize());
2101
+ }
2102
+
2103
+ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2104
+ GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
2105
+ GGML_ASSERT(!ggml_is_permuted(src0));
2106
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2107
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2108
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2109
+
2110
+ const int64_t ne00 = src0->ne[0];
2111
+ const int64_t ne01 = src0->ne[1];
2112
+ const int64_t ne02 = src0->ne[2];
2113
+
2114
+ const int64_t nb01 = src0->nb[1];
2115
+ const int64_t nb02 = src0->nb[2];
2116
+
2117
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2118
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2119
+
2120
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2121
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2122
+
2123
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2124
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2125
+
2126
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2127
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2128
+
2129
+ const int row_stride_x = nb01 / sizeof(half);
2130
+ const int channel_stride_x = nb02 / sizeof(half);
2131
+
2132
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2133
+
2134
+ CUDA_CHECK(cudaDeviceSynchronize());
2135
+ }
2136
+
2137
+ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2138
+ bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
2139
+ src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
2140
+
2141
+ if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
2142
+ ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
2143
+ } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
2144
+ ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
2145
+ }else if (src0->type == GGML_TYPE_F32) {
2146
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2147
+ } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
2148
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2149
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
2150
+ } else {
2151
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2152
+ }
2153
+ } else {
2154
+ GGML_ASSERT(false);
2155
+ }
2156
+ }
2157
+
2158
+ void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2159
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2160
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
2161
+ }
2162
+
2163
+ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2164
+ const int64_t ne = ggml_nelements(src0);
2165
+ GGML_ASSERT(ne == ggml_nelements(src1));
2166
+
2167
+ GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2168
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
2169
+
2170
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
2171
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
2172
+
2173
+ const int64_t ne00 = src0->ne[0];
2174
+ const int64_t ne01 = src0->ne[1];
2175
+ GGML_ASSERT(src0->ne[3] == 1);
2176
+
2177
+ const int64_t nb00 = src0->nb[0];
2178
+ const int64_t nb01 = src0->nb[1];
2179
+ const int64_t nb02 = src0->nb[2];
2180
+
2181
+ const int64_t ne10 = src1->ne[0];
2182
+ const int64_t ne11 = src1->ne[1];
2183
+ GGML_ASSERT(src1->ne[3] == 1);
2184
+
2185
+ const int64_t nb10 = src1->nb[0];
2186
+ const int64_t nb11 = src1->nb[1];
2187
+ const int64_t nb12 = src1->nb[2];
2188
+
2189
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2190
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2191
+
2192
+ const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2193
+ const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2194
+
2195
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2196
+ char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
2197
+
2198
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
2199
+ ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2200
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2201
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
2202
+ ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2203
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2204
+ } else {
2205
+ GGML_ASSERT(false);
2206
+ }
2207
+
2208
+ CUDA_CHECK(cudaDeviceSynchronize());
2209
+
2210
+ (void) dst;
2211
+ }
2212
+
2213
+ void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2214
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2215
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
2216
+ }
2217
+
2218
+ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2219
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2220
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
2221
+ }
2222
+
2223
+ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2224
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2225
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
2226
+ }
2227
+
2228
+ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2229
+ (void) src0;
2230
+ (void) src1;
2231
+ (void) dst;
2232
+ }
2233
+
2234
+ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2235
+ int nrows = ggml_nrows(tensor);
2236
+ const size_t nb1 = tensor->nb[1];
2237
+ ggml_backend backend = tensor->backend;
2238
+ struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
2239
+ memset(extra, 0, sizeof(*extra));
2240
+
2241
+ for (int id = 0; id < g_device_count; ++id) {
2242
+ if (backend == GGML_BACKEND_GPU && id != g_main_device) {
2243
+ continue;
2244
+ }
2245
+
2246
+ cudaSetDevice(id);
2247
+
2248
+ int row_low, row_high;
2249
+ if (backend == GGML_BACKEND_GPU) {
2250
+ row_low = 0;
2251
+ row_high = nrows;
2252
+ } else if (backend == GGML_BACKEND_GPU_SPLIT) {
2253
+ row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
2254
+ row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
2255
+ } else {
2256
+ GGML_ASSERT(false);
2257
+ }
2258
+ if (row_low == row_high) {
2259
+ continue;
2260
+ }
2261
+
2262
+ int64_t nrows_split = row_high - row_low;
2263
+
2264
+ const size_t offset_split = row_low*nb1;
2265
+ const size_t size = ggml_nbytes_split(tensor, nrows_split);
2266
+
2267
+ void * buf;
2268
+ CUDA_CHECK(cudaMalloc(&buf, size));
2269
+ void * buf_host = (char*)data + offset_split;
2270
+
2271
+ cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
2272
+
2273
+ extra->data_device[id] = buf;
2274
+ }
2275
+
2276
+ tensor->extra = extra;
2277
+ }
2278
+
2279
+ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2280
+ if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
2281
+ return;
2282
+ }
2283
+
2284
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
2285
+
2286
+ for (int id = 0; id < g_device_count; ++id) {
2287
+ if (extra->data_device[id] == nullptr) {
2288
+ continue;
2289
+ }
2290
+
2291
+ CUDA_CHECK(cudaSetDevice(id));
2292
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
2293
+ }
2294
+
2295
+ delete extra;
2296
+ }
2297
+
2298
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2299
+ if (scratch && g_scratch_size == 0) {
2300
+ return;
2301
+ }
2302
+
2303
+ // recursively assign CUDA buffers until a compute tensor is found
2304
+ if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2305
+ const ggml_op src0_op = tensor->src0->op;
2306
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2307
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2308
+ }
2309
+ }
2310
+ if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2311
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
2312
+ }
2313
+
2314
+ tensor->backend = GGML_BACKEND_GPU;
2315
+ struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
2316
+
2317
+ const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2318
+ tensor->op == GGML_OP_VIEW;
2319
+ const size_t size = ggml_nbytes(tensor);
2320
+
2321
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2322
+ if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
2323
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2324
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2325
+ size_t offset = 0;
2326
+ if (tensor->op == GGML_OP_VIEW) {
2327
+ memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
2328
+ }
2329
+ extra->data_device[g_main_device] = src0_ddc + offset;
2330
+ } else if (tensor->op == GGML_OP_CPY) {
2331
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
2332
+ void * src1_ddv = src1_extra->data_device[g_main_device];
2333
+ extra->data_device[g_main_device] = src1_ddv;
2334
+ } else if (scratch) {
2335
+ GGML_ASSERT(size <= g_scratch_size);
2336
+ if (g_scratch_offset + size > g_scratch_size) {
2337
+ g_scratch_offset = 0;
2338
+ }
2339
+
2340
+ char * data = (char *) g_scratch_buffer;
2341
+ if (data == nullptr) {
2342
+ CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
2343
+ g_scratch_buffer = data;
2344
+ }
2345
+ extra->data_device[g_main_device] = data + g_scratch_offset;
2346
+
2347
+ g_scratch_offset += size;
2348
+
2349
+ GGML_ASSERT(g_scratch_offset <= g_scratch_size);
2350
+ } else { // allocate new buffers outside of scratch
2351
+ void * data;
2352
+ CUDA_CHECK(cudaMalloc(&data, size));
2353
+ CUDA_CHECK(cudaMemset(data, 0, size));
2354
+ extra->data_device[g_main_device] = data;
2355
+ }
2356
+
2357
+ tensor->extra = extra;
2358
+ }
2359
+
2360
+ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2361
+ ggml_cuda_assign_buffers_impl(tensor, true);
2362
+ }
2363
+
2364
+ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2365
+ ggml_cuda_assign_buffers_impl(tensor, false);
2366
+ }
2367
+
2368
+ void ggml_cuda_set_main_device(int main_device) {
2369
+ if (main_device >= g_device_count) {
2370
+ fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
2371
+ main_device, g_device_count, g_main_device);
2372
+ return;
2373
+ }
2374
+ g_main_device = main_device;
2375
+ if (g_device_count > 1) {
2376
+ cudaDeviceProp prop;
2377
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
2378
+ fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
2379
+ }
2380
+ }
2381
+
2382
+ void ggml_cuda_set_scratch_size(size_t scratch_size) {
2383
+ g_scratch_size = scratch_size;
2384
+ }
2385
+
2386
+ void ggml_cuda_free_scratch() {
2387
+ if (g_scratch_buffer == nullptr) {
2388
+ return;
2389
+ }
2390
+
2391
+ CUDA_CHECK(cudaFree(g_scratch_buffer));
2392
+ g_scratch_buffer = nullptr;
2393
+ }
2394
+
2395
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2396
+ ggml_cuda_func_t func;
2397
+ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
2398
+ || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
2399
+ || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
2400
+
2401
+ switch (tensor->op) {
2402
+ case GGML_OP_ADD:
2403
+ if (!any_on_device) {
2404
+ return false;
2405
+ }
2406
+ func = ggml_cuda_add;
2407
+ break;
2408
+ case GGML_OP_MUL:
2409
+ if (!any_on_device) {
2410
+ return false;
2411
+ }
2412
+ func = ggml_cuda_mul;
2413
+ break;
2414
+ case GGML_OP_SILU:
2415
+ if (!any_on_device) {
2416
+ return false;
2417
+ }
2418
+ func = ggml_cuda_silu;
2419
+ break;
2420
+ case GGML_OP_RMS_NORM:
2421
+ if (!any_on_device) {
2422
+ return false;
2423
+ }
2424
+ func = ggml_cuda_rms_norm;
2425
+ break;
2426
+ case GGML_OP_MUL_MAT:
2427
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
2428
+ return false;
2429
+ }
2430
+ func = ggml_cuda_mul_mat;
2431
+ break;
2432
+ case GGML_OP_SCALE:
2433
+ if (!any_on_device) {
2434
+ return false;
2435
+ }
2436
+ func = ggml_cuda_scale;
2437
+ break;
2438
+ case GGML_OP_CPY:
2439
+ if (!any_on_device) {
2440
+ return false;
2441
+ }
2442
+ func = ggml_cuda_cpy;
2443
+ break;
2444
+ case GGML_OP_RESHAPE:
2445
+ case GGML_OP_VIEW:
2446
+ case GGML_OP_PERMUTE:
2447
+ case GGML_OP_TRANSPOSE:
2448
+ if (!any_on_device) {
2449
+ return false;
2450
+ }
2451
+ func = ggml_cuda_nop;
2452
+ break;
2453
+ case GGML_OP_DIAG_MASK_INF:
2454
+ if (!any_on_device) {
2455
+ return false;
2456
+ }
2457
+ func = ggml_cuda_diag_mask_inf;
2458
+ break;
2459
+ case GGML_OP_SOFT_MAX:
2460
+ if (!any_on_device) {
2461
+ return false;
2462
+ }
2463
+ func = ggml_cuda_soft_max;
2464
+ break;
2465
+ case GGML_OP_ROPE:
2466
+ if (!any_on_device) {
2467
+ return false;
2468
+ }
2469
+ func = ggml_cuda_rope;
2470
+ break;
2471
+ default:
2472
+ return false;
2473
+ }
2474
+
2475
+ if (params->ith != 0) {
2476
+ return true;
2477
+ }
2478
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
2479
+ return true;
2480
+ }
2481
+ func(tensor->src0, tensor->src1, tensor);
2482
+ return true;
2483
+ }