llama_cpp 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/examples/README.md +92 -0
- data/examples/chat.rb +195 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1218 -411
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +703 -514
- data/ext/llama_cpp/src/ggml-metal.metal +574 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +496 -36
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +2715 -476
- data/ext/llama_cpp/src/ggml.h +266 -11
- data/ext/llama_cpp/src/llama.cpp +266 -135
- data/ext/llama_cpp/src/llama.h +19 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +5 -2
@@ -1,5 +1,6 @@
|
|
1
1
|
#include <cstddef>
|
2
2
|
#include <cstdint>
|
3
|
+
#include <limits>
|
3
4
|
#include <stdint.h>
|
4
5
|
#include <stdio.h>
|
5
6
|
#include <atomic>
|
@@ -12,6 +13,10 @@
|
|
12
13
|
#include "ggml-cuda.h"
|
13
14
|
#include "ggml.h"
|
14
15
|
|
16
|
+
#if defined(_MSC_VER)
|
17
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
|
+
#endif
|
19
|
+
|
15
20
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
16
21
|
|
17
22
|
#define CUDA_CHECK(err) \
|
@@ -24,7 +29,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
24
29
|
} \
|
25
30
|
} while (0)
|
26
31
|
|
27
|
-
#if CUDART_VERSION >=
|
32
|
+
#if CUDART_VERSION >= 12000
|
28
33
|
#define CUBLAS_CHECK(err) \
|
29
34
|
do { \
|
30
35
|
cublasStatus_t err_ = (err); \
|
@@ -45,9 +50,18 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
45
50
|
} while (0)
|
46
51
|
#endif // CUDART_VERSION >= 11
|
47
52
|
|
48
|
-
|
53
|
+
#ifdef GGML_CUDA_DMMV_F16
|
54
|
+
typedef half dfloat; // dequantize float
|
55
|
+
typedef half2 dfloat2;
|
56
|
+
#else
|
57
|
+
typedef float dfloat; // dequantize float
|
58
|
+
typedef float2 dfloat2;
|
59
|
+
#endif //GGML_CUDA_DMMV_F16
|
60
|
+
|
61
|
+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
49
62
|
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
50
63
|
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
64
|
+
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
51
65
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
52
66
|
typedef void (*ggml_cuda_op_t)(
|
53
67
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
|
@@ -151,7 +165,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
151
165
|
#define CUDA_ADD_BLOCK_SIZE 256
|
152
166
|
#define CUDA_MUL_BLOCK_SIZE 256
|
153
167
|
#define CUDA_SILU_BLOCK_SIZE 256
|
168
|
+
#define CUDA_CPY_BLOCK_SIZE 32
|
169
|
+
#define CUDA_SCALE_BLOCK_SIZE 256
|
154
170
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
171
|
+
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
155
172
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
156
173
|
|
157
174
|
// dmmv = dequantize_mul_mat_vec
|
@@ -162,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
162
179
|
#define GGML_CUDA_DMMV_Y 1
|
163
180
|
#endif
|
164
181
|
|
182
|
+
#ifndef K_QUANTS_PER_ITERATION
|
183
|
+
#define K_QUANTS_PER_ITERATION 2
|
184
|
+
#else
|
185
|
+
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
186
|
+
#endif
|
187
|
+
|
165
188
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
166
189
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
167
190
|
|
@@ -219,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
219
242
|
}
|
220
243
|
}
|
221
244
|
|
222
|
-
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs,
|
245
|
+
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
223
246
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
224
247
|
|
225
|
-
const
|
248
|
+
const dfloat d = x[ib].d;
|
226
249
|
|
227
|
-
const
|
250
|
+
const int vui = x[ib].qs[iqs];
|
228
251
|
|
229
|
-
|
230
|
-
|
252
|
+
v.x = vui & 0xF;
|
253
|
+
v.y = vui >> 4;
|
231
254
|
|
232
|
-
|
233
|
-
|
255
|
+
#ifdef GGML_CUDA_DMMV_F16
|
256
|
+
v = __hsub2(v, {8.0f, 8.0f});
|
257
|
+
v = __hmul2(v, {d, d});
|
258
|
+
#else
|
259
|
+
v.x = (v.x - 8.0f) * d;
|
260
|
+
v.y = (v.y - 8.0f) * d;
|
261
|
+
#endif // GGML_CUDA_DMMV_F16
|
234
262
|
}
|
235
263
|
|
236
|
-
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs,
|
264
|
+
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
237
265
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
238
266
|
|
239
|
-
const
|
240
|
-
const
|
267
|
+
const dfloat d = x[ib].d;
|
268
|
+
const dfloat m = x[ib].m;
|
241
269
|
|
242
|
-
const
|
270
|
+
const int vui = x[ib].qs[iqs];
|
243
271
|
|
244
|
-
|
245
|
-
|
272
|
+
v.x = vui & 0xF;
|
273
|
+
v.y = vui >> 4;
|
246
274
|
|
247
|
-
|
248
|
-
|
275
|
+
#ifdef GGML_CUDA_DMMV_F16
|
276
|
+
v = __hmul2(v, {d, d});
|
277
|
+
v = __hadd2(v, {m, m});
|
278
|
+
#else
|
279
|
+
v.x = (v.x * d) + m;
|
280
|
+
v.y = (v.y * d) + m;
|
281
|
+
#endif // GGML_CUDA_DMMV_F16
|
249
282
|
}
|
250
283
|
|
251
|
-
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs,
|
284
|
+
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
252
285
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
253
286
|
|
254
|
-
const
|
287
|
+
const dfloat d = x[ib].d;
|
255
288
|
|
256
289
|
uint32_t qh;
|
257
290
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
258
291
|
|
259
|
-
const
|
260
|
-
const
|
292
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
293
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
261
294
|
|
262
|
-
|
263
|
-
|
295
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
296
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
264
297
|
|
265
|
-
|
266
|
-
|
298
|
+
#ifdef GGML_CUDA_DMMV_F16
|
299
|
+
v = __hsub2(v, {16.0f, 16.0f});
|
300
|
+
v = __hmul2(v, {d, d});
|
301
|
+
#else
|
302
|
+
v.x = (v.x - 16.0f) * d;
|
303
|
+
v.y = (v.y - 16.0f) * d;
|
304
|
+
#endif // GGML_CUDA_DMMV_F16
|
267
305
|
}
|
268
306
|
|
269
|
-
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs,
|
307
|
+
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
270
308
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
271
309
|
|
272
|
-
const
|
273
|
-
const
|
310
|
+
const dfloat d = x[ib].d;
|
311
|
+
const dfloat m = x[ib].m;
|
274
312
|
|
275
313
|
uint32_t qh;
|
276
314
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
277
315
|
|
278
|
-
const
|
279
|
-
const
|
316
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
317
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
280
318
|
|
281
|
-
|
282
|
-
|
319
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
320
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
283
321
|
|
284
|
-
|
285
|
-
|
322
|
+
#ifdef GGML_CUDA_DMMV_F16
|
323
|
+
v = __hmul2(v, {d, d});
|
324
|
+
v = __hadd2(v, {m, m});
|
325
|
+
#else
|
326
|
+
v.x = (v.x * d) + m;
|
327
|
+
v.y = (v.y * d) + m;
|
328
|
+
#endif // GGML_CUDA_DMMV_F16
|
286
329
|
}
|
287
330
|
|
288
|
-
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs,
|
331
|
+
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
289
332
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
290
333
|
|
291
|
-
const
|
334
|
+
const dfloat d = x[ib].d;
|
292
335
|
|
293
|
-
|
294
|
-
|
336
|
+
v.x = x[ib].qs[iqs + 0];
|
337
|
+
v.y = x[ib].qs[iqs + 1];
|
295
338
|
|
296
|
-
|
297
|
-
|
339
|
+
#ifdef GGML_CUDA_DMMV_F16
|
340
|
+
v = __hmul2(v, {d, d});
|
341
|
+
#else
|
342
|
+
v.x *= d;
|
343
|
+
v.y *= d;
|
344
|
+
#endif // GGML_CUDA_DMMV_F16
|
298
345
|
}
|
299
346
|
|
300
347
|
//================================== k-quants
|
@@ -321,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
321
368
|
|
322
369
|
}
|
323
370
|
|
324
|
-
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
325
|
-
|
326
|
-
const block_q2_K * x = (const block_q2_K *) vx;
|
327
|
-
|
328
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
329
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
330
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
331
|
-
int n = iqs/128; // 0 or 1
|
332
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
333
|
-
int l = r/8; // 0...15 in steps of 1
|
334
|
-
|
335
|
-
const float * y = yy + 128*n + l;
|
336
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
337
|
-
const uint8_t * s = x[ib].scales + 8*n;
|
338
|
-
|
339
|
-
const float dall = x[ib].d;
|
340
|
-
const float dmin = x[ib].dmin;
|
341
|
-
|
342
|
-
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
|
343
|
-
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
344
|
-
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
345
|
-
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
346
|
-
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
347
|
-
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
|
348
|
-
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
349
|
-
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
350
|
-
|
351
|
-
result = sum;
|
352
|
-
|
353
|
-
}
|
354
|
-
|
355
371
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
356
372
|
|
357
373
|
int r = threadIdx.x/4;
|
@@ -383,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
|
383
399
|
|
384
400
|
}
|
385
401
|
|
386
|
-
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
387
|
-
|
388
|
-
const block_q3_K * x = (const block_q3_K *) vx;
|
389
|
-
|
390
|
-
const uint32_t kmask1 = 0x03030303;
|
391
|
-
const uint32_t kmask2 = 0x0f0f0f0f;
|
392
|
-
|
393
|
-
uint32_t aux[3];
|
394
|
-
uint32_t utmp[4];
|
395
|
-
|
396
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
397
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
398
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
399
|
-
int n = iqs/128; // 0 or 1
|
400
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
401
|
-
int l = r/8; // 0...15 in steps of 1
|
402
|
-
|
403
|
-
const float * y = yy + 128*n + l;
|
404
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
405
|
-
const uint8_t * hm = x[ib].hmask + l;
|
406
|
-
const int8_t * s = (const int8_t *)utmp + 8*n;
|
407
|
-
|
408
|
-
memcpy(aux, x[ib].scales, 12);
|
409
|
-
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
410
|
-
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
411
|
-
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
412
|
-
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
413
|
-
|
414
|
-
const float dall = x[ib].d;
|
415
|
-
|
416
|
-
const uint8_t m = 1 << (4*n);
|
417
|
-
|
418
|
-
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
|
419
|
-
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
|
420
|
-
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
|
421
|
-
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
|
422
|
-
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
|
423
|
-
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
|
424
|
-
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
|
425
|
-
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
|
426
|
-
|
427
|
-
result = sum * dall;
|
428
|
-
|
429
|
-
}
|
430
|
-
|
431
402
|
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
432
403
|
if (j < 4) {
|
433
404
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -474,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
474
445
|
}
|
475
446
|
}
|
476
447
|
|
477
|
-
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
478
|
-
|
479
|
-
const block_q4_K * x = (const block_q4_K *) vx;
|
480
|
-
|
481
|
-
// iqs is in 0...248 in steps of 8 =>
|
482
|
-
const int j = iqs / 64; // j is in 0...3
|
483
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
484
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
485
|
-
|
486
|
-
const float * y = yy + 64*j + ir;
|
487
|
-
const uint8_t * q = x[ib].qs + 32*j + ir;
|
488
|
-
|
489
|
-
const float dall = x[ib].d;
|
490
|
-
const float dmin = x[ib].dmin;
|
491
|
-
|
492
|
-
uint8_t sc, m;
|
493
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
494
|
-
const float d1 = dall * sc;
|
495
|
-
const float m1 = dmin * m;
|
496
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
497
|
-
const float d2 = dall * sc;
|
498
|
-
const float m2 = dmin * m;
|
499
|
-
|
500
|
-
float sum = 0;
|
501
|
-
for (int k = 0; k < 4; ++k) {
|
502
|
-
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
|
503
|
-
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
|
504
|
-
}
|
505
|
-
result = sum;
|
506
|
-
|
507
|
-
}
|
508
|
-
|
509
448
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
510
449
|
const block_q5_K * x = (const block_q5_K *) vx;
|
511
450
|
|
@@ -539,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
539
478
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
540
479
|
}
|
541
480
|
|
542
|
-
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
543
|
-
|
544
|
-
const block_q5_K * x = (const block_q5_K *) vx;
|
545
|
-
|
546
|
-
// iqs is in 0...248 in steps of 8 =>
|
547
|
-
const int j = iqs / 64; // j is in 0...3
|
548
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
549
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
550
|
-
|
551
|
-
const float * y = yy + 64*j + ir;
|
552
|
-
const uint8_t * ql = x[ib].qs + 32*j + ir;
|
553
|
-
const uint8_t * qh = x[ib].qh + ir;
|
554
|
-
|
555
|
-
const float dall = x[ib].d;
|
556
|
-
const float dmin = x[ib].dmin;
|
557
|
-
|
558
|
-
uint8_t sc, m;
|
559
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
560
|
-
const float d1 = dall * sc;
|
561
|
-
const float m1 = dmin * m;
|
562
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
563
|
-
const float d2 = dall * sc;
|
564
|
-
const float m2 = dmin * m;
|
565
|
-
|
566
|
-
uint8_t hm = 1 << is;
|
567
|
-
float sum = 0;
|
568
|
-
for (int k = 0; k < 4; ++k) {
|
569
|
-
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
570
|
-
}
|
571
|
-
hm <<= 1;
|
572
|
-
for (int k = 0; k < 4; ++k) {
|
573
|
-
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
|
574
|
-
}
|
575
|
-
result = sum;
|
576
|
-
|
577
|
-
}
|
578
|
-
|
579
481
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
580
482
|
const block_q6_K * x = (const block_q6_K *) vx;
|
581
483
|
|
@@ -601,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
601
503
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
602
504
|
}
|
603
505
|
|
604
|
-
static
|
506
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
605
507
|
|
606
|
-
|
508
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
509
|
+
|
510
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
511
|
+
if (row > nrows) return;
|
512
|
+
|
513
|
+
const int num_blocks_per_row = ncols / QK_K;
|
514
|
+
const int ib0 = row*num_blocks_per_row;
|
515
|
+
|
516
|
+
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
517
|
+
|
518
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
519
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
520
|
+
|
521
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
522
|
+
|
523
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
524
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
525
|
+
|
526
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
527
|
+
const int q_offset = 32*im + l0;
|
528
|
+
const int s_offset = 8*im;
|
529
|
+
const int y_offset = 128*im + l0;
|
530
|
+
|
531
|
+
float tmp = 0; // partial sum for thread in warp
|
532
|
+
|
533
|
+
uint32_t aux[4];
|
534
|
+
const uint8_t * d = (const uint8_t *)aux;
|
535
|
+
const uint8_t * m = (const uint8_t *)(aux + 2);
|
536
|
+
|
537
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
538
|
+
|
539
|
+
const float * y = yy + i * QK_K + y_offset;
|
540
|
+
const uint8_t * q = x[i].qs + q_offset;
|
541
|
+
|
542
|
+
const float dall = x[i].d;
|
543
|
+
const float dmin = x[i].dmin;
|
544
|
+
|
545
|
+
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
546
|
+
aux[0] = a[0] & 0x0f0f0f0f;
|
547
|
+
aux[1] = a[1] & 0x0f0f0f0f;
|
548
|
+
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
549
|
+
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
550
|
+
|
551
|
+
float sum1 = 0, sum2 = 0;
|
552
|
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
553
|
+
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
554
|
+
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
555
|
+
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
556
|
+
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
557
|
+
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
558
|
+
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
559
|
+
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
560
|
+
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
561
|
+
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
562
|
+
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
563
|
+
|
564
|
+
}
|
565
|
+
tmp += dall * sum1 - dmin * sum2;
|
566
|
+
|
567
|
+
}
|
568
|
+
|
569
|
+
// sum up partial sums and write back result
|
570
|
+
__syncthreads();
|
571
|
+
#pragma unroll
|
572
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
573
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
574
|
+
}
|
575
|
+
|
576
|
+
if (tid == 0) {
|
577
|
+
dst[row] = tmp;
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
582
|
+
|
583
|
+
const uint16_t kmask1 = 0x0303;
|
584
|
+
const uint16_t kmask2 = 0x0f0f;
|
585
|
+
|
586
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
587
|
+
if (row > nrows) return;
|
588
|
+
|
589
|
+
const int num_blocks_per_row = ncols / QK_K;
|
590
|
+
const int ib0 = row*num_blocks_per_row;
|
591
|
+
|
592
|
+
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
593
|
+
|
594
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
595
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
596
|
+
|
597
|
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
598
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
599
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
600
|
+
const int in = tid - step*im; // 0....15 or 0...7
|
601
|
+
|
602
|
+
const uint8_t m = 1 << (4*im);
|
603
|
+
|
604
|
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
605
|
+
const int q_offset = 32*im + l0;
|
606
|
+
const int y_offset = 128*im + l0;
|
607
|
+
|
608
|
+
uint16_t utmp[4];
|
609
|
+
const int8_t * s = (const int8_t *)utmp;
|
610
|
+
|
611
|
+
const uint16_t s_shift = 4*im;
|
612
|
+
|
613
|
+
float tmp = 0; // partial sum for thread in warp
|
614
|
+
|
615
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616
|
+
|
617
|
+
const float * y = yy + i * QK_K + y_offset;
|
618
|
+
const uint8_t * q = x[i].qs + q_offset;
|
619
|
+
const uint8_t * h = x[i].hmask + l0;
|
620
|
+
|
621
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
622
|
+
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
623
|
+
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
624
|
+
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
625
|
+
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
626
|
+
|
627
|
+
const float d = x[i].d;
|
628
|
+
|
629
|
+
float sum = 0;
|
630
|
+
for (int l = 0; l < n; ++l) {
|
631
|
+
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
632
|
+
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
633
|
+
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
634
|
+
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
635
|
+
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
636
|
+
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
637
|
+
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
638
|
+
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
639
|
+
}
|
640
|
+
tmp += d * sum;
|
641
|
+
|
642
|
+
}
|
643
|
+
|
644
|
+
// sum up partial sums and write back result
|
645
|
+
__syncthreads();
|
646
|
+
#pragma unroll
|
647
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
648
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
649
|
+
}
|
650
|
+
|
651
|
+
if (tid == 0) {
|
652
|
+
dst[row] = tmp;
|
653
|
+
}
|
654
|
+
}
|
655
|
+
|
656
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
657
|
+
|
658
|
+
const uint16_t kmask1 = 0x3f3f;
|
659
|
+
const uint16_t kmask2 = 0x0f0f;
|
660
|
+
const uint16_t kmask3 = 0xc0c0;
|
661
|
+
|
662
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
663
|
+
if (row > nrows) return;
|
664
|
+
const int num_blocks_per_row = ncols / QK_K;
|
665
|
+
const int ib0 = row*num_blocks_per_row;
|
666
|
+
|
667
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
668
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
669
|
+
|
670
|
+
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
671
|
+
|
672
|
+
const int il = tid/step; // 0...3
|
673
|
+
const int ir = tid - step*il; // 0...7 or 0...3
|
674
|
+
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
675
|
+
|
676
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
677
|
+
const int in = il%2;
|
678
|
+
|
679
|
+
const int l0 = n*(2*ir + in);
|
680
|
+
const int q_offset = 32*im + l0;
|
681
|
+
const int y_offset = 64*im + l0;
|
682
|
+
|
683
|
+
uint16_t aux[4];
|
684
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
685
|
+
|
686
|
+
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
687
|
+
|
688
|
+
float tmp = 0; // partial sum for thread in warp
|
689
|
+
|
690
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
691
|
+
|
692
|
+
const uint8_t * q1 = x[i].qs + q_offset;
|
693
|
+
const uint8_t * q2 = q1 + 64;
|
694
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
695
|
+
const float * y2 = y1 + 128;
|
696
|
+
|
697
|
+
const float dall = x[i].d;
|
698
|
+
const float dmin = x[i].dmin;
|
699
|
+
|
700
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
701
|
+
aux[0] = a[im+0] & kmask1;
|
702
|
+
aux[1] = a[im+2] & kmask1;
|
703
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
704
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
705
|
+
|
706
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
707
|
+
float smin = 0;
|
708
|
+
for (int l = 0; l < n; ++l) {
|
709
|
+
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
710
|
+
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
711
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
712
|
+
}
|
713
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
714
|
+
|
715
|
+
}
|
716
|
+
|
717
|
+
// sum up partial sums and write back result
|
718
|
+
__syncthreads();
|
719
|
+
#pragma unroll
|
720
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
721
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
722
|
+
}
|
723
|
+
|
724
|
+
if (tid == 0) {
|
725
|
+
dst[row] = tmp;
|
726
|
+
}
|
727
|
+
}
|
728
|
+
|
729
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
730
|
+
|
731
|
+
const uint16_t kmask1 = 0x3f3f;
|
732
|
+
const uint16_t kmask2 = 0x0f0f;
|
733
|
+
const uint16_t kmask3 = 0xc0c0;
|
734
|
+
|
735
|
+
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
736
|
+
const int row = blockIdx.x;
|
737
|
+
const int num_blocks_per_row = ncols / QK_K;
|
738
|
+
const int ib0 = row*num_blocks_per_row;
|
739
|
+
|
740
|
+
const int tid = threadIdx.x/2; // 0...15
|
741
|
+
const int ix = threadIdx.x%2;
|
742
|
+
|
743
|
+
const int il = tid/4; // 0...3
|
744
|
+
const int ir = tid - 4*il;// 0...3
|
745
|
+
const int n = 2;
|
607
746
|
|
608
|
-
const int
|
609
|
-
const int
|
610
|
-
const int is = 8*ip;
|
747
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
748
|
+
const int in = il%2;
|
611
749
|
|
612
|
-
const
|
750
|
+
const int l0 = n*(2*ir + in);
|
751
|
+
const int q_offset = 32*im + l0;
|
752
|
+
const int y_offset = 64*im + l0;
|
613
753
|
|
614
|
-
const
|
754
|
+
const uint8_t hm1 = 1 << (2*im);
|
755
|
+
const uint8_t hm2 = hm1 << 4;
|
615
756
|
|
616
|
-
|
617
|
-
const uint8_t *
|
618
|
-
const int8_t * sc = x[ib].scales + is;
|
757
|
+
uint16_t aux[4];
|
758
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
619
759
|
|
620
|
-
|
621
|
-
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
|
622
|
-
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
|
623
|
-
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
|
624
|
-
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
|
625
|
-
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
|
626
|
-
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
|
627
|
-
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
|
760
|
+
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
628
761
|
|
762
|
+
float tmp = 0; // partial sum for thread in warp
|
763
|
+
|
764
|
+
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
765
|
+
|
766
|
+
const uint8_t * ql1 = x[i].qs + q_offset;
|
767
|
+
const uint8_t * ql2 = ql1 + 64;
|
768
|
+
const uint8_t * qh = x[i].qh + l0;
|
769
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
770
|
+
const float * y2 = y1 + 128;
|
771
|
+
|
772
|
+
const float dall = x[i].d;
|
773
|
+
const float dmin = x[i].dmin;
|
774
|
+
|
775
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
776
|
+
aux[0] = a[im+0] & kmask1;
|
777
|
+
aux[1] = a[im+2] & kmask1;
|
778
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
779
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
780
|
+
|
781
|
+
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
782
|
+
float smin = 0;
|
783
|
+
for (int l = 0; l < n; ++l) {
|
784
|
+
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
785
|
+
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
786
|
+
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
787
|
+
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
788
|
+
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
789
|
+
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
790
|
+
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
791
|
+
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
792
|
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
793
|
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
794
|
+
}
|
795
|
+
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
796
|
+
|
797
|
+
}
|
798
|
+
|
799
|
+
// sum up partial sums and write back result
|
800
|
+
__syncthreads();
|
801
|
+
#pragma unroll
|
802
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
803
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
804
|
+
}
|
805
|
+
|
806
|
+
if (tid == 0) {
|
807
|
+
dst[row] = tmp;
|
808
|
+
}
|
629
809
|
}
|
630
810
|
|
631
|
-
static
|
811
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
812
|
+
|
813
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
814
|
+
|
815
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
816
|
+
if (row > nrows) return;
|
817
|
+
|
818
|
+
const int num_blocks_per_row = ncols / QK_K;
|
819
|
+
const int ib0 = row*num_blocks_per_row;
|
820
|
+
|
821
|
+
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
822
|
+
|
823
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
824
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
825
|
+
|
826
|
+
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
827
|
+
|
828
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
829
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
830
|
+
|
831
|
+
#if K_QUANTS_PER_ITERATION == 1
|
832
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
833
|
+
const int is = 0;
|
834
|
+
#else
|
835
|
+
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
836
|
+
const int is = in / 4;
|
837
|
+
#endif
|
838
|
+
const int ql_offset = 64*im + l0;
|
839
|
+
const int qh_offset = 32*im + l0;
|
840
|
+
const int s_offset = 8*im + is;
|
841
|
+
const int y_offset = 128*im + l0;
|
842
|
+
|
843
|
+
float tmp = 0; // partial sum for thread in warp
|
844
|
+
|
845
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
846
|
+
|
847
|
+
const float * y = yy + i * QK_K + y_offset;
|
848
|
+
const uint8_t * ql = x[i].ql + ql_offset;
|
849
|
+
const uint8_t * qh = x[i].qh + qh_offset;
|
850
|
+
const int8_t * s = x[i].scales + s_offset;
|
851
|
+
|
852
|
+
const float d = x[i].d;
|
853
|
+
|
854
|
+
#if K_QUANTS_PER_ITERATION == 1
|
855
|
+
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
856
|
+
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
857
|
+
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
858
|
+
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
859
|
+
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
860
|
+
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
861
|
+
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
862
|
+
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
863
|
+
tmp += sum;
|
864
|
+
#else
|
865
|
+
float sum = 0;
|
866
|
+
for (int l = 0; l < 4; ++l) {
|
867
|
+
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
868
|
+
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
869
|
+
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
870
|
+
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
871
|
+
}
|
872
|
+
tmp += sum;
|
873
|
+
#endif
|
874
|
+
|
875
|
+
}
|
876
|
+
|
877
|
+
// sum up partial sums and write back result
|
878
|
+
__syncthreads();
|
879
|
+
#pragma unroll
|
880
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
881
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
882
|
+
}
|
883
|
+
|
884
|
+
if (tid == 0) {
|
885
|
+
dst[row] = tmp;
|
886
|
+
}
|
887
|
+
}
|
888
|
+
|
889
|
+
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
632
890
|
const half * x = (const half *) vx;
|
633
891
|
|
634
|
-
|
635
|
-
|
892
|
+
// automatic half -> float type cast if dfloat == float
|
893
|
+
v.x = x[ib + iqs + 0];
|
894
|
+
v.y = x[ib + iqs + 1];
|
636
895
|
}
|
637
896
|
|
638
897
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -649,23 +908,35 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
649
908
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
650
909
|
|
651
910
|
// dequantize
|
652
|
-
|
653
|
-
|
654
|
-
|
911
|
+
dfloat2 v;
|
912
|
+
dequantize_kernel(vx, ib, iqs, v);
|
913
|
+
|
914
|
+
y[iybs + iqs + 0] = v.x;
|
915
|
+
y[iybs + iqs + y_offset] = v.y;
|
655
916
|
}
|
656
917
|
|
657
918
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
658
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const
|
919
|
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
659
920
|
// qk = quantized weights per x block
|
660
921
|
// qr = number of quantized weights per data value in x block
|
661
|
-
const int row = blockIdx.
|
922
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
923
|
+
|
924
|
+
if (row >= nrows) {
|
925
|
+
return;
|
926
|
+
}
|
927
|
+
|
662
928
|
const int tid = threadIdx.x;
|
663
929
|
|
664
930
|
const int iter_stride = 2*GGML_CUDA_DMMV_X;
|
665
931
|
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
666
932
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
667
933
|
|
668
|
-
|
934
|
+
// partial sum for each thread
|
935
|
+
#ifdef GGML_CUDA_DMMV_F16
|
936
|
+
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
937
|
+
#else
|
938
|
+
float tmp = 0.0f;
|
939
|
+
#endif // GGML_CUDA_DMMV_F16
|
669
940
|
|
670
941
|
for (int i = 0; i < ncols; i += iter_stride) {
|
671
942
|
const int col = i + vals_per_iter*tid;
|
@@ -679,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
679
950
|
// process 2 vals per j iter
|
680
951
|
|
681
952
|
// dequantize
|
682
|
-
float v0, v1;
|
683
|
-
dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
|
684
953
|
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
954
|
+
dfloat2 v;
|
955
|
+
dequantize_kernel(vx, ib, iqs + j/qr, v);
|
685
956
|
|
686
957
|
// matrix multiplication
|
687
|
-
tmp += v0 * y[iybs + iqs + j/qr + 0];
|
688
|
-
tmp += v1 * y[iybs + iqs + j/qr + y_offset];
|
689
958
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
959
|
+
#ifdef GGML_CUDA_DMMV_F16
|
960
|
+
tmp += __hmul2(v, {
|
961
|
+
y[iybs + iqs + j/qr + 0],
|
962
|
+
y[iybs + iqs + j/qr + y_offset]
|
963
|
+
});
|
964
|
+
#else
|
965
|
+
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
966
|
+
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
967
|
+
#endif // GGML_CUDA_DMMV_F16
|
690
968
|
}
|
691
969
|
}
|
692
970
|
|
@@ -698,64 +976,232 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
698
976
|
}
|
699
977
|
|
700
978
|
if (tid == 0) {
|
979
|
+
#ifdef GGML_CUDA_DMMV_F16
|
980
|
+
dst[row] = tmp.x + tmp.y;
|
981
|
+
#else
|
701
982
|
dst[row] = tmp;
|
983
|
+
#endif // GGML_CUDA_DMMV_F16
|
702
984
|
}
|
703
985
|
}
|
704
986
|
|
705
|
-
|
706
|
-
|
707
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
708
|
-
const int tid = threadIdx.x;
|
987
|
+
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
988
|
+
const half * x = (half *) vx;
|
709
989
|
|
710
|
-
const int
|
711
|
-
const int
|
712
|
-
const int num_blocks_per_row = ncols / QK_K;
|
713
|
-
const int ib0 = row*num_blocks_per_row;
|
990
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
991
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
714
992
|
|
715
|
-
|
993
|
+
const int nrows_y = ncols_x;
|
994
|
+
const int nrows_dst = nrows_x;
|
995
|
+
const int row_dst = row_x;
|
716
996
|
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
const int
|
721
|
-
|
997
|
+
float tmp = 0.0f;
|
998
|
+
|
999
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
1000
|
+
const int col_x = col_x0 + threadIdx.x;
|
1001
|
+
|
1002
|
+
if (col_x >= ncols_x) {
|
1003
|
+
break;
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
// x is transposed and permuted
|
1007
|
+
const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
|
1008
|
+
const float xi = __half2float(x[ix]);
|
1009
|
+
|
1010
|
+
const int row_y = col_x;
|
1011
|
+
|
1012
|
+
|
1013
|
+
// y is not transposed but permuted
|
1014
|
+
const int iy = channel*nrows_y + row_y;
|
1015
|
+
|
1016
|
+
tmp += xi * y[iy];
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
// dst is not transposed and not permuted
|
1020
|
+
const int idst = channel*nrows_dst + row_dst;
|
1021
|
+
|
1022
|
+
// sum up partial sums and write back result
|
1023
|
+
__syncthreads();
|
1024
|
+
#pragma unroll
|
1025
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1026
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1027
|
+
}
|
1028
|
+
|
1029
|
+
if (threadIdx.x == 0) {
|
1030
|
+
dst[idst] = tmp;
|
1031
|
+
}
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1035
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1036
|
+
const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
|
1037
|
+
|
1038
|
+
const half * x = (half *) vx;
|
1039
|
+
|
1040
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1041
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1042
|
+
|
1043
|
+
const int nrows_y = ncols_x;
|
1044
|
+
const int nrows_dst = nrows_x;
|
1045
|
+
const int row_dst = row_x;
|
1046
|
+
|
1047
|
+
const int idst = channel*nrows_dst + row_dst;
|
1048
|
+
|
1049
|
+
float tmp = 0.0f;
|
1050
|
+
|
1051
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
1052
|
+
const int col_x = col_x0 + threadIdx.x;
|
1053
|
+
|
1054
|
+
if (col_x >= ncols_x) {
|
1055
|
+
break;
|
1056
|
+
}
|
1057
|
+
|
1058
|
+
const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
|
1059
|
+
const float xi = __half2float(x[ix]);
|
1060
|
+
|
1061
|
+
const int row_y = col_x;
|
1062
|
+
|
1063
|
+
const int iy = channel*nrows_y + row_y;
|
1064
|
+
|
1065
|
+
tmp += xi * y[iy];
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
// sum up partial sums and write back result
|
1069
|
+
__syncthreads();
|
1070
|
+
#pragma unroll
|
1071
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1072
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1073
|
+
}
|
1074
|
+
|
1075
|
+
if (threadIdx.x == 0) {
|
1076
|
+
dst[idst] = tmp;
|
1077
|
+
}
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
1081
|
+
const float * xi = (float *) cxi;
|
1082
|
+
float * dsti = (float *) cdsti;
|
1083
|
+
|
1084
|
+
*dsti = *xi;
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
1088
|
+
const float * xi = (float *) cxi;
|
1089
|
+
half * dsti = (half *) cdsti;
|
1090
|
+
|
1091
|
+
*dsti = __float2half(*xi);
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
template <cpy_kernel_t cpy_1>
|
1095
|
+
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
1096
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1097
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
1098
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1099
|
+
|
1100
|
+
if (i >= ne) {
|
1101
|
+
return;
|
1102
|
+
}
|
1103
|
+
|
1104
|
+
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
1105
|
+
// then combine those indices with the corresponding byte offsets to get the total offsets
|
1106
|
+
const int i02 = i / (ne00*ne01);
|
1107
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
1108
|
+
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
1109
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
1110
|
+
|
1111
|
+
const int i12 = i / (ne10*ne11);
|
1112
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
1113
|
+
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
1114
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
1115
|
+
|
1116
|
+
cpy_1(cx + x_offset, cdst + dst_offset);
|
1117
|
+
}
|
1118
|
+
|
1119
|
+
// rope == RoPE == rotary positional embedding
|
1120
|
+
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
|
1121
|
+
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
1122
|
+
|
1123
|
+
if (col >= ncols) {
|
1124
|
+
return;
|
1125
|
+
}
|
1126
|
+
|
1127
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1128
|
+
const int i = row*ncols + col;
|
1129
|
+
|
1130
|
+
const float theta = p*powf(theta_scale, col/2);
|
1131
|
+
const float sin_theta = sinf(theta);
|
1132
|
+
const float cos_theta = cosf(theta);
|
1133
|
+
|
1134
|
+
const float x0 = x[i + 0];
|
1135
|
+
const float x1 = x[i + 1];
|
1136
|
+
|
1137
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
1138
|
+
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1139
|
+
}
|
1140
|
+
|
1141
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1142
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1143
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1144
|
+
|
1145
|
+
if (col >= ncols) {
|
1146
|
+
return;
|
1147
|
+
}
|
1148
|
+
|
1149
|
+
const int i = row*ncols + col;
|
1150
|
+
// dst[i] = col > n_past + row ? -INFINITY : x[i];
|
1151
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
1152
|
+
}
|
1153
|
+
|
1154
|
+
// the CUDA soft max implementation differs from the CPU implementation
|
1155
|
+
// instead of doubles floats are used
|
1156
|
+
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
1157
|
+
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
1158
|
+
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
1159
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1160
|
+
const int block_size = blockDim.x;
|
1161
|
+
const int tid = threadIdx.x;
|
1162
|
+
|
1163
|
+
float tmp = 0.0;
|
1164
|
+
|
1165
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
1166
|
+
const int col = block_start + tid;
|
722
1167
|
|
723
|
-
|
724
|
-
|
725
|
-
|
1168
|
+
if (col >= ncols) {
|
1169
|
+
break;
|
1170
|
+
}
|
1171
|
+
|
1172
|
+
const int i = row*ncols + col;
|
1173
|
+
const float val = expf(x[i]);
|
1174
|
+
tmp += val;
|
1175
|
+
dst[i] = val;
|
726
1176
|
}
|
727
1177
|
|
728
|
-
// sum up partial sums
|
1178
|
+
// sum up partial sums
|
729
1179
|
__syncthreads();
|
730
1180
|
#pragma unroll
|
731
1181
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
732
1182
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
733
1183
|
}
|
734
1184
|
|
735
|
-
|
736
|
-
|
1185
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
1186
|
+
const int col = block_start + tid;
|
1187
|
+
|
1188
|
+
if (col >= ncols) {
|
1189
|
+
break;
|
1190
|
+
}
|
1191
|
+
|
1192
|
+
const int i = row*ncols + col;
|
1193
|
+
dst[i] /= tmp;
|
737
1194
|
}
|
738
1195
|
}
|
739
1196
|
|
740
|
-
static __global__ void
|
741
|
-
const int
|
1197
|
+
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
|
1198
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
742
1199
|
|
743
|
-
if (
|
1200
|
+
if (i >= k) {
|
744
1201
|
return;
|
745
1202
|
}
|
746
1203
|
|
747
|
-
|
748
|
-
const int i = row*ncols + col;
|
749
|
-
|
750
|
-
const float theta = p*powf(theta_scale, col/2);
|
751
|
-
const float sin_theta = sinf(theta);
|
752
|
-
const float cos_theta = cosf(theta);
|
753
|
-
|
754
|
-
const float x0 = x[i + 0];
|
755
|
-
const float x1 = x[i + 1];
|
756
|
-
|
757
|
-
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
758
|
-
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1204
|
+
dst[i] = scale * x[i];
|
759
1205
|
}
|
760
1206
|
|
761
1207
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
@@ -829,75 +1275,91 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
829
1275
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
830
1276
|
}
|
831
1277
|
|
832
|
-
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const
|
1278
|
+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
833
1279
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
834
|
-
|
1280
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1281
|
+
const dim3 block_nums(1, block_num_y, 1);
|
835
1282
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
836
1283
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
837
|
-
<<<
|
1284
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
838
1285
|
}
|
839
1286
|
|
840
|
-
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const
|
1287
|
+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
841
1288
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
842
|
-
|
1289
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1290
|
+
const dim3 block_nums(1, block_num_y, 1);
|
843
1291
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
844
1292
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
845
|
-
<<<
|
1293
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
846
1294
|
}
|
847
1295
|
|
848
|
-
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const
|
1296
|
+
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
849
1297
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
850
|
-
|
1298
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1299
|
+
const dim3 block_nums(1, block_num_y, 1);
|
851
1300
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
852
1301
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
853
|
-
<<<
|
1302
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
854
1303
|
}
|
855
1304
|
|
856
|
-
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const
|
1305
|
+
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
857
1306
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
858
|
-
|
1307
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1308
|
+
const dim3 block_nums(1, block_num_y, 1);
|
859
1309
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
860
1310
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
861
|
-
<<<
|
1311
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
862
1312
|
}
|
863
1313
|
|
864
|
-
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const
|
1314
|
+
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
865
1315
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
866
|
-
|
1316
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1317
|
+
const dim3 block_nums(1, block_num_y, 1);
|
867
1318
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
868
1319
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
869
|
-
<<<
|
1320
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
870
1321
|
}
|
871
1322
|
|
872
1323
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
873
1324
|
GGML_ASSERT(ncols % QK_K == 0);
|
874
|
-
const int ny = 2;
|
1325
|
+
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
1326
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1327
|
+
const dim3 block_nums(1, block_num_y, 1);
|
875
1328
|
const dim3 block_dims(32, ny, 1);
|
876
|
-
|
1329
|
+
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
877
1330
|
}
|
878
1331
|
|
879
1332
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
880
1333
|
GGML_ASSERT(ncols % QK_K == 0);
|
881
|
-
const
|
882
|
-
|
1334
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1335
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1336
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1337
|
+
const dim3 block_dims(32, ny, 1);
|
1338
|
+
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
883
1339
|
}
|
884
1340
|
|
885
1341
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
886
1342
|
GGML_ASSERT(ncols % QK_K == 0);
|
887
|
-
const
|
888
|
-
|
1343
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1344
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1345
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1346
|
+
const dim3 block_dims(32, ny, 1);
|
1347
|
+
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
889
1348
|
}
|
890
1349
|
|
891
1350
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
892
1351
|
GGML_ASSERT(ncols % QK_K == 0);
|
893
|
-
const dim3 block_dims(32,
|
894
|
-
|
1352
|
+
const dim3 block_dims(32, 1, 1);
|
1353
|
+
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
895
1354
|
}
|
896
1355
|
|
897
1356
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
898
1357
|
GGML_ASSERT(ncols % QK_K == 0);
|
899
|
-
const
|
900
|
-
|
1358
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1359
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1360
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1361
|
+
const dim3 block_dims(32, ny, 1);
|
1362
|
+
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
901
1363
|
}
|
902
1364
|
|
903
1365
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -905,12 +1367,13 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
905
1367
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
906
1368
|
}
|
907
1369
|
|
908
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const
|
1370
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
909
1371
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
910
|
-
|
1372
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1373
|
+
const dim3 block_nums(1, block_num_y, 1);
|
911
1374
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
912
1375
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
913
|
-
<<<
|
1376
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
914
1377
|
}
|
915
1378
|
|
916
1379
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
@@ -942,6 +1405,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
942
1405
|
}
|
943
1406
|
}
|
944
1407
|
|
1408
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
|
1409
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1410
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1411
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
1412
|
+
}
|
1413
|
+
|
1414
|
+
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
1415
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
1416
|
+
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
1417
|
+
|
1418
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1419
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1420
|
+
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1421
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
static void ggml_cpy_f32_f32_cuda(
|
1425
|
+
const char * cx, char * cdst, const int ne,
|
1426
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1427
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1428
|
+
|
1429
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1430
|
+
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1431
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1432
|
+
}
|
1433
|
+
|
1434
|
+
static void ggml_cpy_f32_f16_cuda(
|
1435
|
+
const char * cx, char * cdst, const int ne,
|
1436
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1437
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1438
|
+
|
1439
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1440
|
+
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1441
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1442
|
+
}
|
1443
|
+
|
1444
|
+
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
1445
|
+
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
1446
|
+
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
1447
|
+
}
|
1448
|
+
|
945
1449
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
|
946
1450
|
GGML_ASSERT(nrows % 2 == 0);
|
947
1451
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -950,6 +1454,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
950
1454
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
951
1455
|
}
|
952
1456
|
|
1457
|
+
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
1458
|
+
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
1459
|
+
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
1460
|
+
const dim3 block_nums(block_num_x, nrows_x, 1);
|
1461
|
+
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
1462
|
+
}
|
1463
|
+
|
1464
|
+
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
1465
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1466
|
+
const dim3 block_nums(1, nrows_x, 1);
|
1467
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
1468
|
+
}
|
1469
|
+
|
953
1470
|
// buffer pool for cuda
|
954
1471
|
#define MAX_CUDA_BUFFERS 256
|
955
1472
|
|
@@ -1018,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
|
|
1018
1535
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
1019
1536
|
static size_t g_scratch_offset = 0;
|
1020
1537
|
|
1021
|
-
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
1022
|
-
#define GGML_CUDA_MAX_EVENTS 64
|
1023
|
-
|
1024
1538
|
static int g_device_count = -1;
|
1025
1539
|
static int g_main_device = 0;
|
1026
1540
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1027
1541
|
|
1028
1542
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1029
1543
|
|
1030
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES]
|
1031
|
-
|
1032
|
-
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
|
1033
|
-
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
|
1544
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
1034
1545
|
|
1035
1546
|
void ggml_init_cublas() {
|
1036
1547
|
static bool initialized = false;
|
@@ -1054,15 +1565,8 @@ void ggml_init_cublas() {
|
|
1054
1565
|
for (int id = 0; id < g_device_count; ++id) {
|
1055
1566
|
CUDA_CHECK(cudaSetDevice(id));
|
1056
1567
|
|
1057
|
-
// create
|
1058
|
-
|
1059
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
|
1060
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
|
1061
|
-
}
|
1062
|
-
// create events
|
1063
|
-
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
|
1064
|
-
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
|
1065
|
-
}
|
1568
|
+
// create main stream
|
1569
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
|
1066
1570
|
|
1067
1571
|
// create cublas handle
|
1068
1572
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -1105,6 +1609,9 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
1105
1609
|
void * ptr = nullptr;
|
1106
1610
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
1107
1611
|
if (err != cudaSuccess) {
|
1612
|
+
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
1613
|
+
// This can fixed the OOM error in WSL.
|
1614
|
+
cudaGetLastError();
|
1108
1615
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
1109
1616
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
1110
1617
|
return nullptr;
|
@@ -1117,10 +1624,25 @@ void ggml_cuda_host_free(void * ptr) {
|
|
1117
1624
|
CUDA_CHECK(cudaFreeHost(ptr));
|
1118
1625
|
}
|
1119
1626
|
|
1120
|
-
static cudaError_t
|
1627
|
+
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
1121
1628
|
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
1122
1629
|
|
1123
|
-
|
1630
|
+
cudaMemcpyKind kind;
|
1631
|
+
char * src_ptr;
|
1632
|
+
if (src->backend == GGML_BACKEND_CPU) {
|
1633
|
+
kind = cudaMemcpyHostToDevice;
|
1634
|
+
src_ptr = (char *) src->data;
|
1635
|
+
} else if (src->backend == GGML_BACKEND_GPU) {
|
1636
|
+
kind = cudaMemcpyDeviceToDevice;
|
1637
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
1638
|
+
int id;
|
1639
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
1640
|
+
src_ptr = (char *) extra->data_device[id];
|
1641
|
+
} else {
|
1642
|
+
GGML_ASSERT(false);
|
1643
|
+
}
|
1644
|
+
char * dst_ptr = (char *) dst;
|
1645
|
+
|
1124
1646
|
const int64_t ne0 = src->ne[0];
|
1125
1647
|
const int64_t nb0 = src->nb[0];
|
1126
1648
|
const int64_t nb1 = src->nb[1];
|
@@ -1131,17 +1653,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(
|
|
1131
1653
|
const int64_t bs = ggml_blck_size(type);
|
1132
1654
|
int64_t i1_diff = i1_high - i1_low;
|
1133
1655
|
|
1134
|
-
const
|
1656
|
+
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
1135
1657
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
1136
|
-
return cudaMemcpyAsync(
|
1658
|
+
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
1137
1659
|
} else if (nb0 == ts) {
|
1138
|
-
return cudaMemcpy2DAsync(
|
1660
|
+
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
1139
1661
|
} else {
|
1140
1662
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
1141
1663
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
1142
|
-
void * rd = (void *) (
|
1664
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
1143
1665
|
// pretend the row is a matrix with cols=1
|
1144
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0,
|
1666
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
1145
1667
|
if (r != cudaSuccess) return r;
|
1146
1668
|
}
|
1147
1669
|
return cudaSuccess;
|
@@ -1260,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1260
1782
|
const int64_t ne00 = src0->ne[0];
|
1261
1783
|
const int64_t nrows = i01_high - i01_low;
|
1262
1784
|
|
1785
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
1786
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1787
|
+
size_t ash;
|
1788
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
1789
|
+
|
1790
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
1791
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
1792
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
1793
|
+
|
1794
|
+
if (src1_convert_f16) {
|
1795
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
1796
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
1797
|
+
ne00, 1, sizeof(float), 0, 0,
|
1798
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
1799
|
+
}
|
1800
|
+
#else
|
1801
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
1802
|
+
#endif // GGML_CUDA_DMMV_F16
|
1803
|
+
|
1263
1804
|
switch (src0->type) {
|
1264
1805
|
case GGML_TYPE_Q4_0:
|
1265
|
-
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i,
|
1806
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1266
1807
|
break;
|
1267
1808
|
case GGML_TYPE_Q4_1:
|
1268
|
-
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i,
|
1809
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1269
1810
|
break;
|
1270
1811
|
case GGML_TYPE_Q5_0:
|
1271
|
-
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i,
|
1812
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1272
1813
|
break;
|
1273
1814
|
case GGML_TYPE_Q5_1:
|
1274
|
-
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i,
|
1815
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1275
1816
|
break;
|
1276
1817
|
case GGML_TYPE_Q8_0:
|
1277
|
-
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i,
|
1818
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1278
1819
|
break;
|
1279
1820
|
case GGML_TYPE_Q2_K:
|
1280
1821
|
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
@@ -1292,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1292
1833
|
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1293
1834
|
break;
|
1294
1835
|
case GGML_TYPE_F16:
|
1295
|
-
convert_mul_mat_vec_f16_cuda(src0_ddq_i,
|
1836
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1296
1837
|
break;
|
1297
1838
|
default:
|
1298
1839
|
GGML_ASSERT(false);
|
@@ -1300,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1300
1841
|
}
|
1301
1842
|
CUDA_CHECK(cudaGetLastError());
|
1302
1843
|
|
1844
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1845
|
+
if (src1_convert_f16) {
|
1846
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
1847
|
+
}
|
1848
|
+
#endif // GGML_CUDA_DMMV_F16
|
1849
|
+
|
1303
1850
|
(void) src1;
|
1304
1851
|
(void) dst;
|
1305
1852
|
(void) src0_ddf_i;
|
@@ -1377,8 +1924,81 @@ inline void ggml_cuda_op_rope(
|
|
1377
1924
|
(void) i1;
|
1378
1925
|
}
|
1379
1926
|
|
1927
|
+
inline void ggml_cuda_op_diag_mask_inf(
|
1928
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1929
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1930
|
+
cudaStream_t & cudaStream_main){
|
1931
|
+
|
1932
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1933
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1934
|
+
|
1935
|
+
const int64_t ne00 = src0->ne[0];
|
1936
|
+
const int64_t ne01 = src0->ne[1];
|
1937
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1938
|
+
|
1939
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
1940
|
+
|
1941
|
+
// compute
|
1942
|
+
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
1943
|
+
CUDA_CHECK(cudaGetLastError());
|
1944
|
+
|
1945
|
+
(void) dst;
|
1946
|
+
(void) src0_ddq_i;
|
1947
|
+
(void) src1_ddf_i;
|
1948
|
+
(void) i02;
|
1949
|
+
(void) i1;
|
1950
|
+
}
|
1951
|
+
|
1952
|
+
inline void ggml_cuda_op_soft_max(
|
1953
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1954
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1955
|
+
cudaStream_t & cudaStream_main){
|
1956
|
+
|
1957
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1958
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1959
|
+
|
1960
|
+
const int64_t ne00 = src0->ne[0];
|
1961
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1962
|
+
|
1963
|
+
// compute
|
1964
|
+
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
1965
|
+
CUDA_CHECK(cudaGetLastError());
|
1966
|
+
|
1967
|
+
(void) src1;
|
1968
|
+
(void) dst;
|
1969
|
+
(void) src0_ddq_i;
|
1970
|
+
(void) src1_ddf_i;
|
1971
|
+
(void) i02;
|
1972
|
+
(void) i1;
|
1973
|
+
}
|
1974
|
+
|
1975
|
+
inline void ggml_cuda_op_scale(
|
1976
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1977
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1978
|
+
cudaStream_t & cudaStream_main){
|
1979
|
+
|
1980
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1981
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1982
|
+
|
1983
|
+
const float scale = ((float *) src1->data)[0];
|
1984
|
+
|
1985
|
+
const int64_t ne00 = src0->ne[0];
|
1986
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1987
|
+
|
1988
|
+
// compute
|
1989
|
+
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
1990
|
+
CUDA_CHECK(cudaGetLastError());
|
1991
|
+
|
1992
|
+
(void) src1;
|
1993
|
+
(void) dst;
|
1994
|
+
(void) src0_ddq_i;
|
1995
|
+
(void) src1_ddf_i;
|
1996
|
+
(void) i02;
|
1997
|
+
(void) i1;
|
1998
|
+
}
|
1999
|
+
|
1380
2000
|
static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
1381
|
-
ggml_cuda_op_t op, bool src0_needs_f32) {
|
2001
|
+
ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
|
1382
2002
|
const int64_t ne00 = src0->ne[0];
|
1383
2003
|
const int64_t ne01 = src0->ne[1];
|
1384
2004
|
const int64_t ne02 = src0->ne[2];
|
@@ -1401,21 +2021,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1401
2021
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
1402
2022
|
|
1403
2023
|
// strides for iteration over dims 3 and 2
|
1404
|
-
const int64_t
|
1405
|
-
const int64_t
|
1406
|
-
const int64_t
|
1407
|
-
const int64_t
|
2024
|
+
const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
|
2025
|
+
const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
|
2026
|
+
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
2027
|
+
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
2028
|
+
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
1408
2029
|
|
1409
2030
|
const size_t src0_ts = ggml_type_size(src0->type);
|
1410
2031
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
1411
2032
|
|
1412
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
2033
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
1413
2034
|
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
1414
|
-
struct ggml_tensor_extra_gpu * dst_extra
|
2035
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
1415
2036
|
|
1416
2037
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
2038
|
+
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
1417
2039
|
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
1418
2040
|
|
2041
|
+
const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
|
2042
|
+
const bool src1_stays_on_host = use_src1 && (
|
2043
|
+
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
2044
|
+
|
1419
2045
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
1420
2046
|
|
1421
2047
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
@@ -1424,13 +2050,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1424
2050
|
char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
|
1425
2051
|
float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
1426
2052
|
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1427
|
-
float *
|
2053
|
+
float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1428
2054
|
|
1429
2055
|
// asq = actual size quantized, asf = actual size float
|
1430
2056
|
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
1431
2057
|
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1432
2058
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1433
|
-
size_t
|
2059
|
+
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2060
|
+
|
2061
|
+
// if multiple GPUs are used they need to wait for the main GPU to finish
|
2062
|
+
if (split && g_device_count > 1) {
|
2063
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2064
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2065
|
+
}
|
1434
2066
|
|
1435
2067
|
for (int id = 0; id < g_device_count; ++id) {
|
1436
2068
|
if (!split && id != g_main_device) {
|
@@ -1443,9 +2075,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1443
2075
|
int64_t row_low, row_high;
|
1444
2076
|
if (split) {
|
1445
2077
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
1446
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1447
2078
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
1448
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1449
2079
|
} else {
|
1450
2080
|
row_low = 0;
|
1451
2081
|
row_high = nrows0;
|
@@ -1458,7 +2088,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1458
2088
|
|
1459
2089
|
cudaSetDevice(id);
|
1460
2090
|
|
1461
|
-
if (src0_on_device) {
|
2091
|
+
if (src0_on_device && src0_is_contiguous) {
|
1462
2092
|
if (src0_is_f32) {
|
1463
2093
|
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
1464
2094
|
} else {
|
@@ -1476,8 +2106,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1476
2106
|
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
|
1477
2107
|
}
|
1478
2108
|
|
1479
|
-
if (use_src1) {
|
1480
|
-
if (src1_on_device) {
|
2109
|
+
if (use_src1 && !src1_stays_on_host) {
|
2110
|
+
if (src1_on_device && src1_is_contiguous) {
|
1481
2111
|
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
1482
2112
|
} else {
|
1483
2113
|
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
|
@@ -1490,26 +2120,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1490
2120
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
1491
2121
|
}
|
1492
2122
|
|
1493
|
-
|
2123
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
2124
|
+
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
2125
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
2126
|
+
|
2127
|
+
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
1494
2128
|
const int64_t i13 = i03 % ne13;
|
1495
|
-
for (int64_t i02 = 0; i02 <
|
2129
|
+
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
1496
2130
|
const int64_t i12 = i02 % ne12;
|
1497
2131
|
|
1498
2132
|
const int64_t i0 = i03*ne02 + i02;
|
1499
|
-
|
1500
|
-
|
2133
|
+
|
2134
|
+
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
2135
|
+
const int64_t i0_offset_low = row_low/rows_per_iter;
|
2136
|
+
const int64_t i0_offset_high = row_high/rows_per_iter;
|
1501
2137
|
|
1502
2138
|
int64_t i01_low = 0;
|
1503
|
-
int64_t i01_high =
|
2139
|
+
int64_t i01_high = rows_per_iter;
|
1504
2140
|
if (split) {
|
1505
2141
|
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
1506
2142
|
continue;
|
1507
2143
|
}
|
1508
2144
|
if (i0 == i0_offset_low) {
|
1509
|
-
i01_low = row_low %
|
2145
|
+
i01_low = row_low % rows_per_iter;
|
1510
2146
|
}
|
1511
2147
|
if (i0 == i0_offset_high) {
|
1512
|
-
i01_high = row_high %
|
2148
|
+
i01_high = row_high % rows_per_iter;
|
1513
2149
|
}
|
1514
2150
|
}
|
1515
2151
|
|
@@ -1518,7 +2154,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1518
2154
|
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
1519
2155
|
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
1520
2156
|
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
1521
|
-
GGML_ASSERT(i01_high ==
|
2157
|
+
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
1522
2158
|
|
1523
2159
|
const int64_t i01_diff = i01_high - i01_low;
|
1524
2160
|
if (i01_diff == 0) {
|
@@ -1526,24 +2162,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1526
2162
|
}
|
1527
2163
|
const int64_t i11 = i13*ne12 + i12;
|
1528
2164
|
|
1529
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id]
|
1530
|
-
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1531
|
-
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
2165
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
1532
2166
|
|
1533
2167
|
// for split tensors the data begins at i0 == i0_offset_low
|
1534
2168
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
1535
2169
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
1536
2170
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
1537
|
-
float * dst_ddf_i
|
2171
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
1538
2172
|
|
1539
2173
|
// for split tensors the data pointer needs to be rounded down
|
1540
2174
|
// to the bin edge for i03, i02 bins beyond the first
|
1541
2175
|
if (i0 - i0_offset_low > 0) {
|
2176
|
+
GGML_ASSERT(!flatten_rows);
|
1542
2177
|
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
1543
2178
|
src0_ddf_i -= (row_low % ne01)*ne00;
|
1544
|
-
|
1545
|
-
if (i0 - i0_offset_low > 0) {
|
1546
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
2179
|
+
dst_ddf_i -= (row_low % ne0)*ne1;
|
1547
2180
|
}
|
1548
2181
|
|
1549
2182
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
@@ -1553,38 +2186,41 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1553
2186
|
}
|
1554
2187
|
|
1555
2188
|
// copy src0, src1 to device if necessary
|
1556
|
-
if (use_src1) {
|
2189
|
+
if (use_src1 && !src1_stays_on_host) {
|
1557
2190
|
if (src1->backend == GGML_BACKEND_CPU) {
|
1558
|
-
|
1559
|
-
|
2191
|
+
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
2192
|
+
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
2193
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
|
2194
|
+
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
1560
2195
|
if (id != g_main_device) {
|
2196
|
+
GGML_ASSERT(!flatten_rows);
|
1561
2197
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
1562
2198
|
src1_ddf_i_source += i11*src1_stride;
|
1563
2199
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
1564
|
-
cudaMemcpyDeviceToDevice,
|
2200
|
+
cudaMemcpyDeviceToDevice, cudaStream_main));
|
1565
2201
|
}
|
2202
|
+
} else if (src1_on_device && !src1_is_contiguous) {
|
2203
|
+
GGML_ASSERT(!split);
|
2204
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
1566
2205
|
} else {
|
1567
2206
|
GGML_ASSERT(false);
|
1568
2207
|
}
|
1569
2208
|
}
|
1570
|
-
|
1571
|
-
if (!src0_on_device) {
|
2209
|
+
|
2210
|
+
if (!src0_on_device || !src0_is_contiguous) {
|
1572
2211
|
if (src0_is_f32) {
|
1573
|
-
CUDA_CHECK(
|
2212
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1574
2213
|
} else {
|
1575
|
-
CUDA_CHECK(
|
2214
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1576
2215
|
}
|
1577
2216
|
}
|
1578
2217
|
|
1579
|
-
// convert src0 to f32 if it
|
2218
|
+
// convert src0 to f32 if it is necessary for the ggml_cuda_op
|
1580
2219
|
if (src0_needs_f32 && !src0_is_f32) {
|
1581
2220
|
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
1582
2221
|
CUDA_CHECK(cudaGetLastError());
|
1583
2222
|
}
|
1584
2223
|
|
1585
|
-
// wait with main stream until src1 memcpy is done
|
1586
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
|
1587
|
-
|
1588
2224
|
// do the computation
|
1589
2225
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
1590
2226
|
|
@@ -1622,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1622
2258
|
|
1623
2259
|
// wait until each device is finished, then free their buffers
|
1624
2260
|
for (int id = 0; id < g_device_count; ++id) {
|
2261
|
+
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
2262
|
+
continue;
|
2263
|
+
}
|
2264
|
+
|
1625
2265
|
CUDA_CHECK(cudaSetDevice(id));
|
1626
2266
|
CUDA_CHECK(cudaDeviceSynchronize());
|
2267
|
+
|
1627
2268
|
if (src0_asq[id] > 0) {
|
1628
2269
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
1629
2270
|
}
|
@@ -1641,39 +2282,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1641
2282
|
|
1642
2283
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1643
2284
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1644
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
|
2285
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
|
1645
2286
|
}
|
1646
2287
|
|
1647
2288
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1648
2289
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1649
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
|
2290
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
1650
2291
|
}
|
1651
2292
|
|
1652
2293
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1653
2294
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1654
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
|
2295
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
1655
2296
|
}
|
1656
2297
|
|
1657
2298
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1658
2299
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1659
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
|
2300
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
1660
2301
|
}
|
1661
2302
|
|
1662
2303
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1663
|
-
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
|
1664
2304
|
const int64_t ne10 = src1->ne[0];
|
1665
2305
|
|
1666
2306
|
const int64_t ne0 = dst->ne[0];
|
1667
2307
|
const int64_t ne1 = dst->ne[1];
|
1668
2308
|
|
1669
|
-
// if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
|
1670
|
-
// fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
|
1671
|
-
// src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
1672
|
-
// src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
1673
|
-
// dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
1674
|
-
// return false;
|
1675
|
-
// }
|
1676
|
-
|
1677
2309
|
// TODO: find the optimal values for these
|
1678
2310
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
1679
2311
|
src1->type == GGML_TYPE_F32 &&
|
@@ -1685,23 +2317,152 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
1685
2317
|
return false;
|
1686
2318
|
}
|
1687
2319
|
|
2320
|
+
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2321
|
+
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
2322
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2323
|
+
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
2324
|
+
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
2325
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2326
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2327
|
+
|
2328
|
+
const int64_t ne00 = src0->ne[0];
|
2329
|
+
const int64_t ne01 = src0->ne[1];
|
2330
|
+
const int64_t ne02 = src0->ne[2];
|
2331
|
+
|
2332
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2333
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2334
|
+
|
2335
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2336
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2337
|
+
|
2338
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2339
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2340
|
+
|
2341
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2342
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2343
|
+
|
2344
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2348
|
+
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
2349
|
+
GGML_ASSERT(!ggml_is_permuted(src0));
|
2350
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2351
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2352
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2353
|
+
|
2354
|
+
const int64_t ne00 = src0->ne[0];
|
2355
|
+
const int64_t ne01 = src0->ne[1];
|
2356
|
+
const int64_t ne02 = src0->ne[2];
|
2357
|
+
|
2358
|
+
const int64_t nb01 = src0->nb[1];
|
2359
|
+
const int64_t nb02 = src0->nb[2];
|
2360
|
+
|
2361
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2362
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2363
|
+
|
2364
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2365
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2366
|
+
|
2367
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2368
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2369
|
+
|
2370
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2371
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2372
|
+
|
2373
|
+
const int row_stride_x = nb01 / sizeof(half);
|
2374
|
+
const int channel_stride_x = nb02 / sizeof(half);
|
2375
|
+
|
2376
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
2377
|
+
}
|
2378
|
+
|
1688
2379
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1689
|
-
|
1690
|
-
|
2380
|
+
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
2381
|
+
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
2382
|
+
|
2383
|
+
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
2384
|
+
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
2385
|
+
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
2386
|
+
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
2387
|
+
}else if (src0->type == GGML_TYPE_F32) {
|
2388
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1691
2389
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
1692
|
-
if (src1->ne[1] == 1) {
|
1693
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
2390
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
|
2391
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
|
1694
2392
|
} else {
|
1695
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
|
2393
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1696
2394
|
}
|
1697
2395
|
} else {
|
1698
2396
|
GGML_ASSERT(false);
|
1699
2397
|
}
|
1700
2398
|
}
|
1701
2399
|
|
2400
|
+
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2401
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2402
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2406
|
+
const int64_t ne = ggml_nelements(src0);
|
2407
|
+
GGML_ASSERT(ne == ggml_nelements(src1));
|
2408
|
+
|
2409
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
2410
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
2411
|
+
|
2412
|
+
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
2413
|
+
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
2414
|
+
|
2415
|
+
const int64_t ne00 = src0->ne[0];
|
2416
|
+
const int64_t ne01 = src0->ne[1];
|
2417
|
+
GGML_ASSERT(src0->ne[3] == 1);
|
2418
|
+
|
2419
|
+
const int64_t nb00 = src0->nb[0];
|
2420
|
+
const int64_t nb01 = src0->nb[1];
|
2421
|
+
const int64_t nb02 = src0->nb[2];
|
2422
|
+
|
2423
|
+
const int64_t ne10 = src1->ne[0];
|
2424
|
+
const int64_t ne11 = src1->ne[1];
|
2425
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
2426
|
+
|
2427
|
+
const int64_t nb10 = src1->nb[0];
|
2428
|
+
const int64_t nb11 = src1->nb[1];
|
2429
|
+
const int64_t nb12 = src1->nb[2];
|
2430
|
+
|
2431
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2432
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2433
|
+
|
2434
|
+
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2435
|
+
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2436
|
+
|
2437
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2438
|
+
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
2439
|
+
|
2440
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
2441
|
+
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2442
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2443
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
2444
|
+
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2445
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2446
|
+
} else {
|
2447
|
+
GGML_ASSERT(false);
|
2448
|
+
}
|
2449
|
+
|
2450
|
+
(void) dst;
|
2451
|
+
}
|
2452
|
+
|
2453
|
+
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2454
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2455
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
2456
|
+
}
|
2457
|
+
|
2458
|
+
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2459
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2460
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
2461
|
+
}
|
2462
|
+
|
1702
2463
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1703
2464
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1704
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
|
2465
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
|
1705
2466
|
}
|
1706
2467
|
|
1707
2468
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1710,16 +2471,14 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
1710
2471
|
(void) dst;
|
1711
2472
|
}
|
1712
2473
|
|
1713
|
-
void
|
1714
|
-
FILE * fp = fopen(fname, "rb");
|
2474
|
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
1715
2475
|
int nrows = ggml_nrows(tensor);
|
1716
2476
|
const size_t nb1 = tensor->nb[1];
|
1717
2477
|
ggml_backend backend = tensor->backend;
|
1718
2478
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
2479
|
+
memset(extra, 0, sizeof(*extra));
|
1719
2480
|
|
1720
2481
|
for (int id = 0; id < g_device_count; ++id) {
|
1721
|
-
extra->data_device[id] = nullptr;
|
1722
|
-
|
1723
2482
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
1724
2483
|
continue;
|
1725
2484
|
}
|
@@ -1732,10 +2491,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1732
2491
|
row_high = nrows;
|
1733
2492
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
1734
2493
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
1735
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1736
2494
|
row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
|
1737
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1738
|
-
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
|
1739
2495
|
} else {
|
1740
2496
|
GGML_ASSERT(false);
|
1741
2497
|
}
|
@@ -1745,35 +2501,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1745
2501
|
|
1746
2502
|
int64_t nrows_split = row_high - row_low;
|
1747
2503
|
|
1748
|
-
const size_t offset_split =
|
2504
|
+
const size_t offset_split = row_low*nb1;
|
1749
2505
|
const size_t size = ggml_nbytes_split(tensor, nrows_split);
|
1750
2506
|
|
1751
2507
|
void * buf;
|
1752
2508
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
1753
|
-
void * buf_host =
|
1754
|
-
|
1755
|
-
#ifdef _WIN32
|
1756
|
-
int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
|
1757
|
-
#else
|
1758
|
-
int ret = fseek(fp, (long) offset_split, SEEK_SET);
|
1759
|
-
#endif
|
1760
|
-
GGML_ASSERT(ret == 0); // same
|
1761
|
-
|
1762
|
-
size_t ret2 = fread(buf_host, size, 1, fp);
|
1763
|
-
if (ret2 != 1) {
|
1764
|
-
fprintf(stderr, "unexpectedly reached end of file");
|
1765
|
-
exit(1);
|
1766
|
-
}
|
2509
|
+
void * buf_host = (char*)data + offset_split;
|
1767
2510
|
|
1768
2511
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
1769
|
-
cudaDeviceSynchronize();
|
1770
2512
|
|
1771
|
-
free(buf_host);
|
1772
2513
|
extra->data_device[id] = buf;
|
1773
2514
|
}
|
1774
2515
|
|
1775
2516
|
tensor->extra = extra;
|
1776
|
-
fclose(fp);
|
1777
2517
|
}
|
1778
2518
|
|
1779
2519
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
@@ -1795,47 +2535,78 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
1795
2535
|
delete extra;
|
1796
2536
|
}
|
1797
2537
|
|
1798
|
-
void
|
1799
|
-
if (
|
1800
|
-
|
2538
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
2539
|
+
if (scratch && g_scratch_size == 0) {
|
2540
|
+
return;
|
1801
2541
|
}
|
1802
2542
|
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
2543
|
+
// recursively assign CUDA buffers until a compute tensor is found
|
2544
|
+
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2545
|
+
const ggml_op src0_op = tensor->src0->op;
|
2546
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2547
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
2548
|
+
}
|
2549
|
+
}
|
2550
|
+
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2551
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
1807
2552
|
}
|
1808
2553
|
|
1809
2554
|
tensor->backend = GGML_BACKEND_GPU;
|
1810
2555
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
1811
2556
|
|
1812
|
-
bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data
|
2557
|
+
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2558
|
+
tensor->op == GGML_OP_VIEW;
|
2559
|
+
const size_t size = ggml_nbytes(tensor);
|
1813
2560
|
|
1814
2561
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
1815
2562
|
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
1816
2563
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
2564
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2565
|
+
size_t offset = 0;
|
2566
|
+
if (tensor->op == GGML_OP_VIEW) {
|
2567
|
+
memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
|
2568
|
+
}
|
2569
|
+
extra->data_device[g_main_device] = src0_ddc + offset;
|
2570
|
+
} else if (tensor->op == GGML_OP_CPY) {
|
2571
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
|
2572
|
+
void * src1_ddv = src1_extra->data_device[g_main_device];
|
2573
|
+
extra->data_device[g_main_device] = src1_ddv;
|
2574
|
+
} else if (scratch) {
|
2575
|
+
GGML_ASSERT(size <= g_scratch_size);
|
2576
|
+
if (g_scratch_offset + size > g_scratch_size) {
|
2577
|
+
g_scratch_offset = 0;
|
2578
|
+
}
|
2579
|
+
|
1820
2580
|
char * data = (char *) g_scratch_buffer;
|
1821
2581
|
if (data == nullptr) {
|
1822
2582
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
1823
2583
|
g_scratch_buffer = data;
|
1824
2584
|
}
|
1825
2585
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
1826
|
-
}
|
1827
2586
|
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
2587
|
+
g_scratch_offset += size;
|
2588
|
+
|
2589
|
+
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
2590
|
+
} else { // allocate new buffers outside of scratch
|
2591
|
+
void * data;
|
2592
|
+
CUDA_CHECK(cudaMalloc(&data, size));
|
2593
|
+
CUDA_CHECK(cudaMemset(data, 0, size));
|
2594
|
+
extra->data_device[g_main_device] = data;
|
2595
|
+
}
|
1832
2596
|
|
1833
|
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
1834
2597
|
tensor->extra = extra;
|
1835
2598
|
}
|
1836
2599
|
|
2600
|
+
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2601
|
+
ggml_cuda_assign_buffers_impl(tensor, true);
|
2602
|
+
}
|
2603
|
+
|
2604
|
+
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2605
|
+
ggml_cuda_assign_buffers_impl(tensor, false);
|
2606
|
+
}
|
2607
|
+
|
1837
2608
|
void ggml_cuda_set_main_device(int main_device) {
|
1838
|
-
if (main_device
|
2609
|
+
if (main_device >= g_device_count) {
|
1839
2610
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
1840
2611
|
main_device, g_device_count, g_main_device);
|
1841
2612
|
return;
|
@@ -1852,6 +2623,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
|
1852
2623
|
g_scratch_size = scratch_size;
|
1853
2624
|
}
|
1854
2625
|
|
2626
|
+
void ggml_cuda_free_scratch() {
|
2627
|
+
if (g_scratch_buffer == nullptr) {
|
2628
|
+
return;
|
2629
|
+
}
|
2630
|
+
|
2631
|
+
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
2632
|
+
g_scratch_buffer = nullptr;
|
2633
|
+
}
|
2634
|
+
|
1855
2635
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
1856
2636
|
ggml_cuda_func_t func;
|
1857
2637
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -1889,12 +2669,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
1889
2669
|
}
|
1890
2670
|
func = ggml_cuda_mul_mat;
|
1891
2671
|
break;
|
2672
|
+
case GGML_OP_SCALE:
|
2673
|
+
if (!any_on_device) {
|
2674
|
+
return false;
|
2675
|
+
}
|
2676
|
+
func = ggml_cuda_scale;
|
2677
|
+
break;
|
2678
|
+
case GGML_OP_CPY:
|
2679
|
+
if (!any_on_device) {
|
2680
|
+
return false;
|
2681
|
+
}
|
2682
|
+
func = ggml_cuda_cpy;
|
2683
|
+
break;
|
1892
2684
|
case GGML_OP_RESHAPE:
|
2685
|
+
case GGML_OP_VIEW:
|
2686
|
+
case GGML_OP_PERMUTE:
|
2687
|
+
case GGML_OP_TRANSPOSE:
|
1893
2688
|
if (!any_on_device) {
|
1894
2689
|
return false;
|
1895
2690
|
}
|
1896
2691
|
func = ggml_cuda_nop;
|
1897
2692
|
break;
|
2693
|
+
case GGML_OP_DIAG_MASK_INF:
|
2694
|
+
if (!any_on_device) {
|
2695
|
+
return false;
|
2696
|
+
}
|
2697
|
+
func = ggml_cuda_diag_mask_inf;
|
2698
|
+
break;
|
2699
|
+
case GGML_OP_SOFT_MAX:
|
2700
|
+
if (!any_on_device) {
|
2701
|
+
return false;
|
2702
|
+
}
|
2703
|
+
func = ggml_cuda_soft_max;
|
2704
|
+
break;
|
1898
2705
|
case GGML_OP_ROPE:
|
1899
2706
|
if (!any_on_device) {
|
1900
2707
|
return false;
|