llama_cpp 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/examples/README.md +92 -0
- data/examples/chat.rb +195 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1218 -411
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +703 -514
- data/ext/llama_cpp/src/ggml-metal.metal +574 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +496 -36
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +2715 -476
- data/ext/llama_cpp/src/ggml.h +266 -11
- data/ext/llama_cpp/src/llama.cpp +266 -135
- data/ext/llama_cpp/src/llama.h +19 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +5 -2
@@ -1,5 +1,6 @@
|
|
1
1
|
#include <cstddef>
|
2
2
|
#include <cstdint>
|
3
|
+
#include <limits>
|
3
4
|
#include <stdint.h>
|
4
5
|
#include <stdio.h>
|
5
6
|
#include <atomic>
|
@@ -12,6 +13,10 @@
|
|
12
13
|
#include "ggml-cuda.h"
|
13
14
|
#include "ggml.h"
|
14
15
|
|
16
|
+
#if defined(_MSC_VER)
|
17
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
|
+
#endif
|
19
|
+
|
15
20
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
16
21
|
|
17
22
|
#define CUDA_CHECK(err) \
|
@@ -24,7 +29,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
24
29
|
} \
|
25
30
|
} while (0)
|
26
31
|
|
27
|
-
#if CUDART_VERSION >=
|
32
|
+
#if CUDART_VERSION >= 12000
|
28
33
|
#define CUBLAS_CHECK(err) \
|
29
34
|
do { \
|
30
35
|
cublasStatus_t err_ = (err); \
|
@@ -45,9 +50,18 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
45
50
|
} while (0)
|
46
51
|
#endif // CUDART_VERSION >= 11
|
47
52
|
|
48
|
-
|
53
|
+
#ifdef GGML_CUDA_DMMV_F16
|
54
|
+
typedef half dfloat; // dequantize float
|
55
|
+
typedef half2 dfloat2;
|
56
|
+
#else
|
57
|
+
typedef float dfloat; // dequantize float
|
58
|
+
typedef float2 dfloat2;
|
59
|
+
#endif //GGML_CUDA_DMMV_F16
|
60
|
+
|
61
|
+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
49
62
|
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
50
63
|
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
64
|
+
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
51
65
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
52
66
|
typedef void (*ggml_cuda_op_t)(
|
53
67
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
|
@@ -151,7 +165,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
151
165
|
#define CUDA_ADD_BLOCK_SIZE 256
|
152
166
|
#define CUDA_MUL_BLOCK_SIZE 256
|
153
167
|
#define CUDA_SILU_BLOCK_SIZE 256
|
168
|
+
#define CUDA_CPY_BLOCK_SIZE 32
|
169
|
+
#define CUDA_SCALE_BLOCK_SIZE 256
|
154
170
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
171
|
+
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
155
172
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
156
173
|
|
157
174
|
// dmmv = dequantize_mul_mat_vec
|
@@ -162,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
162
179
|
#define GGML_CUDA_DMMV_Y 1
|
163
180
|
#endif
|
164
181
|
|
182
|
+
#ifndef K_QUANTS_PER_ITERATION
|
183
|
+
#define K_QUANTS_PER_ITERATION 2
|
184
|
+
#else
|
185
|
+
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
186
|
+
#endif
|
187
|
+
|
165
188
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
166
189
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
167
190
|
|
@@ -219,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
219
242
|
}
|
220
243
|
}
|
221
244
|
|
222
|
-
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs,
|
245
|
+
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
223
246
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
224
247
|
|
225
|
-
const
|
248
|
+
const dfloat d = x[ib].d;
|
226
249
|
|
227
|
-
const
|
250
|
+
const int vui = x[ib].qs[iqs];
|
228
251
|
|
229
|
-
|
230
|
-
|
252
|
+
v.x = vui & 0xF;
|
253
|
+
v.y = vui >> 4;
|
231
254
|
|
232
|
-
|
233
|
-
|
255
|
+
#ifdef GGML_CUDA_DMMV_F16
|
256
|
+
v = __hsub2(v, {8.0f, 8.0f});
|
257
|
+
v = __hmul2(v, {d, d});
|
258
|
+
#else
|
259
|
+
v.x = (v.x - 8.0f) * d;
|
260
|
+
v.y = (v.y - 8.0f) * d;
|
261
|
+
#endif // GGML_CUDA_DMMV_F16
|
234
262
|
}
|
235
263
|
|
236
|
-
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs,
|
264
|
+
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
237
265
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
238
266
|
|
239
|
-
const
|
240
|
-
const
|
267
|
+
const dfloat d = x[ib].d;
|
268
|
+
const dfloat m = x[ib].m;
|
241
269
|
|
242
|
-
const
|
270
|
+
const int vui = x[ib].qs[iqs];
|
243
271
|
|
244
|
-
|
245
|
-
|
272
|
+
v.x = vui & 0xF;
|
273
|
+
v.y = vui >> 4;
|
246
274
|
|
247
|
-
|
248
|
-
|
275
|
+
#ifdef GGML_CUDA_DMMV_F16
|
276
|
+
v = __hmul2(v, {d, d});
|
277
|
+
v = __hadd2(v, {m, m});
|
278
|
+
#else
|
279
|
+
v.x = (v.x * d) + m;
|
280
|
+
v.y = (v.y * d) + m;
|
281
|
+
#endif // GGML_CUDA_DMMV_F16
|
249
282
|
}
|
250
283
|
|
251
|
-
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs,
|
284
|
+
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
252
285
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
253
286
|
|
254
|
-
const
|
287
|
+
const dfloat d = x[ib].d;
|
255
288
|
|
256
289
|
uint32_t qh;
|
257
290
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
258
291
|
|
259
|
-
const
|
260
|
-
const
|
292
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
293
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
261
294
|
|
262
|
-
|
263
|
-
|
295
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
296
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
264
297
|
|
265
|
-
|
266
|
-
|
298
|
+
#ifdef GGML_CUDA_DMMV_F16
|
299
|
+
v = __hsub2(v, {16.0f, 16.0f});
|
300
|
+
v = __hmul2(v, {d, d});
|
301
|
+
#else
|
302
|
+
v.x = (v.x - 16.0f) * d;
|
303
|
+
v.y = (v.y - 16.0f) * d;
|
304
|
+
#endif // GGML_CUDA_DMMV_F16
|
267
305
|
}
|
268
306
|
|
269
|
-
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs,
|
307
|
+
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
270
308
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
271
309
|
|
272
|
-
const
|
273
|
-
const
|
310
|
+
const dfloat d = x[ib].d;
|
311
|
+
const dfloat m = x[ib].m;
|
274
312
|
|
275
313
|
uint32_t qh;
|
276
314
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
277
315
|
|
278
|
-
const
|
279
|
-
const
|
316
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
317
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
280
318
|
|
281
|
-
|
282
|
-
|
319
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
320
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
283
321
|
|
284
|
-
|
285
|
-
|
322
|
+
#ifdef GGML_CUDA_DMMV_F16
|
323
|
+
v = __hmul2(v, {d, d});
|
324
|
+
v = __hadd2(v, {m, m});
|
325
|
+
#else
|
326
|
+
v.x = (v.x * d) + m;
|
327
|
+
v.y = (v.y * d) + m;
|
328
|
+
#endif // GGML_CUDA_DMMV_F16
|
286
329
|
}
|
287
330
|
|
288
|
-
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs,
|
331
|
+
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
289
332
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
290
333
|
|
291
|
-
const
|
334
|
+
const dfloat d = x[ib].d;
|
292
335
|
|
293
|
-
|
294
|
-
|
336
|
+
v.x = x[ib].qs[iqs + 0];
|
337
|
+
v.y = x[ib].qs[iqs + 1];
|
295
338
|
|
296
|
-
|
297
|
-
|
339
|
+
#ifdef GGML_CUDA_DMMV_F16
|
340
|
+
v = __hmul2(v, {d, d});
|
341
|
+
#else
|
342
|
+
v.x *= d;
|
343
|
+
v.y *= d;
|
344
|
+
#endif // GGML_CUDA_DMMV_F16
|
298
345
|
}
|
299
346
|
|
300
347
|
//================================== k-quants
|
@@ -321,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
321
368
|
|
322
369
|
}
|
323
370
|
|
324
|
-
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
325
|
-
|
326
|
-
const block_q2_K * x = (const block_q2_K *) vx;
|
327
|
-
|
328
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
329
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
330
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
331
|
-
int n = iqs/128; // 0 or 1
|
332
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
333
|
-
int l = r/8; // 0...15 in steps of 1
|
334
|
-
|
335
|
-
const float * y = yy + 128*n + l;
|
336
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
337
|
-
const uint8_t * s = x[ib].scales + 8*n;
|
338
|
-
|
339
|
-
const float dall = x[ib].d;
|
340
|
-
const float dmin = x[ib].dmin;
|
341
|
-
|
342
|
-
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
|
343
|
-
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
344
|
-
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
345
|
-
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
346
|
-
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
347
|
-
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
|
348
|
-
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
349
|
-
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
350
|
-
|
351
|
-
result = sum;
|
352
|
-
|
353
|
-
}
|
354
|
-
|
355
371
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
356
372
|
|
357
373
|
int r = threadIdx.x/4;
|
@@ -383,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
|
383
399
|
|
384
400
|
}
|
385
401
|
|
386
|
-
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
387
|
-
|
388
|
-
const block_q3_K * x = (const block_q3_K *) vx;
|
389
|
-
|
390
|
-
const uint32_t kmask1 = 0x03030303;
|
391
|
-
const uint32_t kmask2 = 0x0f0f0f0f;
|
392
|
-
|
393
|
-
uint32_t aux[3];
|
394
|
-
uint32_t utmp[4];
|
395
|
-
|
396
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
397
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
398
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
399
|
-
int n = iqs/128; // 0 or 1
|
400
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
401
|
-
int l = r/8; // 0...15 in steps of 1
|
402
|
-
|
403
|
-
const float * y = yy + 128*n + l;
|
404
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
405
|
-
const uint8_t * hm = x[ib].hmask + l;
|
406
|
-
const int8_t * s = (const int8_t *)utmp + 8*n;
|
407
|
-
|
408
|
-
memcpy(aux, x[ib].scales, 12);
|
409
|
-
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
410
|
-
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
411
|
-
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
412
|
-
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
413
|
-
|
414
|
-
const float dall = x[ib].d;
|
415
|
-
|
416
|
-
const uint8_t m = 1 << (4*n);
|
417
|
-
|
418
|
-
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
|
419
|
-
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
|
420
|
-
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
|
421
|
-
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
|
422
|
-
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
|
423
|
-
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
|
424
|
-
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
|
425
|
-
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
|
426
|
-
|
427
|
-
result = sum * dall;
|
428
|
-
|
429
|
-
}
|
430
|
-
|
431
402
|
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
432
403
|
if (j < 4) {
|
433
404
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -474,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
474
445
|
}
|
475
446
|
}
|
476
447
|
|
477
|
-
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
478
|
-
|
479
|
-
const block_q4_K * x = (const block_q4_K *) vx;
|
480
|
-
|
481
|
-
// iqs is in 0...248 in steps of 8 =>
|
482
|
-
const int j = iqs / 64; // j is in 0...3
|
483
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
484
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
485
|
-
|
486
|
-
const float * y = yy + 64*j + ir;
|
487
|
-
const uint8_t * q = x[ib].qs + 32*j + ir;
|
488
|
-
|
489
|
-
const float dall = x[ib].d;
|
490
|
-
const float dmin = x[ib].dmin;
|
491
|
-
|
492
|
-
uint8_t sc, m;
|
493
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
494
|
-
const float d1 = dall * sc;
|
495
|
-
const float m1 = dmin * m;
|
496
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
497
|
-
const float d2 = dall * sc;
|
498
|
-
const float m2 = dmin * m;
|
499
|
-
|
500
|
-
float sum = 0;
|
501
|
-
for (int k = 0; k < 4; ++k) {
|
502
|
-
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
|
503
|
-
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
|
504
|
-
}
|
505
|
-
result = sum;
|
506
|
-
|
507
|
-
}
|
508
|
-
|
509
448
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
510
449
|
const block_q5_K * x = (const block_q5_K *) vx;
|
511
450
|
|
@@ -539,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
539
478
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
540
479
|
}
|
541
480
|
|
542
|
-
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
543
|
-
|
544
|
-
const block_q5_K * x = (const block_q5_K *) vx;
|
545
|
-
|
546
|
-
// iqs is in 0...248 in steps of 8 =>
|
547
|
-
const int j = iqs / 64; // j is in 0...3
|
548
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
549
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
550
|
-
|
551
|
-
const float * y = yy + 64*j + ir;
|
552
|
-
const uint8_t * ql = x[ib].qs + 32*j + ir;
|
553
|
-
const uint8_t * qh = x[ib].qh + ir;
|
554
|
-
|
555
|
-
const float dall = x[ib].d;
|
556
|
-
const float dmin = x[ib].dmin;
|
557
|
-
|
558
|
-
uint8_t sc, m;
|
559
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
560
|
-
const float d1 = dall * sc;
|
561
|
-
const float m1 = dmin * m;
|
562
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
563
|
-
const float d2 = dall * sc;
|
564
|
-
const float m2 = dmin * m;
|
565
|
-
|
566
|
-
uint8_t hm = 1 << is;
|
567
|
-
float sum = 0;
|
568
|
-
for (int k = 0; k < 4; ++k) {
|
569
|
-
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
570
|
-
}
|
571
|
-
hm <<= 1;
|
572
|
-
for (int k = 0; k < 4; ++k) {
|
573
|
-
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
|
574
|
-
}
|
575
|
-
result = sum;
|
576
|
-
|
577
|
-
}
|
578
|
-
|
579
481
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
580
482
|
const block_q6_K * x = (const block_q6_K *) vx;
|
581
483
|
|
@@ -601,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
601
503
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
602
504
|
}
|
603
505
|
|
604
|
-
static
|
506
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
605
507
|
|
606
|
-
|
508
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
509
|
+
|
510
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
511
|
+
if (row > nrows) return;
|
512
|
+
|
513
|
+
const int num_blocks_per_row = ncols / QK_K;
|
514
|
+
const int ib0 = row*num_blocks_per_row;
|
515
|
+
|
516
|
+
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
517
|
+
|
518
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
519
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
520
|
+
|
521
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
522
|
+
|
523
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
524
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
525
|
+
|
526
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
527
|
+
const int q_offset = 32*im + l0;
|
528
|
+
const int s_offset = 8*im;
|
529
|
+
const int y_offset = 128*im + l0;
|
530
|
+
|
531
|
+
float tmp = 0; // partial sum for thread in warp
|
532
|
+
|
533
|
+
uint32_t aux[4];
|
534
|
+
const uint8_t * d = (const uint8_t *)aux;
|
535
|
+
const uint8_t * m = (const uint8_t *)(aux + 2);
|
536
|
+
|
537
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
538
|
+
|
539
|
+
const float * y = yy + i * QK_K + y_offset;
|
540
|
+
const uint8_t * q = x[i].qs + q_offset;
|
541
|
+
|
542
|
+
const float dall = x[i].d;
|
543
|
+
const float dmin = x[i].dmin;
|
544
|
+
|
545
|
+
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
546
|
+
aux[0] = a[0] & 0x0f0f0f0f;
|
547
|
+
aux[1] = a[1] & 0x0f0f0f0f;
|
548
|
+
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
549
|
+
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
550
|
+
|
551
|
+
float sum1 = 0, sum2 = 0;
|
552
|
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
553
|
+
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
554
|
+
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
555
|
+
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
556
|
+
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
557
|
+
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
558
|
+
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
559
|
+
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
560
|
+
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
561
|
+
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
562
|
+
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
563
|
+
|
564
|
+
}
|
565
|
+
tmp += dall * sum1 - dmin * sum2;
|
566
|
+
|
567
|
+
}
|
568
|
+
|
569
|
+
// sum up partial sums and write back result
|
570
|
+
__syncthreads();
|
571
|
+
#pragma unroll
|
572
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
573
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
574
|
+
}
|
575
|
+
|
576
|
+
if (tid == 0) {
|
577
|
+
dst[row] = tmp;
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
582
|
+
|
583
|
+
const uint16_t kmask1 = 0x0303;
|
584
|
+
const uint16_t kmask2 = 0x0f0f;
|
585
|
+
|
586
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
587
|
+
if (row > nrows) return;
|
588
|
+
|
589
|
+
const int num_blocks_per_row = ncols / QK_K;
|
590
|
+
const int ib0 = row*num_blocks_per_row;
|
591
|
+
|
592
|
+
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
593
|
+
|
594
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
595
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
596
|
+
|
597
|
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
598
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
599
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
600
|
+
const int in = tid - step*im; // 0....15 or 0...7
|
601
|
+
|
602
|
+
const uint8_t m = 1 << (4*im);
|
603
|
+
|
604
|
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
605
|
+
const int q_offset = 32*im + l0;
|
606
|
+
const int y_offset = 128*im + l0;
|
607
|
+
|
608
|
+
uint16_t utmp[4];
|
609
|
+
const int8_t * s = (const int8_t *)utmp;
|
610
|
+
|
611
|
+
const uint16_t s_shift = 4*im;
|
612
|
+
|
613
|
+
float tmp = 0; // partial sum for thread in warp
|
614
|
+
|
615
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616
|
+
|
617
|
+
const float * y = yy + i * QK_K + y_offset;
|
618
|
+
const uint8_t * q = x[i].qs + q_offset;
|
619
|
+
const uint8_t * h = x[i].hmask + l0;
|
620
|
+
|
621
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
622
|
+
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
623
|
+
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
624
|
+
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
625
|
+
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
626
|
+
|
627
|
+
const float d = x[i].d;
|
628
|
+
|
629
|
+
float sum = 0;
|
630
|
+
for (int l = 0; l < n; ++l) {
|
631
|
+
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
632
|
+
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
633
|
+
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
634
|
+
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
635
|
+
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
636
|
+
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
637
|
+
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
638
|
+
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
639
|
+
}
|
640
|
+
tmp += d * sum;
|
641
|
+
|
642
|
+
}
|
643
|
+
|
644
|
+
// sum up partial sums and write back result
|
645
|
+
__syncthreads();
|
646
|
+
#pragma unroll
|
647
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
648
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
649
|
+
}
|
650
|
+
|
651
|
+
if (tid == 0) {
|
652
|
+
dst[row] = tmp;
|
653
|
+
}
|
654
|
+
}
|
655
|
+
|
656
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
657
|
+
|
658
|
+
const uint16_t kmask1 = 0x3f3f;
|
659
|
+
const uint16_t kmask2 = 0x0f0f;
|
660
|
+
const uint16_t kmask3 = 0xc0c0;
|
661
|
+
|
662
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
663
|
+
if (row > nrows) return;
|
664
|
+
const int num_blocks_per_row = ncols / QK_K;
|
665
|
+
const int ib0 = row*num_blocks_per_row;
|
666
|
+
|
667
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
668
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
669
|
+
|
670
|
+
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
671
|
+
|
672
|
+
const int il = tid/step; // 0...3
|
673
|
+
const int ir = tid - step*il; // 0...7 or 0...3
|
674
|
+
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
675
|
+
|
676
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
677
|
+
const int in = il%2;
|
678
|
+
|
679
|
+
const int l0 = n*(2*ir + in);
|
680
|
+
const int q_offset = 32*im + l0;
|
681
|
+
const int y_offset = 64*im + l0;
|
682
|
+
|
683
|
+
uint16_t aux[4];
|
684
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
685
|
+
|
686
|
+
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
687
|
+
|
688
|
+
float tmp = 0; // partial sum for thread in warp
|
689
|
+
|
690
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
691
|
+
|
692
|
+
const uint8_t * q1 = x[i].qs + q_offset;
|
693
|
+
const uint8_t * q2 = q1 + 64;
|
694
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
695
|
+
const float * y2 = y1 + 128;
|
696
|
+
|
697
|
+
const float dall = x[i].d;
|
698
|
+
const float dmin = x[i].dmin;
|
699
|
+
|
700
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
701
|
+
aux[0] = a[im+0] & kmask1;
|
702
|
+
aux[1] = a[im+2] & kmask1;
|
703
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
704
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
705
|
+
|
706
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
707
|
+
float smin = 0;
|
708
|
+
for (int l = 0; l < n; ++l) {
|
709
|
+
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
710
|
+
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
711
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
712
|
+
}
|
713
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
714
|
+
|
715
|
+
}
|
716
|
+
|
717
|
+
// sum up partial sums and write back result
|
718
|
+
__syncthreads();
|
719
|
+
#pragma unroll
|
720
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
721
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
722
|
+
}
|
723
|
+
|
724
|
+
if (tid == 0) {
|
725
|
+
dst[row] = tmp;
|
726
|
+
}
|
727
|
+
}
|
728
|
+
|
729
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
730
|
+
|
731
|
+
const uint16_t kmask1 = 0x3f3f;
|
732
|
+
const uint16_t kmask2 = 0x0f0f;
|
733
|
+
const uint16_t kmask3 = 0xc0c0;
|
734
|
+
|
735
|
+
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
736
|
+
const int row = blockIdx.x;
|
737
|
+
const int num_blocks_per_row = ncols / QK_K;
|
738
|
+
const int ib0 = row*num_blocks_per_row;
|
739
|
+
|
740
|
+
const int tid = threadIdx.x/2; // 0...15
|
741
|
+
const int ix = threadIdx.x%2;
|
742
|
+
|
743
|
+
const int il = tid/4; // 0...3
|
744
|
+
const int ir = tid - 4*il;// 0...3
|
745
|
+
const int n = 2;
|
607
746
|
|
608
|
-
const int
|
609
|
-
const int
|
610
|
-
const int is = 8*ip;
|
747
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
748
|
+
const int in = il%2;
|
611
749
|
|
612
|
-
const
|
750
|
+
const int l0 = n*(2*ir + in);
|
751
|
+
const int q_offset = 32*im + l0;
|
752
|
+
const int y_offset = 64*im + l0;
|
613
753
|
|
614
|
-
const
|
754
|
+
const uint8_t hm1 = 1 << (2*im);
|
755
|
+
const uint8_t hm2 = hm1 << 4;
|
615
756
|
|
616
|
-
|
617
|
-
const uint8_t *
|
618
|
-
const int8_t * sc = x[ib].scales + is;
|
757
|
+
uint16_t aux[4];
|
758
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
619
759
|
|
620
|
-
|
621
|
-
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
|
622
|
-
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
|
623
|
-
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
|
624
|
-
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
|
625
|
-
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
|
626
|
-
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
|
627
|
-
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
|
760
|
+
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
628
761
|
|
762
|
+
float tmp = 0; // partial sum for thread in warp
|
763
|
+
|
764
|
+
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
765
|
+
|
766
|
+
const uint8_t * ql1 = x[i].qs + q_offset;
|
767
|
+
const uint8_t * ql2 = ql1 + 64;
|
768
|
+
const uint8_t * qh = x[i].qh + l0;
|
769
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
770
|
+
const float * y2 = y1 + 128;
|
771
|
+
|
772
|
+
const float dall = x[i].d;
|
773
|
+
const float dmin = x[i].dmin;
|
774
|
+
|
775
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
776
|
+
aux[0] = a[im+0] & kmask1;
|
777
|
+
aux[1] = a[im+2] & kmask1;
|
778
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
779
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
780
|
+
|
781
|
+
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
782
|
+
float smin = 0;
|
783
|
+
for (int l = 0; l < n; ++l) {
|
784
|
+
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
785
|
+
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
786
|
+
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
787
|
+
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
788
|
+
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
789
|
+
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
790
|
+
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
791
|
+
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
792
|
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
793
|
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
794
|
+
}
|
795
|
+
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
796
|
+
|
797
|
+
}
|
798
|
+
|
799
|
+
// sum up partial sums and write back result
|
800
|
+
__syncthreads();
|
801
|
+
#pragma unroll
|
802
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
803
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
804
|
+
}
|
805
|
+
|
806
|
+
if (tid == 0) {
|
807
|
+
dst[row] = tmp;
|
808
|
+
}
|
629
809
|
}
|
630
810
|
|
631
|
-
static
|
811
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
812
|
+
|
813
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
814
|
+
|
815
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
816
|
+
if (row > nrows) return;
|
817
|
+
|
818
|
+
const int num_blocks_per_row = ncols / QK_K;
|
819
|
+
const int ib0 = row*num_blocks_per_row;
|
820
|
+
|
821
|
+
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
822
|
+
|
823
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
824
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
825
|
+
|
826
|
+
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
827
|
+
|
828
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
829
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
830
|
+
|
831
|
+
#if K_QUANTS_PER_ITERATION == 1
|
832
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
833
|
+
const int is = 0;
|
834
|
+
#else
|
835
|
+
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
836
|
+
const int is = in / 4;
|
837
|
+
#endif
|
838
|
+
const int ql_offset = 64*im + l0;
|
839
|
+
const int qh_offset = 32*im + l0;
|
840
|
+
const int s_offset = 8*im + is;
|
841
|
+
const int y_offset = 128*im + l0;
|
842
|
+
|
843
|
+
float tmp = 0; // partial sum for thread in warp
|
844
|
+
|
845
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
846
|
+
|
847
|
+
const float * y = yy + i * QK_K + y_offset;
|
848
|
+
const uint8_t * ql = x[i].ql + ql_offset;
|
849
|
+
const uint8_t * qh = x[i].qh + qh_offset;
|
850
|
+
const int8_t * s = x[i].scales + s_offset;
|
851
|
+
|
852
|
+
const float d = x[i].d;
|
853
|
+
|
854
|
+
#if K_QUANTS_PER_ITERATION == 1
|
855
|
+
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
856
|
+
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
857
|
+
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
858
|
+
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
859
|
+
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
860
|
+
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
861
|
+
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
862
|
+
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
863
|
+
tmp += sum;
|
864
|
+
#else
|
865
|
+
float sum = 0;
|
866
|
+
for (int l = 0; l < 4; ++l) {
|
867
|
+
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
868
|
+
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
869
|
+
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
870
|
+
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
871
|
+
}
|
872
|
+
tmp += sum;
|
873
|
+
#endif
|
874
|
+
|
875
|
+
}
|
876
|
+
|
877
|
+
// sum up partial sums and write back result
|
878
|
+
__syncthreads();
|
879
|
+
#pragma unroll
|
880
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
881
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
882
|
+
}
|
883
|
+
|
884
|
+
if (tid == 0) {
|
885
|
+
dst[row] = tmp;
|
886
|
+
}
|
887
|
+
}
|
888
|
+
|
889
|
+
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
632
890
|
const half * x = (const half *) vx;
|
633
891
|
|
634
|
-
|
635
|
-
|
892
|
+
// automatic half -> float type cast if dfloat == float
|
893
|
+
v.x = x[ib + iqs + 0];
|
894
|
+
v.y = x[ib + iqs + 1];
|
636
895
|
}
|
637
896
|
|
638
897
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -649,23 +908,35 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
649
908
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
650
909
|
|
651
910
|
// dequantize
|
652
|
-
|
653
|
-
|
654
|
-
|
911
|
+
dfloat2 v;
|
912
|
+
dequantize_kernel(vx, ib, iqs, v);
|
913
|
+
|
914
|
+
y[iybs + iqs + 0] = v.x;
|
915
|
+
y[iybs + iqs + y_offset] = v.y;
|
655
916
|
}
|
656
917
|
|
657
918
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
658
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const
|
919
|
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
659
920
|
// qk = quantized weights per x block
|
660
921
|
// qr = number of quantized weights per data value in x block
|
661
|
-
const int row = blockIdx.
|
922
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
923
|
+
|
924
|
+
if (row >= nrows) {
|
925
|
+
return;
|
926
|
+
}
|
927
|
+
|
662
928
|
const int tid = threadIdx.x;
|
663
929
|
|
664
930
|
const int iter_stride = 2*GGML_CUDA_DMMV_X;
|
665
931
|
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
666
932
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
667
933
|
|
668
|
-
|
934
|
+
// partial sum for each thread
|
935
|
+
#ifdef GGML_CUDA_DMMV_F16
|
936
|
+
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
937
|
+
#else
|
938
|
+
float tmp = 0.0f;
|
939
|
+
#endif // GGML_CUDA_DMMV_F16
|
669
940
|
|
670
941
|
for (int i = 0; i < ncols; i += iter_stride) {
|
671
942
|
const int col = i + vals_per_iter*tid;
|
@@ -679,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
679
950
|
// process 2 vals per j iter
|
680
951
|
|
681
952
|
// dequantize
|
682
|
-
float v0, v1;
|
683
|
-
dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
|
684
953
|
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
954
|
+
dfloat2 v;
|
955
|
+
dequantize_kernel(vx, ib, iqs + j/qr, v);
|
685
956
|
|
686
957
|
// matrix multiplication
|
687
|
-
tmp += v0 * y[iybs + iqs + j/qr + 0];
|
688
|
-
tmp += v1 * y[iybs + iqs + j/qr + y_offset];
|
689
958
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
959
|
+
#ifdef GGML_CUDA_DMMV_F16
|
960
|
+
tmp += __hmul2(v, {
|
961
|
+
y[iybs + iqs + j/qr + 0],
|
962
|
+
y[iybs + iqs + j/qr + y_offset]
|
963
|
+
});
|
964
|
+
#else
|
965
|
+
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
966
|
+
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
967
|
+
#endif // GGML_CUDA_DMMV_F16
|
690
968
|
}
|
691
969
|
}
|
692
970
|
|
@@ -698,64 +976,232 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
698
976
|
}
|
699
977
|
|
700
978
|
if (tid == 0) {
|
979
|
+
#ifdef GGML_CUDA_DMMV_F16
|
980
|
+
dst[row] = tmp.x + tmp.y;
|
981
|
+
#else
|
701
982
|
dst[row] = tmp;
|
983
|
+
#endif // GGML_CUDA_DMMV_F16
|
702
984
|
}
|
703
985
|
}
|
704
986
|
|
705
|
-
|
706
|
-
|
707
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
708
|
-
const int tid = threadIdx.x;
|
987
|
+
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
988
|
+
const half * x = (half *) vx;
|
709
989
|
|
710
|
-
const int
|
711
|
-
const int
|
712
|
-
const int num_blocks_per_row = ncols / QK_K;
|
713
|
-
const int ib0 = row*num_blocks_per_row;
|
990
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
991
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
714
992
|
|
715
|
-
|
993
|
+
const int nrows_y = ncols_x;
|
994
|
+
const int nrows_dst = nrows_x;
|
995
|
+
const int row_dst = row_x;
|
716
996
|
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
const int
|
721
|
-
|
997
|
+
float tmp = 0.0f;
|
998
|
+
|
999
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
1000
|
+
const int col_x = col_x0 + threadIdx.x;
|
1001
|
+
|
1002
|
+
if (col_x >= ncols_x) {
|
1003
|
+
break;
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
// x is transposed and permuted
|
1007
|
+
const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
|
1008
|
+
const float xi = __half2float(x[ix]);
|
1009
|
+
|
1010
|
+
const int row_y = col_x;
|
1011
|
+
|
1012
|
+
|
1013
|
+
// y is not transposed but permuted
|
1014
|
+
const int iy = channel*nrows_y + row_y;
|
1015
|
+
|
1016
|
+
tmp += xi * y[iy];
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
// dst is not transposed and not permuted
|
1020
|
+
const int idst = channel*nrows_dst + row_dst;
|
1021
|
+
|
1022
|
+
// sum up partial sums and write back result
|
1023
|
+
__syncthreads();
|
1024
|
+
#pragma unroll
|
1025
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1026
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1027
|
+
}
|
1028
|
+
|
1029
|
+
if (threadIdx.x == 0) {
|
1030
|
+
dst[idst] = tmp;
|
1031
|
+
}
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1035
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1036
|
+
const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
|
1037
|
+
|
1038
|
+
const half * x = (half *) vx;
|
1039
|
+
|
1040
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1041
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1042
|
+
|
1043
|
+
const int nrows_y = ncols_x;
|
1044
|
+
const int nrows_dst = nrows_x;
|
1045
|
+
const int row_dst = row_x;
|
1046
|
+
|
1047
|
+
const int idst = channel*nrows_dst + row_dst;
|
1048
|
+
|
1049
|
+
float tmp = 0.0f;
|
1050
|
+
|
1051
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
1052
|
+
const int col_x = col_x0 + threadIdx.x;
|
1053
|
+
|
1054
|
+
if (col_x >= ncols_x) {
|
1055
|
+
break;
|
1056
|
+
}
|
1057
|
+
|
1058
|
+
const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
|
1059
|
+
const float xi = __half2float(x[ix]);
|
1060
|
+
|
1061
|
+
const int row_y = col_x;
|
1062
|
+
|
1063
|
+
const int iy = channel*nrows_y + row_y;
|
1064
|
+
|
1065
|
+
tmp += xi * y[iy];
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
// sum up partial sums and write back result
|
1069
|
+
__syncthreads();
|
1070
|
+
#pragma unroll
|
1071
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1072
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1073
|
+
}
|
1074
|
+
|
1075
|
+
if (threadIdx.x == 0) {
|
1076
|
+
dst[idst] = tmp;
|
1077
|
+
}
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
1081
|
+
const float * xi = (float *) cxi;
|
1082
|
+
float * dsti = (float *) cdsti;
|
1083
|
+
|
1084
|
+
*dsti = *xi;
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
1088
|
+
const float * xi = (float *) cxi;
|
1089
|
+
half * dsti = (half *) cdsti;
|
1090
|
+
|
1091
|
+
*dsti = __float2half(*xi);
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
template <cpy_kernel_t cpy_1>
|
1095
|
+
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
1096
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1097
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
1098
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1099
|
+
|
1100
|
+
if (i >= ne) {
|
1101
|
+
return;
|
1102
|
+
}
|
1103
|
+
|
1104
|
+
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
1105
|
+
// then combine those indices with the corresponding byte offsets to get the total offsets
|
1106
|
+
const int i02 = i / (ne00*ne01);
|
1107
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
1108
|
+
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
1109
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
1110
|
+
|
1111
|
+
const int i12 = i / (ne10*ne11);
|
1112
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
1113
|
+
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
1114
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
1115
|
+
|
1116
|
+
cpy_1(cx + x_offset, cdst + dst_offset);
|
1117
|
+
}
|
1118
|
+
|
1119
|
+
// rope == RoPE == rotary positional embedding
|
1120
|
+
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
|
1121
|
+
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
1122
|
+
|
1123
|
+
if (col >= ncols) {
|
1124
|
+
return;
|
1125
|
+
}
|
1126
|
+
|
1127
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1128
|
+
const int i = row*ncols + col;
|
1129
|
+
|
1130
|
+
const float theta = p*powf(theta_scale, col/2);
|
1131
|
+
const float sin_theta = sinf(theta);
|
1132
|
+
const float cos_theta = cosf(theta);
|
1133
|
+
|
1134
|
+
const float x0 = x[i + 0];
|
1135
|
+
const float x1 = x[i + 1];
|
1136
|
+
|
1137
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
1138
|
+
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1139
|
+
}
|
1140
|
+
|
1141
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1142
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1143
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1144
|
+
|
1145
|
+
if (col >= ncols) {
|
1146
|
+
return;
|
1147
|
+
}
|
1148
|
+
|
1149
|
+
const int i = row*ncols + col;
|
1150
|
+
// dst[i] = col > n_past + row ? -INFINITY : x[i];
|
1151
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
1152
|
+
}
|
1153
|
+
|
1154
|
+
// the CUDA soft max implementation differs from the CPU implementation
|
1155
|
+
// instead of doubles floats are used
|
1156
|
+
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
1157
|
+
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
1158
|
+
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
1159
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1160
|
+
const int block_size = blockDim.x;
|
1161
|
+
const int tid = threadIdx.x;
|
1162
|
+
|
1163
|
+
float tmp = 0.0;
|
1164
|
+
|
1165
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
1166
|
+
const int col = block_start + tid;
|
722
1167
|
|
723
|
-
|
724
|
-
|
725
|
-
|
1168
|
+
if (col >= ncols) {
|
1169
|
+
break;
|
1170
|
+
}
|
1171
|
+
|
1172
|
+
const int i = row*ncols + col;
|
1173
|
+
const float val = expf(x[i]);
|
1174
|
+
tmp += val;
|
1175
|
+
dst[i] = val;
|
726
1176
|
}
|
727
1177
|
|
728
|
-
// sum up partial sums
|
1178
|
+
// sum up partial sums
|
729
1179
|
__syncthreads();
|
730
1180
|
#pragma unroll
|
731
1181
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
732
1182
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
733
1183
|
}
|
734
1184
|
|
735
|
-
|
736
|
-
|
1185
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
1186
|
+
const int col = block_start + tid;
|
1187
|
+
|
1188
|
+
if (col >= ncols) {
|
1189
|
+
break;
|
1190
|
+
}
|
1191
|
+
|
1192
|
+
const int i = row*ncols + col;
|
1193
|
+
dst[i] /= tmp;
|
737
1194
|
}
|
738
1195
|
}
|
739
1196
|
|
740
|
-
static __global__ void
|
741
|
-
const int
|
1197
|
+
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
|
1198
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
742
1199
|
|
743
|
-
if (
|
1200
|
+
if (i >= k) {
|
744
1201
|
return;
|
745
1202
|
}
|
746
1203
|
|
747
|
-
|
748
|
-
const int i = row*ncols + col;
|
749
|
-
|
750
|
-
const float theta = p*powf(theta_scale, col/2);
|
751
|
-
const float sin_theta = sinf(theta);
|
752
|
-
const float cos_theta = cosf(theta);
|
753
|
-
|
754
|
-
const float x0 = x[i + 0];
|
755
|
-
const float x1 = x[i + 1];
|
756
|
-
|
757
|
-
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
758
|
-
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1204
|
+
dst[i] = scale * x[i];
|
759
1205
|
}
|
760
1206
|
|
761
1207
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
@@ -829,75 +1275,91 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
829
1275
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
830
1276
|
}
|
831
1277
|
|
832
|
-
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const
|
1278
|
+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
833
1279
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
834
|
-
|
1280
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1281
|
+
const dim3 block_nums(1, block_num_y, 1);
|
835
1282
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
836
1283
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
837
|
-
<<<
|
1284
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
838
1285
|
}
|
839
1286
|
|
840
|
-
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const
|
1287
|
+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
841
1288
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
842
|
-
|
1289
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1290
|
+
const dim3 block_nums(1, block_num_y, 1);
|
843
1291
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
844
1292
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
845
|
-
<<<
|
1293
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
846
1294
|
}
|
847
1295
|
|
848
|
-
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const
|
1296
|
+
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
849
1297
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
850
|
-
|
1298
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1299
|
+
const dim3 block_nums(1, block_num_y, 1);
|
851
1300
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
852
1301
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
853
|
-
<<<
|
1302
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
854
1303
|
}
|
855
1304
|
|
856
|
-
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const
|
1305
|
+
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
857
1306
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
858
|
-
|
1307
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1308
|
+
const dim3 block_nums(1, block_num_y, 1);
|
859
1309
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
860
1310
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
861
|
-
<<<
|
1311
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
862
1312
|
}
|
863
1313
|
|
864
|
-
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const
|
1314
|
+
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
865
1315
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
866
|
-
|
1316
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1317
|
+
const dim3 block_nums(1, block_num_y, 1);
|
867
1318
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
868
1319
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
869
|
-
<<<
|
1320
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
870
1321
|
}
|
871
1322
|
|
872
1323
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
873
1324
|
GGML_ASSERT(ncols % QK_K == 0);
|
874
|
-
const int ny = 2;
|
1325
|
+
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
1326
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1327
|
+
const dim3 block_nums(1, block_num_y, 1);
|
875
1328
|
const dim3 block_dims(32, ny, 1);
|
876
|
-
|
1329
|
+
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
877
1330
|
}
|
878
1331
|
|
879
1332
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
880
1333
|
GGML_ASSERT(ncols % QK_K == 0);
|
881
|
-
const
|
882
|
-
|
1334
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1335
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1336
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1337
|
+
const dim3 block_dims(32, ny, 1);
|
1338
|
+
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
883
1339
|
}
|
884
1340
|
|
885
1341
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
886
1342
|
GGML_ASSERT(ncols % QK_K == 0);
|
887
|
-
const
|
888
|
-
|
1343
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1344
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1345
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1346
|
+
const dim3 block_dims(32, ny, 1);
|
1347
|
+
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
889
1348
|
}
|
890
1349
|
|
891
1350
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
892
1351
|
GGML_ASSERT(ncols % QK_K == 0);
|
893
|
-
const dim3 block_dims(32,
|
894
|
-
|
1352
|
+
const dim3 block_dims(32, 1, 1);
|
1353
|
+
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
895
1354
|
}
|
896
1355
|
|
897
1356
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
898
1357
|
GGML_ASSERT(ncols % QK_K == 0);
|
899
|
-
const
|
900
|
-
|
1358
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1359
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1360
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1361
|
+
const dim3 block_dims(32, ny, 1);
|
1362
|
+
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
901
1363
|
}
|
902
1364
|
|
903
1365
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -905,12 +1367,13 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
905
1367
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
906
1368
|
}
|
907
1369
|
|
908
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const
|
1370
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
909
1371
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
910
|
-
|
1372
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1373
|
+
const dim3 block_nums(1, block_num_y, 1);
|
911
1374
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
912
1375
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
913
|
-
<<<
|
1376
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
914
1377
|
}
|
915
1378
|
|
916
1379
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
@@ -942,6 +1405,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
942
1405
|
}
|
943
1406
|
}
|
944
1407
|
|
1408
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
|
1409
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1410
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1411
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
1412
|
+
}
|
1413
|
+
|
1414
|
+
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
1415
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
1416
|
+
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
1417
|
+
|
1418
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1419
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1420
|
+
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1421
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
static void ggml_cpy_f32_f32_cuda(
|
1425
|
+
const char * cx, char * cdst, const int ne,
|
1426
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1427
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1428
|
+
|
1429
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1430
|
+
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1431
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1432
|
+
}
|
1433
|
+
|
1434
|
+
static void ggml_cpy_f32_f16_cuda(
|
1435
|
+
const char * cx, char * cdst, const int ne,
|
1436
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1437
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1438
|
+
|
1439
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1440
|
+
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1441
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1442
|
+
}
|
1443
|
+
|
1444
|
+
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
1445
|
+
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
1446
|
+
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
1447
|
+
}
|
1448
|
+
|
945
1449
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
|
946
1450
|
GGML_ASSERT(nrows % 2 == 0);
|
947
1451
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -950,6 +1454,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
950
1454
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
951
1455
|
}
|
952
1456
|
|
1457
|
+
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
1458
|
+
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
1459
|
+
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
1460
|
+
const dim3 block_nums(block_num_x, nrows_x, 1);
|
1461
|
+
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
1462
|
+
}
|
1463
|
+
|
1464
|
+
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
1465
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1466
|
+
const dim3 block_nums(1, nrows_x, 1);
|
1467
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
1468
|
+
}
|
1469
|
+
|
953
1470
|
// buffer pool for cuda
|
954
1471
|
#define MAX_CUDA_BUFFERS 256
|
955
1472
|
|
@@ -1018,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
|
|
1018
1535
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
1019
1536
|
static size_t g_scratch_offset = 0;
|
1020
1537
|
|
1021
|
-
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
1022
|
-
#define GGML_CUDA_MAX_EVENTS 64
|
1023
|
-
|
1024
1538
|
static int g_device_count = -1;
|
1025
1539
|
static int g_main_device = 0;
|
1026
1540
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1027
1541
|
|
1028
1542
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1029
1543
|
|
1030
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES]
|
1031
|
-
|
1032
|
-
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
|
1033
|
-
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
|
1544
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
1034
1545
|
|
1035
1546
|
void ggml_init_cublas() {
|
1036
1547
|
static bool initialized = false;
|
@@ -1054,15 +1565,8 @@ void ggml_init_cublas() {
|
|
1054
1565
|
for (int id = 0; id < g_device_count; ++id) {
|
1055
1566
|
CUDA_CHECK(cudaSetDevice(id));
|
1056
1567
|
|
1057
|
-
// create
|
1058
|
-
|
1059
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
|
1060
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
|
1061
|
-
}
|
1062
|
-
// create events
|
1063
|
-
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
|
1064
|
-
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
|
1065
|
-
}
|
1568
|
+
// create main stream
|
1569
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
|
1066
1570
|
|
1067
1571
|
// create cublas handle
|
1068
1572
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -1105,6 +1609,9 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
1105
1609
|
void * ptr = nullptr;
|
1106
1610
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
1107
1611
|
if (err != cudaSuccess) {
|
1612
|
+
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
1613
|
+
// This can fixed the OOM error in WSL.
|
1614
|
+
cudaGetLastError();
|
1108
1615
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
1109
1616
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
1110
1617
|
return nullptr;
|
@@ -1117,10 +1624,25 @@ void ggml_cuda_host_free(void * ptr) {
|
|
1117
1624
|
CUDA_CHECK(cudaFreeHost(ptr));
|
1118
1625
|
}
|
1119
1626
|
|
1120
|
-
static cudaError_t
|
1627
|
+
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
1121
1628
|
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
1122
1629
|
|
1123
|
-
|
1630
|
+
cudaMemcpyKind kind;
|
1631
|
+
char * src_ptr;
|
1632
|
+
if (src->backend == GGML_BACKEND_CPU) {
|
1633
|
+
kind = cudaMemcpyHostToDevice;
|
1634
|
+
src_ptr = (char *) src->data;
|
1635
|
+
} else if (src->backend == GGML_BACKEND_GPU) {
|
1636
|
+
kind = cudaMemcpyDeviceToDevice;
|
1637
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
1638
|
+
int id;
|
1639
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
1640
|
+
src_ptr = (char *) extra->data_device[id];
|
1641
|
+
} else {
|
1642
|
+
GGML_ASSERT(false);
|
1643
|
+
}
|
1644
|
+
char * dst_ptr = (char *) dst;
|
1645
|
+
|
1124
1646
|
const int64_t ne0 = src->ne[0];
|
1125
1647
|
const int64_t nb0 = src->nb[0];
|
1126
1648
|
const int64_t nb1 = src->nb[1];
|
@@ -1131,17 +1653,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(
|
|
1131
1653
|
const int64_t bs = ggml_blck_size(type);
|
1132
1654
|
int64_t i1_diff = i1_high - i1_low;
|
1133
1655
|
|
1134
|
-
const
|
1656
|
+
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
1135
1657
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
1136
|
-
return cudaMemcpyAsync(
|
1658
|
+
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
1137
1659
|
} else if (nb0 == ts) {
|
1138
|
-
return cudaMemcpy2DAsync(
|
1660
|
+
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
1139
1661
|
} else {
|
1140
1662
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
1141
1663
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
1142
|
-
void * rd = (void *) (
|
1664
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
1143
1665
|
// pretend the row is a matrix with cols=1
|
1144
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0,
|
1666
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
1145
1667
|
if (r != cudaSuccess) return r;
|
1146
1668
|
}
|
1147
1669
|
return cudaSuccess;
|
@@ -1260,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1260
1782
|
const int64_t ne00 = src0->ne[0];
|
1261
1783
|
const int64_t nrows = i01_high - i01_low;
|
1262
1784
|
|
1785
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
1786
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1787
|
+
size_t ash;
|
1788
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
1789
|
+
|
1790
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
1791
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
1792
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
1793
|
+
|
1794
|
+
if (src1_convert_f16) {
|
1795
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
1796
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
1797
|
+
ne00, 1, sizeof(float), 0, 0,
|
1798
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
1799
|
+
}
|
1800
|
+
#else
|
1801
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
1802
|
+
#endif // GGML_CUDA_DMMV_F16
|
1803
|
+
|
1263
1804
|
switch (src0->type) {
|
1264
1805
|
case GGML_TYPE_Q4_0:
|
1265
|
-
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i,
|
1806
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1266
1807
|
break;
|
1267
1808
|
case GGML_TYPE_Q4_1:
|
1268
|
-
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i,
|
1809
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1269
1810
|
break;
|
1270
1811
|
case GGML_TYPE_Q5_0:
|
1271
|
-
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i,
|
1812
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1272
1813
|
break;
|
1273
1814
|
case GGML_TYPE_Q5_1:
|
1274
|
-
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i,
|
1815
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1275
1816
|
break;
|
1276
1817
|
case GGML_TYPE_Q8_0:
|
1277
|
-
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i,
|
1818
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1278
1819
|
break;
|
1279
1820
|
case GGML_TYPE_Q2_K:
|
1280
1821
|
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
@@ -1292,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1292
1833
|
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1293
1834
|
break;
|
1294
1835
|
case GGML_TYPE_F16:
|
1295
|
-
convert_mul_mat_vec_f16_cuda(src0_ddq_i,
|
1836
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1296
1837
|
break;
|
1297
1838
|
default:
|
1298
1839
|
GGML_ASSERT(false);
|
@@ -1300,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1300
1841
|
}
|
1301
1842
|
CUDA_CHECK(cudaGetLastError());
|
1302
1843
|
|
1844
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1845
|
+
if (src1_convert_f16) {
|
1846
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
1847
|
+
}
|
1848
|
+
#endif // GGML_CUDA_DMMV_F16
|
1849
|
+
|
1303
1850
|
(void) src1;
|
1304
1851
|
(void) dst;
|
1305
1852
|
(void) src0_ddf_i;
|
@@ -1377,8 +1924,81 @@ inline void ggml_cuda_op_rope(
|
|
1377
1924
|
(void) i1;
|
1378
1925
|
}
|
1379
1926
|
|
1927
|
+
inline void ggml_cuda_op_diag_mask_inf(
|
1928
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1929
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1930
|
+
cudaStream_t & cudaStream_main){
|
1931
|
+
|
1932
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1933
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1934
|
+
|
1935
|
+
const int64_t ne00 = src0->ne[0];
|
1936
|
+
const int64_t ne01 = src0->ne[1];
|
1937
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1938
|
+
|
1939
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
1940
|
+
|
1941
|
+
// compute
|
1942
|
+
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
1943
|
+
CUDA_CHECK(cudaGetLastError());
|
1944
|
+
|
1945
|
+
(void) dst;
|
1946
|
+
(void) src0_ddq_i;
|
1947
|
+
(void) src1_ddf_i;
|
1948
|
+
(void) i02;
|
1949
|
+
(void) i1;
|
1950
|
+
}
|
1951
|
+
|
1952
|
+
inline void ggml_cuda_op_soft_max(
|
1953
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1954
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1955
|
+
cudaStream_t & cudaStream_main){
|
1956
|
+
|
1957
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1958
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1959
|
+
|
1960
|
+
const int64_t ne00 = src0->ne[0];
|
1961
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1962
|
+
|
1963
|
+
// compute
|
1964
|
+
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
1965
|
+
CUDA_CHECK(cudaGetLastError());
|
1966
|
+
|
1967
|
+
(void) src1;
|
1968
|
+
(void) dst;
|
1969
|
+
(void) src0_ddq_i;
|
1970
|
+
(void) src1_ddf_i;
|
1971
|
+
(void) i02;
|
1972
|
+
(void) i1;
|
1973
|
+
}
|
1974
|
+
|
1975
|
+
inline void ggml_cuda_op_scale(
|
1976
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1977
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1978
|
+
cudaStream_t & cudaStream_main){
|
1979
|
+
|
1980
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1981
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1982
|
+
|
1983
|
+
const float scale = ((float *) src1->data)[0];
|
1984
|
+
|
1985
|
+
const int64_t ne00 = src0->ne[0];
|
1986
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1987
|
+
|
1988
|
+
// compute
|
1989
|
+
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
1990
|
+
CUDA_CHECK(cudaGetLastError());
|
1991
|
+
|
1992
|
+
(void) src1;
|
1993
|
+
(void) dst;
|
1994
|
+
(void) src0_ddq_i;
|
1995
|
+
(void) src1_ddf_i;
|
1996
|
+
(void) i02;
|
1997
|
+
(void) i1;
|
1998
|
+
}
|
1999
|
+
|
1380
2000
|
static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
1381
|
-
ggml_cuda_op_t op, bool src0_needs_f32) {
|
2001
|
+
ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
|
1382
2002
|
const int64_t ne00 = src0->ne[0];
|
1383
2003
|
const int64_t ne01 = src0->ne[1];
|
1384
2004
|
const int64_t ne02 = src0->ne[2];
|
@@ -1401,21 +2021,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1401
2021
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
1402
2022
|
|
1403
2023
|
// strides for iteration over dims 3 and 2
|
1404
|
-
const int64_t
|
1405
|
-
const int64_t
|
1406
|
-
const int64_t
|
1407
|
-
const int64_t
|
2024
|
+
const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
|
2025
|
+
const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
|
2026
|
+
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
2027
|
+
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
2028
|
+
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
1408
2029
|
|
1409
2030
|
const size_t src0_ts = ggml_type_size(src0->type);
|
1410
2031
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
1411
2032
|
|
1412
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
2033
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
1413
2034
|
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
1414
|
-
struct ggml_tensor_extra_gpu * dst_extra
|
2035
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
1415
2036
|
|
1416
2037
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
2038
|
+
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
1417
2039
|
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
1418
2040
|
|
2041
|
+
const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
|
2042
|
+
const bool src1_stays_on_host = use_src1 && (
|
2043
|
+
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
2044
|
+
|
1419
2045
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
1420
2046
|
|
1421
2047
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
@@ -1424,13 +2050,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1424
2050
|
char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
|
1425
2051
|
float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
1426
2052
|
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1427
|
-
float *
|
2053
|
+
float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1428
2054
|
|
1429
2055
|
// asq = actual size quantized, asf = actual size float
|
1430
2056
|
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
1431
2057
|
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1432
2058
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1433
|
-
size_t
|
2059
|
+
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2060
|
+
|
2061
|
+
// if multiple GPUs are used they need to wait for the main GPU to finish
|
2062
|
+
if (split && g_device_count > 1) {
|
2063
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2064
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2065
|
+
}
|
1434
2066
|
|
1435
2067
|
for (int id = 0; id < g_device_count; ++id) {
|
1436
2068
|
if (!split && id != g_main_device) {
|
@@ -1443,9 +2075,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1443
2075
|
int64_t row_low, row_high;
|
1444
2076
|
if (split) {
|
1445
2077
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
1446
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1447
2078
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
1448
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1449
2079
|
} else {
|
1450
2080
|
row_low = 0;
|
1451
2081
|
row_high = nrows0;
|
@@ -1458,7 +2088,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1458
2088
|
|
1459
2089
|
cudaSetDevice(id);
|
1460
2090
|
|
1461
|
-
if (src0_on_device) {
|
2091
|
+
if (src0_on_device && src0_is_contiguous) {
|
1462
2092
|
if (src0_is_f32) {
|
1463
2093
|
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
1464
2094
|
} else {
|
@@ -1476,8 +2106,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1476
2106
|
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
|
1477
2107
|
}
|
1478
2108
|
|
1479
|
-
if (use_src1) {
|
1480
|
-
if (src1_on_device) {
|
2109
|
+
if (use_src1 && !src1_stays_on_host) {
|
2110
|
+
if (src1_on_device && src1_is_contiguous) {
|
1481
2111
|
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
1482
2112
|
} else {
|
1483
2113
|
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
|
@@ -1490,26 +2120,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1490
2120
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
1491
2121
|
}
|
1492
2122
|
|
1493
|
-
|
2123
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
2124
|
+
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
2125
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
2126
|
+
|
2127
|
+
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
1494
2128
|
const int64_t i13 = i03 % ne13;
|
1495
|
-
for (int64_t i02 = 0; i02 <
|
2129
|
+
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
1496
2130
|
const int64_t i12 = i02 % ne12;
|
1497
2131
|
|
1498
2132
|
const int64_t i0 = i03*ne02 + i02;
|
1499
|
-
|
1500
|
-
|
2133
|
+
|
2134
|
+
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
2135
|
+
const int64_t i0_offset_low = row_low/rows_per_iter;
|
2136
|
+
const int64_t i0_offset_high = row_high/rows_per_iter;
|
1501
2137
|
|
1502
2138
|
int64_t i01_low = 0;
|
1503
|
-
int64_t i01_high =
|
2139
|
+
int64_t i01_high = rows_per_iter;
|
1504
2140
|
if (split) {
|
1505
2141
|
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
1506
2142
|
continue;
|
1507
2143
|
}
|
1508
2144
|
if (i0 == i0_offset_low) {
|
1509
|
-
i01_low = row_low %
|
2145
|
+
i01_low = row_low % rows_per_iter;
|
1510
2146
|
}
|
1511
2147
|
if (i0 == i0_offset_high) {
|
1512
|
-
i01_high = row_high %
|
2148
|
+
i01_high = row_high % rows_per_iter;
|
1513
2149
|
}
|
1514
2150
|
}
|
1515
2151
|
|
@@ -1518,7 +2154,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1518
2154
|
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
1519
2155
|
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
1520
2156
|
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
1521
|
-
GGML_ASSERT(i01_high ==
|
2157
|
+
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
1522
2158
|
|
1523
2159
|
const int64_t i01_diff = i01_high - i01_low;
|
1524
2160
|
if (i01_diff == 0) {
|
@@ -1526,24 +2162,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1526
2162
|
}
|
1527
2163
|
const int64_t i11 = i13*ne12 + i12;
|
1528
2164
|
|
1529
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id]
|
1530
|
-
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1531
|
-
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
2165
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
1532
2166
|
|
1533
2167
|
// for split tensors the data begins at i0 == i0_offset_low
|
1534
2168
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
1535
2169
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
1536
2170
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
1537
|
-
float * dst_ddf_i
|
2171
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
1538
2172
|
|
1539
2173
|
// for split tensors the data pointer needs to be rounded down
|
1540
2174
|
// to the bin edge for i03, i02 bins beyond the first
|
1541
2175
|
if (i0 - i0_offset_low > 0) {
|
2176
|
+
GGML_ASSERT(!flatten_rows);
|
1542
2177
|
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
1543
2178
|
src0_ddf_i -= (row_low % ne01)*ne00;
|
1544
|
-
|
1545
|
-
if (i0 - i0_offset_low > 0) {
|
1546
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
2179
|
+
dst_ddf_i -= (row_low % ne0)*ne1;
|
1547
2180
|
}
|
1548
2181
|
|
1549
2182
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
@@ -1553,38 +2186,41 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1553
2186
|
}
|
1554
2187
|
|
1555
2188
|
// copy src0, src1 to device if necessary
|
1556
|
-
if (use_src1) {
|
2189
|
+
if (use_src1 && !src1_stays_on_host) {
|
1557
2190
|
if (src1->backend == GGML_BACKEND_CPU) {
|
1558
|
-
|
1559
|
-
|
2191
|
+
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
2192
|
+
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
2193
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
|
2194
|
+
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
1560
2195
|
if (id != g_main_device) {
|
2196
|
+
GGML_ASSERT(!flatten_rows);
|
1561
2197
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
1562
2198
|
src1_ddf_i_source += i11*src1_stride;
|
1563
2199
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
1564
|
-
cudaMemcpyDeviceToDevice,
|
2200
|
+
cudaMemcpyDeviceToDevice, cudaStream_main));
|
1565
2201
|
}
|
2202
|
+
} else if (src1_on_device && !src1_is_contiguous) {
|
2203
|
+
GGML_ASSERT(!split);
|
2204
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
1566
2205
|
} else {
|
1567
2206
|
GGML_ASSERT(false);
|
1568
2207
|
}
|
1569
2208
|
}
|
1570
|
-
|
1571
|
-
if (!src0_on_device) {
|
2209
|
+
|
2210
|
+
if (!src0_on_device || !src0_is_contiguous) {
|
1572
2211
|
if (src0_is_f32) {
|
1573
|
-
CUDA_CHECK(
|
2212
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1574
2213
|
} else {
|
1575
|
-
CUDA_CHECK(
|
2214
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1576
2215
|
}
|
1577
2216
|
}
|
1578
2217
|
|
1579
|
-
// convert src0 to f32 if it
|
2218
|
+
// convert src0 to f32 if it is necessary for the ggml_cuda_op
|
1580
2219
|
if (src0_needs_f32 && !src0_is_f32) {
|
1581
2220
|
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
1582
2221
|
CUDA_CHECK(cudaGetLastError());
|
1583
2222
|
}
|
1584
2223
|
|
1585
|
-
// wait with main stream until src1 memcpy is done
|
1586
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
|
1587
|
-
|
1588
2224
|
// do the computation
|
1589
2225
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
1590
2226
|
|
@@ -1622,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1622
2258
|
|
1623
2259
|
// wait until each device is finished, then free their buffers
|
1624
2260
|
for (int id = 0; id < g_device_count; ++id) {
|
2261
|
+
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
2262
|
+
continue;
|
2263
|
+
}
|
2264
|
+
|
1625
2265
|
CUDA_CHECK(cudaSetDevice(id));
|
1626
2266
|
CUDA_CHECK(cudaDeviceSynchronize());
|
2267
|
+
|
1627
2268
|
if (src0_asq[id] > 0) {
|
1628
2269
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
1629
2270
|
}
|
@@ -1641,39 +2282,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1641
2282
|
|
1642
2283
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1643
2284
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1644
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
|
2285
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
|
1645
2286
|
}
|
1646
2287
|
|
1647
2288
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1648
2289
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1649
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
|
2290
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
1650
2291
|
}
|
1651
2292
|
|
1652
2293
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1653
2294
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1654
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
|
2295
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
1655
2296
|
}
|
1656
2297
|
|
1657
2298
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1658
2299
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1659
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
|
2300
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
1660
2301
|
}
|
1661
2302
|
|
1662
2303
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1663
|
-
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
|
1664
2304
|
const int64_t ne10 = src1->ne[0];
|
1665
2305
|
|
1666
2306
|
const int64_t ne0 = dst->ne[0];
|
1667
2307
|
const int64_t ne1 = dst->ne[1];
|
1668
2308
|
|
1669
|
-
// if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
|
1670
|
-
// fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
|
1671
|
-
// src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
1672
|
-
// src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
1673
|
-
// dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
1674
|
-
// return false;
|
1675
|
-
// }
|
1676
|
-
|
1677
2309
|
// TODO: find the optimal values for these
|
1678
2310
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
1679
2311
|
src1->type == GGML_TYPE_F32 &&
|
@@ -1685,23 +2317,152 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
1685
2317
|
return false;
|
1686
2318
|
}
|
1687
2319
|
|
2320
|
+
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2321
|
+
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
2322
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2323
|
+
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
2324
|
+
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
2325
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2326
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2327
|
+
|
2328
|
+
const int64_t ne00 = src0->ne[0];
|
2329
|
+
const int64_t ne01 = src0->ne[1];
|
2330
|
+
const int64_t ne02 = src0->ne[2];
|
2331
|
+
|
2332
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2333
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2334
|
+
|
2335
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2336
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2337
|
+
|
2338
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2339
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2340
|
+
|
2341
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2342
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2343
|
+
|
2344
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2348
|
+
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
2349
|
+
GGML_ASSERT(!ggml_is_permuted(src0));
|
2350
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2351
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2352
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2353
|
+
|
2354
|
+
const int64_t ne00 = src0->ne[0];
|
2355
|
+
const int64_t ne01 = src0->ne[1];
|
2356
|
+
const int64_t ne02 = src0->ne[2];
|
2357
|
+
|
2358
|
+
const int64_t nb01 = src0->nb[1];
|
2359
|
+
const int64_t nb02 = src0->nb[2];
|
2360
|
+
|
2361
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2362
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2363
|
+
|
2364
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2365
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2366
|
+
|
2367
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2368
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2369
|
+
|
2370
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2371
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2372
|
+
|
2373
|
+
const int row_stride_x = nb01 / sizeof(half);
|
2374
|
+
const int channel_stride_x = nb02 / sizeof(half);
|
2375
|
+
|
2376
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
2377
|
+
}
|
2378
|
+
|
1688
2379
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1689
|
-
|
1690
|
-
|
2380
|
+
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
2381
|
+
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
2382
|
+
|
2383
|
+
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
2384
|
+
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
2385
|
+
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
2386
|
+
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
2387
|
+
}else if (src0->type == GGML_TYPE_F32) {
|
2388
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1691
2389
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
1692
|
-
if (src1->ne[1] == 1) {
|
1693
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
2390
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
|
2391
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
|
1694
2392
|
} else {
|
1695
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
|
2393
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1696
2394
|
}
|
1697
2395
|
} else {
|
1698
2396
|
GGML_ASSERT(false);
|
1699
2397
|
}
|
1700
2398
|
}
|
1701
2399
|
|
2400
|
+
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2401
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2402
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2406
|
+
const int64_t ne = ggml_nelements(src0);
|
2407
|
+
GGML_ASSERT(ne == ggml_nelements(src1));
|
2408
|
+
|
2409
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
2410
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
2411
|
+
|
2412
|
+
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
2413
|
+
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
2414
|
+
|
2415
|
+
const int64_t ne00 = src0->ne[0];
|
2416
|
+
const int64_t ne01 = src0->ne[1];
|
2417
|
+
GGML_ASSERT(src0->ne[3] == 1);
|
2418
|
+
|
2419
|
+
const int64_t nb00 = src0->nb[0];
|
2420
|
+
const int64_t nb01 = src0->nb[1];
|
2421
|
+
const int64_t nb02 = src0->nb[2];
|
2422
|
+
|
2423
|
+
const int64_t ne10 = src1->ne[0];
|
2424
|
+
const int64_t ne11 = src1->ne[1];
|
2425
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
2426
|
+
|
2427
|
+
const int64_t nb10 = src1->nb[0];
|
2428
|
+
const int64_t nb11 = src1->nb[1];
|
2429
|
+
const int64_t nb12 = src1->nb[2];
|
2430
|
+
|
2431
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2432
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2433
|
+
|
2434
|
+
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2435
|
+
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2436
|
+
|
2437
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2438
|
+
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
2439
|
+
|
2440
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
2441
|
+
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2442
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2443
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
2444
|
+
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2445
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2446
|
+
} else {
|
2447
|
+
GGML_ASSERT(false);
|
2448
|
+
}
|
2449
|
+
|
2450
|
+
(void) dst;
|
2451
|
+
}
|
2452
|
+
|
2453
|
+
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2454
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2455
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
2456
|
+
}
|
2457
|
+
|
2458
|
+
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2459
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2460
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
2461
|
+
}
|
2462
|
+
|
1702
2463
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1703
2464
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1704
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
|
2465
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
|
1705
2466
|
}
|
1706
2467
|
|
1707
2468
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1710,16 +2471,14 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
1710
2471
|
(void) dst;
|
1711
2472
|
}
|
1712
2473
|
|
1713
|
-
void
|
1714
|
-
FILE * fp = fopen(fname, "rb");
|
2474
|
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
1715
2475
|
int nrows = ggml_nrows(tensor);
|
1716
2476
|
const size_t nb1 = tensor->nb[1];
|
1717
2477
|
ggml_backend backend = tensor->backend;
|
1718
2478
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
2479
|
+
memset(extra, 0, sizeof(*extra));
|
1719
2480
|
|
1720
2481
|
for (int id = 0; id < g_device_count; ++id) {
|
1721
|
-
extra->data_device[id] = nullptr;
|
1722
|
-
|
1723
2482
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
1724
2483
|
continue;
|
1725
2484
|
}
|
@@ -1732,10 +2491,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1732
2491
|
row_high = nrows;
|
1733
2492
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
1734
2493
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
1735
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1736
2494
|
row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
|
1737
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1738
|
-
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
|
1739
2495
|
} else {
|
1740
2496
|
GGML_ASSERT(false);
|
1741
2497
|
}
|
@@ -1745,35 +2501,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1745
2501
|
|
1746
2502
|
int64_t nrows_split = row_high - row_low;
|
1747
2503
|
|
1748
|
-
const size_t offset_split =
|
2504
|
+
const size_t offset_split = row_low*nb1;
|
1749
2505
|
const size_t size = ggml_nbytes_split(tensor, nrows_split);
|
1750
2506
|
|
1751
2507
|
void * buf;
|
1752
2508
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
1753
|
-
void * buf_host =
|
1754
|
-
|
1755
|
-
#ifdef _WIN32
|
1756
|
-
int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
|
1757
|
-
#else
|
1758
|
-
int ret = fseek(fp, (long) offset_split, SEEK_SET);
|
1759
|
-
#endif
|
1760
|
-
GGML_ASSERT(ret == 0); // same
|
1761
|
-
|
1762
|
-
size_t ret2 = fread(buf_host, size, 1, fp);
|
1763
|
-
if (ret2 != 1) {
|
1764
|
-
fprintf(stderr, "unexpectedly reached end of file");
|
1765
|
-
exit(1);
|
1766
|
-
}
|
2509
|
+
void * buf_host = (char*)data + offset_split;
|
1767
2510
|
|
1768
2511
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
1769
|
-
cudaDeviceSynchronize();
|
1770
2512
|
|
1771
|
-
free(buf_host);
|
1772
2513
|
extra->data_device[id] = buf;
|
1773
2514
|
}
|
1774
2515
|
|
1775
2516
|
tensor->extra = extra;
|
1776
|
-
fclose(fp);
|
1777
2517
|
}
|
1778
2518
|
|
1779
2519
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
@@ -1795,47 +2535,78 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
1795
2535
|
delete extra;
|
1796
2536
|
}
|
1797
2537
|
|
1798
|
-
void
|
1799
|
-
if (
|
1800
|
-
|
2538
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
2539
|
+
if (scratch && g_scratch_size == 0) {
|
2540
|
+
return;
|
1801
2541
|
}
|
1802
2542
|
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
2543
|
+
// recursively assign CUDA buffers until a compute tensor is found
|
2544
|
+
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2545
|
+
const ggml_op src0_op = tensor->src0->op;
|
2546
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2547
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
2548
|
+
}
|
2549
|
+
}
|
2550
|
+
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2551
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
1807
2552
|
}
|
1808
2553
|
|
1809
2554
|
tensor->backend = GGML_BACKEND_GPU;
|
1810
2555
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
1811
2556
|
|
1812
|
-
bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data
|
2557
|
+
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2558
|
+
tensor->op == GGML_OP_VIEW;
|
2559
|
+
const size_t size = ggml_nbytes(tensor);
|
1813
2560
|
|
1814
2561
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
1815
2562
|
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
1816
2563
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
2564
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2565
|
+
size_t offset = 0;
|
2566
|
+
if (tensor->op == GGML_OP_VIEW) {
|
2567
|
+
memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
|
2568
|
+
}
|
2569
|
+
extra->data_device[g_main_device] = src0_ddc + offset;
|
2570
|
+
} else if (tensor->op == GGML_OP_CPY) {
|
2571
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
|
2572
|
+
void * src1_ddv = src1_extra->data_device[g_main_device];
|
2573
|
+
extra->data_device[g_main_device] = src1_ddv;
|
2574
|
+
} else if (scratch) {
|
2575
|
+
GGML_ASSERT(size <= g_scratch_size);
|
2576
|
+
if (g_scratch_offset + size > g_scratch_size) {
|
2577
|
+
g_scratch_offset = 0;
|
2578
|
+
}
|
2579
|
+
|
1820
2580
|
char * data = (char *) g_scratch_buffer;
|
1821
2581
|
if (data == nullptr) {
|
1822
2582
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
1823
2583
|
g_scratch_buffer = data;
|
1824
2584
|
}
|
1825
2585
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
1826
|
-
}
|
1827
2586
|
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
2587
|
+
g_scratch_offset += size;
|
2588
|
+
|
2589
|
+
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
2590
|
+
} else { // allocate new buffers outside of scratch
|
2591
|
+
void * data;
|
2592
|
+
CUDA_CHECK(cudaMalloc(&data, size));
|
2593
|
+
CUDA_CHECK(cudaMemset(data, 0, size));
|
2594
|
+
extra->data_device[g_main_device] = data;
|
2595
|
+
}
|
1832
2596
|
|
1833
|
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
1834
2597
|
tensor->extra = extra;
|
1835
2598
|
}
|
1836
2599
|
|
2600
|
+
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2601
|
+
ggml_cuda_assign_buffers_impl(tensor, true);
|
2602
|
+
}
|
2603
|
+
|
2604
|
+
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2605
|
+
ggml_cuda_assign_buffers_impl(tensor, false);
|
2606
|
+
}
|
2607
|
+
|
1837
2608
|
void ggml_cuda_set_main_device(int main_device) {
|
1838
|
-
if (main_device
|
2609
|
+
if (main_device >= g_device_count) {
|
1839
2610
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
1840
2611
|
main_device, g_device_count, g_main_device);
|
1841
2612
|
return;
|
@@ -1852,6 +2623,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
|
1852
2623
|
g_scratch_size = scratch_size;
|
1853
2624
|
}
|
1854
2625
|
|
2626
|
+
void ggml_cuda_free_scratch() {
|
2627
|
+
if (g_scratch_buffer == nullptr) {
|
2628
|
+
return;
|
2629
|
+
}
|
2630
|
+
|
2631
|
+
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
2632
|
+
g_scratch_buffer = nullptr;
|
2633
|
+
}
|
2634
|
+
|
1855
2635
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
1856
2636
|
ggml_cuda_func_t func;
|
1857
2637
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -1889,12 +2669,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
1889
2669
|
}
|
1890
2670
|
func = ggml_cuda_mul_mat;
|
1891
2671
|
break;
|
2672
|
+
case GGML_OP_SCALE:
|
2673
|
+
if (!any_on_device) {
|
2674
|
+
return false;
|
2675
|
+
}
|
2676
|
+
func = ggml_cuda_scale;
|
2677
|
+
break;
|
2678
|
+
case GGML_OP_CPY:
|
2679
|
+
if (!any_on_device) {
|
2680
|
+
return false;
|
2681
|
+
}
|
2682
|
+
func = ggml_cuda_cpy;
|
2683
|
+
break;
|
1892
2684
|
case GGML_OP_RESHAPE:
|
2685
|
+
case GGML_OP_VIEW:
|
2686
|
+
case GGML_OP_PERMUTE:
|
2687
|
+
case GGML_OP_TRANSPOSE:
|
1893
2688
|
if (!any_on_device) {
|
1894
2689
|
return false;
|
1895
2690
|
}
|
1896
2691
|
func = ggml_cuda_nop;
|
1897
2692
|
break;
|
2693
|
+
case GGML_OP_DIAG_MASK_INF:
|
2694
|
+
if (!any_on_device) {
|
2695
|
+
return false;
|
2696
|
+
}
|
2697
|
+
func = ggml_cuda_diag_mask_inf;
|
2698
|
+
break;
|
2699
|
+
case GGML_OP_SOFT_MAX:
|
2700
|
+
if (!any_on_device) {
|
2701
|
+
return false;
|
2702
|
+
}
|
2703
|
+
func = ggml_cuda_soft_max;
|
2704
|
+
break;
|
1898
2705
|
case GGML_OP_ROPE:
|
1899
2706
|
if (!any_on_device) {
|
1900
2707
|
return false;
|