llama_cpp 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
@@ -13,6 +13,8 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
|
16
18
|
#if defined(_MSC_VER)
|
17
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
20
|
#endif
|
@@ -59,8 +61,8 @@ typedef float2 dfloat2;
|
|
59
61
|
#endif //GGML_CUDA_DMMV_F16
|
60
62
|
|
61
63
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
62
|
-
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
63
|
-
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
64
|
+
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
65
|
+
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
64
66
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
65
67
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
66
68
|
typedef void (*ggml_cuda_op_t)(
|
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
|
|
74
76
|
|
75
77
|
#define QK4_0 32
|
76
78
|
#define QR4_0 2
|
77
|
-
#define QI4_0 4
|
79
|
+
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
78
80
|
typedef struct {
|
79
81
|
half d; // delta
|
80
82
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
83
85
|
|
84
86
|
#define QK4_1 32
|
85
87
|
#define QR4_1 2
|
86
|
-
#define QI4_1 4
|
88
|
+
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
87
89
|
typedef struct {
|
88
90
|
half d; // delta
|
89
91
|
half m; // min
|
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
93
95
|
|
94
96
|
#define QK5_0 32
|
95
97
|
#define QR5_0 2
|
96
|
-
#define QI5_0 4
|
98
|
+
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
97
99
|
typedef struct {
|
98
100
|
half d; // delta
|
99
101
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
103
105
|
|
104
106
|
#define QK5_1 32
|
105
107
|
#define QR5_1 2
|
106
|
-
#define QI5_1 4
|
108
|
+
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
107
109
|
typedef struct {
|
108
110
|
half d; // delta
|
109
111
|
half m; // min
|
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
114
116
|
|
115
117
|
#define QK8_0 32
|
116
118
|
#define QR8_0 1
|
117
|
-
#define QI8_0
|
119
|
+
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
118
120
|
typedef struct {
|
119
121
|
half d; // delta
|
120
122
|
int8_t qs[QK8_0]; // quants
|
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
123
125
|
|
124
126
|
#define QK8_1 32
|
125
127
|
#define QR8_1 1
|
126
|
-
#define QI8_1
|
128
|
+
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
127
129
|
typedef struct {
|
128
130
|
half d; // delta
|
129
131
|
half s; // unquantized sum
|
@@ -131,7 +133,7 @@ typedef struct {
|
|
131
133
|
} block_q8_1;
|
132
134
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
135
|
|
134
|
-
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
136
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
135
137
|
|
136
138
|
//================================= k-quants
|
137
139
|
|
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, co
|
|
143
145
|
#define K_SCALE_SIZE 12
|
144
146
|
#endif
|
145
147
|
|
148
|
+
#define QR2_K 4
|
149
|
+
#define QI2_K (QK_K / (4*QR2_K))
|
146
150
|
typedef struct {
|
147
151
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
148
152
|
uint8_t qs[QK_K/4]; // quants
|
@@ -151,6 +155,8 @@ typedef struct {
|
|
151
155
|
} block_q2_K;
|
152
156
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
153
157
|
|
158
|
+
#define QR3_K 4
|
159
|
+
#define QI3_K (QK_K / (4*QR3_K))
|
154
160
|
typedef struct {
|
155
161
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
156
162
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -163,6 +169,8 @@ typedef struct {
|
|
163
169
|
} block_q3_K;
|
164
170
|
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
165
171
|
|
172
|
+
#define QR4_K 2
|
173
|
+
#define QI4_K (QK_K / (4*QR4_K))
|
166
174
|
#ifdef GGML_QKK_64
|
167
175
|
typedef struct {
|
168
176
|
half d[2]; // super-block scales/mins
|
@@ -180,6 +188,8 @@ typedef struct {
|
|
180
188
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
181
189
|
#endif
|
182
190
|
|
191
|
+
#define QR5_K 2
|
192
|
+
#define QI5_K (QK_K / (4*QR5_K))
|
183
193
|
#ifdef GGML_QKK_64
|
184
194
|
typedef struct {
|
185
195
|
half d; // super-block scale
|
@@ -199,6 +209,8 @@ typedef struct {
|
|
199
209
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
200
210
|
#endif
|
201
211
|
|
212
|
+
#define QR6_K 2
|
213
|
+
#define QI6_K (QK_K / (4*QR6_K))
|
202
214
|
typedef struct {
|
203
215
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
204
216
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
@@ -208,9 +220,11 @@ typedef struct {
|
|
208
220
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
209
221
|
|
210
222
|
#define WARP_SIZE 32
|
223
|
+
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
211
224
|
|
212
225
|
#define CUDA_ADD_BLOCK_SIZE 256
|
213
226
|
#define CUDA_MUL_BLOCK_SIZE 256
|
227
|
+
#define CUDA_GELU_BLOCK_SIZE 256
|
214
228
|
#define CUDA_SILU_BLOCK_SIZE 256
|
215
229
|
#define CUDA_CPY_BLOCK_SIZE 32
|
216
230
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
@@ -238,13 +252,13 @@ struct ggml_tensor_extra_gpu {
|
|
238
252
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
239
253
|
};
|
240
254
|
|
241
|
-
static __global__ void add_f32(const float * x, const float * y, float * dst, const int
|
255
|
+
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
242
256
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
243
257
|
|
244
|
-
if (i >=
|
258
|
+
if (i >= kx) {
|
245
259
|
return;
|
246
260
|
}
|
247
|
-
dst[i] = x[i] + y[i];
|
261
|
+
dst[i] = x[i] + y[i%ky];
|
248
262
|
}
|
249
263
|
|
250
264
|
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
@@ -265,6 +279,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
|
|
265
279
|
dst[i] = x[i] * y[i%ky];
|
266
280
|
}
|
267
281
|
|
282
|
+
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
283
|
+
const float GELU_COEF_A = 0.044715f;
|
284
|
+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
285
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
286
|
+
|
287
|
+
if (i >= k) {
|
288
|
+
return;
|
289
|
+
}
|
290
|
+
|
291
|
+
float xi = x[i];
|
292
|
+
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
293
|
+
}
|
294
|
+
|
268
295
|
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
269
296
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
270
297
|
|
@@ -274,16 +301,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
274
301
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
275
302
|
}
|
276
303
|
|
304
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
305
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
306
|
+
const int tid = threadIdx.x;
|
307
|
+
|
308
|
+
const float eps = 1e-5f;
|
309
|
+
|
310
|
+
float mean = 0.0f;
|
311
|
+
float var = 0.0f;
|
312
|
+
|
313
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
314
|
+
const float xi = x[row*ncols + col];
|
315
|
+
mean += xi;
|
316
|
+
var += xi * xi;
|
317
|
+
}
|
318
|
+
|
319
|
+
// sum up partial sums
|
320
|
+
#pragma unroll
|
321
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
322
|
+
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
|
323
|
+
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
|
324
|
+
}
|
325
|
+
|
326
|
+
mean /= ncols;
|
327
|
+
var = var / ncols - mean * mean;
|
328
|
+
const float inv_var = rsqrtf(var + eps);
|
329
|
+
|
330
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
331
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
277
335
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
278
336
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
279
337
|
const int tid = threadIdx.x;
|
280
338
|
|
281
|
-
const float eps = 1e-
|
339
|
+
const float eps = 1e-6f;
|
282
340
|
|
283
341
|
float tmp = 0.0f; // partial sum for thread in warp
|
284
342
|
|
285
|
-
for (int
|
286
|
-
const int col = i + tid;
|
343
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
287
344
|
const float xi = x[row*ncols + col];
|
288
345
|
tmp += xi * xi;
|
289
346
|
}
|
@@ -295,10 +352,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
295
352
|
}
|
296
353
|
|
297
354
|
const float mean = tmp / ncols;
|
298
|
-
const float scale =
|
355
|
+
const float scale = rsqrtf(mean + eps);
|
299
356
|
|
300
|
-
for (int
|
301
|
-
const int col = i + tid;
|
357
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
358
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
303
359
|
}
|
304
360
|
}
|
@@ -407,7 +463,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
407
463
|
|
408
464
|
//================================== k-quants
|
409
465
|
|
410
|
-
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
466
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
411
467
|
|
412
468
|
const int i = blockIdx.x;
|
413
469
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -440,7 +496,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
440
496
|
|
441
497
|
}
|
442
498
|
|
443
|
-
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
499
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
444
500
|
|
445
501
|
const int i = blockIdx.x;
|
446
502
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -504,7 +560,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
504
560
|
}
|
505
561
|
#endif
|
506
562
|
|
507
|
-
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
563
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
508
564
|
const block_q4_K * x = (const block_q4_K *) vx;
|
509
565
|
|
510
566
|
const int i = blockIdx.x;
|
@@ -544,7 +600,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
544
600
|
#endif
|
545
601
|
}
|
546
602
|
|
547
|
-
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
603
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
548
604
|
const block_q5_K * x = (const block_q5_K *) vx;
|
549
605
|
|
550
606
|
const int i = blockIdx.x;
|
@@ -590,7 +646,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
590
646
|
#endif
|
591
647
|
}
|
592
648
|
|
593
|
-
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
649
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
594
650
|
const block_q6_K * x = (const block_q6_K *) vx;
|
595
651
|
|
596
652
|
const int i = blockIdx.x;
|
@@ -634,7 +690,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
634
690
|
#endif
|
635
691
|
}
|
636
692
|
|
637
|
-
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
693
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
638
694
|
|
639
695
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
640
696
|
|
@@ -742,7 +798,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
742
798
|
}
|
743
799
|
}
|
744
800
|
|
745
|
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
801
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
746
802
|
|
747
803
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
748
804
|
if (row > nrows) return;
|
@@ -846,7 +902,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
846
902
|
}
|
847
903
|
}
|
848
904
|
|
849
|
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
905
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
850
906
|
|
851
907
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
852
908
|
if (row > nrows) return;
|
@@ -949,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
949
1005
|
}
|
950
1006
|
}
|
951
1007
|
|
952
|
-
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
1008
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
953
1009
|
|
954
1010
|
const int row = blockIdx.x;
|
955
1011
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1053,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1053
1109
|
}
|
1054
1110
|
}
|
1055
1111
|
|
1056
|
-
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
1112
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1057
1113
|
|
1058
1114
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1059
1115
|
|
@@ -1171,7 +1227,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1171
1227
|
v.y = x[ib + iqs + 1];
|
1172
1228
|
}
|
1173
1229
|
|
1174
|
-
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1230
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
|
1175
1231
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
1232
|
|
1177
1233
|
if (i >= k) {
|
@@ -1180,10 +1236,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1180
1236
|
|
1181
1237
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
1238
|
|
1183
|
-
const int ib = i /
|
1184
|
-
const int iqs = i %
|
1239
|
+
const int ib = i / QK8_1; // block index
|
1240
|
+
const int iqs = i % QK8_1; // quant index
|
1185
1241
|
|
1186
|
-
const float xi = x[i];
|
1242
|
+
const float xi = i < ndata ? x[i] : 0.0f;
|
1187
1243
|
float amax = fabsf(xi);
|
1188
1244
|
float sum = xi;
|
1189
1245
|
|
@@ -1207,7 +1263,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1207
1263
|
}
|
1208
1264
|
|
1209
1265
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1210
|
-
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1266
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
1211
1267
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1212
1268
|
|
1213
1269
|
if (i >= k) {
|
@@ -1227,8 +1283,9 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1227
1283
|
y[iybs + iqs + y_offset] = v.y;
|
1228
1284
|
}
|
1229
1285
|
|
1230
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1231
|
-
|
1286
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1287
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1288
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1232
1289
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
1290
|
|
1234
1291
|
int vi;
|
@@ -1249,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
|
|
1249
1306
|
return sumi*d;
|
1250
1307
|
#else
|
1251
1308
|
return 0.0f; // only to satisfy the compiler
|
1252
|
-
#endif // __CUDA_ARCH__ >=
|
1309
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1253
1310
|
}
|
1254
1311
|
|
1255
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1256
|
-
|
1312
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1313
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1314
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1257
1315
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
1316
|
|
1259
1317
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1274,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
|
|
1274
1332
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
1333
|
#else
|
1276
1334
|
return 0.0f; // only to satisfy the compiler
|
1277
|
-
#endif // __CUDA_ARCH__ >=
|
1335
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1278
1336
|
}
|
1279
1337
|
|
1280
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1281
|
-
|
1338
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1339
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1340
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1282
1341
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
1342
|
|
1284
1343
|
int qs;
|
@@ -1309,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
|
|
1309
1368
|
return sumi*d;
|
1310
1369
|
#else
|
1311
1370
|
return 0.0f; // only to satisfy the compiler
|
1312
|
-
#endif // __CUDA_ARCH__ >=
|
1371
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1313
1372
|
}
|
1314
1373
|
|
1315
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1316
|
-
|
1374
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1375
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1376
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1317
1377
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
1378
|
|
1319
1379
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1343,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
|
|
1343
1403
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
1404
|
#else
|
1345
1405
|
return 0.0f; // only to satisfy the compiler
|
1346
|
-
#endif // __CUDA_ARCH__ >=
|
1406
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1347
1407
|
}
|
1348
1408
|
|
1349
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1350
|
-
|
1409
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1410
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1411
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1351
1412
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
1413
|
|
1353
1414
|
int vi;
|
@@ -1362,11 +1423,224 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
|
|
1362
1423
|
return sumi*d;
|
1363
1424
|
#else
|
1364
1425
|
return 0.0f; // only to satisfy the compiler
|
1365
|
-
#endif // __CUDA_ARCH__ >=
|
1426
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1427
|
+
}
|
1428
|
+
|
1429
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
1430
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1431
|
+
|
1432
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1433
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
1434
|
+
|
1435
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
1436
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1437
|
+
|
1438
|
+
float sumf_d = 0.0f;
|
1439
|
+
float sumf_m = 0.0f;
|
1440
|
+
|
1441
|
+
const float d = bq2_K->d;
|
1442
|
+
const float dmin = bq2_K->dmin;
|
1443
|
+
|
1444
|
+
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
|
1445
|
+
|
1446
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1447
|
+
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1448
|
+
|
1449
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1450
|
+
const float d8i = bq8i->d;
|
1451
|
+
|
1452
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1453
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1454
|
+
|
1455
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
|
1456
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
return d*sumf_d - dmin*sumf_m;
|
1460
|
+
#else
|
1461
|
+
return 0.0f; // only to satisfy the compiler
|
1462
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1463
|
+
}
|
1464
|
+
|
1465
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
1466
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1467
|
+
|
1468
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1469
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
1470
|
+
|
1471
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
1472
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1473
|
+
|
1474
|
+
float sumf = 0.0f;
|
1475
|
+
|
1476
|
+
const float d = bq3_K->d;
|
1477
|
+
|
1478
|
+
int vl;
|
1479
|
+
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
|
1480
|
+
|
1481
|
+
int vh;
|
1482
|
+
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
1483
|
+
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1484
|
+
vh >>= bq8_offset;
|
1485
|
+
|
1486
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1487
|
+
const int isc = scale_offset + 2*i;
|
1488
|
+
|
1489
|
+
const int isc_low = isc % (QK_K/32);
|
1490
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1491
|
+
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1492
|
+
|
1493
|
+
const int isc_high = isc % (QK_K/64);
|
1494
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1495
|
+
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1496
|
+
|
1497
|
+
const int sc = (sc_low | sc_high) - 32;
|
1498
|
+
|
1499
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1500
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1501
|
+
const float d8i = bq8i->d;
|
1502
|
+
|
1503
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1504
|
+
|
1505
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1506
|
+
|
1507
|
+
const int vi = __vsubss4(vil, vih);
|
1508
|
+
|
1509
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1510
|
+
}
|
1511
|
+
|
1512
|
+
return d*sumf;
|
1513
|
+
#else
|
1514
|
+
return 0.0f; // only to satisfy the compiler
|
1515
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1516
|
+
}
|
1517
|
+
|
1518
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
1519
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1520
|
+
|
1521
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1522
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1523
|
+
|
1524
|
+
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
1525
|
+
|
1526
|
+
float sumf_d = 0.0f;
|
1527
|
+
float sumf_m = 0.0f;
|
1528
|
+
|
1529
|
+
const float d = bq4_K->d;
|
1530
|
+
const float dmin = bq4_K->dmin;
|
1531
|
+
|
1532
|
+
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
1533
|
+
|
1534
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1535
|
+
const int isc = bq8_offset + i;
|
1536
|
+
|
1537
|
+
uint8_t sc, m;
|
1538
|
+
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
1539
|
+
|
1540
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1541
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1542
|
+
const float d8i = bq8i->d;
|
1543
|
+
|
1544
|
+
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
1545
|
+
|
1546
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1547
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
|
1548
|
+
}
|
1549
|
+
|
1550
|
+
return d*sumf_d - dmin*sumf_m;
|
1551
|
+
#else
|
1552
|
+
return 0.0f; // only to satisfy the compiler
|
1553
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1554
|
+
}
|
1555
|
+
|
1556
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
1557
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1558
|
+
|
1559
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1560
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1561
|
+
|
1562
|
+
const int bq8_offset = QR5_K * (iqs / QI8_1);
|
1563
|
+
|
1564
|
+
float sumf_d = 0.0f;
|
1565
|
+
float sumf_m = 0.0f;
|
1566
|
+
|
1567
|
+
const float d = bq5_K->d;
|
1568
|
+
const float dmin = bq5_K->dmin;
|
1569
|
+
|
1570
|
+
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
|
1571
|
+
|
1572
|
+
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
|
1573
|
+
|
1574
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1575
|
+
const int isc = bq8_offset + i;
|
1576
|
+
|
1577
|
+
uint8_t sc, m;
|
1578
|
+
get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
1579
|
+
|
1580
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1581
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1582
|
+
const float d8i = bq8i->d;
|
1583
|
+
|
1584
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1585
|
+
|
1586
|
+
const int vih = ((vh >> i) << 4) & 0x10101010;
|
1587
|
+
|
1588
|
+
const int vi = vil | vih;
|
1589
|
+
|
1590
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1591
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
1592
|
+
}
|
1593
|
+
|
1594
|
+
return d*sumf_d - dmin*sumf_m;
|
1595
|
+
#else
|
1596
|
+
return 0.0f; // only to satisfy the compiler
|
1597
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
1601
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1602
|
+
|
1603
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1604
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
1605
|
+
|
1606
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
1607
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1608
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1609
|
+
|
1610
|
+
float sumf = 0.0f;
|
1611
|
+
|
1612
|
+
const float d = bq6_K->d;
|
1613
|
+
|
1614
|
+
int vl;
|
1615
|
+
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
1616
|
+
|
1617
|
+
int vh;
|
1618
|
+
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
|
1619
|
+
|
1620
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1621
|
+
const int sc = bq6_K->scales[scale_offset + 4*i];
|
1622
|
+
|
1623
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
1624
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1625
|
+
const float d8i = bq8i->d;
|
1626
|
+
|
1627
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1628
|
+
|
1629
|
+
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
|
1630
|
+
|
1631
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1632
|
+
|
1633
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1634
|
+
}
|
1635
|
+
|
1636
|
+
return d*sumf;
|
1637
|
+
#else
|
1638
|
+
return 0.0f; // only to satisfy the compiler
|
1639
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1366
1640
|
}
|
1367
1641
|
|
1368
1642
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
-
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1643
|
+
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1370
1644
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
1645
|
|
1372
1646
|
if (row >= nrows) {
|
@@ -1385,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
|
|
1385
1659
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1386
1660
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1387
1661
|
|
1388
|
-
const int iby = i + threadIdx.x / qi; // y block index
|
1662
|
+
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
1389
1663
|
|
1390
1664
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1391
1665
|
|
@@ -1404,7 +1678,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
|
|
1404
1678
|
}
|
1405
1679
|
|
1406
1680
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1407
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1681
|
+
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
1408
1682
|
// qk = quantized weights per x block
|
1409
1683
|
// qr = number of quantized weights per data value in x block
|
1410
1684
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -1471,7 +1745,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1471
1745
|
}
|
1472
1746
|
}
|
1473
1747
|
|
1474
|
-
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1748
|
+
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1475
1749
|
const half * x = (const half *) vx;
|
1476
1750
|
|
1477
1751
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1518,7 +1792,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1518
1792
|
}
|
1519
1793
|
|
1520
1794
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1521
|
-
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1795
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1522
1796
|
const int row_stride_x, const int channel_stride_x) {
|
1523
1797
|
|
1524
1798
|
const half * x = (const half *) vx;
|
@@ -1623,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
1623
1897
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1624
1898
|
}
|
1625
1899
|
|
1900
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
1901
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1902
|
+
const int half_n_dims = ncols/4;
|
1903
|
+
|
1904
|
+
if (col >= half_n_dims) {
|
1905
|
+
return;
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1909
|
+
const int i = row*ncols + col;
|
1910
|
+
|
1911
|
+
const float col_theta_scale = powf(theta_scale, col);
|
1912
|
+
|
1913
|
+
const float theta = p*col_theta_scale;
|
1914
|
+
const float sin_theta = sinf(theta);
|
1915
|
+
const float cos_theta = cosf(theta);
|
1916
|
+
|
1917
|
+
const float x0 = x[i + 0];
|
1918
|
+
const float x1 = x[i + half_n_dims];
|
1919
|
+
|
1920
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
1921
|
+
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
1922
|
+
|
1923
|
+
const float block_theta = block_p*col_theta_scale;
|
1924
|
+
const float sin_block_theta = sinf(block_theta);
|
1925
|
+
const float cos_block_theta = cosf(block_theta);
|
1926
|
+
|
1927
|
+
const float x2 = x[i + half_n_dims * 2];
|
1928
|
+
const float x3 = x[i + half_n_dims * 3];
|
1929
|
+
|
1930
|
+
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
1931
|
+
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
1932
|
+
}
|
1933
|
+
|
1626
1934
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1627
1935
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1628
1936
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1688,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
1688
1996
|
dst[i] = scale * x[i];
|
1689
1997
|
}
|
1690
1998
|
|
1691
|
-
static void add_f32_cuda(const float * x, const float * y, float * dst, const int
|
1692
|
-
const int num_blocks = (
|
1693
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst,
|
1999
|
+
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
2000
|
+
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
2001
|
+
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1694
2002
|
}
|
1695
2003
|
|
1696
2004
|
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
@@ -1703,20 +2011,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1703
2011
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1704
2012
|
}
|
1705
2013
|
|
2014
|
+
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
2015
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
2016
|
+
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
2017
|
+
}
|
2018
|
+
|
1706
2019
|
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1707
2020
|
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
1708
2021
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1709
2022
|
}
|
1710
2023
|
|
2024
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2025
|
+
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
2026
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2027
|
+
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2028
|
+
}
|
2029
|
+
|
1711
2030
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1712
2031
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1713
2032
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1714
2033
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1715
2034
|
}
|
1716
2035
|
|
1717
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
2036
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
1718
2037
|
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
-
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
2038
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
|
1720
2039
|
}
|
1721
2040
|
|
1722
2041
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -1873,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1873
2192
|
}
|
1874
2193
|
|
1875
2194
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1876
|
-
GGML_ASSERT(ncols %
|
2195
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
1877
2196
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1878
2197
|
const dim3 block_nums(1, block_num_y, 1);
|
1879
2198
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1882,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1882
2201
|
}
|
1883
2202
|
|
1884
2203
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1885
|
-
GGML_ASSERT(ncols %
|
2204
|
+
GGML_ASSERT(ncols % QK4_1 == 0);
|
1886
2205
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1887
2206
|
const dim3 block_nums(1, block_num_y, 1);
|
1888
2207
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1891,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1891
2210
|
}
|
1892
2211
|
|
1893
2212
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1894
|
-
GGML_ASSERT(ncols %
|
2213
|
+
GGML_ASSERT(ncols % QK5_0 == 0);
|
1895
2214
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1896
2215
|
const dim3 block_nums(1, block_num_y, 1);
|
1897
2216
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1900,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1900
2219
|
}
|
1901
2220
|
|
1902
2221
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1903
|
-
GGML_ASSERT(ncols %
|
2222
|
+
GGML_ASSERT(ncols % QK5_1 == 0);
|
1904
2223
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1905
2224
|
const dim3 block_nums(1, block_num_y, 1);
|
1906
2225
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1909,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1909
2228
|
}
|
1910
2229
|
|
1911
2230
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1912
|
-
GGML_ASSERT(ncols %
|
2231
|
+
GGML_ASSERT(ncols % QK8_0 == 0);
|
1913
2232
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1914
2233
|
const dim3 block_nums(1, block_num_y, 1);
|
1915
2234
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1917,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1917
2236
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1918
2237
|
}
|
1919
2238
|
|
2239
|
+
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2240
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2241
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2242
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2243
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2244
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
2245
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2249
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2250
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2251
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2252
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2253
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
2254
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2258
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2259
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2260
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2261
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2262
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
|
2263
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2264
|
+
}
|
2265
|
+
|
2266
|
+
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2267
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2268
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2269
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2270
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2271
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
|
2272
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2273
|
+
}
|
2274
|
+
|
2275
|
+
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2276
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2277
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2278
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2279
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2280
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
2281
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2282
|
+
}
|
2283
|
+
|
1920
2284
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1921
2285
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1922
2286
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -2009,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2009
2373
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
2010
2374
|
}
|
2011
2375
|
|
2376
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
2377
|
+
GGML_ASSERT(nrows % 4 == 0);
|
2378
|
+
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2379
|
+
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
2380
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2381
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
2382
|
+
}
|
2383
|
+
|
2012
2384
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
2013
2385
|
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
2014
2386
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -2051,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
2051
2423
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
2052
2424
|
int id;
|
2053
2425
|
CUDA_CHECK(cudaGetDevice(&id));
|
2054
|
-
|
2426
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2427
|
+
int nnz = 0;
|
2428
|
+
size_t max_size = 0, tot_size = 0;
|
2429
|
+
#endif
|
2430
|
+
size_t best_diff = 1ull << 36;
|
2431
|
+
int ibest = -1;
|
2055
2432
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
2056
2433
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
2057
|
-
if (b.
|
2058
|
-
|
2059
|
-
|
2060
|
-
b.
|
2061
|
-
b.size =
|
2062
|
-
|
2434
|
+
if (b.ptr != nullptr) {
|
2435
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2436
|
+
++nnz;
|
2437
|
+
tot_size += b.size;
|
2438
|
+
if (b.size > max_size) max_size = b.size;
|
2439
|
+
#endif
|
2440
|
+
if (b.size >= size) {
|
2441
|
+
size_t diff = b.size - size;
|
2442
|
+
if (diff < best_diff) {
|
2443
|
+
best_diff = diff;
|
2444
|
+
ibest = i;
|
2445
|
+
if (!best_diff) {
|
2446
|
+
void * ptr = b.ptr;
|
2447
|
+
*actual_size = b.size;
|
2448
|
+
b.ptr = nullptr;
|
2449
|
+
b.size = 0;
|
2450
|
+
return ptr;
|
2451
|
+
}
|
2452
|
+
}
|
2453
|
+
}
|
2063
2454
|
}
|
2064
2455
|
}
|
2456
|
+
if (ibest >= 0) {
|
2457
|
+
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
2458
|
+
void * ptr = b.ptr;
|
2459
|
+
*actual_size = b.size;
|
2460
|
+
b.ptr = nullptr;
|
2461
|
+
b.size = 0;
|
2462
|
+
return ptr;
|
2463
|
+
}
|
2464
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2465
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
2466
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
2467
|
+
#endif
|
2065
2468
|
void * ptr;
|
2066
|
-
|
2067
|
-
|
2469
|
+
size_t look_ahead_size = (size_t) (1.05 * size);
|
2470
|
+
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
2471
|
+
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
2472
|
+
*actual_size = look_ahead_size;
|
2068
2473
|
return ptr;
|
2069
2474
|
}
|
2070
2475
|
|
@@ -2140,6 +2545,9 @@ void ggml_init_cublas() {
|
|
2140
2545
|
}
|
2141
2546
|
|
2142
2547
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
2548
|
+
if (tensor_split == nullptr) {
|
2549
|
+
return;
|
2550
|
+
}
|
2143
2551
|
bool all_zero = true;
|
2144
2552
|
for (int i = 0; i < g_device_count; ++i) {
|
2145
2553
|
if (tensor_split[i] != 0.0f) {
|
@@ -2236,16 +2644,19 @@ inline void ggml_cuda_op_add(
|
|
2236
2644
|
|
2237
2645
|
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
2238
2646
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2239
|
-
GGML_ASSERT(dst_ddf_i
|
2647
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2240
2648
|
|
2241
|
-
const int64_t
|
2649
|
+
const int64_t ne00 = src0->ne[0];
|
2242
2650
|
const int64_t i01_diff = i01_high - i01_low;
|
2243
2651
|
|
2652
|
+
const int64_t ne10 = src1->ne[0];
|
2653
|
+
const int64_t ne11 = src1->ne[1];
|
2654
|
+
|
2244
2655
|
// compute
|
2245
2656
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2246
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i,
|
2657
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2247
2658
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2248
|
-
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i,
|
2659
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2249
2660
|
} else {
|
2250
2661
|
GGML_ASSERT(false);
|
2251
2662
|
}
|
@@ -2264,27 +2675,41 @@ inline void ggml_cuda_op_mul(
|
|
2264
2675
|
|
2265
2676
|
GGML_ASSERT(src0_ddf_i != nullptr);
|
2266
2677
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2267
|
-
GGML_ASSERT(dst_ddf_i
|
2678
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2268
2679
|
|
2269
2680
|
const int64_t ne00 = src0->ne[0];
|
2681
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2270
2682
|
|
2271
2683
|
const int64_t ne10 = src1->ne[0];
|
2272
2684
|
const int64_t ne11 = src1->ne[1];
|
2273
2685
|
|
2274
|
-
|
2275
|
-
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
2686
|
+
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2276
2687
|
|
2277
|
-
|
2278
|
-
|
2279
|
-
|
2688
|
+
(void) dst;
|
2689
|
+
(void) src0_ddq_i;
|
2690
|
+
(void) i02;
|
2691
|
+
}
|
2280
2692
|
|
2281
|
-
|
2282
|
-
|
2283
|
-
|
2693
|
+
inline void ggml_cuda_op_gelu(
|
2694
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2695
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2696
|
+
cudaStream_t & cudaStream_main){
|
2697
|
+
|
2698
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2699
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2700
|
+
|
2701
|
+
const int64_t ne00 = src0->ne[0];
|
2702
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2703
|
+
|
2704
|
+
// compute
|
2705
|
+
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2284
2706
|
|
2707
|
+
(void) src1;
|
2285
2708
|
(void) dst;
|
2286
2709
|
(void) src0_ddq_i;
|
2710
|
+
(void) src1_ddf_i;
|
2287
2711
|
(void) i02;
|
2712
|
+
(void) i1;
|
2288
2713
|
}
|
2289
2714
|
|
2290
2715
|
inline void ggml_cuda_op_silu(
|
@@ -2309,6 +2734,28 @@ inline void ggml_cuda_op_silu(
|
|
2309
2734
|
(void) i1;
|
2310
2735
|
}
|
2311
2736
|
|
2737
|
+
inline void ggml_cuda_op_norm(
|
2738
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2739
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2740
|
+
cudaStream_t & cudaStream_main){
|
2741
|
+
|
2742
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2743
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2744
|
+
|
2745
|
+
const int64_t ne00 = src0->ne[0];
|
2746
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2747
|
+
|
2748
|
+
// compute
|
2749
|
+
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2750
|
+
|
2751
|
+
(void) src1;
|
2752
|
+
(void) dst;
|
2753
|
+
(void) src0_ddq_i;
|
2754
|
+
(void) src1_ddf_i;
|
2755
|
+
(void) i02;
|
2756
|
+
(void) i1;
|
2757
|
+
}
|
2758
|
+
|
2312
2759
|
inline void ggml_cuda_op_rms_norm(
|
2313
2760
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2314
2761
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2349,22 +2796,30 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2349
2796
|
int id;
|
2350
2797
|
CUDA_CHECK(cudaGetDevice(&id));
|
2351
2798
|
|
2352
|
-
|
2799
|
+
bool mul_mat_vec_q_implemented =
|
2800
|
+
src0->type == GGML_TYPE_Q4_0 ||
|
2353
2801
|
src0->type == GGML_TYPE_Q4_1 ||
|
2354
2802
|
src0->type == GGML_TYPE_Q5_0 ||
|
2355
2803
|
src0->type == GGML_TYPE_Q5_1 ||
|
2356
2804
|
src0->type == GGML_TYPE_Q8_0;
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2805
|
+
#if QK_K == 256
|
2806
|
+
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
2807
|
+
src0->type == GGML_TYPE_Q2_K ||
|
2808
|
+
src0->type == GGML_TYPE_Q3_K ||
|
2809
|
+
src0->type == GGML_TYPE_Q4_K ||
|
2810
|
+
src0->type == GGML_TYPE_Q5_K ||
|
2811
|
+
src0->type == GGML_TYPE_Q6_K;
|
2812
|
+
#endif // QK_K == 256
|
2813
|
+
|
2814
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
2362
2815
|
#endif
|
2363
2816
|
|
2364
2817
|
if (use_mul_mat_vec_q) {
|
2818
|
+
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
2819
|
+
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
2365
2820
|
size_t as;
|
2366
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(
|
2367
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2821
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2822
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
2368
2823
|
|
2369
2824
|
switch (src0->type) {
|
2370
2825
|
case GGML_TYPE_Q4_0:
|
@@ -2382,6 +2837,21 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2382
2837
|
case GGML_TYPE_Q8_0:
|
2383
2838
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2384
2839
|
break;
|
2840
|
+
case GGML_TYPE_Q2_K:
|
2841
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2842
|
+
break;
|
2843
|
+
case GGML_TYPE_Q3_K:
|
2844
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2845
|
+
break;
|
2846
|
+
case GGML_TYPE_Q4_K:
|
2847
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2848
|
+
break;
|
2849
|
+
case GGML_TYPE_Q5_K:
|
2850
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2851
|
+
break;
|
2852
|
+
case GGML_TYPE_Q6_K:
|
2853
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2854
|
+
break;
|
2385
2855
|
default:
|
2386
2856
|
GGML_ASSERT(false);
|
2387
2857
|
break;
|
@@ -2516,13 +2986,26 @@ inline void ggml_cuda_op_rope(
|
|
2516
2986
|
const int n_past = ((int32_t *) src1->data)[0];
|
2517
2987
|
const int n_dims = ((int32_t *) src1->data)[1];
|
2518
2988
|
const int mode = ((int32_t *) src1->data)[2];
|
2519
|
-
|
2989
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
2990
|
+
|
2991
|
+
// RoPE alteration for extended context
|
2992
|
+
float freq_base, freq_scale;
|
2993
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
2994
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
2995
|
+
|
2996
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
2997
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
2520
2998
|
|
2521
|
-
|
2522
|
-
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
2999
|
+
bool is_glm = mode & 4;
|
2523
3000
|
|
2524
3001
|
// compute
|
2525
|
-
|
3002
|
+
if (is_glm) {
|
3003
|
+
const float id_p = min(p, n_ctx - 2.f);
|
3004
|
+
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3005
|
+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3006
|
+
} else {
|
3007
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3008
|
+
}
|
2526
3009
|
|
2527
3010
|
(void) dst;
|
2528
3011
|
(void) src0_ddq_i;
|
@@ -2925,11 +3408,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2925
3408
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
2926
3409
|
}
|
2927
3410
|
|
3411
|
+
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3412
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3413
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
3414
|
+
}
|
3415
|
+
|
2928
3416
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2929
3417
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2930
3418
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
2931
3419
|
}
|
2932
3420
|
|
3421
|
+
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3422
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3423
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
3424
|
+
}
|
3425
|
+
|
2933
3426
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2934
3427
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2935
3428
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
@@ -3085,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3085
3578
|
(void) dst;
|
3086
3579
|
}
|
3087
3580
|
|
3581
|
+
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3582
|
+
ggml_cuda_cpy(src0, dst, nullptr);
|
3583
|
+
(void) src1;
|
3584
|
+
}
|
3585
|
+
|
3088
3586
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3089
3587
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3090
3588
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
@@ -3108,7 +3606,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3108
3606
|
|
3109
3607
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
3110
3608
|
int nrows = ggml_nrows(tensor);
|
3609
|
+
|
3610
|
+
const int64_t ne0 = tensor->ne[0];
|
3611
|
+
|
3111
3612
|
const size_t nb1 = tensor->nb[1];
|
3613
|
+
|
3112
3614
|
ggml_backend backend = tensor->backend;
|
3113
3615
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
3114
3616
|
memset(extra, 0, sizeof(*extra));
|
@@ -3137,13 +3639,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3137
3639
|
int64_t nrows_split = row_high - row_low;
|
3138
3640
|
|
3139
3641
|
const size_t offset_split = row_low*nb1;
|
3140
|
-
|
3642
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3643
|
+
const size_t original_size = size;
|
3644
|
+
|
3645
|
+
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
3646
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3647
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3648
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
3649
|
+
}
|
3141
3650
|
|
3142
|
-
|
3651
|
+
char * buf;
|
3143
3652
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
3144
|
-
|
3653
|
+
char * buf_host = (char*)data + offset_split;
|
3145
3654
|
|
3146
|
-
|
3655
|
+
// set padding to 0 to avoid possible NaN values
|
3656
|
+
if (size > original_size) {
|
3657
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
3658
|
+
}
|
3659
|
+
|
3660
|
+
|
3661
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
|
3147
3662
|
|
3148
3663
|
extra->data_device[id] = buf;
|
3149
3664
|
|
@@ -3177,43 +3692,60 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
3177
3692
|
delete extra;
|
3178
3693
|
}
|
3179
3694
|
|
3695
|
+
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
3696
|
+
static size_t g_temp_tensor_extra_index = 0;
|
3697
|
+
|
3698
|
+
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
3699
|
+
if (g_temp_tensor_extras == nullptr) {
|
3700
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
3701
|
+
}
|
3702
|
+
|
3703
|
+
size_t alloc_index = g_temp_tensor_extra_index;
|
3704
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
3705
|
+
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
3706
|
+
memset(extra, 0, sizeof(*extra));
|
3707
|
+
|
3708
|
+
return extra;
|
3709
|
+
}
|
3710
|
+
|
3180
3711
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
3181
3712
|
if (scratch && g_scratch_size == 0) {
|
3182
3713
|
return;
|
3183
3714
|
}
|
3184
3715
|
|
3185
3716
|
// recursively assign CUDA buffers until a compute tensor is found
|
3186
|
-
if (tensor->
|
3187
|
-
const ggml_op src0_op = tensor->
|
3188
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3189
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3717
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3718
|
+
const ggml_op src0_op = tensor->src[0]->op;
|
3719
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
3720
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3190
3721
|
}
|
3191
3722
|
}
|
3192
|
-
if (tensor->op == GGML_OP_CPY && tensor->
|
3193
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3723
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
3724
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
3194
3725
|
}
|
3195
3726
|
|
3196
3727
|
tensor->backend = GGML_BACKEND_GPU;
|
3197
|
-
struct ggml_tensor_extra_gpu * extra
|
3198
|
-
memset(extra, 0, sizeof(*extra));
|
3728
|
+
struct ggml_tensor_extra_gpu * extra;
|
3199
3729
|
|
3200
|
-
const bool inplace = (tensor->
|
3730
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3201
3731
|
tensor->op == GGML_OP_VIEW ||
|
3202
3732
|
force_inplace;
|
3203
3733
|
const size_t size = ggml_nbytes(tensor);
|
3204
3734
|
|
3205
3735
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3206
|
-
if (inplace && (tensor->
|
3207
|
-
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3736
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
3737
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
3208
3738
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3209
3739
|
size_t offset = 0;
|
3210
3740
|
if (tensor->op == GGML_OP_VIEW) {
|
3211
|
-
memcpy(&offset, tensor->
|
3741
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
3212
3742
|
}
|
3743
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3213
3744
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3214
3745
|
} else if (tensor->op == GGML_OP_CPY) {
|
3215
|
-
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3746
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3216
3747
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3748
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3217
3749
|
extra->data_device[g_main_device] = src1_ddv;
|
3218
3750
|
} else if (scratch) {
|
3219
3751
|
GGML_ASSERT(size <= g_scratch_size);
|
@@ -3226,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3226
3758
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
3227
3759
|
g_scratch_buffer = data;
|
3228
3760
|
}
|
3761
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3229
3762
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
3230
3763
|
|
3231
3764
|
g_scratch_offset += size;
|
@@ -3235,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3235
3768
|
void * data;
|
3236
3769
|
CUDA_CHECK(cudaMalloc(&data, size));
|
3237
3770
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
3771
|
+
extra = new ggml_tensor_extra_gpu;
|
3772
|
+
memset(extra, 0, sizeof(*extra));
|
3238
3773
|
extra->data_device[g_main_device] = data;
|
3239
3774
|
}
|
3240
3775
|
|
@@ -3283,10 +3818,16 @@ void ggml_cuda_free_scratch() {
|
|
3283
3818
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
3284
3819
|
ggml_cuda_func_t func;
|
3285
3820
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
3286
|
-
|| (tensor->
|
3287
|
-
|| (tensor->
|
3821
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
3822
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3288
3823
|
|
3289
3824
|
switch (tensor->op) {
|
3825
|
+
case GGML_OP_DUP:
|
3826
|
+
if (!any_on_device) {
|
3827
|
+
return false;
|
3828
|
+
}
|
3829
|
+
func = ggml_cuda_dup;
|
3830
|
+
break;
|
3290
3831
|
case GGML_OP_ADD:
|
3291
3832
|
if (!any_on_device) {
|
3292
3833
|
return false;
|
@@ -3299,12 +3840,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3299
3840
|
}
|
3300
3841
|
func = ggml_cuda_mul;
|
3301
3842
|
break;
|
3843
|
+
case GGML_OP_GELU:
|
3844
|
+
if (!any_on_device) {
|
3845
|
+
return false;
|
3846
|
+
}
|
3847
|
+
func = ggml_cuda_gelu;
|
3848
|
+
break;
|
3302
3849
|
case GGML_OP_SILU:
|
3303
3850
|
if (!any_on_device) {
|
3304
3851
|
return false;
|
3305
3852
|
}
|
3306
3853
|
func = ggml_cuda_silu;
|
3307
3854
|
break;
|
3855
|
+
case GGML_OP_NORM:
|
3856
|
+
if (!any_on_device) {
|
3857
|
+
return false;
|
3858
|
+
}
|
3859
|
+
func = ggml_cuda_norm;
|
3860
|
+
break;
|
3308
3861
|
case GGML_OP_RMS_NORM:
|
3309
3862
|
if (!any_on_device) {
|
3310
3863
|
return false;
|
@@ -3312,7 +3865,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3312
3865
|
func = ggml_cuda_rms_norm;
|
3313
3866
|
break;
|
3314
3867
|
case GGML_OP_MUL_MAT:
|
3315
|
-
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->
|
3868
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
3316
3869
|
return false;
|
3317
3870
|
}
|
3318
3871
|
func = ggml_cuda_mul_mat;
|
@@ -3329,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3329
3882
|
}
|
3330
3883
|
func = ggml_cuda_cpy;
|
3331
3884
|
break;
|
3885
|
+
case GGML_OP_CONT:
|
3886
|
+
if (!any_on_device) {
|
3887
|
+
return false;
|
3888
|
+
}
|
3889
|
+
func = ggml_cuda_dup;
|
3890
|
+
break;
|
3332
3891
|
case GGML_OP_RESHAPE:
|
3333
3892
|
case GGML_OP_VIEW:
|
3334
3893
|
case GGML_OP_PERMUTE:
|
@@ -3366,6 +3925,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3366
3925
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
3367
3926
|
return true;
|
3368
3927
|
}
|
3369
|
-
func(tensor->
|
3928
|
+
func(tensor->src[0], tensor->src[1], tensor);
|
3370
3929
|
return true;
|
3371
3930
|
}
|