llama_cpp 0.3.2 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
@@ -13,6 +13,8 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
|
16
18
|
#if defined(_MSC_VER)
|
17
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
20
|
#endif
|
@@ -59,8 +61,8 @@ typedef float2 dfloat2;
|
|
59
61
|
#endif //GGML_CUDA_DMMV_F16
|
60
62
|
|
61
63
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
62
|
-
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
63
|
-
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
64
|
+
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
65
|
+
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
64
66
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
65
67
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
66
68
|
typedef void (*ggml_cuda_op_t)(
|
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
|
|
74
76
|
|
75
77
|
#define QK4_0 32
|
76
78
|
#define QR4_0 2
|
77
|
-
#define QI4_0 4
|
79
|
+
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
78
80
|
typedef struct {
|
79
81
|
half d; // delta
|
80
82
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
83
85
|
|
84
86
|
#define QK4_1 32
|
85
87
|
#define QR4_1 2
|
86
|
-
#define QI4_1 4
|
88
|
+
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
87
89
|
typedef struct {
|
88
90
|
half d; // delta
|
89
91
|
half m; // min
|
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
93
95
|
|
94
96
|
#define QK5_0 32
|
95
97
|
#define QR5_0 2
|
96
|
-
#define QI5_0 4
|
98
|
+
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
97
99
|
typedef struct {
|
98
100
|
half d; // delta
|
99
101
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
103
105
|
|
104
106
|
#define QK5_1 32
|
105
107
|
#define QR5_1 2
|
106
|
-
#define QI5_1 4
|
108
|
+
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
107
109
|
typedef struct {
|
108
110
|
half d; // delta
|
109
111
|
half m; // min
|
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
114
116
|
|
115
117
|
#define QK8_0 32
|
116
118
|
#define QR8_0 1
|
117
|
-
#define QI8_0
|
119
|
+
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
118
120
|
typedef struct {
|
119
121
|
half d; // delta
|
120
122
|
int8_t qs[QK8_0]; // quants
|
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
123
125
|
|
124
126
|
#define QK8_1 32
|
125
127
|
#define QR8_1 1
|
126
|
-
#define QI8_1
|
128
|
+
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
127
129
|
typedef struct {
|
128
130
|
half d; // delta
|
129
131
|
half s; // unquantized sum
|
@@ -131,7 +133,7 @@ typedef struct {
|
|
131
133
|
} block_q8_1;
|
132
134
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
135
|
|
134
|
-
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
136
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
135
137
|
|
136
138
|
//================================= k-quants
|
137
139
|
|
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, co
|
|
143
145
|
#define K_SCALE_SIZE 12
|
144
146
|
#endif
|
145
147
|
|
148
|
+
#define QR2_K 4
|
149
|
+
#define QI2_K (QK_K / (4*QR2_K))
|
146
150
|
typedef struct {
|
147
151
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
148
152
|
uint8_t qs[QK_K/4]; // quants
|
@@ -151,6 +155,8 @@ typedef struct {
|
|
151
155
|
} block_q2_K;
|
152
156
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
153
157
|
|
158
|
+
#define QR3_K 4
|
159
|
+
#define QI3_K (QK_K / (4*QR3_K))
|
154
160
|
typedef struct {
|
155
161
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
156
162
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -163,6 +169,8 @@ typedef struct {
|
|
163
169
|
} block_q3_K;
|
164
170
|
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
165
171
|
|
172
|
+
#define QR4_K 2
|
173
|
+
#define QI4_K (QK_K / (4*QR4_K))
|
166
174
|
#ifdef GGML_QKK_64
|
167
175
|
typedef struct {
|
168
176
|
half d[2]; // super-block scales/mins
|
@@ -180,6 +188,8 @@ typedef struct {
|
|
180
188
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
181
189
|
#endif
|
182
190
|
|
191
|
+
#define QR5_K 2
|
192
|
+
#define QI5_K (QK_K / (4*QR5_K))
|
183
193
|
#ifdef GGML_QKK_64
|
184
194
|
typedef struct {
|
185
195
|
half d; // super-block scale
|
@@ -199,6 +209,8 @@ typedef struct {
|
|
199
209
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
200
210
|
#endif
|
201
211
|
|
212
|
+
#define QR6_K 2
|
213
|
+
#define QI6_K (QK_K / (4*QR6_K))
|
202
214
|
typedef struct {
|
203
215
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
204
216
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
@@ -208,9 +220,11 @@ typedef struct {
|
|
208
220
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
209
221
|
|
210
222
|
#define WARP_SIZE 32
|
223
|
+
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
211
224
|
|
212
225
|
#define CUDA_ADD_BLOCK_SIZE 256
|
213
226
|
#define CUDA_MUL_BLOCK_SIZE 256
|
227
|
+
#define CUDA_GELU_BLOCK_SIZE 256
|
214
228
|
#define CUDA_SILU_BLOCK_SIZE 256
|
215
229
|
#define CUDA_CPY_BLOCK_SIZE 32
|
216
230
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
@@ -238,13 +252,13 @@ struct ggml_tensor_extra_gpu {
|
|
238
252
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
239
253
|
};
|
240
254
|
|
241
|
-
static __global__ void add_f32(const float * x, const float * y, float * dst, const int
|
255
|
+
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
242
256
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
243
257
|
|
244
|
-
if (i >=
|
258
|
+
if (i >= kx) {
|
245
259
|
return;
|
246
260
|
}
|
247
|
-
dst[i] = x[i] + y[i];
|
261
|
+
dst[i] = x[i] + y[i%ky];
|
248
262
|
}
|
249
263
|
|
250
264
|
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
@@ -265,6 +279,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
|
|
265
279
|
dst[i] = x[i] * y[i%ky];
|
266
280
|
}
|
267
281
|
|
282
|
+
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
283
|
+
const float GELU_COEF_A = 0.044715f;
|
284
|
+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
285
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
286
|
+
|
287
|
+
if (i >= k) {
|
288
|
+
return;
|
289
|
+
}
|
290
|
+
|
291
|
+
float xi = x[i];
|
292
|
+
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
293
|
+
}
|
294
|
+
|
268
295
|
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
269
296
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
270
297
|
|
@@ -274,16 +301,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
274
301
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
275
302
|
}
|
276
303
|
|
304
|
+
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
305
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
306
|
+
const int tid = threadIdx.x;
|
307
|
+
|
308
|
+
const float eps = 1e-5f;
|
309
|
+
|
310
|
+
float mean = 0.0f;
|
311
|
+
float var = 0.0f;
|
312
|
+
|
313
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
314
|
+
const float xi = x[row*ncols + col];
|
315
|
+
mean += xi;
|
316
|
+
var += xi * xi;
|
317
|
+
}
|
318
|
+
|
319
|
+
// sum up partial sums
|
320
|
+
#pragma unroll
|
321
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
322
|
+
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
|
323
|
+
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
|
324
|
+
}
|
325
|
+
|
326
|
+
mean /= ncols;
|
327
|
+
var = var / ncols - mean * mean;
|
328
|
+
const float inv_var = rsqrtf(var + eps);
|
329
|
+
|
330
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
331
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
277
335
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
278
336
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
279
337
|
const int tid = threadIdx.x;
|
280
338
|
|
281
|
-
const float eps = 1e-
|
339
|
+
const float eps = 1e-6f;
|
282
340
|
|
283
341
|
float tmp = 0.0f; // partial sum for thread in warp
|
284
342
|
|
285
|
-
for (int
|
286
|
-
const int col = i + tid;
|
343
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
287
344
|
const float xi = x[row*ncols + col];
|
288
345
|
tmp += xi * xi;
|
289
346
|
}
|
@@ -295,10 +352,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
295
352
|
}
|
296
353
|
|
297
354
|
const float mean = tmp / ncols;
|
298
|
-
const float scale =
|
355
|
+
const float scale = rsqrtf(mean + eps);
|
299
356
|
|
300
|
-
for (int
|
301
|
-
const int col = i + tid;
|
357
|
+
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
302
358
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
303
359
|
}
|
304
360
|
}
|
@@ -407,7 +463,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
407
463
|
|
408
464
|
//================================== k-quants
|
409
465
|
|
410
|
-
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
466
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
411
467
|
|
412
468
|
const int i = blockIdx.x;
|
413
469
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -440,7 +496,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
440
496
|
|
441
497
|
}
|
442
498
|
|
443
|
-
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
499
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
444
500
|
|
445
501
|
const int i = blockIdx.x;
|
446
502
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -504,7 +560,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
504
560
|
}
|
505
561
|
#endif
|
506
562
|
|
507
|
-
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
563
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
508
564
|
const block_q4_K * x = (const block_q4_K *) vx;
|
509
565
|
|
510
566
|
const int i = blockIdx.x;
|
@@ -544,7 +600,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
544
600
|
#endif
|
545
601
|
}
|
546
602
|
|
547
|
-
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
603
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
548
604
|
const block_q5_K * x = (const block_q5_K *) vx;
|
549
605
|
|
550
606
|
const int i = blockIdx.x;
|
@@ -590,7 +646,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
590
646
|
#endif
|
591
647
|
}
|
592
648
|
|
593
|
-
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
649
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
594
650
|
const block_q6_K * x = (const block_q6_K *) vx;
|
595
651
|
|
596
652
|
const int i = blockIdx.x;
|
@@ -634,7 +690,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
634
690
|
#endif
|
635
691
|
}
|
636
692
|
|
637
|
-
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
693
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
638
694
|
|
639
695
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
640
696
|
|
@@ -742,7 +798,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
742
798
|
}
|
743
799
|
}
|
744
800
|
|
745
|
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
801
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
746
802
|
|
747
803
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
748
804
|
if (row > nrows) return;
|
@@ -846,7 +902,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
846
902
|
}
|
847
903
|
}
|
848
904
|
|
849
|
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
905
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
850
906
|
|
851
907
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
852
908
|
if (row > nrows) return;
|
@@ -949,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
949
1005
|
}
|
950
1006
|
}
|
951
1007
|
|
952
|
-
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
1008
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
953
1009
|
|
954
1010
|
const int row = blockIdx.x;
|
955
1011
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1053,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1053
1109
|
}
|
1054
1110
|
}
|
1055
1111
|
|
1056
|
-
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
1112
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1057
1113
|
|
1058
1114
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1059
1115
|
|
@@ -1171,7 +1227,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1171
1227
|
v.y = x[ib + iqs + 1];
|
1172
1228
|
}
|
1173
1229
|
|
1174
|
-
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1230
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
|
1175
1231
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
1232
|
|
1177
1233
|
if (i >= k) {
|
@@ -1180,10 +1236,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1180
1236
|
|
1181
1237
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
1238
|
|
1183
|
-
const int ib = i /
|
1184
|
-
const int iqs = i %
|
1239
|
+
const int ib = i / QK8_1; // block index
|
1240
|
+
const int iqs = i % QK8_1; // quant index
|
1185
1241
|
|
1186
|
-
const float xi = x[i];
|
1242
|
+
const float xi = i < ndata ? x[i] : 0.0f;
|
1187
1243
|
float amax = fabsf(xi);
|
1188
1244
|
float sum = xi;
|
1189
1245
|
|
@@ -1207,7 +1263,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
|
1207
1263
|
}
|
1208
1264
|
|
1209
1265
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1210
|
-
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1266
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
1211
1267
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1212
1268
|
|
1213
1269
|
if (i >= k) {
|
@@ -1227,8 +1283,9 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1227
1283
|
y[iybs + iqs + y_offset] = v.y;
|
1228
1284
|
}
|
1229
1285
|
|
1230
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1231
|
-
|
1286
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1287
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1288
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1232
1289
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
1290
|
|
1234
1291
|
int vi;
|
@@ -1249,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
|
|
1249
1306
|
return sumi*d;
|
1250
1307
|
#else
|
1251
1308
|
return 0.0f; // only to satisfy the compiler
|
1252
|
-
#endif // __CUDA_ARCH__ >=
|
1309
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1253
1310
|
}
|
1254
1311
|
|
1255
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1256
|
-
|
1312
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1313
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1314
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1257
1315
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
1316
|
|
1259
1317
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1274,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
|
|
1274
1332
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
1333
|
#else
|
1276
1334
|
return 0.0f; // only to satisfy the compiler
|
1277
|
-
#endif // __CUDA_ARCH__ >=
|
1335
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1278
1336
|
}
|
1279
1337
|
|
1280
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1281
|
-
|
1338
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1339
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1340
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1282
1341
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
1342
|
|
1284
1343
|
int qs;
|
@@ -1309,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
|
|
1309
1368
|
return sumi*d;
|
1310
1369
|
#else
|
1311
1370
|
return 0.0f; // only to satisfy the compiler
|
1312
|
-
#endif // __CUDA_ARCH__ >=
|
1371
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1313
1372
|
}
|
1314
1373
|
|
1315
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1316
|
-
|
1374
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1375
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1376
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1317
1377
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
1378
|
|
1319
1379
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1343,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
|
|
1343
1403
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
1404
|
#else
|
1345
1405
|
return 0.0f; // only to satisfy the compiler
|
1346
|
-
#endif // __CUDA_ARCH__ >=
|
1406
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1347
1407
|
}
|
1348
1408
|
|
1349
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1350
|
-
|
1409
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1410
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1411
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1351
1412
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
1413
|
|
1353
1414
|
int vi;
|
@@ -1362,11 +1423,224 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
|
|
1362
1423
|
return sumi*d;
|
1363
1424
|
#else
|
1364
1425
|
return 0.0f; // only to satisfy the compiler
|
1365
|
-
#endif // __CUDA_ARCH__ >=
|
1426
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1427
|
+
}
|
1428
|
+
|
1429
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
1430
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1431
|
+
|
1432
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1433
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
1434
|
+
|
1435
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
1436
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1437
|
+
|
1438
|
+
float sumf_d = 0.0f;
|
1439
|
+
float sumf_m = 0.0f;
|
1440
|
+
|
1441
|
+
const float d = bq2_K->d;
|
1442
|
+
const float dmin = bq2_K->dmin;
|
1443
|
+
|
1444
|
+
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
|
1445
|
+
|
1446
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1447
|
+
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1448
|
+
|
1449
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1450
|
+
const float d8i = bq8i->d;
|
1451
|
+
|
1452
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1453
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1454
|
+
|
1455
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
|
1456
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
return d*sumf_d - dmin*sumf_m;
|
1460
|
+
#else
|
1461
|
+
return 0.0f; // only to satisfy the compiler
|
1462
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1463
|
+
}
|
1464
|
+
|
1465
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
1466
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1467
|
+
|
1468
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1469
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
1470
|
+
|
1471
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
1472
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1473
|
+
|
1474
|
+
float sumf = 0.0f;
|
1475
|
+
|
1476
|
+
const float d = bq3_K->d;
|
1477
|
+
|
1478
|
+
int vl;
|
1479
|
+
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
|
1480
|
+
|
1481
|
+
int vh;
|
1482
|
+
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
1483
|
+
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1484
|
+
vh >>= bq8_offset;
|
1485
|
+
|
1486
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1487
|
+
const int isc = scale_offset + 2*i;
|
1488
|
+
|
1489
|
+
const int isc_low = isc % (QK_K/32);
|
1490
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1491
|
+
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1492
|
+
|
1493
|
+
const int isc_high = isc % (QK_K/64);
|
1494
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1495
|
+
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1496
|
+
|
1497
|
+
const int sc = (sc_low | sc_high) - 32;
|
1498
|
+
|
1499
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1500
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1501
|
+
const float d8i = bq8i->d;
|
1502
|
+
|
1503
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1504
|
+
|
1505
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1506
|
+
|
1507
|
+
const int vi = __vsubss4(vil, vih);
|
1508
|
+
|
1509
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1510
|
+
}
|
1511
|
+
|
1512
|
+
return d*sumf;
|
1513
|
+
#else
|
1514
|
+
return 0.0f; // only to satisfy the compiler
|
1515
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1516
|
+
}
|
1517
|
+
|
1518
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
1519
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1520
|
+
|
1521
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1522
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1523
|
+
|
1524
|
+
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
1525
|
+
|
1526
|
+
float sumf_d = 0.0f;
|
1527
|
+
float sumf_m = 0.0f;
|
1528
|
+
|
1529
|
+
const float d = bq4_K->d;
|
1530
|
+
const float dmin = bq4_K->dmin;
|
1531
|
+
|
1532
|
+
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
1533
|
+
|
1534
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1535
|
+
const int isc = bq8_offset + i;
|
1536
|
+
|
1537
|
+
uint8_t sc, m;
|
1538
|
+
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
1539
|
+
|
1540
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1541
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1542
|
+
const float d8i = bq8i->d;
|
1543
|
+
|
1544
|
+
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
1545
|
+
|
1546
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1547
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
|
1548
|
+
}
|
1549
|
+
|
1550
|
+
return d*sumf_d - dmin*sumf_m;
|
1551
|
+
#else
|
1552
|
+
return 0.0f; // only to satisfy the compiler
|
1553
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1554
|
+
}
|
1555
|
+
|
1556
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
1557
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1558
|
+
|
1559
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1560
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1561
|
+
|
1562
|
+
const int bq8_offset = QR5_K * (iqs / QI8_1);
|
1563
|
+
|
1564
|
+
float sumf_d = 0.0f;
|
1565
|
+
float sumf_m = 0.0f;
|
1566
|
+
|
1567
|
+
const float d = bq5_K->d;
|
1568
|
+
const float dmin = bq5_K->dmin;
|
1569
|
+
|
1570
|
+
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
|
1571
|
+
|
1572
|
+
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
|
1573
|
+
|
1574
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1575
|
+
const int isc = bq8_offset + i;
|
1576
|
+
|
1577
|
+
uint8_t sc, m;
|
1578
|
+
get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
1579
|
+
|
1580
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1581
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1582
|
+
const float d8i = bq8i->d;
|
1583
|
+
|
1584
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1585
|
+
|
1586
|
+
const int vih = ((vh >> i) << 4) & 0x10101010;
|
1587
|
+
|
1588
|
+
const int vi = vil | vih;
|
1589
|
+
|
1590
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1591
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
1592
|
+
}
|
1593
|
+
|
1594
|
+
return d*sumf_d - dmin*sumf_m;
|
1595
|
+
#else
|
1596
|
+
return 0.0f; // only to satisfy the compiler
|
1597
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
1601
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1602
|
+
|
1603
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1604
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
1605
|
+
|
1606
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
1607
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1608
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1609
|
+
|
1610
|
+
float sumf = 0.0f;
|
1611
|
+
|
1612
|
+
const float d = bq6_K->d;
|
1613
|
+
|
1614
|
+
int vl;
|
1615
|
+
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
1616
|
+
|
1617
|
+
int vh;
|
1618
|
+
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
|
1619
|
+
|
1620
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1621
|
+
const int sc = bq6_K->scales[scale_offset + 4*i];
|
1622
|
+
|
1623
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
1624
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1625
|
+
const float d8i = bq8i->d;
|
1626
|
+
|
1627
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1628
|
+
|
1629
|
+
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
|
1630
|
+
|
1631
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1632
|
+
|
1633
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1634
|
+
}
|
1635
|
+
|
1636
|
+
return d*sumf;
|
1637
|
+
#else
|
1638
|
+
return 0.0f; // only to satisfy the compiler
|
1639
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1366
1640
|
}
|
1367
1641
|
|
1368
1642
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
-
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1643
|
+
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1370
1644
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
1645
|
|
1372
1646
|
if (row >= nrows) {
|
@@ -1385,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
|
|
1385
1659
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1386
1660
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1387
1661
|
|
1388
|
-
const int iby = i + threadIdx.x / qi; // y block index
|
1662
|
+
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
1389
1663
|
|
1390
1664
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1391
1665
|
|
@@ -1404,7 +1678,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
|
|
1404
1678
|
}
|
1405
1679
|
|
1406
1680
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1407
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1681
|
+
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
1408
1682
|
// qk = quantized weights per x block
|
1409
1683
|
// qr = number of quantized weights per data value in x block
|
1410
1684
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -1471,7 +1745,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1471
1745
|
}
|
1472
1746
|
}
|
1473
1747
|
|
1474
|
-
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1748
|
+
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1475
1749
|
const half * x = (const half *) vx;
|
1476
1750
|
|
1477
1751
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1518,7 +1792,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1518
1792
|
}
|
1519
1793
|
|
1520
1794
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1521
|
-
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1795
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1522
1796
|
const int row_stride_x, const int channel_stride_x) {
|
1523
1797
|
|
1524
1798
|
const half * x = (const half *) vx;
|
@@ -1623,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
1623
1897
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1624
1898
|
}
|
1625
1899
|
|
1900
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
1901
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1902
|
+
const int half_n_dims = ncols/4;
|
1903
|
+
|
1904
|
+
if (col >= half_n_dims) {
|
1905
|
+
return;
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1909
|
+
const int i = row*ncols + col;
|
1910
|
+
|
1911
|
+
const float col_theta_scale = powf(theta_scale, col);
|
1912
|
+
|
1913
|
+
const float theta = p*col_theta_scale;
|
1914
|
+
const float sin_theta = sinf(theta);
|
1915
|
+
const float cos_theta = cosf(theta);
|
1916
|
+
|
1917
|
+
const float x0 = x[i + 0];
|
1918
|
+
const float x1 = x[i + half_n_dims];
|
1919
|
+
|
1920
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
1921
|
+
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
1922
|
+
|
1923
|
+
const float block_theta = block_p*col_theta_scale;
|
1924
|
+
const float sin_block_theta = sinf(block_theta);
|
1925
|
+
const float cos_block_theta = cosf(block_theta);
|
1926
|
+
|
1927
|
+
const float x2 = x[i + half_n_dims * 2];
|
1928
|
+
const float x3 = x[i + half_n_dims * 3];
|
1929
|
+
|
1930
|
+
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
1931
|
+
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
1932
|
+
}
|
1933
|
+
|
1626
1934
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1627
1935
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1628
1936
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1688,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
1688
1996
|
dst[i] = scale * x[i];
|
1689
1997
|
}
|
1690
1998
|
|
1691
|
-
static void add_f32_cuda(const float * x, const float * y, float * dst, const int
|
1692
|
-
const int num_blocks = (
|
1693
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst,
|
1999
|
+
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
2000
|
+
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
2001
|
+
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1694
2002
|
}
|
1695
2003
|
|
1696
2004
|
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
@@ -1703,20 +2011,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1703
2011
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1704
2012
|
}
|
1705
2013
|
|
2014
|
+
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
2015
|
+
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
2016
|
+
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
2017
|
+
}
|
2018
|
+
|
1706
2019
|
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
1707
2020
|
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
1708
2021
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
1709
2022
|
}
|
1710
2023
|
|
2024
|
+
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2025
|
+
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
2026
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2027
|
+
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2028
|
+
}
|
2029
|
+
|
1711
2030
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1712
2031
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1713
2032
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1714
2033
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1715
2034
|
}
|
1716
2035
|
|
1717
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
2036
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
1718
2037
|
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
-
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
2038
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
|
1720
2039
|
}
|
1721
2040
|
|
1722
2041
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -1873,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1873
2192
|
}
|
1874
2193
|
|
1875
2194
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1876
|
-
GGML_ASSERT(ncols %
|
2195
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
1877
2196
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1878
2197
|
const dim3 block_nums(1, block_num_y, 1);
|
1879
2198
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1882,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1882
2201
|
}
|
1883
2202
|
|
1884
2203
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1885
|
-
GGML_ASSERT(ncols %
|
2204
|
+
GGML_ASSERT(ncols % QK4_1 == 0);
|
1886
2205
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1887
2206
|
const dim3 block_nums(1, block_num_y, 1);
|
1888
2207
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1891,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1891
2210
|
}
|
1892
2211
|
|
1893
2212
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1894
|
-
GGML_ASSERT(ncols %
|
2213
|
+
GGML_ASSERT(ncols % QK5_0 == 0);
|
1895
2214
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1896
2215
|
const dim3 block_nums(1, block_num_y, 1);
|
1897
2216
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1900,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1900
2219
|
}
|
1901
2220
|
|
1902
2221
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1903
|
-
GGML_ASSERT(ncols %
|
2222
|
+
GGML_ASSERT(ncols % QK5_1 == 0);
|
1904
2223
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1905
2224
|
const dim3 block_nums(1, block_num_y, 1);
|
1906
2225
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1909,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1909
2228
|
}
|
1910
2229
|
|
1911
2230
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1912
|
-
GGML_ASSERT(ncols %
|
2231
|
+
GGML_ASSERT(ncols % QK8_0 == 0);
|
1913
2232
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1914
2233
|
const dim3 block_nums(1, block_num_y, 1);
|
1915
2234
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1917,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1917
2236
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1918
2237
|
}
|
1919
2238
|
|
2239
|
+
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2240
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2241
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2242
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2243
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2244
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
2245
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2249
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2250
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2251
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2252
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2253
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
2254
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2258
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2259
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2260
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2261
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2262
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
|
2263
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2264
|
+
}
|
2265
|
+
|
2266
|
+
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2267
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2268
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2269
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2270
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2271
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
|
2272
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2273
|
+
}
|
2274
|
+
|
2275
|
+
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2276
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2277
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2278
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2279
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2280
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
2281
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2282
|
+
}
|
2283
|
+
|
1920
2284
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1921
2285
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1922
2286
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -2009,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2009
2373
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
2010
2374
|
}
|
2011
2375
|
|
2376
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
2377
|
+
GGML_ASSERT(nrows % 4 == 0);
|
2378
|
+
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2379
|
+
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
2380
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2381
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
2382
|
+
}
|
2383
|
+
|
2012
2384
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
2013
2385
|
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
2014
2386
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -2051,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
2051
2423
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
2052
2424
|
int id;
|
2053
2425
|
CUDA_CHECK(cudaGetDevice(&id));
|
2054
|
-
|
2426
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2427
|
+
int nnz = 0;
|
2428
|
+
size_t max_size = 0, tot_size = 0;
|
2429
|
+
#endif
|
2430
|
+
size_t best_diff = 1ull << 36;
|
2431
|
+
int ibest = -1;
|
2055
2432
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
2056
2433
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
2057
|
-
if (b.
|
2058
|
-
|
2059
|
-
|
2060
|
-
b.
|
2061
|
-
b.size =
|
2062
|
-
|
2434
|
+
if (b.ptr != nullptr) {
|
2435
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2436
|
+
++nnz;
|
2437
|
+
tot_size += b.size;
|
2438
|
+
if (b.size > max_size) max_size = b.size;
|
2439
|
+
#endif
|
2440
|
+
if (b.size >= size) {
|
2441
|
+
size_t diff = b.size - size;
|
2442
|
+
if (diff < best_diff) {
|
2443
|
+
best_diff = diff;
|
2444
|
+
ibest = i;
|
2445
|
+
if (!best_diff) {
|
2446
|
+
void * ptr = b.ptr;
|
2447
|
+
*actual_size = b.size;
|
2448
|
+
b.ptr = nullptr;
|
2449
|
+
b.size = 0;
|
2450
|
+
return ptr;
|
2451
|
+
}
|
2452
|
+
}
|
2453
|
+
}
|
2063
2454
|
}
|
2064
2455
|
}
|
2456
|
+
if (ibest >= 0) {
|
2457
|
+
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
2458
|
+
void * ptr = b.ptr;
|
2459
|
+
*actual_size = b.size;
|
2460
|
+
b.ptr = nullptr;
|
2461
|
+
b.size = 0;
|
2462
|
+
return ptr;
|
2463
|
+
}
|
2464
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2465
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
2466
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
2467
|
+
#endif
|
2065
2468
|
void * ptr;
|
2066
|
-
|
2067
|
-
|
2469
|
+
size_t look_ahead_size = (size_t) (1.05 * size);
|
2470
|
+
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
2471
|
+
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
2472
|
+
*actual_size = look_ahead_size;
|
2068
2473
|
return ptr;
|
2069
2474
|
}
|
2070
2475
|
|
@@ -2140,6 +2545,9 @@ void ggml_init_cublas() {
|
|
2140
2545
|
}
|
2141
2546
|
|
2142
2547
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
2548
|
+
if (tensor_split == nullptr) {
|
2549
|
+
return;
|
2550
|
+
}
|
2143
2551
|
bool all_zero = true;
|
2144
2552
|
for (int i = 0; i < g_device_count; ++i) {
|
2145
2553
|
if (tensor_split[i] != 0.0f) {
|
@@ -2236,16 +2644,19 @@ inline void ggml_cuda_op_add(
|
|
2236
2644
|
|
2237
2645
|
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
2238
2646
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2239
|
-
GGML_ASSERT(dst_ddf_i
|
2647
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2240
2648
|
|
2241
|
-
const int64_t
|
2649
|
+
const int64_t ne00 = src0->ne[0];
|
2242
2650
|
const int64_t i01_diff = i01_high - i01_low;
|
2243
2651
|
|
2652
|
+
const int64_t ne10 = src1->ne[0];
|
2653
|
+
const int64_t ne11 = src1->ne[1];
|
2654
|
+
|
2244
2655
|
// compute
|
2245
2656
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2246
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i,
|
2657
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2247
2658
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2248
|
-
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i,
|
2659
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2249
2660
|
} else {
|
2250
2661
|
GGML_ASSERT(false);
|
2251
2662
|
}
|
@@ -2264,27 +2675,41 @@ inline void ggml_cuda_op_mul(
|
|
2264
2675
|
|
2265
2676
|
GGML_ASSERT(src0_ddf_i != nullptr);
|
2266
2677
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2267
|
-
GGML_ASSERT(dst_ddf_i
|
2678
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2268
2679
|
|
2269
2680
|
const int64_t ne00 = src0->ne[0];
|
2681
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2270
2682
|
|
2271
2683
|
const int64_t ne10 = src1->ne[0];
|
2272
2684
|
const int64_t ne11 = src1->ne[1];
|
2273
2685
|
|
2274
|
-
|
2275
|
-
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
2686
|
+
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2276
2687
|
|
2277
|
-
|
2278
|
-
|
2279
|
-
|
2688
|
+
(void) dst;
|
2689
|
+
(void) src0_ddq_i;
|
2690
|
+
(void) i02;
|
2691
|
+
}
|
2280
2692
|
|
2281
|
-
|
2282
|
-
|
2283
|
-
|
2693
|
+
inline void ggml_cuda_op_gelu(
|
2694
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2695
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2696
|
+
cudaStream_t & cudaStream_main){
|
2697
|
+
|
2698
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2699
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2700
|
+
|
2701
|
+
const int64_t ne00 = src0->ne[0];
|
2702
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2703
|
+
|
2704
|
+
// compute
|
2705
|
+
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2284
2706
|
|
2707
|
+
(void) src1;
|
2285
2708
|
(void) dst;
|
2286
2709
|
(void) src0_ddq_i;
|
2710
|
+
(void) src1_ddf_i;
|
2287
2711
|
(void) i02;
|
2712
|
+
(void) i1;
|
2288
2713
|
}
|
2289
2714
|
|
2290
2715
|
inline void ggml_cuda_op_silu(
|
@@ -2309,6 +2734,28 @@ inline void ggml_cuda_op_silu(
|
|
2309
2734
|
(void) i1;
|
2310
2735
|
}
|
2311
2736
|
|
2737
|
+
inline void ggml_cuda_op_norm(
|
2738
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2739
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2740
|
+
cudaStream_t & cudaStream_main){
|
2741
|
+
|
2742
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
2743
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
2744
|
+
|
2745
|
+
const int64_t ne00 = src0->ne[0];
|
2746
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2747
|
+
|
2748
|
+
// compute
|
2749
|
+
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2750
|
+
|
2751
|
+
(void) src1;
|
2752
|
+
(void) dst;
|
2753
|
+
(void) src0_ddq_i;
|
2754
|
+
(void) src1_ddf_i;
|
2755
|
+
(void) i02;
|
2756
|
+
(void) i1;
|
2757
|
+
}
|
2758
|
+
|
2312
2759
|
inline void ggml_cuda_op_rms_norm(
|
2313
2760
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2314
2761
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2349,22 +2796,30 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2349
2796
|
int id;
|
2350
2797
|
CUDA_CHECK(cudaGetDevice(&id));
|
2351
2798
|
|
2352
|
-
|
2799
|
+
bool mul_mat_vec_q_implemented =
|
2800
|
+
src0->type == GGML_TYPE_Q4_0 ||
|
2353
2801
|
src0->type == GGML_TYPE_Q4_1 ||
|
2354
2802
|
src0->type == GGML_TYPE_Q5_0 ||
|
2355
2803
|
src0->type == GGML_TYPE_Q5_1 ||
|
2356
2804
|
src0->type == GGML_TYPE_Q8_0;
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2805
|
+
#if QK_K == 256
|
2806
|
+
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
2807
|
+
src0->type == GGML_TYPE_Q2_K ||
|
2808
|
+
src0->type == GGML_TYPE_Q3_K ||
|
2809
|
+
src0->type == GGML_TYPE_Q4_K ||
|
2810
|
+
src0->type == GGML_TYPE_Q5_K ||
|
2811
|
+
src0->type == GGML_TYPE_Q6_K;
|
2812
|
+
#endif // QK_K == 256
|
2813
|
+
|
2814
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
2362
2815
|
#endif
|
2363
2816
|
|
2364
2817
|
if (use_mul_mat_vec_q) {
|
2818
|
+
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
2819
|
+
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
2365
2820
|
size_t as;
|
2366
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(
|
2367
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2821
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2822
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
2368
2823
|
|
2369
2824
|
switch (src0->type) {
|
2370
2825
|
case GGML_TYPE_Q4_0:
|
@@ -2382,6 +2837,21 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2382
2837
|
case GGML_TYPE_Q8_0:
|
2383
2838
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2384
2839
|
break;
|
2840
|
+
case GGML_TYPE_Q2_K:
|
2841
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2842
|
+
break;
|
2843
|
+
case GGML_TYPE_Q3_K:
|
2844
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2845
|
+
break;
|
2846
|
+
case GGML_TYPE_Q4_K:
|
2847
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2848
|
+
break;
|
2849
|
+
case GGML_TYPE_Q5_K:
|
2850
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2851
|
+
break;
|
2852
|
+
case GGML_TYPE_Q6_K:
|
2853
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2854
|
+
break;
|
2385
2855
|
default:
|
2386
2856
|
GGML_ASSERT(false);
|
2387
2857
|
break;
|
@@ -2516,13 +2986,26 @@ inline void ggml_cuda_op_rope(
|
|
2516
2986
|
const int n_past = ((int32_t *) src1->data)[0];
|
2517
2987
|
const int n_dims = ((int32_t *) src1->data)[1];
|
2518
2988
|
const int mode = ((int32_t *) src1->data)[2];
|
2519
|
-
|
2989
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
2990
|
+
|
2991
|
+
// RoPE alteration for extended context
|
2992
|
+
float freq_base, freq_scale;
|
2993
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
2994
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
2995
|
+
|
2996
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
2997
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
2520
2998
|
|
2521
|
-
|
2522
|
-
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
2999
|
+
bool is_glm = mode & 4;
|
2523
3000
|
|
2524
3001
|
// compute
|
2525
|
-
|
3002
|
+
if (is_glm) {
|
3003
|
+
const float id_p = min(p, n_ctx - 2.f);
|
3004
|
+
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3005
|
+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3006
|
+
} else {
|
3007
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3008
|
+
}
|
2526
3009
|
|
2527
3010
|
(void) dst;
|
2528
3011
|
(void) src0_ddq_i;
|
@@ -2925,11 +3408,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2925
3408
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
2926
3409
|
}
|
2927
3410
|
|
3411
|
+
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3412
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3413
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
3414
|
+
}
|
3415
|
+
|
2928
3416
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2929
3417
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2930
3418
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
2931
3419
|
}
|
2932
3420
|
|
3421
|
+
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3422
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3423
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
3424
|
+
}
|
3425
|
+
|
2933
3426
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2934
3427
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2935
3428
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
@@ -3085,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3085
3578
|
(void) dst;
|
3086
3579
|
}
|
3087
3580
|
|
3581
|
+
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3582
|
+
ggml_cuda_cpy(src0, dst, nullptr);
|
3583
|
+
(void) src1;
|
3584
|
+
}
|
3585
|
+
|
3088
3586
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3089
3587
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3090
3588
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
@@ -3108,7 +3606,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3108
3606
|
|
3109
3607
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
3110
3608
|
int nrows = ggml_nrows(tensor);
|
3609
|
+
|
3610
|
+
const int64_t ne0 = tensor->ne[0];
|
3611
|
+
|
3111
3612
|
const size_t nb1 = tensor->nb[1];
|
3613
|
+
|
3112
3614
|
ggml_backend backend = tensor->backend;
|
3113
3615
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
3114
3616
|
memset(extra, 0, sizeof(*extra));
|
@@ -3137,13 +3639,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3137
3639
|
int64_t nrows_split = row_high - row_low;
|
3138
3640
|
|
3139
3641
|
const size_t offset_split = row_low*nb1;
|
3140
|
-
|
3642
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3643
|
+
const size_t original_size = size;
|
3644
|
+
|
3645
|
+
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
3646
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3647
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3648
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
3649
|
+
}
|
3141
3650
|
|
3142
|
-
|
3651
|
+
char * buf;
|
3143
3652
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
3144
|
-
|
3653
|
+
char * buf_host = (char*)data + offset_split;
|
3145
3654
|
|
3146
|
-
|
3655
|
+
// set padding to 0 to avoid possible NaN values
|
3656
|
+
if (size > original_size) {
|
3657
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
3658
|
+
}
|
3659
|
+
|
3660
|
+
|
3661
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
|
3147
3662
|
|
3148
3663
|
extra->data_device[id] = buf;
|
3149
3664
|
|
@@ -3177,43 +3692,60 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
3177
3692
|
delete extra;
|
3178
3693
|
}
|
3179
3694
|
|
3695
|
+
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
3696
|
+
static size_t g_temp_tensor_extra_index = 0;
|
3697
|
+
|
3698
|
+
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
3699
|
+
if (g_temp_tensor_extras == nullptr) {
|
3700
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
3701
|
+
}
|
3702
|
+
|
3703
|
+
size_t alloc_index = g_temp_tensor_extra_index;
|
3704
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
3705
|
+
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
3706
|
+
memset(extra, 0, sizeof(*extra));
|
3707
|
+
|
3708
|
+
return extra;
|
3709
|
+
}
|
3710
|
+
|
3180
3711
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
3181
3712
|
if (scratch && g_scratch_size == 0) {
|
3182
3713
|
return;
|
3183
3714
|
}
|
3184
3715
|
|
3185
3716
|
// recursively assign CUDA buffers until a compute tensor is found
|
3186
|
-
if (tensor->
|
3187
|
-
const ggml_op src0_op = tensor->
|
3188
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3189
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3717
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3718
|
+
const ggml_op src0_op = tensor->src[0]->op;
|
3719
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
3720
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3190
3721
|
}
|
3191
3722
|
}
|
3192
|
-
if (tensor->op == GGML_OP_CPY && tensor->
|
3193
|
-
ggml_cuda_assign_buffers_impl(tensor->
|
3723
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
3724
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
3194
3725
|
}
|
3195
3726
|
|
3196
3727
|
tensor->backend = GGML_BACKEND_GPU;
|
3197
|
-
struct ggml_tensor_extra_gpu * extra
|
3198
|
-
memset(extra, 0, sizeof(*extra));
|
3728
|
+
struct ggml_tensor_extra_gpu * extra;
|
3199
3729
|
|
3200
|
-
const bool inplace = (tensor->
|
3730
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3201
3731
|
tensor->op == GGML_OP_VIEW ||
|
3202
3732
|
force_inplace;
|
3203
3733
|
const size_t size = ggml_nbytes(tensor);
|
3204
3734
|
|
3205
3735
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3206
|
-
if (inplace && (tensor->
|
3207
|
-
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3736
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
3737
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
3208
3738
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3209
3739
|
size_t offset = 0;
|
3210
3740
|
if (tensor->op == GGML_OP_VIEW) {
|
3211
|
-
memcpy(&offset, tensor->
|
3741
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
3212
3742
|
}
|
3743
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3213
3744
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3214
3745
|
} else if (tensor->op == GGML_OP_CPY) {
|
3215
|
-
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->
|
3746
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3216
3747
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3748
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3217
3749
|
extra->data_device[g_main_device] = src1_ddv;
|
3218
3750
|
} else if (scratch) {
|
3219
3751
|
GGML_ASSERT(size <= g_scratch_size);
|
@@ -3226,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3226
3758
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
3227
3759
|
g_scratch_buffer = data;
|
3228
3760
|
}
|
3761
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3229
3762
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
3230
3763
|
|
3231
3764
|
g_scratch_offset += size;
|
@@ -3235,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3235
3768
|
void * data;
|
3236
3769
|
CUDA_CHECK(cudaMalloc(&data, size));
|
3237
3770
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
3771
|
+
extra = new ggml_tensor_extra_gpu;
|
3772
|
+
memset(extra, 0, sizeof(*extra));
|
3238
3773
|
extra->data_device[g_main_device] = data;
|
3239
3774
|
}
|
3240
3775
|
|
@@ -3283,10 +3818,16 @@ void ggml_cuda_free_scratch() {
|
|
3283
3818
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
3284
3819
|
ggml_cuda_func_t func;
|
3285
3820
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
3286
|
-
|| (tensor->
|
3287
|
-
|| (tensor->
|
3821
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
3822
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3288
3823
|
|
3289
3824
|
switch (tensor->op) {
|
3825
|
+
case GGML_OP_DUP:
|
3826
|
+
if (!any_on_device) {
|
3827
|
+
return false;
|
3828
|
+
}
|
3829
|
+
func = ggml_cuda_dup;
|
3830
|
+
break;
|
3290
3831
|
case GGML_OP_ADD:
|
3291
3832
|
if (!any_on_device) {
|
3292
3833
|
return false;
|
@@ -3299,12 +3840,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3299
3840
|
}
|
3300
3841
|
func = ggml_cuda_mul;
|
3301
3842
|
break;
|
3843
|
+
case GGML_OP_GELU:
|
3844
|
+
if (!any_on_device) {
|
3845
|
+
return false;
|
3846
|
+
}
|
3847
|
+
func = ggml_cuda_gelu;
|
3848
|
+
break;
|
3302
3849
|
case GGML_OP_SILU:
|
3303
3850
|
if (!any_on_device) {
|
3304
3851
|
return false;
|
3305
3852
|
}
|
3306
3853
|
func = ggml_cuda_silu;
|
3307
3854
|
break;
|
3855
|
+
case GGML_OP_NORM:
|
3856
|
+
if (!any_on_device) {
|
3857
|
+
return false;
|
3858
|
+
}
|
3859
|
+
func = ggml_cuda_norm;
|
3860
|
+
break;
|
3308
3861
|
case GGML_OP_RMS_NORM:
|
3309
3862
|
if (!any_on_device) {
|
3310
3863
|
return false;
|
@@ -3312,7 +3865,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3312
3865
|
func = ggml_cuda_rms_norm;
|
3313
3866
|
break;
|
3314
3867
|
case GGML_OP_MUL_MAT:
|
3315
|
-
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->
|
3868
|
+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
3316
3869
|
return false;
|
3317
3870
|
}
|
3318
3871
|
func = ggml_cuda_mul_mat;
|
@@ -3329,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3329
3882
|
}
|
3330
3883
|
func = ggml_cuda_cpy;
|
3331
3884
|
break;
|
3885
|
+
case GGML_OP_CONT:
|
3886
|
+
if (!any_on_device) {
|
3887
|
+
return false;
|
3888
|
+
}
|
3889
|
+
func = ggml_cuda_dup;
|
3890
|
+
break;
|
3332
3891
|
case GGML_OP_RESHAPE:
|
3333
3892
|
case GGML_OP_VIEW:
|
3334
3893
|
case GGML_OP_PERMUTE:
|
@@ -3366,6 +3925,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3366
3925
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
3367
3926
|
return true;
|
3368
3927
|
}
|
3369
|
-
func(tensor->
|
3928
|
+
func(tensor->src[0], tensor->src[1], tensor);
|
3370
3929
|
return true;
|
3371
3930
|
}
|