llama_cpp 0.3.3 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +439 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +759 -136
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +250 -111
- data/ext/llama_cpp/src/ggml-metal.metal +614 -483
- data/ext/llama_cpp/src/ggml.c +793 -1032
- data/ext/llama_cpp/src/ggml.h +95 -18
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +626 -166
- data/ext/llama_cpp/src/llama.h +94 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +36 -1
- metadata +2 -2
@@ -13,6 +13,8 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
|
16
18
|
#if defined(_MSC_VER)
|
17
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
20
|
#endif
|
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
|
|
74
76
|
|
75
77
|
#define QK4_0 32
|
76
78
|
#define QR4_0 2
|
77
|
-
#define QI4_0 4
|
79
|
+
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
78
80
|
typedef struct {
|
79
81
|
half d; // delta
|
80
82
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
83
85
|
|
84
86
|
#define QK4_1 32
|
85
87
|
#define QR4_1 2
|
86
|
-
#define QI4_1 4
|
88
|
+
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
87
89
|
typedef struct {
|
88
90
|
half d; // delta
|
89
91
|
half m; // min
|
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
93
95
|
|
94
96
|
#define QK5_0 32
|
95
97
|
#define QR5_0 2
|
96
|
-
#define QI5_0 4
|
98
|
+
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
97
99
|
typedef struct {
|
98
100
|
half d; // delta
|
99
101
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
103
105
|
|
104
106
|
#define QK5_1 32
|
105
107
|
#define QR5_1 2
|
106
|
-
#define QI5_1 4
|
108
|
+
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
107
109
|
typedef struct {
|
108
110
|
half d; // delta
|
109
111
|
half m; // min
|
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
114
116
|
|
115
117
|
#define QK8_0 32
|
116
118
|
#define QR8_0 1
|
117
|
-
#define QI8_0
|
119
|
+
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
118
120
|
typedef struct {
|
119
121
|
half d; // delta
|
120
122
|
int8_t qs[QK8_0]; // quants
|
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
123
125
|
|
124
126
|
#define QK8_1 32
|
125
127
|
#define QR8_1 1
|
126
|
-
#define QI8_1
|
128
|
+
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
127
129
|
typedef struct {
|
128
130
|
half d; // delta
|
129
131
|
half s; // unquantized sum
|
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
143
145
|
#define K_SCALE_SIZE 12
|
144
146
|
#endif
|
145
147
|
|
148
|
+
#define QR2_K 4
|
149
|
+
#define QI2_K (QK_K / (4*QR2_K))
|
146
150
|
typedef struct {
|
147
151
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
148
152
|
uint8_t qs[QK_K/4]; // quants
|
@@ -151,6 +155,8 @@ typedef struct {
|
|
151
155
|
} block_q2_K;
|
152
156
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
153
157
|
|
158
|
+
#define QR3_K 4
|
159
|
+
#define QI3_K (QK_K / (4*QR3_K))
|
154
160
|
typedef struct {
|
155
161
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
156
162
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -163,6 +169,8 @@ typedef struct {
|
|
163
169
|
} block_q3_K;
|
164
170
|
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
165
171
|
|
172
|
+
#define QR4_K 2
|
173
|
+
#define QI4_K (QK_K / (4*QR4_K))
|
166
174
|
#ifdef GGML_QKK_64
|
167
175
|
typedef struct {
|
168
176
|
half d[2]; // super-block scales/mins
|
@@ -180,6 +188,8 @@ typedef struct {
|
|
180
188
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
181
189
|
#endif
|
182
190
|
|
191
|
+
#define QR5_K 2
|
192
|
+
#define QI5_K (QK_K / (4*QR5_K))
|
183
193
|
#ifdef GGML_QKK_64
|
184
194
|
typedef struct {
|
185
195
|
half d; // super-block scale
|
@@ -199,6 +209,8 @@ typedef struct {
|
|
199
209
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
200
210
|
#endif
|
201
211
|
|
212
|
+
#define QR6_K 2
|
213
|
+
#define QI6_K (QK_K / (4*QR6_K))
|
202
214
|
typedef struct {
|
203
215
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
204
216
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
@@ -208,7 +220,7 @@ typedef struct {
|
|
208
220
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
209
221
|
|
210
222
|
#define WARP_SIZE 32
|
211
|
-
#define MATRIX_ROW_PADDING
|
223
|
+
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
212
224
|
|
213
225
|
#define CUDA_ADD_BLOCK_SIZE 256
|
214
226
|
#define CUDA_MUL_BLOCK_SIZE 256
|
@@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
|
|
240
252
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
241
253
|
};
|
242
254
|
|
243
|
-
static __global__ void add_f32(const float * x, const float * y, float * dst, const int
|
255
|
+
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
244
256
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
245
257
|
|
246
|
-
if (i >=
|
258
|
+
if (i >= kx) {
|
247
259
|
return;
|
248
260
|
}
|
249
|
-
dst[i] = x[i] + y[i];
|
261
|
+
dst[i] = x[i] + y[i%ky];
|
250
262
|
}
|
251
263
|
|
252
264
|
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
@@ -320,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
320
332
|
}
|
321
333
|
}
|
322
334
|
|
323
|
-
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
335
|
+
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
324
336
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
325
337
|
const int tid = threadIdx.x;
|
326
338
|
|
327
|
-
const float eps = 1e-6f;
|
328
|
-
|
329
339
|
float tmp = 0.0f; // partial sum for thread in warp
|
330
340
|
|
331
341
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
@@ -923,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
923
933
|
uint16_t aux[4];
|
924
934
|
const uint8_t * sc = (const uint8_t *)aux;
|
925
935
|
|
936
|
+
#if K_QUANTS_PER_ITERATION == 2
|
937
|
+
uint32_t q32[4];
|
938
|
+
const uint8_t * q4 = (const uint8_t *)q32;
|
939
|
+
#else
|
940
|
+
uint16_t q16[4];
|
941
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
942
|
+
#endif
|
943
|
+
|
926
944
|
float tmp = 0; // partial sum for thread in warp
|
927
945
|
|
928
946
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
929
947
|
|
930
|
-
const uint8_t * q1 = x[i].qs + q_offset;
|
931
|
-
const uint8_t * q2 = q1 + 64;
|
932
948
|
const float * y1 = yy + i*QK_K + y_offset;
|
933
949
|
const float * y2 = y1 + 128;
|
934
950
|
|
@@ -941,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
941
957
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
942
958
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
943
959
|
|
960
|
+
#if K_QUANTS_PER_ITERATION == 2
|
961
|
+
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
962
|
+
const uint32_t * q2 = q1 + 16;
|
963
|
+
|
964
|
+
q32[0] = q1[0] & 0x0f0f0f0f;
|
965
|
+
q32[1] = q1[0] & 0xf0f0f0f0;
|
966
|
+
q32[2] = q2[0] & 0x0f0f0f0f;
|
967
|
+
q32[3] = q2[0] & 0xf0f0f0f0;
|
968
|
+
|
944
969
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
945
970
|
float smin = 0;
|
946
|
-
for (int l = 0; l <
|
947
|
-
s.x += y1[l] *
|
948
|
-
s.z += y2[l] *
|
971
|
+
for (int l = 0; l < 4; ++l) {
|
972
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
973
|
+
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
949
974
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
950
975
|
}
|
951
|
-
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
976
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
977
|
+
#else
|
978
|
+
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
979
|
+
const uint16_t * q2 = q1 + 32;
|
980
|
+
|
981
|
+
q16[0] = q1[0] & 0x0f0f;
|
982
|
+
q16[1] = q1[0] & 0xf0f0;
|
983
|
+
q16[2] = q2[0] & 0x0f0f;
|
984
|
+
q16[3] = q2[0] & 0xf0f0;
|
985
|
+
|
986
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
987
|
+
float smin = 0;
|
988
|
+
for (int l = 0; l < 2; ++l) {
|
989
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
990
|
+
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
991
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
992
|
+
}
|
993
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
994
|
+
#endif
|
952
995
|
|
953
996
|
}
|
954
997
|
#else
|
@@ -1028,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1028
1071
|
uint16_t aux[4];
|
1029
1072
|
const uint8_t * sc = (const uint8_t *)aux;
|
1030
1073
|
|
1074
|
+
uint16_t q16[8];
|
1075
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
1076
|
+
|
1031
1077
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
1032
1078
|
|
1033
1079
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
1034
|
-
const uint8_t * ql2 = ql1 + 64;
|
1035
1080
|
const uint8_t * qh = x[i].qh + l0;
|
1036
1081
|
const float * y1 = yy + i*QK_K + y_offset;
|
1037
1082
|
const float * y2 = y1 + 128;
|
@@ -1047,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1047
1092
|
|
1048
1093
|
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
1049
1094
|
float smin = 0;
|
1095
|
+
const uint16_t * q1 = (const uint16_t *)ql1;
|
1096
|
+
const uint16_t * q2 = q1 + 32;
|
1097
|
+
q16[0] = q1[0] & 0x0f0f;
|
1098
|
+
q16[1] = q1[8] & 0x0f0f;
|
1099
|
+
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
1100
|
+
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
1101
|
+
q16[4] = q2[0] & 0x0f0f;
|
1102
|
+
q16[5] = q2[8] & 0x0f0f;
|
1103
|
+
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
1104
|
+
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
1050
1105
|
for (int l = 0; l < n; ++l) {
|
1051
|
-
sum.x += y1[l+ 0] * (
|
1052
|
-
+ y1[l+16] * (
|
1053
|
-
sum.y += y1[l+32] * (
|
1054
|
-
+ y1[l+48] * (
|
1055
|
-
sum.z += y2[l+ 0] * (
|
1056
|
-
+ y2[l+16] * (
|
1057
|
-
sum.w += y2[l+32] * (
|
1058
|
-
+ y2[l+48] * (
|
1106
|
+
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
1107
|
+
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
1108
|
+
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
1109
|
+
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
1110
|
+
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
1111
|
+
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
1112
|
+
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
1113
|
+
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
1059
1114
|
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
1060
1115
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
1061
1116
|
}
|
@@ -1271,8 +1326,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1271
1326
|
y[iybs + iqs + y_offset] = v.y;
|
1272
1327
|
}
|
1273
1328
|
|
1274
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1275
|
-
|
1329
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1330
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1331
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1276
1332
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1277
1333
|
|
1278
1334
|
int vi;
|
@@ -1293,11 +1349,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
|
|
1293
1349
|
return sumi*d;
|
1294
1350
|
#else
|
1295
1351
|
return 0.0f; // only to satisfy the compiler
|
1296
|
-
#endif // __CUDA_ARCH__ >=
|
1352
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1297
1353
|
}
|
1298
1354
|
|
1299
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1300
|
-
|
1355
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1356
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1357
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1301
1358
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1302
1359
|
|
1303
1360
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1318,11 +1375,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
|
|
1318
1375
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1319
1376
|
#else
|
1320
1377
|
return 0.0f; // only to satisfy the compiler
|
1321
|
-
#endif // __CUDA_ARCH__ >=
|
1378
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1322
1379
|
}
|
1323
1380
|
|
1324
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1325
|
-
|
1381
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1382
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1383
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1326
1384
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1327
1385
|
|
1328
1386
|
int qs;
|
@@ -1353,11 +1411,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
|
|
1353
1411
|
return sumi*d;
|
1354
1412
|
#else
|
1355
1413
|
return 0.0f; // only to satisfy the compiler
|
1356
|
-
#endif // __CUDA_ARCH__ >=
|
1414
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1357
1415
|
}
|
1358
1416
|
|
1359
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1360
|
-
|
1417
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1418
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1419
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1361
1420
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1362
1421
|
|
1363
1422
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1387,11 +1446,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
|
|
1387
1446
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1388
1447
|
#else
|
1389
1448
|
return 0.0f; // only to satisfy the compiler
|
1390
|
-
#endif // __CUDA_ARCH__ >=
|
1449
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1391
1450
|
}
|
1392
1451
|
|
1393
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1394
|
-
|
1452
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1453
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1454
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1395
1455
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1396
1456
|
|
1397
1457
|
int vi;
|
@@ -1406,7 +1466,342 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
|
|
1406
1466
|
return sumi*d;
|
1407
1467
|
#else
|
1408
1468
|
return 0.0f; // only to satisfy the compiler
|
1409
|
-
#endif // __CUDA_ARCH__ >=
|
1469
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1470
|
+
}
|
1471
|
+
|
1472
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
1473
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1474
|
+
|
1475
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1476
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
1477
|
+
|
1478
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
1479
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1480
|
+
|
1481
|
+
float sumf_d = 0.0f;
|
1482
|
+
float sumf_m = 0.0f;
|
1483
|
+
|
1484
|
+
const float d = bq2_K->d;
|
1485
|
+
const float dmin = bq2_K->dmin;
|
1486
|
+
|
1487
|
+
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
|
1488
|
+
|
1489
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1490
|
+
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1491
|
+
|
1492
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1493
|
+
const float d8i = bq8i->d;
|
1494
|
+
|
1495
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1496
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1497
|
+
|
1498
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
|
1499
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
|
1500
|
+
}
|
1501
|
+
|
1502
|
+
return d*sumf_d - dmin*sumf_m;
|
1503
|
+
#else
|
1504
|
+
return 0.0f; // only to satisfy the compiler
|
1505
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1506
|
+
}
|
1507
|
+
|
1508
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
1509
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1510
|
+
|
1511
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1512
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
1513
|
+
|
1514
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
1515
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1516
|
+
|
1517
|
+
float sumf = 0.0f;
|
1518
|
+
|
1519
|
+
const float d = bq3_K->d;
|
1520
|
+
|
1521
|
+
int vl;
|
1522
|
+
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
|
1523
|
+
|
1524
|
+
int vh;
|
1525
|
+
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
1526
|
+
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1527
|
+
vh >>= bq8_offset;
|
1528
|
+
|
1529
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1530
|
+
const int isc = scale_offset + 2*i;
|
1531
|
+
|
1532
|
+
const int isc_low = isc % (QK_K/32);
|
1533
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1534
|
+
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1535
|
+
|
1536
|
+
const int isc_high = isc % (QK_K/64);
|
1537
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1538
|
+
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1539
|
+
|
1540
|
+
const int sc = (sc_low | sc_high) - 32;
|
1541
|
+
|
1542
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1543
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1544
|
+
const float d8i = bq8i->d;
|
1545
|
+
|
1546
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1547
|
+
|
1548
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1549
|
+
|
1550
|
+
const int vi = __vsubss4(vil, vih);
|
1551
|
+
|
1552
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1553
|
+
}
|
1554
|
+
|
1555
|
+
return d*sumf;
|
1556
|
+
#else
|
1557
|
+
return 0.0f; // only to satisfy the compiler
|
1558
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1559
|
+
}
|
1560
|
+
|
1561
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
1562
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1563
|
+
|
1564
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1565
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1566
|
+
|
1567
|
+
float sumf_d = 0.0f;
|
1568
|
+
float sumf_m = 0.0f;
|
1569
|
+
|
1570
|
+
#ifndef GGML_QKK_64
|
1571
|
+
|
1572
|
+
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
|
1573
|
+
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
|
1574
|
+
|
1575
|
+
const float d = bq4_K->d;
|
1576
|
+
const float dmin = bq4_K->dmin;
|
1577
|
+
|
1578
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
1579
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
1580
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
1581
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
1582
|
+
|
1583
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1584
|
+
const int v1 = q4[0];
|
1585
|
+
const int v2 = q4[4];
|
1586
|
+
|
1587
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
1588
|
+
uint16_t aux[2];
|
1589
|
+
const int j = bq8_offset/2;
|
1590
|
+
if (j < 2) {
|
1591
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1592
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1593
|
+
} else {
|
1594
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1595
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1596
|
+
}
|
1597
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1598
|
+
const uint8_t * m = sc + 2;
|
1599
|
+
|
1600
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1601
|
+
|
1602
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1603
|
+
const float d8i = bq8i->d;
|
1604
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1605
|
+
const int ui1 = q8[0];
|
1606
|
+
const int ui2 = q8[4];
|
1607
|
+
|
1608
|
+
const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
|
1609
|
+
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
1610
|
+
|
1611
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1612
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1613
|
+
|
1614
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1615
|
+
sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1616
|
+
}
|
1617
|
+
|
1618
|
+
return d*sumf_d - dmin*sumf_m;
|
1619
|
+
|
1620
|
+
#else
|
1621
|
+
|
1622
|
+
uint16_t aux16[2];
|
1623
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
1624
|
+
|
1625
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
1626
|
+
aux16[0] = a[0] & 0x0f0f;
|
1627
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1628
|
+
|
1629
|
+
const float dall = bq4_K->d[0];
|
1630
|
+
const float dmin = bq4_K->d[1];
|
1631
|
+
|
1632
|
+
const float d8_1 = bq8_1[0].d;
|
1633
|
+
const float d8_2 = bq8_1[1].d;
|
1634
|
+
|
1635
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1636
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1637
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1638
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1639
|
+
|
1640
|
+
const int * q4 = (const int *)bq4_K->qs + iqs;
|
1641
|
+
const int v1 = q4[0];
|
1642
|
+
const int v2 = q4[4];
|
1643
|
+
|
1644
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
1645
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
1646
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1647
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
1648
|
+
|
1649
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
1650
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
1651
|
+
|
1652
|
+
return dall * sumf_d - dmin * sumf_m;
|
1653
|
+
|
1654
|
+
#endif
|
1655
|
+
|
1656
|
+
#else
|
1657
|
+
return 0.0f; // only to satisfy the compiler
|
1658
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1659
|
+
}
|
1660
|
+
|
1661
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
1662
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1663
|
+
|
1664
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1665
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1666
|
+
|
1667
|
+
#ifndef GGML_QKK_64
|
1668
|
+
|
1669
|
+
const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
|
1670
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1671
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
|
1672
|
+
|
1673
|
+
float sumf_d = 0.0f;
|
1674
|
+
float sumf_m = 0.0f;
|
1675
|
+
|
1676
|
+
const float d = bq5_K->d;
|
1677
|
+
const float dmin = bq5_K->dmin;
|
1678
|
+
|
1679
|
+
const int vl1 = ql[0];
|
1680
|
+
const int vl2 = ql[4];
|
1681
|
+
|
1682
|
+
const int vh1 = qh[0] >> bq8_offset;
|
1683
|
+
const int vh2 = qh[4] >> bq8_offset;
|
1684
|
+
|
1685
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
1686
|
+
uint16_t aux[2];
|
1687
|
+
const int j = bq8_offset/2;
|
1688
|
+
if (j < 2) {
|
1689
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1690
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1691
|
+
} else {
|
1692
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1693
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1694
|
+
}
|
1695
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1696
|
+
const uint8_t * m = sc + 2;
|
1697
|
+
|
1698
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1699
|
+
|
1700
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1701
|
+
const float d8i = bq8i->d;
|
1702
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1703
|
+
const int ui1 = q8[0];
|
1704
|
+
const int ui2 = q8[4];
|
1705
|
+
|
1706
|
+
const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
|
1707
|
+
const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
|
1708
|
+
|
1709
|
+
const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
|
1710
|
+
const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
|
1711
|
+
|
1712
|
+
const int vi1 = vil1 | vih1;
|
1713
|
+
const int vi2 = vil2 | vih2;
|
1714
|
+
|
1715
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1716
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1717
|
+
|
1718
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1719
|
+
sumf_m += d8i * (dot2 * m[i]);
|
1720
|
+
|
1721
|
+
}
|
1722
|
+
|
1723
|
+
return d*sumf_d - dmin*sumf_m;
|
1724
|
+
|
1725
|
+
#else
|
1726
|
+
|
1727
|
+
const int8_t * s = bq5_K->scales;
|
1728
|
+
|
1729
|
+
const float d = bq5_K->d;
|
1730
|
+
|
1731
|
+
const float d8_1 = bq8_1[0].d;
|
1732
|
+
const float d8_2 = bq8_1[1].d;
|
1733
|
+
|
1734
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1735
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1736
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1737
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1738
|
+
|
1739
|
+
const int * ql = (const int *)bq5_K->qs + iqs;
|
1740
|
+
const int vl1 = ql[0];
|
1741
|
+
const int vl2 = ql[4];
|
1742
|
+
|
1743
|
+
const int step = 4 * iqs; // 0, 4, 8, 12
|
1744
|
+
const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
|
1745
|
+
const int in = step%8; // 0, 4, 0, 4
|
1746
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
1747
|
+
|
1748
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
1749
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
1750
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
1751
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
1752
|
+
|
1753
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
1754
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
1755
|
+
|
1756
|
+
return d * sumf_d;
|
1757
|
+
|
1758
|
+
#endif
|
1759
|
+
|
1760
|
+
#else
|
1761
|
+
return 0.0f; // only to satisfy the compiler
|
1762
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1763
|
+
}
|
1764
|
+
|
1765
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
1766
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1767
|
+
|
1768
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1769
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
1770
|
+
|
1771
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
1772
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1773
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1774
|
+
|
1775
|
+
float sumf = 0.0f;
|
1776
|
+
|
1777
|
+
const float d = bq6_K->d;
|
1778
|
+
|
1779
|
+
int vl;
|
1780
|
+
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
1781
|
+
|
1782
|
+
int vh;
|
1783
|
+
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
|
1784
|
+
|
1785
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1786
|
+
const int sc = bq6_K->scales[scale_offset + 4*i];
|
1787
|
+
|
1788
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
1789
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1790
|
+
const float d8i = bq8i->d;
|
1791
|
+
|
1792
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1793
|
+
|
1794
|
+
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
|
1795
|
+
|
1796
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1797
|
+
|
1798
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1799
|
+
}
|
1800
|
+
|
1801
|
+
return d*sumf;
|
1802
|
+
#else
|
1803
|
+
return 0.0f; // only to satisfy the compiler
|
1804
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1410
1805
|
}
|
1411
1806
|
|
1412
1807
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -1429,7 +1824,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1429
1824
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1430
1825
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1431
1826
|
|
1432
|
-
const int iby = i + threadIdx.x / qi; // y block index
|
1827
|
+
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
1433
1828
|
|
1434
1829
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1435
1830
|
|
@@ -1515,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1515
1910
|
}
|
1516
1911
|
}
|
1517
1912
|
|
1518
|
-
static __global__ void mul_mat_p021_f16_f32(
|
1913
|
+
static __global__ void mul_mat_p021_f16_f32(
|
1914
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
1915
|
+
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
1916
|
+
|
1519
1917
|
const half * x = (const half *) vx;
|
1520
1918
|
|
1521
1919
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1522
1920
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1921
|
+
const int channel_x = channel / (nchannels_y / nchannels_x);
|
1523
1922
|
|
1524
1923
|
const int nrows_y = ncols_x;
|
1525
1924
|
const int nrows_dst = nrows_x;
|
@@ -1535,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1535
1934
|
}
|
1536
1935
|
|
1537
1936
|
// x is transposed and permuted
|
1538
|
-
const int ix = row_x*nchannels_x*ncols_x +
|
1937
|
+
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
1539
1938
|
const float xi = __half2float(x[ix]);
|
1540
1939
|
|
1541
1940
|
const int row_y = col_x;
|
@@ -1563,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1563
1962
|
|
1564
1963
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1565
1964
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1566
|
-
const int row_stride_x, const int channel_stride_x) {
|
1965
|
+
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
1567
1966
|
|
1568
1967
|
const half * x = (const half *) vx;
|
1569
1968
|
|
1570
1969
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1571
1970
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1971
|
+
const int channel_x = channel / channel_x_divisor;
|
1572
1972
|
|
1573
1973
|
const int nrows_y = ncols_x;
|
1574
1974
|
const int nrows_dst = nrows_x;
|
@@ -1585,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1585
1985
|
break;
|
1586
1986
|
}
|
1587
1987
|
|
1588
|
-
const int ix =
|
1988
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
1589
1989
|
const float xi = __half2float(x[ix]);
|
1590
1990
|
|
1591
1991
|
const int row_y = col_x;
|
@@ -1667,6 +2067,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
1667
2067
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1668
2068
|
}
|
1669
2069
|
|
2070
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
2071
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
2072
|
+
const int half_n_dims = ncols/4;
|
2073
|
+
|
2074
|
+
if (col >= half_n_dims) {
|
2075
|
+
return;
|
2076
|
+
}
|
2077
|
+
|
2078
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
2079
|
+
const int i = row*ncols + col;
|
2080
|
+
|
2081
|
+
const float col_theta_scale = powf(theta_scale, col);
|
2082
|
+
|
2083
|
+
const float theta = p*col_theta_scale;
|
2084
|
+
const float sin_theta = sinf(theta);
|
2085
|
+
const float cos_theta = cosf(theta);
|
2086
|
+
|
2087
|
+
const float x0 = x[i + 0];
|
2088
|
+
const float x1 = x[i + half_n_dims];
|
2089
|
+
|
2090
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
2091
|
+
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
2092
|
+
|
2093
|
+
const float block_theta = block_p*col_theta_scale;
|
2094
|
+
const float sin_block_theta = sinf(block_theta);
|
2095
|
+
const float cos_block_theta = cosf(block_theta);
|
2096
|
+
|
2097
|
+
const float x2 = x[i + half_n_dims * 2];
|
2098
|
+
const float x3 = x[i + half_n_dims * 3];
|
2099
|
+
|
2100
|
+
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
2101
|
+
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
2102
|
+
}
|
2103
|
+
|
1670
2104
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1671
2105
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1672
2106
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1732,9 +2166,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
1732
2166
|
dst[i] = scale * x[i];
|
1733
2167
|
}
|
1734
2168
|
|
1735
|
-
static void add_f32_cuda(const float * x, const float * y, float * dst, const int
|
1736
|
-
const int num_blocks = (
|
1737
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst,
|
2169
|
+
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
2170
|
+
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
2171
|
+
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1738
2172
|
}
|
1739
2173
|
|
1740
2174
|
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
@@ -1763,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
1763
2197
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1764
2198
|
}
|
1765
2199
|
|
1766
|
-
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2200
|
+
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
1767
2201
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1768
2202
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1769
|
-
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2203
|
+
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
1770
2204
|
}
|
1771
2205
|
|
1772
2206
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
@@ -1928,7 +2362,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1928
2362
|
}
|
1929
2363
|
|
1930
2364
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1931
|
-
GGML_ASSERT(ncols %
|
2365
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
1932
2366
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1933
2367
|
const dim3 block_nums(1, block_num_y, 1);
|
1934
2368
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1937,7 +2371,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1937
2371
|
}
|
1938
2372
|
|
1939
2373
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1940
|
-
GGML_ASSERT(ncols %
|
2374
|
+
GGML_ASSERT(ncols % QK4_1 == 0);
|
1941
2375
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1942
2376
|
const dim3 block_nums(1, block_num_y, 1);
|
1943
2377
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1946,7 +2380,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1946
2380
|
}
|
1947
2381
|
|
1948
2382
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1949
|
-
GGML_ASSERT(ncols %
|
2383
|
+
GGML_ASSERT(ncols % QK5_0 == 0);
|
1950
2384
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1951
2385
|
const dim3 block_nums(1, block_num_y, 1);
|
1952
2386
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1955,7 +2389,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1955
2389
|
}
|
1956
2390
|
|
1957
2391
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1958
|
-
GGML_ASSERT(ncols %
|
2392
|
+
GGML_ASSERT(ncols % QK5_1 == 0);
|
1959
2393
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1960
2394
|
const dim3 block_nums(1, block_num_y, 1);
|
1961
2395
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1964,7 +2398,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1964
2398
|
}
|
1965
2399
|
|
1966
2400
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1967
|
-
GGML_ASSERT(ncols %
|
2401
|
+
GGML_ASSERT(ncols % QK8_0 == 0);
|
1968
2402
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1969
2403
|
const dim3 block_nums(1, block_num_y, 1);
|
1970
2404
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1972,6 +2406,57 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1972
2406
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1973
2407
|
}
|
1974
2408
|
|
2409
|
+
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2410
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2411
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2412
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2413
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2414
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
2415
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2416
|
+
}
|
2417
|
+
|
2418
|
+
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2419
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2420
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2421
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2422
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2423
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
2424
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2425
|
+
}
|
2426
|
+
|
2427
|
+
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2428
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2429
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2430
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2431
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2432
|
+
// Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
|
2433
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2434
|
+
// is better amortized.
|
2435
|
+
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
2436
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2437
|
+
}
|
2438
|
+
|
2439
|
+
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2440
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2441
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2442
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2443
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2444
|
+
// Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
|
2445
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2446
|
+
// is better amortized.
|
2447
|
+
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
2448
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2449
|
+
}
|
2450
|
+
|
2451
|
+
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2452
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2453
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2454
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2455
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2456
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
2457
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2458
|
+
}
|
2459
|
+
|
1975
2460
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1976
2461
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1977
2462
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -2015,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2015
2500
|
}
|
2016
2501
|
}
|
2017
2502
|
|
2018
|
-
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2019
|
-
const
|
2503
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2504
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
2505
|
+
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
2506
|
+
|
2507
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2020
2508
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2021
|
-
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
2509
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
2022
2510
|
}
|
2023
2511
|
|
2024
2512
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
2025
2513
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
2026
|
-
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
2514
|
+
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
2027
2515
|
|
2028
|
-
const dim3 block_nums(1, nrows_x,
|
2516
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2029
2517
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2030
2518
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
2031
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
2519
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
2032
2520
|
}
|
2033
2521
|
|
2034
2522
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -2064,6 +2552,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2064
2552
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
2065
2553
|
}
|
2066
2554
|
|
2555
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
2556
|
+
GGML_ASSERT(nrows % 4 == 0);
|
2557
|
+
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2558
|
+
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
2559
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2560
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
2561
|
+
}
|
2562
|
+
|
2067
2563
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
2068
2564
|
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
2069
2565
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -2106,20 +2602,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
2106
2602
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
2107
2603
|
int id;
|
2108
2604
|
CUDA_CHECK(cudaGetDevice(&id));
|
2109
|
-
|
2605
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2606
|
+
int nnz = 0;
|
2607
|
+
size_t max_size = 0, tot_size = 0;
|
2608
|
+
#endif
|
2609
|
+
size_t best_diff = 1ull << 36;
|
2610
|
+
int ibest = -1;
|
2110
2611
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
2111
2612
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
2112
|
-
if (b.
|
2113
|
-
|
2114
|
-
|
2115
|
-
b.
|
2116
|
-
b.size =
|
2117
|
-
|
2613
|
+
if (b.ptr != nullptr) {
|
2614
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2615
|
+
++nnz;
|
2616
|
+
tot_size += b.size;
|
2617
|
+
if (b.size > max_size) max_size = b.size;
|
2618
|
+
#endif
|
2619
|
+
if (b.size >= size) {
|
2620
|
+
size_t diff = b.size - size;
|
2621
|
+
if (diff < best_diff) {
|
2622
|
+
best_diff = diff;
|
2623
|
+
ibest = i;
|
2624
|
+
if (!best_diff) {
|
2625
|
+
void * ptr = b.ptr;
|
2626
|
+
*actual_size = b.size;
|
2627
|
+
b.ptr = nullptr;
|
2628
|
+
b.size = 0;
|
2629
|
+
return ptr;
|
2630
|
+
}
|
2631
|
+
}
|
2632
|
+
}
|
2118
2633
|
}
|
2119
2634
|
}
|
2635
|
+
if (ibest >= 0) {
|
2636
|
+
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
2637
|
+
void * ptr = b.ptr;
|
2638
|
+
*actual_size = b.size;
|
2639
|
+
b.ptr = nullptr;
|
2640
|
+
b.size = 0;
|
2641
|
+
return ptr;
|
2642
|
+
}
|
2643
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2644
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
2645
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
2646
|
+
#endif
|
2120
2647
|
void * ptr;
|
2121
|
-
|
2122
|
-
|
2648
|
+
size_t look_ahead_size = (size_t) (1.05 * size);
|
2649
|
+
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
2650
|
+
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
2651
|
+
*actual_size = look_ahead_size;
|
2123
2652
|
return ptr;
|
2124
2653
|
}
|
2125
2654
|
|
@@ -2147,7 +2676,9 @@ static size_t g_scratch_offset = 0;
|
|
2147
2676
|
|
2148
2677
|
static int g_device_count = -1;
|
2149
2678
|
static int g_main_device = 0;
|
2679
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2150
2680
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2681
|
+
#endif
|
2151
2682
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
2152
2683
|
|
2153
2684
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -2170,7 +2701,9 @@ void ggml_init_cublas() {
|
|
2170
2701
|
g_tensor_split[id] = total_vram;
|
2171
2702
|
total_vram += prop.totalGlobalMem;
|
2172
2703
|
|
2704
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2173
2705
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
2706
|
+
#endif
|
2174
2707
|
}
|
2175
2708
|
for (int id = 0; id < g_device_count; ++id) {
|
2176
2709
|
g_tensor_split[id] /= total_vram;
|
@@ -2195,6 +2728,9 @@ void ggml_init_cublas() {
|
|
2195
2728
|
}
|
2196
2729
|
|
2197
2730
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
2731
|
+
if (tensor_split == nullptr) {
|
2732
|
+
return;
|
2733
|
+
}
|
2198
2734
|
bool all_zero = true;
|
2199
2735
|
for (int i = 0; i < g_device_count; ++i) {
|
2200
2736
|
if (tensor_split[i] != 0.0f) {
|
@@ -2293,17 +2829,15 @@ inline void ggml_cuda_op_add(
|
|
2293
2829
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2294
2830
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2295
2831
|
|
2296
|
-
// TODO: support broadcasting
|
2297
|
-
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
2298
|
-
|
2299
2832
|
const int64_t ne00 = src0->ne[0];
|
2300
2833
|
const int64_t i01_diff = i01_high - i01_low;
|
2301
2834
|
|
2302
|
-
|
2835
|
+
const int64_t ne10 = src1->ne[0];
|
2836
|
+
const int64_t ne11 = src1->ne[1];
|
2303
2837
|
|
2304
2838
|
// compute
|
2305
2839
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2306
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2840
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2307
2841
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2308
2842
|
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2309
2843
|
} else {
|
@@ -2327,23 +2861,17 @@ inline void ggml_cuda_op_mul(
|
|
2327
2861
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2328
2862
|
|
2329
2863
|
const int64_t ne00 = src0->ne[0];
|
2864
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2865
|
+
|
2330
2866
|
const int64_t ne10 = src1->ne[0];
|
2331
2867
|
const int64_t ne11 = src1->ne[1];
|
2332
2868
|
|
2333
|
-
|
2334
|
-
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
2335
|
-
|
2336
|
-
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2337
|
-
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2338
|
-
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2339
|
-
|
2340
|
-
// compute
|
2341
|
-
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2342
|
-
}
|
2869
|
+
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2343
2870
|
|
2344
2871
|
(void) dst;
|
2345
2872
|
(void) src0_ddq_i;
|
2346
2873
|
(void) i02;
|
2874
|
+
(void) i1;
|
2347
2875
|
}
|
2348
2876
|
|
2349
2877
|
inline void ggml_cuda_op_gelu(
|
@@ -2423,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
|
|
2423
2951
|
const int64_t ne00 = src0->ne[0];
|
2424
2952
|
const int64_t i01_diff = i01_high - i01_low;
|
2425
2953
|
|
2954
|
+
float eps;
|
2955
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
2956
|
+
|
2426
2957
|
// compute
|
2427
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2958
|
+
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
2428
2959
|
|
2429
2960
|
(void) src1;
|
2430
2961
|
(void) dst;
|
@@ -2452,18 +2983,27 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2452
2983
|
int id;
|
2453
2984
|
CUDA_CHECK(cudaGetDevice(&id));
|
2454
2985
|
|
2455
|
-
|
2986
|
+
bool mul_mat_vec_q_implemented =
|
2987
|
+
src0->type == GGML_TYPE_Q4_0 ||
|
2456
2988
|
src0->type == GGML_TYPE_Q4_1 ||
|
2457
2989
|
src0->type == GGML_TYPE_Q5_0 ||
|
2458
2990
|
src0->type == GGML_TYPE_Q5_1 ||
|
2459
2991
|
src0->type == GGML_TYPE_Q8_0;
|
2460
|
-
|
2461
|
-
|
2992
|
+
#if QK_K == 256
|
2993
|
+
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
2994
|
+
src0->type == GGML_TYPE_Q2_K ||
|
2995
|
+
src0->type == GGML_TYPE_Q3_K ||
|
2996
|
+
src0->type == GGML_TYPE_Q4_K ||
|
2997
|
+
src0->type == GGML_TYPE_Q5_K ||
|
2998
|
+
src0->type == GGML_TYPE_Q6_K;
|
2999
|
+
#endif // QK_K == 256
|
3000
|
+
|
3001
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
2462
3002
|
#endif
|
2463
3003
|
|
2464
3004
|
if (use_mul_mat_vec_q) {
|
2465
|
-
int64_t padded_row_size = ne00
|
2466
|
-
|
3005
|
+
const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
|
3006
|
+
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
2467
3007
|
size_t as;
|
2468
3008
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2469
3009
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
@@ -2484,6 +3024,21 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2484
3024
|
case GGML_TYPE_Q8_0:
|
2485
3025
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2486
3026
|
break;
|
3027
|
+
case GGML_TYPE_Q2_K:
|
3028
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3029
|
+
break;
|
3030
|
+
case GGML_TYPE_Q3_K:
|
3031
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3032
|
+
break;
|
3033
|
+
case GGML_TYPE_Q4_K:
|
3034
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3035
|
+
break;
|
3036
|
+
case GGML_TYPE_Q5_K:
|
3037
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3038
|
+
break;
|
3039
|
+
case GGML_TYPE_Q6_K:
|
3040
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3041
|
+
break;
|
2487
3042
|
default:
|
2488
3043
|
GGML_ASSERT(false);
|
2489
3044
|
break;
|
@@ -2615,17 +3170,31 @@ inline void ggml_cuda_op_rope(
|
|
2615
3170
|
const int64_t ne00 = src0->ne[0];
|
2616
3171
|
const int64_t i01_diff = i01_high - i01_low;
|
2617
3172
|
|
2618
|
-
const int n_past = ((int32_t *)
|
2619
|
-
const int n_dims = ((int32_t *)
|
2620
|
-
const int mode = ((int32_t *)
|
2621
|
-
|
3173
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
3174
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
3175
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
3176
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
3177
|
+
// RoPE alteration for extended context
|
2622
3178
|
|
2623
|
-
|
2624
|
-
|
3179
|
+
float freq_base, freq_scale;
|
3180
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
3181
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
3182
|
+
|
3183
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3184
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3185
|
+
|
3186
|
+
bool is_glm = mode & 4;
|
2625
3187
|
|
2626
3188
|
// compute
|
2627
|
-
|
3189
|
+
if (is_glm) {
|
3190
|
+
const float id_p = min(p, n_ctx - 2.f);
|
3191
|
+
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3192
|
+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3193
|
+
} else {
|
3194
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3195
|
+
}
|
2628
3196
|
|
3197
|
+
(void) src1;
|
2629
3198
|
(void) dst;
|
2630
3199
|
(void) src0_ddq_i;
|
2631
3200
|
(void) src1_ddf_i;
|
@@ -2644,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2644
3213
|
const int64_t ne01 = src0->ne[1];
|
2645
3214
|
const int64_t i01_diff = i01_high - i01_low;
|
2646
3215
|
|
2647
|
-
const int n_past = ((int32_t *)
|
3216
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
2648
3217
|
|
2649
3218
|
// compute
|
2650
3219
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2651
3220
|
|
3221
|
+
(void) src1;
|
2652
3222
|
(void) dst;
|
2653
3223
|
(void) src0_ddq_i;
|
2654
3224
|
(void) src1_ddf_i;
|
@@ -2716,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2716
3286
|
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
|
2717
3287
|
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
|
2718
3288
|
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
|
3289
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
3290
|
+
|
3291
|
+
GGML_ASSERT(ne03 == ne13);
|
2719
3292
|
|
2720
3293
|
const int64_t ne0 = dst->ne[0];
|
2721
3294
|
const int64_t ne1 = dst->ne[1];
|
@@ -2727,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2727
3300
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
2728
3301
|
|
2729
3302
|
// strides for iteration over dims 3 and 2
|
2730
|
-
const int64_t
|
2731
|
-
const int64_t
|
3303
|
+
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
3304
|
+
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
3305
|
+
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
2732
3306
|
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
2733
3307
|
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
2734
3308
|
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
2735
3309
|
|
3310
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
3311
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
3312
|
+
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
3313
|
+
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
3314
|
+
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
3315
|
+
|
2736
3316
|
const size_t src0_ts = ggml_type_size(src0->type);
|
2737
3317
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
2738
3318
|
|
@@ -2749,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2749
3329
|
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
2750
3330
|
|
2751
3331
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
3332
|
+
GGML_ASSERT(!(split && ne02 < ne12));
|
2752
3333
|
|
2753
3334
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
2754
3335
|
|
@@ -2785,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2785
3366
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
2786
3367
|
} else {
|
2787
3368
|
row_low = 0;
|
2788
|
-
row_high = nrows0;
|
3369
|
+
row_high = nrows0*i02_divisor;
|
2789
3370
|
}
|
2790
3371
|
if (row_low == row_high) {
|
2791
3372
|
continue;
|
@@ -2833,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2833
3414
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
2834
3415
|
}
|
2835
3416
|
|
2836
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
2837
|
-
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
2838
|
-
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
2839
|
-
|
2840
3417
|
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
2841
3418
|
const int64_t i13 = i03 % ne13;
|
2842
3419
|
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
2843
3420
|
const int64_t i12 = i02 % ne12;
|
2844
3421
|
|
2845
|
-
const int64_t i0 = i03*
|
3422
|
+
const int64_t i0 = i03*i02_max + i02;
|
2846
3423
|
|
2847
3424
|
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
2848
3425
|
const int64_t i0_offset_low = row_low/rows_per_iter;
|
@@ -2876,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2876
3453
|
const int64_t i11 = i13*ne12 + i12;
|
2877
3454
|
|
2878
3455
|
// for split tensors the data begins at i0 == i0_offset_low
|
2879
|
-
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2880
|
-
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
3456
|
+
char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
3457
|
+
float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
|
2881
3458
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
2882
|
-
float * dst_ddf_i = dst_ddf[id] + (i0
|
3459
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
2883
3460
|
|
2884
3461
|
// for split tensors the data pointer needs to be rounded down
|
2885
3462
|
// to the bin edge for i03, i02 bins beyond the first
|
@@ -2918,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2918
3495
|
}
|
2919
3496
|
}
|
2920
3497
|
|
2921
|
-
if (!src0_on_device || !src0_is_contiguous) {
|
3498
|
+
if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
2922
3499
|
if (src0_is_f32) {
|
2923
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3500
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
2924
3501
|
} else {
|
2925
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3502
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
2926
3503
|
}
|
2927
3504
|
}
|
2928
3505
|
|
@@ -3076,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3076
3653
|
const int64_t ne01 = src0->ne[1];
|
3077
3654
|
const int64_t ne02 = src0->ne[2];
|
3078
3655
|
|
3656
|
+
const int64_t ne12 = src1->ne[2];
|
3657
|
+
|
3079
3658
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3080
3659
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
3081
3660
|
|
@@ -3088,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3088
3667
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3089
3668
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
3090
3669
|
|
3091
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
3670
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
|
3092
3671
|
}
|
3093
3672
|
|
3094
3673
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -3102,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3102
3681
|
const int64_t ne01 = src0->ne[1];
|
3103
3682
|
const int64_t ne02 = src0->ne[2];
|
3104
3683
|
|
3684
|
+
const int64_t ne12 = src1->ne[2];
|
3685
|
+
|
3105
3686
|
const int64_t nb01 = src0->nb[1];
|
3106
3687
|
const int64_t nb02 = src0->nb[2];
|
3107
3688
|
|
@@ -3120,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3120
3701
|
const int row_stride_x = nb01 / sizeof(half);
|
3121
3702
|
const int channel_stride_x = nb02 / sizeof(half);
|
3122
3703
|
|
3123
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
3704
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
|
3124
3705
|
}
|
3125
3706
|
|
3126
3707
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3197,6 +3778,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3197
3778
|
(void) dst;
|
3198
3779
|
}
|
3199
3780
|
|
3781
|
+
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3782
|
+
ggml_cuda_cpy(src0, dst, nullptr);
|
3783
|
+
(void) src1;
|
3784
|
+
}
|
3785
|
+
|
3200
3786
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3201
3787
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3202
3788
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
@@ -3256,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3256
3842
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3257
3843
|
const size_t original_size = size;
|
3258
3844
|
|
3259
|
-
// pad last row to a multiple of
|
3845
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
3260
3846
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3261
3847
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3262
3848
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
@@ -3272,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3272
3858
|
}
|
3273
3859
|
|
3274
3860
|
|
3275
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host,
|
3861
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
3276
3862
|
|
3277
3863
|
extra->data_device[id] = buf;
|
3278
3864
|
|
@@ -3306,6 +3892,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
3306
3892
|
delete extra;
|
3307
3893
|
}
|
3308
3894
|
|
3895
|
+
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
3896
|
+
static size_t g_temp_tensor_extra_index = 0;
|
3897
|
+
|
3898
|
+
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
3899
|
+
if (g_temp_tensor_extras == nullptr) {
|
3900
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
3901
|
+
}
|
3902
|
+
|
3903
|
+
size_t alloc_index = g_temp_tensor_extra_index;
|
3904
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
3905
|
+
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
3906
|
+
memset(extra, 0, sizeof(*extra));
|
3907
|
+
|
3908
|
+
return extra;
|
3909
|
+
}
|
3910
|
+
|
3309
3911
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
3310
3912
|
if (scratch && g_scratch_size == 0) {
|
3311
3913
|
return;
|
@@ -3314,7 +3916,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3314
3916
|
// recursively assign CUDA buffers until a compute tensor is found
|
3315
3917
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
3918
|
const ggml_op src0_op = tensor->src[0]->op;
|
3317
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3919
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
3318
3920
|
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3319
3921
|
}
|
3320
3922
|
}
|
@@ -3323,8 +3925,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3323
3925
|
}
|
3324
3926
|
|
3325
3927
|
tensor->backend = GGML_BACKEND_GPU;
|
3326
|
-
struct ggml_tensor_extra_gpu * extra
|
3327
|
-
memset(extra, 0, sizeof(*extra));
|
3928
|
+
struct ggml_tensor_extra_gpu * extra;
|
3328
3929
|
|
3329
3930
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3330
3931
|
tensor->op == GGML_OP_VIEW ||
|
@@ -3337,12 +3938,14 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3337
3938
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3338
3939
|
size_t offset = 0;
|
3339
3940
|
if (tensor->op == GGML_OP_VIEW) {
|
3340
|
-
memcpy(&offset, tensor->
|
3941
|
+
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
3341
3942
|
}
|
3943
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3342
3944
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3343
3945
|
} else if (tensor->op == GGML_OP_CPY) {
|
3344
3946
|
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3345
3947
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3948
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3346
3949
|
extra->data_device[g_main_device] = src1_ddv;
|
3347
3950
|
} else if (scratch) {
|
3348
3951
|
GGML_ASSERT(size <= g_scratch_size);
|
@@ -3355,6 +3958,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3355
3958
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
3356
3959
|
g_scratch_buffer = data;
|
3357
3960
|
}
|
3961
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3358
3962
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
3359
3963
|
|
3360
3964
|
g_scratch_offset += size;
|
@@ -3364,6 +3968,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3364
3968
|
void * data;
|
3365
3969
|
CUDA_CHECK(cudaMalloc(&data, size));
|
3366
3970
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
3971
|
+
extra = new ggml_tensor_extra_gpu;
|
3972
|
+
memset(extra, 0, sizeof(*extra));
|
3367
3973
|
extra->data_device[g_main_device] = data;
|
3368
3974
|
}
|
3369
3975
|
|
@@ -3416,30 +4022,41 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3416
4022
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3417
4023
|
|
3418
4024
|
switch (tensor->op) {
|
3419
|
-
case
|
4025
|
+
case GGML_OP_DUP:
|
3420
4026
|
if (!any_on_device) {
|
3421
4027
|
return false;
|
3422
4028
|
}
|
3423
|
-
func =
|
4029
|
+
func = ggml_cuda_dup;
|
3424
4030
|
break;
|
3425
|
-
case
|
3426
|
-
if (!any_on_device) {
|
3427
|
-
return false;
|
3428
|
-
}
|
3429
|
-
func = ggml_cuda_mul;
|
3430
|
-
break;
|
3431
|
-
case GGML_OP_GELU:
|
4031
|
+
case GGML_OP_ADD:
|
3432
4032
|
if (!any_on_device) {
|
3433
4033
|
return false;
|
3434
4034
|
}
|
3435
|
-
func =
|
4035
|
+
func = ggml_cuda_add;
|
3436
4036
|
break;
|
3437
|
-
case
|
4037
|
+
case GGML_OP_MUL:
|
3438
4038
|
if (!any_on_device) {
|
3439
4039
|
return false;
|
3440
4040
|
}
|
3441
|
-
func =
|
4041
|
+
func = ggml_cuda_mul;
|
3442
4042
|
break;
|
4043
|
+
case GGML_OP_UNARY:
|
4044
|
+
switch (ggml_get_unary_op(tensor)) {
|
4045
|
+
case GGML_UNARY_OP_GELU:
|
4046
|
+
if (!any_on_device) {
|
4047
|
+
return false;
|
4048
|
+
}
|
4049
|
+
func = ggml_cuda_gelu;
|
4050
|
+
break;
|
4051
|
+
case GGML_UNARY_OP_SILU:
|
4052
|
+
if (!any_on_device) {
|
4053
|
+
return false;
|
4054
|
+
}
|
4055
|
+
func = ggml_cuda_silu;
|
4056
|
+
break;
|
4057
|
+
default:
|
4058
|
+
return false;
|
4059
|
+
} break;
|
3443
4060
|
case GGML_OP_NORM:
|
3444
4061
|
if (!any_on_device) {
|
3445
4062
|
return false;
|
@@ -3470,6 +4087,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3470
4087
|
}
|
3471
4088
|
func = ggml_cuda_cpy;
|
3472
4089
|
break;
|
4090
|
+
case GGML_OP_CONT:
|
4091
|
+
if (!any_on_device) {
|
4092
|
+
return false;
|
4093
|
+
}
|
4094
|
+
func = ggml_cuda_dup;
|
4095
|
+
break;
|
3473
4096
|
case GGML_OP_RESHAPE:
|
3474
4097
|
case GGML_OP_VIEW:
|
3475
4098
|
case GGML_OP_PERMUTE:
|