llama_cpp 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +439 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +759 -136
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +250 -111
- data/ext/llama_cpp/src/ggml-metal.metal +614 -483
- data/ext/llama_cpp/src/ggml.c +793 -1032
- data/ext/llama_cpp/src/ggml.h +95 -18
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +626 -166
- data/ext/llama_cpp/src/llama.h +94 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +36 -1
- metadata +2 -2
@@ -13,6 +13,8 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
|
16
18
|
#if defined(_MSC_VER)
|
17
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
20
|
#endif
|
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
|
|
74
76
|
|
75
77
|
#define QK4_0 32
|
76
78
|
#define QR4_0 2
|
77
|
-
#define QI4_0 4
|
79
|
+
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
78
80
|
typedef struct {
|
79
81
|
half d; // delta
|
80
82
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
83
85
|
|
84
86
|
#define QK4_1 32
|
85
87
|
#define QR4_1 2
|
86
|
-
#define QI4_1 4
|
88
|
+
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
87
89
|
typedef struct {
|
88
90
|
half d; // delta
|
89
91
|
half m; // min
|
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
93
95
|
|
94
96
|
#define QK5_0 32
|
95
97
|
#define QR5_0 2
|
96
|
-
#define QI5_0 4
|
98
|
+
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
97
99
|
typedef struct {
|
98
100
|
half d; // delta
|
99
101
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
103
105
|
|
104
106
|
#define QK5_1 32
|
105
107
|
#define QR5_1 2
|
106
|
-
#define QI5_1 4
|
108
|
+
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
107
109
|
typedef struct {
|
108
110
|
half d; // delta
|
109
111
|
half m; // min
|
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
114
116
|
|
115
117
|
#define QK8_0 32
|
116
118
|
#define QR8_0 1
|
117
|
-
#define QI8_0
|
119
|
+
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
118
120
|
typedef struct {
|
119
121
|
half d; // delta
|
120
122
|
int8_t qs[QK8_0]; // quants
|
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
123
125
|
|
124
126
|
#define QK8_1 32
|
125
127
|
#define QR8_1 1
|
126
|
-
#define QI8_1
|
128
|
+
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
127
129
|
typedef struct {
|
128
130
|
half d; // delta
|
129
131
|
half s; // unquantized sum
|
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
143
145
|
#define K_SCALE_SIZE 12
|
144
146
|
#endif
|
145
147
|
|
148
|
+
#define QR2_K 4
|
149
|
+
#define QI2_K (QK_K / (4*QR2_K))
|
146
150
|
typedef struct {
|
147
151
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
148
152
|
uint8_t qs[QK_K/4]; // quants
|
@@ -151,6 +155,8 @@ typedef struct {
|
|
151
155
|
} block_q2_K;
|
152
156
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
153
157
|
|
158
|
+
#define QR3_K 4
|
159
|
+
#define QI3_K (QK_K / (4*QR3_K))
|
154
160
|
typedef struct {
|
155
161
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
156
162
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -163,6 +169,8 @@ typedef struct {
|
|
163
169
|
} block_q3_K;
|
164
170
|
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
165
171
|
|
172
|
+
#define QR4_K 2
|
173
|
+
#define QI4_K (QK_K / (4*QR4_K))
|
166
174
|
#ifdef GGML_QKK_64
|
167
175
|
typedef struct {
|
168
176
|
half d[2]; // super-block scales/mins
|
@@ -180,6 +188,8 @@ typedef struct {
|
|
180
188
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
181
189
|
#endif
|
182
190
|
|
191
|
+
#define QR5_K 2
|
192
|
+
#define QI5_K (QK_K / (4*QR5_K))
|
183
193
|
#ifdef GGML_QKK_64
|
184
194
|
typedef struct {
|
185
195
|
half d; // super-block scale
|
@@ -199,6 +209,8 @@ typedef struct {
|
|
199
209
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
200
210
|
#endif
|
201
211
|
|
212
|
+
#define QR6_K 2
|
213
|
+
#define QI6_K (QK_K / (4*QR6_K))
|
202
214
|
typedef struct {
|
203
215
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
204
216
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
@@ -208,7 +220,7 @@ typedef struct {
|
|
208
220
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
209
221
|
|
210
222
|
#define WARP_SIZE 32
|
211
|
-
#define MATRIX_ROW_PADDING
|
223
|
+
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
212
224
|
|
213
225
|
#define CUDA_ADD_BLOCK_SIZE 256
|
214
226
|
#define CUDA_MUL_BLOCK_SIZE 256
|
@@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
|
|
240
252
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
241
253
|
};
|
242
254
|
|
243
|
-
static __global__ void add_f32(const float * x, const float * y, float * dst, const int
|
255
|
+
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
244
256
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
245
257
|
|
246
|
-
if (i >=
|
258
|
+
if (i >= kx) {
|
247
259
|
return;
|
248
260
|
}
|
249
|
-
dst[i] = x[i] + y[i];
|
261
|
+
dst[i] = x[i] + y[i%ky];
|
250
262
|
}
|
251
263
|
|
252
264
|
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
@@ -320,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
320
332
|
}
|
321
333
|
}
|
322
334
|
|
323
|
-
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
335
|
+
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
324
336
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
325
337
|
const int tid = threadIdx.x;
|
326
338
|
|
327
|
-
const float eps = 1e-6f;
|
328
|
-
|
329
339
|
float tmp = 0.0f; // partial sum for thread in warp
|
330
340
|
|
331
341
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
@@ -923,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
923
933
|
uint16_t aux[4];
|
924
934
|
const uint8_t * sc = (const uint8_t *)aux;
|
925
935
|
|
936
|
+
#if K_QUANTS_PER_ITERATION == 2
|
937
|
+
uint32_t q32[4];
|
938
|
+
const uint8_t * q4 = (const uint8_t *)q32;
|
939
|
+
#else
|
940
|
+
uint16_t q16[4];
|
941
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
942
|
+
#endif
|
943
|
+
|
926
944
|
float tmp = 0; // partial sum for thread in warp
|
927
945
|
|
928
946
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
929
947
|
|
930
|
-
const uint8_t * q1 = x[i].qs + q_offset;
|
931
|
-
const uint8_t * q2 = q1 + 64;
|
932
948
|
const float * y1 = yy + i*QK_K + y_offset;
|
933
949
|
const float * y2 = y1 + 128;
|
934
950
|
|
@@ -941,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
941
957
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
942
958
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
943
959
|
|
960
|
+
#if K_QUANTS_PER_ITERATION == 2
|
961
|
+
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
962
|
+
const uint32_t * q2 = q1 + 16;
|
963
|
+
|
964
|
+
q32[0] = q1[0] & 0x0f0f0f0f;
|
965
|
+
q32[1] = q1[0] & 0xf0f0f0f0;
|
966
|
+
q32[2] = q2[0] & 0x0f0f0f0f;
|
967
|
+
q32[3] = q2[0] & 0xf0f0f0f0;
|
968
|
+
|
944
969
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
945
970
|
float smin = 0;
|
946
|
-
for (int l = 0; l <
|
947
|
-
s.x += y1[l] *
|
948
|
-
s.z += y2[l] *
|
971
|
+
for (int l = 0; l < 4; ++l) {
|
972
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
973
|
+
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
949
974
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
950
975
|
}
|
951
|
-
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
976
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
977
|
+
#else
|
978
|
+
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
979
|
+
const uint16_t * q2 = q1 + 32;
|
980
|
+
|
981
|
+
q16[0] = q1[0] & 0x0f0f;
|
982
|
+
q16[1] = q1[0] & 0xf0f0;
|
983
|
+
q16[2] = q2[0] & 0x0f0f;
|
984
|
+
q16[3] = q2[0] & 0xf0f0;
|
985
|
+
|
986
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
987
|
+
float smin = 0;
|
988
|
+
for (int l = 0; l < 2; ++l) {
|
989
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
990
|
+
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
991
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
992
|
+
}
|
993
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
994
|
+
#endif
|
952
995
|
|
953
996
|
}
|
954
997
|
#else
|
@@ -1028,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1028
1071
|
uint16_t aux[4];
|
1029
1072
|
const uint8_t * sc = (const uint8_t *)aux;
|
1030
1073
|
|
1074
|
+
uint16_t q16[8];
|
1075
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
1076
|
+
|
1031
1077
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
1032
1078
|
|
1033
1079
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
1034
|
-
const uint8_t * ql2 = ql1 + 64;
|
1035
1080
|
const uint8_t * qh = x[i].qh + l0;
|
1036
1081
|
const float * y1 = yy + i*QK_K + y_offset;
|
1037
1082
|
const float * y2 = y1 + 128;
|
@@ -1047,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1047
1092
|
|
1048
1093
|
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
1049
1094
|
float smin = 0;
|
1095
|
+
const uint16_t * q1 = (const uint16_t *)ql1;
|
1096
|
+
const uint16_t * q2 = q1 + 32;
|
1097
|
+
q16[0] = q1[0] & 0x0f0f;
|
1098
|
+
q16[1] = q1[8] & 0x0f0f;
|
1099
|
+
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
1100
|
+
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
1101
|
+
q16[4] = q2[0] & 0x0f0f;
|
1102
|
+
q16[5] = q2[8] & 0x0f0f;
|
1103
|
+
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
1104
|
+
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
1050
1105
|
for (int l = 0; l < n; ++l) {
|
1051
|
-
sum.x += y1[l+ 0] * (
|
1052
|
-
+ y1[l+16] * (
|
1053
|
-
sum.y += y1[l+32] * (
|
1054
|
-
+ y1[l+48] * (
|
1055
|
-
sum.z += y2[l+ 0] * (
|
1056
|
-
+ y2[l+16] * (
|
1057
|
-
sum.w += y2[l+32] * (
|
1058
|
-
+ y2[l+48] * (
|
1106
|
+
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
1107
|
+
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
1108
|
+
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
1109
|
+
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
1110
|
+
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
1111
|
+
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
1112
|
+
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
1113
|
+
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
1059
1114
|
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
1060
1115
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
1061
1116
|
}
|
@@ -1271,8 +1326,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1271
1326
|
y[iybs + iqs + y_offset] = v.y;
|
1272
1327
|
}
|
1273
1328
|
|
1274
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1275
|
-
|
1329
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1330
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1331
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1276
1332
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1277
1333
|
|
1278
1334
|
int vi;
|
@@ -1293,11 +1349,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
|
|
1293
1349
|
return sumi*d;
|
1294
1350
|
#else
|
1295
1351
|
return 0.0f; // only to satisfy the compiler
|
1296
|
-
#endif // __CUDA_ARCH__ >=
|
1352
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1297
1353
|
}
|
1298
1354
|
|
1299
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1300
|
-
|
1355
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1356
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1357
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1301
1358
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1302
1359
|
|
1303
1360
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1318,11 +1375,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
|
|
1318
1375
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1319
1376
|
#else
|
1320
1377
|
return 0.0f; // only to satisfy the compiler
|
1321
|
-
#endif // __CUDA_ARCH__ >=
|
1378
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1322
1379
|
}
|
1323
1380
|
|
1324
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1325
|
-
|
1381
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1382
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1383
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1326
1384
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1327
1385
|
|
1328
1386
|
int qs;
|
@@ -1353,11 +1411,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
|
|
1353
1411
|
return sumi*d;
|
1354
1412
|
#else
|
1355
1413
|
return 0.0f; // only to satisfy the compiler
|
1356
|
-
#endif // __CUDA_ARCH__ >=
|
1414
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1357
1415
|
}
|
1358
1416
|
|
1359
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1360
|
-
|
1417
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1418
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1419
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1361
1420
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1362
1421
|
|
1363
1422
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1387,11 +1446,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
|
|
1387
1446
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1388
1447
|
#else
|
1389
1448
|
return 0.0f; // only to satisfy the compiler
|
1390
|
-
#endif // __CUDA_ARCH__ >=
|
1449
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1391
1450
|
}
|
1392
1451
|
|
1393
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1394
|
-
|
1452
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1453
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1454
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1395
1455
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1396
1456
|
|
1397
1457
|
int vi;
|
@@ -1406,7 +1466,342 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
|
|
1406
1466
|
return sumi*d;
|
1407
1467
|
#else
|
1408
1468
|
return 0.0f; // only to satisfy the compiler
|
1409
|
-
#endif // __CUDA_ARCH__ >=
|
1469
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1470
|
+
}
|
1471
|
+
|
1472
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
1473
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1474
|
+
|
1475
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1476
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
1477
|
+
|
1478
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
1479
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1480
|
+
|
1481
|
+
float sumf_d = 0.0f;
|
1482
|
+
float sumf_m = 0.0f;
|
1483
|
+
|
1484
|
+
const float d = bq2_K->d;
|
1485
|
+
const float dmin = bq2_K->dmin;
|
1486
|
+
|
1487
|
+
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
|
1488
|
+
|
1489
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1490
|
+
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1491
|
+
|
1492
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1493
|
+
const float d8i = bq8i->d;
|
1494
|
+
|
1495
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1496
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1497
|
+
|
1498
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
|
1499
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
|
1500
|
+
}
|
1501
|
+
|
1502
|
+
return d*sumf_d - dmin*sumf_m;
|
1503
|
+
#else
|
1504
|
+
return 0.0f; // only to satisfy the compiler
|
1505
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1506
|
+
}
|
1507
|
+
|
1508
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
1509
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1510
|
+
|
1511
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1512
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
1513
|
+
|
1514
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
1515
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1516
|
+
|
1517
|
+
float sumf = 0.0f;
|
1518
|
+
|
1519
|
+
const float d = bq3_K->d;
|
1520
|
+
|
1521
|
+
int vl;
|
1522
|
+
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
|
1523
|
+
|
1524
|
+
int vh;
|
1525
|
+
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
1526
|
+
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1527
|
+
vh >>= bq8_offset;
|
1528
|
+
|
1529
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1530
|
+
const int isc = scale_offset + 2*i;
|
1531
|
+
|
1532
|
+
const int isc_low = isc % (QK_K/32);
|
1533
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1534
|
+
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1535
|
+
|
1536
|
+
const int isc_high = isc % (QK_K/64);
|
1537
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1538
|
+
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1539
|
+
|
1540
|
+
const int sc = (sc_low | sc_high) - 32;
|
1541
|
+
|
1542
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1543
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1544
|
+
const float d8i = bq8i->d;
|
1545
|
+
|
1546
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1547
|
+
|
1548
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1549
|
+
|
1550
|
+
const int vi = __vsubss4(vil, vih);
|
1551
|
+
|
1552
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1553
|
+
}
|
1554
|
+
|
1555
|
+
return d*sumf;
|
1556
|
+
#else
|
1557
|
+
return 0.0f; // only to satisfy the compiler
|
1558
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1559
|
+
}
|
1560
|
+
|
1561
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
1562
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1563
|
+
|
1564
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1565
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1566
|
+
|
1567
|
+
float sumf_d = 0.0f;
|
1568
|
+
float sumf_m = 0.0f;
|
1569
|
+
|
1570
|
+
#ifndef GGML_QKK_64
|
1571
|
+
|
1572
|
+
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
|
1573
|
+
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
|
1574
|
+
|
1575
|
+
const float d = bq4_K->d;
|
1576
|
+
const float dmin = bq4_K->dmin;
|
1577
|
+
|
1578
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
1579
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
1580
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
1581
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
1582
|
+
|
1583
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1584
|
+
const int v1 = q4[0];
|
1585
|
+
const int v2 = q4[4];
|
1586
|
+
|
1587
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
1588
|
+
uint16_t aux[2];
|
1589
|
+
const int j = bq8_offset/2;
|
1590
|
+
if (j < 2) {
|
1591
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1592
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1593
|
+
} else {
|
1594
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1595
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1596
|
+
}
|
1597
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1598
|
+
const uint8_t * m = sc + 2;
|
1599
|
+
|
1600
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1601
|
+
|
1602
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1603
|
+
const float d8i = bq8i->d;
|
1604
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1605
|
+
const int ui1 = q8[0];
|
1606
|
+
const int ui2 = q8[4];
|
1607
|
+
|
1608
|
+
const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
|
1609
|
+
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
1610
|
+
|
1611
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1612
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1613
|
+
|
1614
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1615
|
+
sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1616
|
+
}
|
1617
|
+
|
1618
|
+
return d*sumf_d - dmin*sumf_m;
|
1619
|
+
|
1620
|
+
#else
|
1621
|
+
|
1622
|
+
uint16_t aux16[2];
|
1623
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
1624
|
+
|
1625
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
1626
|
+
aux16[0] = a[0] & 0x0f0f;
|
1627
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1628
|
+
|
1629
|
+
const float dall = bq4_K->d[0];
|
1630
|
+
const float dmin = bq4_K->d[1];
|
1631
|
+
|
1632
|
+
const float d8_1 = bq8_1[0].d;
|
1633
|
+
const float d8_2 = bq8_1[1].d;
|
1634
|
+
|
1635
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1636
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1637
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1638
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1639
|
+
|
1640
|
+
const int * q4 = (const int *)bq4_K->qs + iqs;
|
1641
|
+
const int v1 = q4[0];
|
1642
|
+
const int v2 = q4[4];
|
1643
|
+
|
1644
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
1645
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
1646
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1647
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
1648
|
+
|
1649
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
1650
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
1651
|
+
|
1652
|
+
return dall * sumf_d - dmin * sumf_m;
|
1653
|
+
|
1654
|
+
#endif
|
1655
|
+
|
1656
|
+
#else
|
1657
|
+
return 0.0f; // only to satisfy the compiler
|
1658
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1659
|
+
}
|
1660
|
+
|
1661
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
1662
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1663
|
+
|
1664
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1665
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1666
|
+
|
1667
|
+
#ifndef GGML_QKK_64
|
1668
|
+
|
1669
|
+
const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
|
1670
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1671
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
|
1672
|
+
|
1673
|
+
float sumf_d = 0.0f;
|
1674
|
+
float sumf_m = 0.0f;
|
1675
|
+
|
1676
|
+
const float d = bq5_K->d;
|
1677
|
+
const float dmin = bq5_K->dmin;
|
1678
|
+
|
1679
|
+
const int vl1 = ql[0];
|
1680
|
+
const int vl2 = ql[4];
|
1681
|
+
|
1682
|
+
const int vh1 = qh[0] >> bq8_offset;
|
1683
|
+
const int vh2 = qh[4] >> bq8_offset;
|
1684
|
+
|
1685
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
1686
|
+
uint16_t aux[2];
|
1687
|
+
const int j = bq8_offset/2;
|
1688
|
+
if (j < 2) {
|
1689
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1690
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1691
|
+
} else {
|
1692
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1693
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1694
|
+
}
|
1695
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1696
|
+
const uint8_t * m = sc + 2;
|
1697
|
+
|
1698
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1699
|
+
|
1700
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1701
|
+
const float d8i = bq8i->d;
|
1702
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1703
|
+
const int ui1 = q8[0];
|
1704
|
+
const int ui2 = q8[4];
|
1705
|
+
|
1706
|
+
const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
|
1707
|
+
const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
|
1708
|
+
|
1709
|
+
const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
|
1710
|
+
const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
|
1711
|
+
|
1712
|
+
const int vi1 = vil1 | vih1;
|
1713
|
+
const int vi2 = vil2 | vih2;
|
1714
|
+
|
1715
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1716
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1717
|
+
|
1718
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1719
|
+
sumf_m += d8i * (dot2 * m[i]);
|
1720
|
+
|
1721
|
+
}
|
1722
|
+
|
1723
|
+
return d*sumf_d - dmin*sumf_m;
|
1724
|
+
|
1725
|
+
#else
|
1726
|
+
|
1727
|
+
const int8_t * s = bq5_K->scales;
|
1728
|
+
|
1729
|
+
const float d = bq5_K->d;
|
1730
|
+
|
1731
|
+
const float d8_1 = bq8_1[0].d;
|
1732
|
+
const float d8_2 = bq8_1[1].d;
|
1733
|
+
|
1734
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1735
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1736
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1737
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1738
|
+
|
1739
|
+
const int * ql = (const int *)bq5_K->qs + iqs;
|
1740
|
+
const int vl1 = ql[0];
|
1741
|
+
const int vl2 = ql[4];
|
1742
|
+
|
1743
|
+
const int step = 4 * iqs; // 0, 4, 8, 12
|
1744
|
+
const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
|
1745
|
+
const int in = step%8; // 0, 4, 0, 4
|
1746
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
1747
|
+
|
1748
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
1749
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
1750
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
1751
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
1752
|
+
|
1753
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
1754
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
1755
|
+
|
1756
|
+
return d * sumf_d;
|
1757
|
+
|
1758
|
+
#endif
|
1759
|
+
|
1760
|
+
#else
|
1761
|
+
return 0.0f; // only to satisfy the compiler
|
1762
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1763
|
+
}
|
1764
|
+
|
1765
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
1766
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1767
|
+
|
1768
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1769
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
1770
|
+
|
1771
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
1772
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1773
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1774
|
+
|
1775
|
+
float sumf = 0.0f;
|
1776
|
+
|
1777
|
+
const float d = bq6_K->d;
|
1778
|
+
|
1779
|
+
int vl;
|
1780
|
+
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
1781
|
+
|
1782
|
+
int vh;
|
1783
|
+
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
|
1784
|
+
|
1785
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1786
|
+
const int sc = bq6_K->scales[scale_offset + 4*i];
|
1787
|
+
|
1788
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
1789
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1790
|
+
const float d8i = bq8i->d;
|
1791
|
+
|
1792
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1793
|
+
|
1794
|
+
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
|
1795
|
+
|
1796
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1797
|
+
|
1798
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1799
|
+
}
|
1800
|
+
|
1801
|
+
return d*sumf;
|
1802
|
+
#else
|
1803
|
+
return 0.0f; // only to satisfy the compiler
|
1804
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1410
1805
|
}
|
1411
1806
|
|
1412
1807
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -1429,7 +1824,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1429
1824
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1430
1825
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1431
1826
|
|
1432
|
-
const int iby = i + threadIdx.x / qi; // y block index
|
1827
|
+
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
1433
1828
|
|
1434
1829
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1435
1830
|
|
@@ -1515,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1515
1910
|
}
|
1516
1911
|
}
|
1517
1912
|
|
1518
|
-
static __global__ void mul_mat_p021_f16_f32(
|
1913
|
+
static __global__ void mul_mat_p021_f16_f32(
|
1914
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
1915
|
+
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
1916
|
+
|
1519
1917
|
const half * x = (const half *) vx;
|
1520
1918
|
|
1521
1919
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1522
1920
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1921
|
+
const int channel_x = channel / (nchannels_y / nchannels_x);
|
1523
1922
|
|
1524
1923
|
const int nrows_y = ncols_x;
|
1525
1924
|
const int nrows_dst = nrows_x;
|
@@ -1535,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1535
1934
|
}
|
1536
1935
|
|
1537
1936
|
// x is transposed and permuted
|
1538
|
-
const int ix = row_x*nchannels_x*ncols_x +
|
1937
|
+
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
1539
1938
|
const float xi = __half2float(x[ix]);
|
1540
1939
|
|
1541
1940
|
const int row_y = col_x;
|
@@ -1563,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1563
1962
|
|
1564
1963
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1565
1964
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1566
|
-
const int row_stride_x, const int channel_stride_x) {
|
1965
|
+
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
1567
1966
|
|
1568
1967
|
const half * x = (const half *) vx;
|
1569
1968
|
|
1570
1969
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1571
1970
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1971
|
+
const int channel_x = channel / channel_x_divisor;
|
1572
1972
|
|
1573
1973
|
const int nrows_y = ncols_x;
|
1574
1974
|
const int nrows_dst = nrows_x;
|
@@ -1585,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1585
1985
|
break;
|
1586
1986
|
}
|
1587
1987
|
|
1588
|
-
const int ix =
|
1988
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
1589
1989
|
const float xi = __half2float(x[ix]);
|
1590
1990
|
|
1591
1991
|
const int row_y = col_x;
|
@@ -1667,6 +2067,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
1667
2067
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1668
2068
|
}
|
1669
2069
|
|
2070
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
2071
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
2072
|
+
const int half_n_dims = ncols/4;
|
2073
|
+
|
2074
|
+
if (col >= half_n_dims) {
|
2075
|
+
return;
|
2076
|
+
}
|
2077
|
+
|
2078
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
2079
|
+
const int i = row*ncols + col;
|
2080
|
+
|
2081
|
+
const float col_theta_scale = powf(theta_scale, col);
|
2082
|
+
|
2083
|
+
const float theta = p*col_theta_scale;
|
2084
|
+
const float sin_theta = sinf(theta);
|
2085
|
+
const float cos_theta = cosf(theta);
|
2086
|
+
|
2087
|
+
const float x0 = x[i + 0];
|
2088
|
+
const float x1 = x[i + half_n_dims];
|
2089
|
+
|
2090
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
2091
|
+
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
2092
|
+
|
2093
|
+
const float block_theta = block_p*col_theta_scale;
|
2094
|
+
const float sin_block_theta = sinf(block_theta);
|
2095
|
+
const float cos_block_theta = cosf(block_theta);
|
2096
|
+
|
2097
|
+
const float x2 = x[i + half_n_dims * 2];
|
2098
|
+
const float x3 = x[i + half_n_dims * 3];
|
2099
|
+
|
2100
|
+
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
2101
|
+
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
2102
|
+
}
|
2103
|
+
|
1670
2104
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1671
2105
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1672
2106
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1732,9 +2166,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
1732
2166
|
dst[i] = scale * x[i];
|
1733
2167
|
}
|
1734
2168
|
|
1735
|
-
static void add_f32_cuda(const float * x, const float * y, float * dst, const int
|
1736
|
-
const int num_blocks = (
|
1737
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst,
|
2169
|
+
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
2170
|
+
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
2171
|
+
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1738
2172
|
}
|
1739
2173
|
|
1740
2174
|
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
@@ -1763,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
1763
2197
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1764
2198
|
}
|
1765
2199
|
|
1766
|
-
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2200
|
+
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
1767
2201
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
1768
2202
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1769
|
-
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2203
|
+
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
1770
2204
|
}
|
1771
2205
|
|
1772
2206
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
@@ -1928,7 +2362,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1928
2362
|
}
|
1929
2363
|
|
1930
2364
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1931
|
-
GGML_ASSERT(ncols %
|
2365
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
1932
2366
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1933
2367
|
const dim3 block_nums(1, block_num_y, 1);
|
1934
2368
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1937,7 +2371,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1937
2371
|
}
|
1938
2372
|
|
1939
2373
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1940
|
-
GGML_ASSERT(ncols %
|
2374
|
+
GGML_ASSERT(ncols % QK4_1 == 0);
|
1941
2375
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1942
2376
|
const dim3 block_nums(1, block_num_y, 1);
|
1943
2377
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1946,7 +2380,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1946
2380
|
}
|
1947
2381
|
|
1948
2382
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1949
|
-
GGML_ASSERT(ncols %
|
2383
|
+
GGML_ASSERT(ncols % QK5_0 == 0);
|
1950
2384
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1951
2385
|
const dim3 block_nums(1, block_num_y, 1);
|
1952
2386
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1955,7 +2389,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1955
2389
|
}
|
1956
2390
|
|
1957
2391
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1958
|
-
GGML_ASSERT(ncols %
|
2392
|
+
GGML_ASSERT(ncols % QK5_1 == 0);
|
1959
2393
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1960
2394
|
const dim3 block_nums(1, block_num_y, 1);
|
1961
2395
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1964,7 +2398,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1964
2398
|
}
|
1965
2399
|
|
1966
2400
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1967
|
-
GGML_ASSERT(ncols %
|
2401
|
+
GGML_ASSERT(ncols % QK8_0 == 0);
|
1968
2402
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1969
2403
|
const dim3 block_nums(1, block_num_y, 1);
|
1970
2404
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1972,6 +2406,57 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1972
2406
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1973
2407
|
}
|
1974
2408
|
|
2409
|
+
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2410
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2411
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2412
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2413
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2414
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
2415
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2416
|
+
}
|
2417
|
+
|
2418
|
+
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2419
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2420
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2421
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2422
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2423
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
2424
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2425
|
+
}
|
2426
|
+
|
2427
|
+
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2428
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2429
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2430
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2431
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2432
|
+
// Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
|
2433
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2434
|
+
// is better amortized.
|
2435
|
+
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
2436
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2437
|
+
}
|
2438
|
+
|
2439
|
+
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2440
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2441
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2442
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2443
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2444
|
+
// Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
|
2445
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2446
|
+
// is better amortized.
|
2447
|
+
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
2448
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2449
|
+
}
|
2450
|
+
|
2451
|
+
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2452
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2453
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2454
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2455
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2456
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
2457
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2458
|
+
}
|
2459
|
+
|
1975
2460
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1976
2461
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1977
2462
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -2015,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2015
2500
|
}
|
2016
2501
|
}
|
2017
2502
|
|
2018
|
-
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2019
|
-
const
|
2503
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2504
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
2505
|
+
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
2506
|
+
|
2507
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2020
2508
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2021
|
-
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
2509
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
2022
2510
|
}
|
2023
2511
|
|
2024
2512
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
2025
2513
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
2026
|
-
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
2514
|
+
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
2027
2515
|
|
2028
|
-
const dim3 block_nums(1, nrows_x,
|
2516
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2029
2517
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2030
2518
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
2031
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
2519
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
2032
2520
|
}
|
2033
2521
|
|
2034
2522
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -2064,6 +2552,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2064
2552
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
2065
2553
|
}
|
2066
2554
|
|
2555
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
2556
|
+
GGML_ASSERT(nrows % 4 == 0);
|
2557
|
+
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2558
|
+
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
2559
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2560
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
2561
|
+
}
|
2562
|
+
|
2067
2563
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
2068
2564
|
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
2069
2565
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -2106,20 +2602,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
2106
2602
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
2107
2603
|
int id;
|
2108
2604
|
CUDA_CHECK(cudaGetDevice(&id));
|
2109
|
-
|
2605
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2606
|
+
int nnz = 0;
|
2607
|
+
size_t max_size = 0, tot_size = 0;
|
2608
|
+
#endif
|
2609
|
+
size_t best_diff = 1ull << 36;
|
2610
|
+
int ibest = -1;
|
2110
2611
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
2111
2612
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
2112
|
-
if (b.
|
2113
|
-
|
2114
|
-
|
2115
|
-
b.
|
2116
|
-
b.size =
|
2117
|
-
|
2613
|
+
if (b.ptr != nullptr) {
|
2614
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2615
|
+
++nnz;
|
2616
|
+
tot_size += b.size;
|
2617
|
+
if (b.size > max_size) max_size = b.size;
|
2618
|
+
#endif
|
2619
|
+
if (b.size >= size) {
|
2620
|
+
size_t diff = b.size - size;
|
2621
|
+
if (diff < best_diff) {
|
2622
|
+
best_diff = diff;
|
2623
|
+
ibest = i;
|
2624
|
+
if (!best_diff) {
|
2625
|
+
void * ptr = b.ptr;
|
2626
|
+
*actual_size = b.size;
|
2627
|
+
b.ptr = nullptr;
|
2628
|
+
b.size = 0;
|
2629
|
+
return ptr;
|
2630
|
+
}
|
2631
|
+
}
|
2632
|
+
}
|
2118
2633
|
}
|
2119
2634
|
}
|
2635
|
+
if (ibest >= 0) {
|
2636
|
+
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
2637
|
+
void * ptr = b.ptr;
|
2638
|
+
*actual_size = b.size;
|
2639
|
+
b.ptr = nullptr;
|
2640
|
+
b.size = 0;
|
2641
|
+
return ptr;
|
2642
|
+
}
|
2643
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2644
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
2645
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
2646
|
+
#endif
|
2120
2647
|
void * ptr;
|
2121
|
-
|
2122
|
-
|
2648
|
+
size_t look_ahead_size = (size_t) (1.05 * size);
|
2649
|
+
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
2650
|
+
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
2651
|
+
*actual_size = look_ahead_size;
|
2123
2652
|
return ptr;
|
2124
2653
|
}
|
2125
2654
|
|
@@ -2147,7 +2676,9 @@ static size_t g_scratch_offset = 0;
|
|
2147
2676
|
|
2148
2677
|
static int g_device_count = -1;
|
2149
2678
|
static int g_main_device = 0;
|
2679
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2150
2680
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2681
|
+
#endif
|
2151
2682
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
2152
2683
|
|
2153
2684
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -2170,7 +2701,9 @@ void ggml_init_cublas() {
|
|
2170
2701
|
g_tensor_split[id] = total_vram;
|
2171
2702
|
total_vram += prop.totalGlobalMem;
|
2172
2703
|
|
2704
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2173
2705
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
2706
|
+
#endif
|
2174
2707
|
}
|
2175
2708
|
for (int id = 0; id < g_device_count; ++id) {
|
2176
2709
|
g_tensor_split[id] /= total_vram;
|
@@ -2195,6 +2728,9 @@ void ggml_init_cublas() {
|
|
2195
2728
|
}
|
2196
2729
|
|
2197
2730
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
2731
|
+
if (tensor_split == nullptr) {
|
2732
|
+
return;
|
2733
|
+
}
|
2198
2734
|
bool all_zero = true;
|
2199
2735
|
for (int i = 0; i < g_device_count; ++i) {
|
2200
2736
|
if (tensor_split[i] != 0.0f) {
|
@@ -2293,17 +2829,15 @@ inline void ggml_cuda_op_add(
|
|
2293
2829
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2294
2830
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2295
2831
|
|
2296
|
-
// TODO: support broadcasting
|
2297
|
-
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
2298
|
-
|
2299
2832
|
const int64_t ne00 = src0->ne[0];
|
2300
2833
|
const int64_t i01_diff = i01_high - i01_low;
|
2301
2834
|
|
2302
|
-
|
2835
|
+
const int64_t ne10 = src1->ne[0];
|
2836
|
+
const int64_t ne11 = src1->ne[1];
|
2303
2837
|
|
2304
2838
|
// compute
|
2305
2839
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2306
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2840
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2307
2841
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2308
2842
|
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2309
2843
|
} else {
|
@@ -2327,23 +2861,17 @@ inline void ggml_cuda_op_mul(
|
|
2327
2861
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2328
2862
|
|
2329
2863
|
const int64_t ne00 = src0->ne[0];
|
2864
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2865
|
+
|
2330
2866
|
const int64_t ne10 = src1->ne[0];
|
2331
2867
|
const int64_t ne11 = src1->ne[1];
|
2332
2868
|
|
2333
|
-
|
2334
|
-
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
2335
|
-
|
2336
|
-
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2337
|
-
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2338
|
-
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2339
|
-
|
2340
|
-
// compute
|
2341
|
-
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2342
|
-
}
|
2869
|
+
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2343
2870
|
|
2344
2871
|
(void) dst;
|
2345
2872
|
(void) src0_ddq_i;
|
2346
2873
|
(void) i02;
|
2874
|
+
(void) i1;
|
2347
2875
|
}
|
2348
2876
|
|
2349
2877
|
inline void ggml_cuda_op_gelu(
|
@@ -2423,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
|
|
2423
2951
|
const int64_t ne00 = src0->ne[0];
|
2424
2952
|
const int64_t i01_diff = i01_high - i01_low;
|
2425
2953
|
|
2954
|
+
float eps;
|
2955
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
2956
|
+
|
2426
2957
|
// compute
|
2427
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2958
|
+
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
2428
2959
|
|
2429
2960
|
(void) src1;
|
2430
2961
|
(void) dst;
|
@@ -2452,18 +2983,27 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2452
2983
|
int id;
|
2453
2984
|
CUDA_CHECK(cudaGetDevice(&id));
|
2454
2985
|
|
2455
|
-
|
2986
|
+
bool mul_mat_vec_q_implemented =
|
2987
|
+
src0->type == GGML_TYPE_Q4_0 ||
|
2456
2988
|
src0->type == GGML_TYPE_Q4_1 ||
|
2457
2989
|
src0->type == GGML_TYPE_Q5_0 ||
|
2458
2990
|
src0->type == GGML_TYPE_Q5_1 ||
|
2459
2991
|
src0->type == GGML_TYPE_Q8_0;
|
2460
|
-
|
2461
|
-
|
2992
|
+
#if QK_K == 256
|
2993
|
+
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
2994
|
+
src0->type == GGML_TYPE_Q2_K ||
|
2995
|
+
src0->type == GGML_TYPE_Q3_K ||
|
2996
|
+
src0->type == GGML_TYPE_Q4_K ||
|
2997
|
+
src0->type == GGML_TYPE_Q5_K ||
|
2998
|
+
src0->type == GGML_TYPE_Q6_K;
|
2999
|
+
#endif // QK_K == 256
|
3000
|
+
|
3001
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
2462
3002
|
#endif
|
2463
3003
|
|
2464
3004
|
if (use_mul_mat_vec_q) {
|
2465
|
-
int64_t padded_row_size = ne00
|
2466
|
-
|
3005
|
+
const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
|
3006
|
+
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
2467
3007
|
size_t as;
|
2468
3008
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2469
3009
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
@@ -2484,6 +3024,21 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2484
3024
|
case GGML_TYPE_Q8_0:
|
2485
3025
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2486
3026
|
break;
|
3027
|
+
case GGML_TYPE_Q2_K:
|
3028
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3029
|
+
break;
|
3030
|
+
case GGML_TYPE_Q3_K:
|
3031
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3032
|
+
break;
|
3033
|
+
case GGML_TYPE_Q4_K:
|
3034
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3035
|
+
break;
|
3036
|
+
case GGML_TYPE_Q5_K:
|
3037
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3038
|
+
break;
|
3039
|
+
case GGML_TYPE_Q6_K:
|
3040
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
3041
|
+
break;
|
2487
3042
|
default:
|
2488
3043
|
GGML_ASSERT(false);
|
2489
3044
|
break;
|
@@ -2615,17 +3170,31 @@ inline void ggml_cuda_op_rope(
|
|
2615
3170
|
const int64_t ne00 = src0->ne[0];
|
2616
3171
|
const int64_t i01_diff = i01_high - i01_low;
|
2617
3172
|
|
2618
|
-
const int n_past = ((int32_t *)
|
2619
|
-
const int n_dims = ((int32_t *)
|
2620
|
-
const int mode = ((int32_t *)
|
2621
|
-
|
3173
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
3174
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
3175
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
3176
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
3177
|
+
// RoPE alteration for extended context
|
2622
3178
|
|
2623
|
-
|
2624
|
-
|
3179
|
+
float freq_base, freq_scale;
|
3180
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
3181
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
3182
|
+
|
3183
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3184
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3185
|
+
|
3186
|
+
bool is_glm = mode & 4;
|
2625
3187
|
|
2626
3188
|
// compute
|
2627
|
-
|
3189
|
+
if (is_glm) {
|
3190
|
+
const float id_p = min(p, n_ctx - 2.f);
|
3191
|
+
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3192
|
+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3193
|
+
} else {
|
3194
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3195
|
+
}
|
2628
3196
|
|
3197
|
+
(void) src1;
|
2629
3198
|
(void) dst;
|
2630
3199
|
(void) src0_ddq_i;
|
2631
3200
|
(void) src1_ddf_i;
|
@@ -2644,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2644
3213
|
const int64_t ne01 = src0->ne[1];
|
2645
3214
|
const int64_t i01_diff = i01_high - i01_low;
|
2646
3215
|
|
2647
|
-
const int n_past = ((int32_t *)
|
3216
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
2648
3217
|
|
2649
3218
|
// compute
|
2650
3219
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2651
3220
|
|
3221
|
+
(void) src1;
|
2652
3222
|
(void) dst;
|
2653
3223
|
(void) src0_ddq_i;
|
2654
3224
|
(void) src1_ddf_i;
|
@@ -2716,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2716
3286
|
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
|
2717
3287
|
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
|
2718
3288
|
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
|
3289
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
3290
|
+
|
3291
|
+
GGML_ASSERT(ne03 == ne13);
|
2719
3292
|
|
2720
3293
|
const int64_t ne0 = dst->ne[0];
|
2721
3294
|
const int64_t ne1 = dst->ne[1];
|
@@ -2727,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2727
3300
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
2728
3301
|
|
2729
3302
|
// strides for iteration over dims 3 and 2
|
2730
|
-
const int64_t
|
2731
|
-
const int64_t
|
3303
|
+
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
3304
|
+
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
3305
|
+
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
2732
3306
|
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
2733
3307
|
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
2734
3308
|
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
2735
3309
|
|
3310
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
3311
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
3312
|
+
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
3313
|
+
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
3314
|
+
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
3315
|
+
|
2736
3316
|
const size_t src0_ts = ggml_type_size(src0->type);
|
2737
3317
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
2738
3318
|
|
@@ -2749,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2749
3329
|
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
2750
3330
|
|
2751
3331
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
3332
|
+
GGML_ASSERT(!(split && ne02 < ne12));
|
2752
3333
|
|
2753
3334
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
2754
3335
|
|
@@ -2785,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2785
3366
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
2786
3367
|
} else {
|
2787
3368
|
row_low = 0;
|
2788
|
-
row_high = nrows0;
|
3369
|
+
row_high = nrows0*i02_divisor;
|
2789
3370
|
}
|
2790
3371
|
if (row_low == row_high) {
|
2791
3372
|
continue;
|
@@ -2833,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2833
3414
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
2834
3415
|
}
|
2835
3416
|
|
2836
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
2837
|
-
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
2838
|
-
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
2839
|
-
|
2840
3417
|
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
2841
3418
|
const int64_t i13 = i03 % ne13;
|
2842
3419
|
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
2843
3420
|
const int64_t i12 = i02 % ne12;
|
2844
3421
|
|
2845
|
-
const int64_t i0 = i03*
|
3422
|
+
const int64_t i0 = i03*i02_max + i02;
|
2846
3423
|
|
2847
3424
|
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
2848
3425
|
const int64_t i0_offset_low = row_low/rows_per_iter;
|
@@ -2876,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2876
3453
|
const int64_t i11 = i13*ne12 + i12;
|
2877
3454
|
|
2878
3455
|
// for split tensors the data begins at i0 == i0_offset_low
|
2879
|
-
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2880
|
-
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
3456
|
+
char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
3457
|
+
float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
|
2881
3458
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
2882
|
-
float * dst_ddf_i = dst_ddf[id] + (i0
|
3459
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
2883
3460
|
|
2884
3461
|
// for split tensors the data pointer needs to be rounded down
|
2885
3462
|
// to the bin edge for i03, i02 bins beyond the first
|
@@ -2918,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2918
3495
|
}
|
2919
3496
|
}
|
2920
3497
|
|
2921
|
-
if (!src0_on_device || !src0_is_contiguous) {
|
3498
|
+
if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
2922
3499
|
if (src0_is_f32) {
|
2923
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3500
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
2924
3501
|
} else {
|
2925
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3502
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
2926
3503
|
}
|
2927
3504
|
}
|
2928
3505
|
|
@@ -3076,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3076
3653
|
const int64_t ne01 = src0->ne[1];
|
3077
3654
|
const int64_t ne02 = src0->ne[2];
|
3078
3655
|
|
3656
|
+
const int64_t ne12 = src1->ne[2];
|
3657
|
+
|
3079
3658
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3080
3659
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
3081
3660
|
|
@@ -3088,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3088
3667
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3089
3668
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
3090
3669
|
|
3091
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
3670
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
|
3092
3671
|
}
|
3093
3672
|
|
3094
3673
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -3102,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3102
3681
|
const int64_t ne01 = src0->ne[1];
|
3103
3682
|
const int64_t ne02 = src0->ne[2];
|
3104
3683
|
|
3684
|
+
const int64_t ne12 = src1->ne[2];
|
3685
|
+
|
3105
3686
|
const int64_t nb01 = src0->nb[1];
|
3106
3687
|
const int64_t nb02 = src0->nb[2];
|
3107
3688
|
|
@@ -3120,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3120
3701
|
const int row_stride_x = nb01 / sizeof(half);
|
3121
3702
|
const int channel_stride_x = nb02 / sizeof(half);
|
3122
3703
|
|
3123
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
3704
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
|
3124
3705
|
}
|
3125
3706
|
|
3126
3707
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3197,6 +3778,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3197
3778
|
(void) dst;
|
3198
3779
|
}
|
3199
3780
|
|
3781
|
+
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3782
|
+
ggml_cuda_cpy(src0, dst, nullptr);
|
3783
|
+
(void) src1;
|
3784
|
+
}
|
3785
|
+
|
3200
3786
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3201
3787
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3202
3788
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
@@ -3256,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3256
3842
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3257
3843
|
const size_t original_size = size;
|
3258
3844
|
|
3259
|
-
// pad last row to a multiple of
|
3845
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
3260
3846
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3261
3847
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3262
3848
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
@@ -3272,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3272
3858
|
}
|
3273
3859
|
|
3274
3860
|
|
3275
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host,
|
3861
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
3276
3862
|
|
3277
3863
|
extra->data_device[id] = buf;
|
3278
3864
|
|
@@ -3306,6 +3892,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
3306
3892
|
delete extra;
|
3307
3893
|
}
|
3308
3894
|
|
3895
|
+
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
3896
|
+
static size_t g_temp_tensor_extra_index = 0;
|
3897
|
+
|
3898
|
+
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
3899
|
+
if (g_temp_tensor_extras == nullptr) {
|
3900
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
3901
|
+
}
|
3902
|
+
|
3903
|
+
size_t alloc_index = g_temp_tensor_extra_index;
|
3904
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
3905
|
+
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
3906
|
+
memset(extra, 0, sizeof(*extra));
|
3907
|
+
|
3908
|
+
return extra;
|
3909
|
+
}
|
3910
|
+
|
3309
3911
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
3310
3912
|
if (scratch && g_scratch_size == 0) {
|
3311
3913
|
return;
|
@@ -3314,7 +3916,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3314
3916
|
// recursively assign CUDA buffers until a compute tensor is found
|
3315
3917
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
3918
|
const ggml_op src0_op = tensor->src[0]->op;
|
3317
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3919
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
3318
3920
|
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3319
3921
|
}
|
3320
3922
|
}
|
@@ -3323,8 +3925,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3323
3925
|
}
|
3324
3926
|
|
3325
3927
|
tensor->backend = GGML_BACKEND_GPU;
|
3326
|
-
struct ggml_tensor_extra_gpu * extra
|
3327
|
-
memset(extra, 0, sizeof(*extra));
|
3928
|
+
struct ggml_tensor_extra_gpu * extra;
|
3328
3929
|
|
3329
3930
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3330
3931
|
tensor->op == GGML_OP_VIEW ||
|
@@ -3337,12 +3938,14 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3337
3938
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3338
3939
|
size_t offset = 0;
|
3339
3940
|
if (tensor->op == GGML_OP_VIEW) {
|
3340
|
-
memcpy(&offset, tensor->
|
3941
|
+
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
3341
3942
|
}
|
3943
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3342
3944
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3343
3945
|
} else if (tensor->op == GGML_OP_CPY) {
|
3344
3946
|
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3345
3947
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3948
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3346
3949
|
extra->data_device[g_main_device] = src1_ddv;
|
3347
3950
|
} else if (scratch) {
|
3348
3951
|
GGML_ASSERT(size <= g_scratch_size);
|
@@ -3355,6 +3958,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3355
3958
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
3356
3959
|
g_scratch_buffer = data;
|
3357
3960
|
}
|
3961
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3358
3962
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
3359
3963
|
|
3360
3964
|
g_scratch_offset += size;
|
@@ -3364,6 +3968,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3364
3968
|
void * data;
|
3365
3969
|
CUDA_CHECK(cudaMalloc(&data, size));
|
3366
3970
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
3971
|
+
extra = new ggml_tensor_extra_gpu;
|
3972
|
+
memset(extra, 0, sizeof(*extra));
|
3367
3973
|
extra->data_device[g_main_device] = data;
|
3368
3974
|
}
|
3369
3975
|
|
@@ -3416,30 +4022,41 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3416
4022
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3417
4023
|
|
3418
4024
|
switch (tensor->op) {
|
3419
|
-
case
|
4025
|
+
case GGML_OP_DUP:
|
3420
4026
|
if (!any_on_device) {
|
3421
4027
|
return false;
|
3422
4028
|
}
|
3423
|
-
func =
|
4029
|
+
func = ggml_cuda_dup;
|
3424
4030
|
break;
|
3425
|
-
case
|
3426
|
-
if (!any_on_device) {
|
3427
|
-
return false;
|
3428
|
-
}
|
3429
|
-
func = ggml_cuda_mul;
|
3430
|
-
break;
|
3431
|
-
case GGML_OP_GELU:
|
4031
|
+
case GGML_OP_ADD:
|
3432
4032
|
if (!any_on_device) {
|
3433
4033
|
return false;
|
3434
4034
|
}
|
3435
|
-
func =
|
4035
|
+
func = ggml_cuda_add;
|
3436
4036
|
break;
|
3437
|
-
case
|
4037
|
+
case GGML_OP_MUL:
|
3438
4038
|
if (!any_on_device) {
|
3439
4039
|
return false;
|
3440
4040
|
}
|
3441
|
-
func =
|
4041
|
+
func = ggml_cuda_mul;
|
3442
4042
|
break;
|
4043
|
+
case GGML_OP_UNARY:
|
4044
|
+
switch (ggml_get_unary_op(tensor)) {
|
4045
|
+
case GGML_UNARY_OP_GELU:
|
4046
|
+
if (!any_on_device) {
|
4047
|
+
return false;
|
4048
|
+
}
|
4049
|
+
func = ggml_cuda_gelu;
|
4050
|
+
break;
|
4051
|
+
case GGML_UNARY_OP_SILU:
|
4052
|
+
if (!any_on_device) {
|
4053
|
+
return false;
|
4054
|
+
}
|
4055
|
+
func = ggml_cuda_silu;
|
4056
|
+
break;
|
4057
|
+
default:
|
4058
|
+
return false;
|
4059
|
+
} break;
|
3443
4060
|
case GGML_OP_NORM:
|
3444
4061
|
if (!any_on_device) {
|
3445
4062
|
return false;
|
@@ -3470,6 +4087,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3470
4087
|
}
|
3471
4088
|
func = ggml_cuda_cpy;
|
3472
4089
|
break;
|
4090
|
+
case GGML_OP_CONT:
|
4091
|
+
if (!any_on_device) {
|
4092
|
+
return false;
|
4093
|
+
}
|
4094
|
+
func = ggml_cuda_dup;
|
4095
|
+
break;
|
3473
4096
|
case GGML_OP_RESHAPE:
|
3474
4097
|
case GGML_OP_VIEW:
|
3475
4098
|
case GGML_OP_PERMUTE:
|