llama_cpp 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +0 -5
- data/ext/llama_cpp/src/ggml-cuda.cu +1011 -655
- data/ext/llama_cpp/src/ggml-metal.m +57 -15
- data/ext/llama_cpp/src/ggml-metal.metal +271 -137
- data/ext/llama_cpp/src/ggml.c +7 -3
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +617 -141
- data/ext/llama_cpp/src/llama.h +8 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -63,18 +63,18 @@ kernel void kernel_mul_row(
|
|
63
63
|
}
|
64
64
|
|
65
65
|
kernel void kernel_scale(
|
66
|
-
device const
|
67
|
-
device
|
66
|
+
device const float4 * src0,
|
67
|
+
device float4 * dst,
|
68
68
|
constant float & scale,
|
69
69
|
uint tpig[[thread_position_in_grid]]) {
|
70
70
|
dst[tpig] = src0[tpig] * scale;
|
71
71
|
}
|
72
72
|
|
73
73
|
kernel void kernel_silu(
|
74
|
-
device const
|
75
|
-
device
|
74
|
+
device const float4 * src0,
|
75
|
+
device float4 * dst,
|
76
76
|
uint tpig[[thread_position_in_grid]]) {
|
77
|
-
|
77
|
+
device const float4 & x = src0[tpig];
|
78
78
|
dst[tpig] = x / (1.0f + exp(-x));
|
79
79
|
}
|
80
80
|
|
@@ -89,10 +89,10 @@ constant float GELU_COEF_A = 0.044715f;
|
|
89
89
|
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
90
90
|
|
91
91
|
kernel void kernel_gelu(
|
92
|
-
device const
|
93
|
-
device
|
92
|
+
device const float4 * src0,
|
93
|
+
device float4 * dst,
|
94
94
|
uint tpig[[thread_position_in_grid]]) {
|
95
|
-
|
95
|
+
device const float4 & x = src0[tpig];
|
96
96
|
|
97
97
|
// BEWARE !!!
|
98
98
|
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
|
@@ -107,7 +107,6 @@ kernel void kernel_soft_max(
|
|
107
107
|
constant int64_t & ne00,
|
108
108
|
constant int64_t & ne01,
|
109
109
|
constant int64_t & ne02,
|
110
|
-
threadgroup float * buf [[threadgroup(0)]],
|
111
110
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
112
111
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
113
112
|
uint3 ntg[[threads_per_threadgroup]]) {
|
@@ -119,61 +118,67 @@ kernel void kernel_soft_max(
|
|
119
118
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
120
119
|
|
121
120
|
// parallel max
|
122
|
-
|
123
|
-
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
124
|
-
|
125
|
-
}
|
126
|
-
|
127
|
-
// reduce
|
128
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
129
|
-
for (uint i = ntg[0]/2; i > 0; i /= 2) {
|
130
|
-
if (tpitg[0] < i) {
|
131
|
-
buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
|
132
|
-
}
|
133
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
121
|
+
float lmax = psrc0[tpitg[0]];
|
122
|
+
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
123
|
+
lmax = MAX(lmax, psrc0[i00]);
|
134
124
|
}
|
135
|
-
|
136
|
-
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
137
|
-
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
138
|
-
//if (tpitg[0] == 0) {
|
139
|
-
// buf[0] = buf[0];
|
140
|
-
//}
|
141
|
-
|
142
|
-
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
143
|
-
|
144
|
-
const float max = buf[0];
|
125
|
+
const float max = simd_max(lmax);
|
145
126
|
|
146
127
|
// parallel sum
|
147
|
-
|
128
|
+
float lsum = 0.0f;
|
148
129
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
149
130
|
const float exp_psrc0 = exp(psrc0[i00] - max);
|
150
|
-
|
131
|
+
lsum += exp_psrc0;
|
151
132
|
// Remember the result of exp here. exp is expensive, so we really do not
|
152
133
|
// whish to compute it twice.
|
153
134
|
pdst[i00] = exp_psrc0;
|
154
135
|
}
|
155
136
|
|
156
|
-
|
157
|
-
|
158
|
-
for (
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
137
|
+
const float sum = simd_sum(lsum);
|
138
|
+
|
139
|
+
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
140
|
+
pdst[i00] /= sum;
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
kernel void kernel_soft_max_4(
|
145
|
+
device const float * src0,
|
146
|
+
device float * dst,
|
147
|
+
constant int64_t & ne00,
|
148
|
+
constant int64_t & ne01,
|
149
|
+
constant int64_t & ne02,
|
150
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
151
|
+
uint3 tpitg[[thread_position_in_threadgroup]],
|
152
|
+
uint3 ntg[[threads_per_threadgroup]]) {
|
153
|
+
const int64_t i03 = tgpig[2];
|
154
|
+
const int64_t i02 = tgpig[1];
|
155
|
+
const int64_t i01 = tgpig[0];
|
156
|
+
|
157
|
+
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
158
|
+
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
159
|
+
|
160
|
+
// parallel max
|
161
|
+
float4 lmax4 = psrc4[tpitg[0]];
|
162
|
+
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
163
|
+
lmax4 = fmax(lmax4, psrc4[i00]);
|
163
164
|
}
|
165
|
+
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
164
166
|
|
165
|
-
|
166
|
-
//// broadcast
|
167
|
-
//if (tpitg[0] == 0) {
|
168
|
-
// buf[0] = buf[0];
|
169
|
-
//}
|
167
|
+
const float max = simd_max(lmax);
|
170
168
|
|
171
|
-
//
|
169
|
+
// parallel sum
|
170
|
+
float4 lsum4 = 0.0f;
|
171
|
+
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
172
|
+
const float4 exp_psrc4 = exp(psrc4[i00] - max);
|
173
|
+
lsum4 += exp_psrc4;
|
174
|
+
pdst4[i00] = exp_psrc4;
|
175
|
+
}
|
176
|
+
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
172
177
|
|
173
|
-
const float sum =
|
178
|
+
const float sum = simd_sum(lsum);
|
174
179
|
|
175
|
-
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
176
|
-
|
180
|
+
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
181
|
+
pdst4[i00] /= sum;
|
177
182
|
}
|
178
183
|
}
|
179
184
|
|
@@ -192,6 +197,33 @@ kernel void kernel_diag_mask_inf(
|
|
192
197
|
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
|
193
198
|
} else {
|
194
199
|
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
kernel void kernel_diag_mask_inf_8(
|
204
|
+
device const float4 * src0,
|
205
|
+
device float4 * dst,
|
206
|
+
constant int64_t & ne00,
|
207
|
+
constant int64_t & ne01,
|
208
|
+
constant int & n_past,
|
209
|
+
uint3 tpig[[thread_position_in_grid]]) {
|
210
|
+
|
211
|
+
const int64_t i = 2*tpig[0];
|
212
|
+
|
213
|
+
dst[i+0] = src0[i+0];
|
214
|
+
dst[i+1] = src0[i+1];
|
215
|
+
int64_t i4 = 4*i;
|
216
|
+
const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
|
217
|
+
const int64_t i01 = i4/(ne00); i4 -= i01*ne00;
|
218
|
+
const int64_t i00 = i4;
|
219
|
+
for (int k = 3; k >= 0; --k) {
|
220
|
+
if (i00 + 4 + k <= n_past + i01) {
|
221
|
+
break;
|
222
|
+
}
|
223
|
+
dst[i+1][k] = -INFINITY;
|
224
|
+
if (i00 + k > n_past + i01) {
|
225
|
+
dst[i][k] = -INFINITY;
|
226
|
+
}
|
195
227
|
}
|
196
228
|
}
|
197
229
|
|
@@ -616,6 +648,49 @@ kernel void kernel_mul_mat_f16_f32(
|
|
616
648
|
}
|
617
649
|
}
|
618
650
|
|
651
|
+
// Assumes row size (ne00) is a multiple of 4
|
652
|
+
kernel void kernel_mul_mat_f16_f32_l4(
|
653
|
+
device const char * src0,
|
654
|
+
device const char * src1,
|
655
|
+
device float * dst,
|
656
|
+
constant int64_t & ne00,
|
657
|
+
constant int64_t & ne01,
|
658
|
+
constant int64_t & ne02,
|
659
|
+
constant uint64_t & nb00,
|
660
|
+
constant uint64_t & nb01,
|
661
|
+
constant uint64_t & nb02,
|
662
|
+
constant int64_t & ne10,
|
663
|
+
constant int64_t & ne11,
|
664
|
+
constant int64_t & ne12,
|
665
|
+
constant uint64_t & nb10,
|
666
|
+
constant uint64_t & nb11,
|
667
|
+
constant uint64_t & nb12,
|
668
|
+
constant int64_t & ne0,
|
669
|
+
constant int64_t & ne1,
|
670
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
671
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
672
|
+
|
673
|
+
const int nrows = ne11;
|
674
|
+
const int64_t r0 = tgpig.x;
|
675
|
+
const int64_t im = tgpig.z;
|
676
|
+
|
677
|
+
device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
678
|
+
|
679
|
+
for (int r1 = 0; r1 < nrows; ++r1) {
|
680
|
+
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
|
681
|
+
|
682
|
+
float sumf = 0;
|
683
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
684
|
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
685
|
+
}
|
686
|
+
|
687
|
+
float all_sum = simd_sum(sumf);
|
688
|
+
if (tiisg == 0) {
|
689
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
690
|
+
}
|
691
|
+
}
|
692
|
+
}
|
693
|
+
|
619
694
|
kernel void kernel_alibi_f32(
|
620
695
|
device const float * src0,
|
621
696
|
device float * dst,
|
@@ -1123,31 +1198,40 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1123
1198
|
device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
|
1124
1199
|
device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1;
|
1125
1200
|
|
1126
|
-
float yl[
|
1201
|
+
float yl[32];
|
1127
1202
|
|
1128
|
-
const uint16_t kmask1 =
|
1203
|
+
const uint16_t kmask1 = 0x3030;
|
1129
1204
|
const uint16_t kmask2 = 0x0f0f;
|
1130
1205
|
|
1131
|
-
const int tid = tiisg/
|
1132
|
-
const int ix = tiisg%
|
1133
|
-
const int ip = tid/
|
1134
|
-
const int il = tid/2
|
1206
|
+
const int tid = tiisg/4;
|
1207
|
+
const int ix = tiisg%4;
|
1208
|
+
const int ip = tid/4; // 0 or 1
|
1209
|
+
const int il = 2*((tid%4)/2); // 0 or 2
|
1135
1210
|
const int ir = tid%2;
|
1136
1211
|
const int n = 8;
|
1137
1212
|
const int l0 = n*ir;
|
1138
1213
|
|
1139
|
-
|
1140
|
-
|
1214
|
+
// One would think that the Metal compiler would figure out that ip and il can only have
|
1215
|
+
// 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
|
1216
|
+
// with these two tales.
|
1217
|
+
//
|
1218
|
+
// Possible masks for the high bit
|
1219
|
+
const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200}, // ip = 0, il = 0
|
1220
|
+
{0x0004, 0x0400, 0x0008, 0x0800}, // ip = 0, il = 2
|
1221
|
+
{0x0010, 0x1000, 0x0020, 0x2000}, // ip = 1, il = 0
|
1222
|
+
{0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
|
1223
|
+
|
1224
|
+
// Possible masks for the low 2 bits
|
1225
|
+
const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
|
1226
|
+
|
1227
|
+
const ushort4 hm = mm[2*ip + il/2];
|
1141
1228
|
|
1142
1229
|
const int shift = 2*il;
|
1143
|
-
const
|
1144
|
-
const
|
1145
|
-
const int32_t v1 = 4 << shift;
|
1146
|
-
const int32_t v2 = 1024 << shift;
|
1230
|
+
const float v1 = il == 0 ? 4.f : 64.f;
|
1231
|
+
const float v2 = 4.f * v1;
|
1147
1232
|
|
1148
1233
|
const uint16_t s_shift1 = 4*ip;
|
1149
|
-
const uint16_t s_shift2 = s_shift1 +
|
1150
|
-
const int ik = 4 + (il%2);
|
1234
|
+
const uint16_t s_shift2 = s_shift1 + il;
|
1151
1235
|
|
1152
1236
|
const int q_offset = 32*ip + l0;
|
1153
1237
|
const int y_offset = 128*ip + 32*il + l0;
|
@@ -1156,12 +1240,19 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1156
1240
|
|
1157
1241
|
device const float * y1 = yy + ix*QK_K + y_offset;
|
1158
1242
|
|
1159
|
-
|
1160
|
-
|
1243
|
+
uint32_t scales32, aux32;
|
1244
|
+
thread uint16_t * scales16 = (thread uint16_t *)&scales32;
|
1245
|
+
thread const int8_t * scales = (thread const int8_t *)&scales32;
|
1246
|
+
|
1247
|
+
float sumf1[2] = {0.f};
|
1248
|
+
float sumf2[2] = {0.f};
|
1249
|
+
for (int i = ix; i < nb; i += 4) {
|
1161
1250
|
|
1162
1251
|
for (int l = 0; l < 8; ++l) {
|
1163
|
-
yl[l+0] = y1[l+ 0];
|
1164
|
-
yl[l+8] = y1[l+16];
|
1252
|
+
yl[l+ 0] = y1[l+ 0];
|
1253
|
+
yl[l+ 8] = y1[l+16];
|
1254
|
+
yl[l+16] = y1[l+32];
|
1255
|
+
yl[l+24] = y1[l+48];
|
1165
1256
|
}
|
1166
1257
|
|
1167
1258
|
device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
|
@@ -1172,27 +1263,43 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1172
1263
|
for (int row = 0; row < 2; ++row) {
|
1173
1264
|
|
1174
1265
|
const float d_all = (float)dh[0];
|
1175
|
-
const char2 scales = as_type<char2>((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4)));
|
1176
1266
|
|
1177
|
-
|
1267
|
+
scales16[0] = a[4];
|
1268
|
+
scales16[1] = a[5];
|
1269
|
+
aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
|
1270
|
+
scales16[0] = a[il+0];
|
1271
|
+
scales16[1] = a[il+1];
|
1272
|
+
scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
|
1273
|
+
|
1274
|
+
float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
|
1178
1275
|
for (int l = 0; l < n; l += 2) {
|
1179
|
-
const
|
1180
|
-
s1 += yl[l+0] * (
|
1181
|
-
s2 += yl[l+1] * (
|
1276
|
+
const int32_t qs = q[l/2];
|
1277
|
+
s1 += yl[l+0] * (qs & qm[il/2][0]);
|
1278
|
+
s2 += yl[l+1] * (qs & qm[il/2][1]);
|
1279
|
+
s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
|
1280
|
+
s4 += yl[l+16] * (qs & qm[il/2][2]);
|
1281
|
+
s5 += yl[l+17] * (qs & qm[il/2][3]);
|
1282
|
+
s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
|
1182
1283
|
}
|
1183
|
-
float
|
1184
|
-
|
1185
|
-
|
1284
|
+
float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
|
1285
|
+
float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
|
1286
|
+
sumf1[row] += d1 * (scales[0] - 32);
|
1287
|
+
sumf2[row] += d2 * (scales[2] - 32);
|
1186
1288
|
|
1187
|
-
s1 = s2 = 0;
|
1289
|
+
s1 = s2 = s3 = s4 = s5 = s6 = 0;
|
1188
1290
|
for (int l = 0; l < n; l += 2) {
|
1189
|
-
const
|
1190
|
-
s1 += yl[l+8] * (
|
1191
|
-
s2 += yl[l+9] * (
|
1291
|
+
const int32_t qs = q[l/2+8];
|
1292
|
+
s1 += yl[l+8] * (qs & qm[il/2][0]);
|
1293
|
+
s2 += yl[l+9] * (qs & qm[il/2][1]);
|
1294
|
+
s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
|
1295
|
+
s4 += yl[l+24] * (qs & qm[il/2][2]);
|
1296
|
+
s5 += yl[l+25] * (qs & qm[il/2][3]);
|
1297
|
+
s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
|
1192
1298
|
}
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1299
|
+
d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
|
1300
|
+
d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
|
1301
|
+
sumf1[row] += d1 * (scales[1] - 32);
|
1302
|
+
sumf2[row] += d2 * (scales[3] - 32);
|
1196
1303
|
|
1197
1304
|
q += step;
|
1198
1305
|
h += step;
|
@@ -1201,17 +1308,20 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1201
1308
|
|
1202
1309
|
}
|
1203
1310
|
|
1204
|
-
y1 +=
|
1311
|
+
y1 += 4 * QK_K;
|
1205
1312
|
|
1206
1313
|
}
|
1207
1314
|
|
1208
1315
|
for (int row = 0; row < 2; ++row) {
|
1209
|
-
const float sumf = (sumf1[row]
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1316
|
+
const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
|
1317
|
+
sumf1[row] = simd_sum(sumf);
|
1318
|
+
}
|
1319
|
+
if (tiisg == 0) {
|
1320
|
+
for (int row = 0; row < 2; ++row) {
|
1321
|
+
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
|
1213
1322
|
}
|
1214
1323
|
}
|
1324
|
+
|
1215
1325
|
}
|
1216
1326
|
#else
|
1217
1327
|
kernel void kernel_mul_mat_q3_K_f32(
|
@@ -1564,17 +1674,25 @@ kernel void kernel_mul_mat_q5_K_f32(
|
|
1564
1674
|
sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
|
1565
1675
|
sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
|
1566
1676
|
|
1567
|
-
float4
|
1677
|
+
float4 acc1 = {0.f};
|
1678
|
+
float4 acc2 = {0.f};
|
1568
1679
|
for (int l = 0; l < n; ++l) {
|
1569
1680
|
uint8_t h = qh[l];
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1681
|
+
acc1[0] += yl[l+0] * (q1[l] & 0x0F);
|
1682
|
+
acc1[1] += yl[l+8] * (q1[l] & 0xF0);
|
1683
|
+
acc1[2] += yh[l+0] * (q2[l] & 0x0F);
|
1684
|
+
acc1[3] += yh[l+8] * (q2[l] & 0xF0);
|
1685
|
+
acc2[0] += h & hm1 ? yl[l+0] : 0.f;
|
1686
|
+
acc2[1] += h & hm2 ? yl[l+8] : 0.f;
|
1687
|
+
acc2[2] += h & hm3 ? yh[l+0] : 0.f;
|
1688
|
+
acc2[3] += h & hm4 ? yh[l+8] : 0.f;
|
1574
1689
|
}
|
1575
1690
|
const float dall = dh[0];
|
1576
1691
|
const float dmin = dh[1];
|
1577
|
-
sumf[row] += dall * (
|
1692
|
+
sumf[row] += dall * (sc8[0] * (acc1[0] + 16.f*acc2[0]) +
|
1693
|
+
sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
|
1694
|
+
sc8[4] * (acc1[2] + 16.f*acc2[2]) +
|
1695
|
+
sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
|
1578
1696
|
dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
|
1579
1697
|
|
1580
1698
|
q1 += step;
|
@@ -1757,29 +1875,34 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
|
|
1757
1875
|
|
1758
1876
|
template <typename type4x4>
|
1759
1877
|
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
1878
|
+
|
1760
1879
|
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
1761
|
-
const
|
1762
|
-
const
|
1880
|
+
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
1881
|
+
const float d2 = d1 / 256.f;
|
1882
|
+
const float md = -8.h * xb->d;
|
1763
1883
|
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
1764
|
-
const ushort mask1 =
|
1884
|
+
const ushort mask1 = mask0 << 8;
|
1765
1885
|
|
1766
1886
|
for (int i=0;i<8;i++) {
|
1767
|
-
reg[i/2][2*(i%2)]
|
1768
|
-
reg[i/2][2*(i%2)+1] = (
|
1887
|
+
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
1888
|
+
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
1769
1889
|
}
|
1890
|
+
|
1770
1891
|
}
|
1771
1892
|
|
1772
1893
|
template <typename type4x4>
|
1773
1894
|
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
1895
|
+
|
1774
1896
|
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
1775
|
-
const
|
1776
|
-
const
|
1897
|
+
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
1898
|
+
const float d2 = d1 / 256.f;
|
1899
|
+
const float m = xb->m;
|
1777
1900
|
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
1778
|
-
const ushort mask1 =
|
1901
|
+
const ushort mask1 = mask0 << 8;
|
1779
1902
|
|
1780
1903
|
for (int i=0;i<8;i++) {
|
1781
|
-
reg[i/2][2*(i%2)]
|
1782
|
-
reg[i/2][2*(i%2)+1] = ((
|
1904
|
+
reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
|
1905
|
+
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
1783
1906
|
}
|
1784
1907
|
}
|
1785
1908
|
|
@@ -1815,7 +1938,7 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
|
|
1815
1938
|
|
1816
1939
|
template <typename type4x4>
|
1817
1940
|
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
|
1818
|
-
const
|
1941
|
+
const half d_all = xb->d;
|
1819
1942
|
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
1820
1943
|
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
1821
1944
|
device const int8_t * scales = (device const int8_t *)xb->scales;
|
@@ -1828,17 +1951,20 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
|
1828
1951
|
((il/4)>0 ? 12 : 3);
|
1829
1952
|
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
|
1830
1953
|
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
|
1831
|
-
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
|
1832
|
-
|
1833
|
-
|
1954
|
+
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
|
1955
|
+
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
|
1956
|
+
half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
|
1957
|
+
const half ml = 4.h * dl;
|
1834
1958
|
|
1835
|
-
il = (il/2)
|
1836
|
-
|
1837
|
-
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
1959
|
+
il = (il/2) & 3;
|
1960
|
+
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
1961
|
+
const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
1962
|
+
dl *= coef;
|
1838
1963
|
|
1839
1964
|
for (int i = 0; i < 16; ++i) {
|
1840
|
-
reg[i/4][i%4] =
|
1965
|
+
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
1841
1966
|
}
|
1967
|
+
|
1842
1968
|
#else
|
1843
1969
|
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
1844
1970
|
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
@@ -1852,31 +1978,37 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
|
1852
1978
|
#endif
|
1853
1979
|
}
|
1854
1980
|
|
1981
|
+
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
1982
|
+
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
|
1983
|
+
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
|
1984
|
+
}
|
1985
|
+
|
1855
1986
|
template <typename type4x4>
|
1856
1987
|
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
1857
|
-
device const
|
1988
|
+
device const uchar * q = xb->qs;
|
1858
1989
|
|
1859
1990
|
#if QK_K == 256
|
1860
|
-
const float d = (float)(xb->d);
|
1861
|
-
const float min = (float)(xb->dmin);
|
1862
1991
|
short is = (il/4) * 2;
|
1863
1992
|
q = q + (il/4) * 32 + 16 * (il&1);
|
1864
|
-
il = il
|
1865
|
-
const
|
1866
|
-
const
|
1867
|
-
const
|
1993
|
+
il = il & 3;
|
1994
|
+
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
1995
|
+
const half d = il < 2 ? xb->d : xb->d / 16.h;
|
1996
|
+
const half min = xb->dmin;
|
1997
|
+
const half dl = d * sc[0];
|
1998
|
+
const half ml = min * sc[1];
|
1868
1999
|
#else
|
1869
2000
|
q = q + 16 * (il&1);
|
1870
2001
|
device const uint8_t * s = xb->scales;
|
1871
2002
|
device const half2 * dh = (device const half2 *)xb->d;
|
1872
2003
|
const float2 d = (float2)dh[0];
|
1873
2004
|
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
|
1874
|
-
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1
|
2005
|
+
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
|
1875
2006
|
#endif
|
1876
2007
|
const ushort mask = il<2 ? 0x0F : 0xF0;
|
1877
2008
|
for (int i = 0; i < 16; ++i) {
|
1878
2009
|
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
1879
2010
|
}
|
2011
|
+
|
1880
2012
|
}
|
1881
2013
|
|
1882
2014
|
template <typename type4x4>
|
@@ -1885,19 +2017,19 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
|
1885
2017
|
device const uint8_t * qh = xb->qh;
|
1886
2018
|
|
1887
2019
|
#if QK_K == 256
|
1888
|
-
const float d = (float)(xb->d);
|
1889
|
-
const float min = (float)(xb->dmin);
|
1890
2020
|
short is = (il/4) * 2;
|
1891
2021
|
q = q + 32 * (il/4) + 16 * (il&1);
|
1892
2022
|
qh = qh + 16 * (il&1);
|
1893
2023
|
uint8_t ul = 1 << (il/2);
|
1894
|
-
il = il
|
1895
|
-
const
|
1896
|
-
const
|
1897
|
-
const
|
2024
|
+
il = il & 3;
|
2025
|
+
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
2026
|
+
const half d = il < 2 ? xb->d : xb->d / 16.h;
|
2027
|
+
const half min = xb->dmin;
|
2028
|
+
const half dl = d * sc[0];
|
2029
|
+
const half ml = min * sc[1];
|
1898
2030
|
|
1899
|
-
const ushort mask
|
1900
|
-
const
|
2031
|
+
const ushort mask = il<2 ? 0x0F : 0xF0;
|
2032
|
+
const half qh_val = il<2 ? 16.h : 256.h;
|
1901
2033
|
for (int i = 0; i < 16; ++i) {
|
1902
2034
|
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
1903
2035
|
}
|
@@ -1916,7 +2048,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
|
1916
2048
|
|
1917
2049
|
template <typename type4x4>
|
1918
2050
|
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
|
1919
|
-
const
|
2051
|
+
const half d_all = xb->d;
|
1920
2052
|
device const uint8_t * ql = (device const uint8_t *)xb->ql;
|
1921
2053
|
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
1922
2054
|
device const int8_t * scales = (device const int8_t *)xb->scales;
|
@@ -1924,19 +2056,21 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
|
|
1924
2056
|
#if QK_K == 256
|
1925
2057
|
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
1926
2058
|
qh = qh + 32*(il/8) + 16*(il&1);
|
1927
|
-
|
1928
|
-
il = (il/2)
|
2059
|
+
half sc = scales[(il%2) + 2 * ((il/2))];
|
2060
|
+
il = (il/2) & 3;
|
1929
2061
|
#else
|
1930
2062
|
ql = ql + 16 * (il&1);
|
1931
|
-
|
2063
|
+
half sc = scales[il];
|
1932
2064
|
#endif
|
2065
|
+
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
2066
|
+
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
2067
|
+
const half coef = il>1 ? 1.f/16.h : 1.h;
|
2068
|
+
const half ml = d_all * sc * 32.h;
|
2069
|
+
const half dl = d_all * sc * coef;
|
1933
2070
|
for (int i = 0; i < 16; ++i) {
|
1934
|
-
|
1935
|
-
|
1936
|
-
|
1937
|
-
float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \
|
1938
|
-
((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef;
|
1939
|
-
reg[i/4][i%4] = d_all * sc * q * coef;
|
2071
|
+
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
|
2072
|
+
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
|
2073
|
+
reg[i/4][i%4] = dl * q - ml;
|
1940
2074
|
}
|
1941
2075
|
}
|
1942
2076
|
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
1
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
2
|
|
4
3
|
#include "ggml.h"
|
@@ -47,6 +46,10 @@
|
|
47
46
|
// disable "possible loss of data" to avoid hundreds of casts
|
48
47
|
// we should just be careful :)
|
49
48
|
#pragma warning(disable: 4244 4267)
|
49
|
+
|
50
|
+
// disable POSIX deprecation warnigns
|
51
|
+
// these functions are never going away, anyway
|
52
|
+
#pragma warning(disable: 4996)
|
50
53
|
#endif
|
51
54
|
|
52
55
|
#if defined(_WIN32)
|
@@ -280,7 +283,7 @@ typedef double ggml_float;
|
|
280
283
|
// 16-bit float
|
281
284
|
// on Arm, we use __fp16
|
282
285
|
// on x86, we use uint16_t
|
283
|
-
#
|
286
|
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
284
287
|
|
285
288
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
286
289
|
//
|
@@ -307,12 +310,14 @@ typedef double ggml_float;
|
|
307
310
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
308
311
|
#include <intrin.h>
|
309
312
|
#else
|
313
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
310
314
|
#if !defined(__riscv)
|
311
315
|
#include <immintrin.h>
|
312
316
|
#endif
|
313
317
|
#endif
|
314
318
|
#endif
|
315
319
|
#endif
|
320
|
+
#endif
|
316
321
|
|
317
322
|
#ifdef __riscv_v_intrinsic
|
318
323
|
#include <riscv_vector.h>
|
@@ -18872,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18872
18877
|
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
18873
18878
|
return count;
|
18874
18879
|
}
|
18875
|
-
return count;
|
18876
18880
|
}
|
18877
18881
|
}
|
18878
18882
|
|