llama_cpp 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -14,26 +14,6 @@
|
|
14
14
|
//
|
15
15
|
#include <arm_neon.h>
|
16
16
|
|
17
|
-
#if !defined(__aarch64__)
|
18
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
19
|
-
return
|
20
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
21
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
22
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
23
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
24
|
-
}
|
25
|
-
|
26
|
-
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
27
|
-
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
28
|
-
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
29
|
-
return vcombine_s16(a0, b0);
|
30
|
-
}
|
31
|
-
|
32
|
-
inline static int32_t vaddvq_s32(int32x4_t v) {
|
33
|
-
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
34
|
-
}
|
35
|
-
#endif
|
36
|
-
|
37
17
|
#else
|
38
18
|
|
39
19
|
#ifdef __wasm_simd128__
|
@@ -47,13 +27,15 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
47
27
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
48
28
|
#include <intrin.h>
|
49
29
|
#else
|
50
|
-
#if
|
30
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
31
|
+
#if !defined(__riscv)
|
51
32
|
#include <immintrin.h>
|
52
33
|
#endif
|
53
34
|
#endif
|
54
35
|
#endif
|
55
36
|
#endif
|
56
37
|
#endif
|
38
|
+
#endif
|
57
39
|
|
58
40
|
#ifdef __riscv_v_intrinsic
|
59
41
|
#include <riscv_vector.h>
|
@@ -61,6 +43,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
61
43
|
|
62
44
|
#undef MIN
|
63
45
|
#undef MAX
|
46
|
+
|
64
47
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
65
48
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
66
49
|
|
@@ -283,9 +266,31 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
283
266
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
284
267
|
|
285
268
|
#if defined(__ARM_NEON)
|
286
|
-
|
287
269
|
#if !defined(__aarch64__)
|
288
270
|
|
271
|
+
// 64-bit compatibility
|
272
|
+
|
273
|
+
// vaddvq_s16
|
274
|
+
// vpaddq_s16
|
275
|
+
// vaddvq_s32
|
276
|
+
// vaddvq_f32
|
277
|
+
// vmaxvq_f32
|
278
|
+
// vcvtnq_s32_f32
|
279
|
+
|
280
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
281
|
+
return
|
282
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
283
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
284
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
285
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
286
|
+
}
|
287
|
+
|
288
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
289
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
290
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
291
|
+
return vcombine_s16(a0, b0);
|
292
|
+
}
|
293
|
+
|
289
294
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
290
295
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
291
296
|
}
|
@@ -311,6 +316,96 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
311
316
|
return res;
|
312
317
|
}
|
313
318
|
|
319
|
+
// vld1q_s16_x2
|
320
|
+
// vld1q_u8_x2
|
321
|
+
// vld1q_u8_x4
|
322
|
+
// vld1q_s8_x2
|
323
|
+
// vld1q_s8_x4
|
324
|
+
// TODO: double-check these work correctly
|
325
|
+
|
326
|
+
typedef struct ggml_int16x8x2_t {
|
327
|
+
int16x8_t val[2];
|
328
|
+
} ggml_int16x8x2_t;
|
329
|
+
|
330
|
+
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
331
|
+
ggml_int16x8x2_t res;
|
332
|
+
|
333
|
+
res.val[0] = vld1q_s16(ptr + 0);
|
334
|
+
res.val[1] = vld1q_s16(ptr + 8);
|
335
|
+
|
336
|
+
return res;
|
337
|
+
}
|
338
|
+
|
339
|
+
typedef struct ggml_uint8x16x2_t {
|
340
|
+
uint8x16_t val[2];
|
341
|
+
} ggml_uint8x16x2_t;
|
342
|
+
|
343
|
+
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
344
|
+
ggml_uint8x16x2_t res;
|
345
|
+
|
346
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
347
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
348
|
+
|
349
|
+
return res;
|
350
|
+
}
|
351
|
+
|
352
|
+
typedef struct ggml_uint8x16x4_t {
|
353
|
+
uint8x16_t val[4];
|
354
|
+
} ggml_uint8x16x4_t;
|
355
|
+
|
356
|
+
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
357
|
+
ggml_uint8x16x4_t res;
|
358
|
+
|
359
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
360
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
361
|
+
res.val[2] = vld1q_u8(ptr + 32);
|
362
|
+
res.val[3] = vld1q_u8(ptr + 48);
|
363
|
+
|
364
|
+
return res;
|
365
|
+
}
|
366
|
+
|
367
|
+
typedef struct ggml_int8x16x2_t {
|
368
|
+
int8x16_t val[2];
|
369
|
+
} ggml_int8x16x2_t;
|
370
|
+
|
371
|
+
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
372
|
+
ggml_int8x16x2_t res;
|
373
|
+
|
374
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
375
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
376
|
+
|
377
|
+
return res;
|
378
|
+
}
|
379
|
+
|
380
|
+
typedef struct ggml_int8x16x4_t {
|
381
|
+
int8x16_t val[4];
|
382
|
+
} ggml_int8x16x4_t;
|
383
|
+
|
384
|
+
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
385
|
+
ggml_int8x16x4_t res;
|
386
|
+
|
387
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
388
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
389
|
+
res.val[2] = vld1q_s8(ptr + 32);
|
390
|
+
res.val[3] = vld1q_s8(ptr + 48);
|
391
|
+
|
392
|
+
return res;
|
393
|
+
}
|
394
|
+
|
395
|
+
#else
|
396
|
+
|
397
|
+
#define ggml_int16x8x2_t int16x8x2_t
|
398
|
+
#define ggml_uint8x16x2_t uint8x16x2_t
|
399
|
+
#define ggml_uint8x16x4_t uint8x16x4_t
|
400
|
+
#define ggml_int8x16x2_t int8x16x2_t
|
401
|
+
#define ggml_int8x16x4_t int8x16x4_t
|
402
|
+
|
403
|
+
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
404
|
+
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
405
|
+
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
406
|
+
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
407
|
+
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
408
|
+
|
314
409
|
#endif
|
315
410
|
#endif
|
316
411
|
|
@@ -1273,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|
1273
1368
|
float max = x[0];
|
1274
1369
|
float sum_w = weights[0];
|
1275
1370
|
float sum_x = sum_w * x[0];
|
1371
|
+
#ifdef HAVE_BUGGY_APPLE_LINKER
|
1372
|
+
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
1373
|
+
for (volatile int i = 1; i < n; ++i) {
|
1374
|
+
#else
|
1276
1375
|
for (int i = 1; i < n; ++i) {
|
1376
|
+
#endif
|
1277
1377
|
if (x[i] < min) min = x[i];
|
1278
1378
|
if (x[i] > max) max = x[i];
|
1279
1379
|
float w = weights[i];
|
@@ -3557,7 +3657,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3557
3657
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3558
3658
|
#endif
|
3559
3659
|
|
3560
|
-
|
3660
|
+
ggml_int8x16x2_t q2bytes;
|
3561
3661
|
uint8_t aux[16];
|
3562
3662
|
|
3563
3663
|
float sum = 0;
|
@@ -3576,8 +3676,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3576
3676
|
vst1q_u8(aux, scales);
|
3577
3677
|
|
3578
3678
|
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
3579
|
-
const
|
3580
|
-
const
|
3679
|
+
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
3680
|
+
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
3581
3681
|
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
3582
3682
|
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
3583
3683
|
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
@@ -3605,7 +3705,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3605
3705
|
#endif
|
3606
3706
|
|
3607
3707
|
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
3608
|
-
q8bytes =
|
3708
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
3609
3709
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
|
3610
3710
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
3611
3711
|
MULTIPLY_ACCUM_WITH_SCALE((index));
|
@@ -3613,9 +3713,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3613
3713
|
|
3614
3714
|
for (int j = 0; j < QK_K/128; ++j) {
|
3615
3715
|
|
3616
|
-
const
|
3716
|
+
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
3617
3717
|
|
3618
|
-
|
3718
|
+
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
3619
3719
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
3620
3720
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
3621
3721
|
MULTIPLY_ACCUM_WITH_SCALE(0);
|
@@ -3949,7 +4049,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3949
4049
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3950
4050
|
#endif
|
3951
4051
|
|
3952
|
-
|
4052
|
+
ggml_int8x16x4_t q2bytes;
|
3953
4053
|
|
3954
4054
|
uint32_t aux32[2];
|
3955
4055
|
const uint8_t * scales = (const uint8_t *)aux32;
|
@@ -3974,7 +4074,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3974
4074
|
|
3975
4075
|
const uint8x16_t q2bits = vld1q_u8(q2);
|
3976
4076
|
|
3977
|
-
const
|
4077
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
3978
4078
|
|
3979
4079
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
|
3980
4080
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
|
@@ -4238,7 +4338,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4238
4338
|
const uint8x16_t m3 = vshlq_n_u8(m0, 3);
|
4239
4339
|
const int8_t m32 = 32;
|
4240
4340
|
|
4241
|
-
|
4341
|
+
ggml_int8x16x4_t q3bytes;
|
4242
4342
|
|
4243
4343
|
float sum = 0;
|
4244
4344
|
|
@@ -4250,9 +4350,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4250
4350
|
const uint8_t * restrict qh = x[i].hmask;
|
4251
4351
|
const int8_t * restrict q8 = y[i].qs;
|
4252
4352
|
|
4253
|
-
|
4353
|
+
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
4254
4354
|
|
4255
|
-
|
4355
|
+
ggml_uint8x16x4_t q3h;
|
4256
4356
|
|
4257
4357
|
int32_t isum = 0;
|
4258
4358
|
|
@@ -4268,9 +4368,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4268
4368
|
|
4269
4369
|
for (int j = 0; j < QK_K/128; ++j) {
|
4270
4370
|
|
4271
|
-
const
|
4272
|
-
const
|
4273
|
-
const
|
4371
|
+
const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
|
4372
|
+
const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
|
4373
|
+
const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
|
4274
4374
|
|
4275
4375
|
q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
|
4276
4376
|
q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
|
@@ -4772,7 +4872,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4772
4872
|
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
4773
4873
|
const uint8x16_t mh = vdupq_n_u8(4);
|
4774
4874
|
|
4775
|
-
|
4875
|
+
ggml_int8x16x4_t q3bytes;
|
4776
4876
|
|
4777
4877
|
uint16_t aux16[2];
|
4778
4878
|
int8_t * scales = (int8_t *)aux16;
|
@@ -4781,11 +4881,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4781
4881
|
|
4782
4882
|
for (int i = 0; i < nb; ++i) {
|
4783
4883
|
|
4784
|
-
|
4884
|
+
ggml_uint8x16x4_t q3h;
|
4785
4885
|
|
4786
4886
|
const uint8x8_t hbits = vld1_u8(x[i].hmask);
|
4787
4887
|
const uint8x16_t q3bits = vld1q_u8(x[i].qs);
|
4788
|
-
const
|
4888
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
|
4789
4889
|
|
4790
4890
|
const uint16_t a = *(const uint16_t *)x[i].scales;
|
4791
4891
|
aux16[0] = a & 0x0f0f;
|
@@ -5134,8 +5234,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5134
5234
|
const int32x4_t mzero = vdupq_n_s32(0);
|
5135
5235
|
#endif
|
5136
5236
|
|
5137
|
-
|
5138
|
-
|
5237
|
+
ggml_int8x16x2_t q4bytes;
|
5238
|
+
ggml_int8x16x2_t q8bytes;
|
5139
5239
|
|
5140
5240
|
float sumf = 0;
|
5141
5241
|
|
@@ -5170,17 +5270,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5170
5270
|
|
5171
5271
|
for (int j = 0; j < QK_K/64; ++j) {
|
5172
5272
|
|
5173
|
-
const
|
5273
|
+
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
5174
5274
|
|
5175
5275
|
#ifdef __ARM_FEATURE_DOTPROD
|
5176
|
-
q8bytes =
|
5276
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5177
5277
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5178
5278
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5179
5279
|
|
5180
5280
|
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
5181
5281
|
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
5182
5282
|
|
5183
|
-
q8bytes =
|
5283
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5184
5284
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
5185
5285
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
5186
5286
|
|
@@ -5188,7 +5288,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5188
5288
|
|
5189
5289
|
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
5190
5290
|
#else
|
5191
|
-
q8bytes =
|
5291
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5192
5292
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5193
5293
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5194
5294
|
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
@@ -5197,7 +5297,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5197
5297
|
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
5198
5298
|
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
|
5199
5299
|
|
5200
|
-
q8bytes =
|
5300
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5201
5301
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
5202
5302
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
5203
5303
|
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
@@ -5512,8 +5612,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5512
5612
|
|
5513
5613
|
float sumf = 0;
|
5514
5614
|
|
5515
|
-
|
5516
|
-
|
5615
|
+
ggml_int8x16x2_t q4bytes;
|
5616
|
+
ggml_int8x16x4_t q8bytes;
|
5517
5617
|
|
5518
5618
|
float sum_mins = 0.f;
|
5519
5619
|
|
@@ -5534,10 +5634,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5534
5634
|
|
5535
5635
|
const float d = y[i].d * (float)x[i].d[0];
|
5536
5636
|
|
5537
|
-
const
|
5637
|
+
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
5538
5638
|
|
5539
5639
|
#ifdef __ARM_FEATURE_DOTPROD
|
5540
|
-
q8bytes =
|
5640
|
+
q8bytes = ggml_vld1q_s8_x4(q8);
|
5541
5641
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5542
5642
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5543
5643
|
|
@@ -5551,7 +5651,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5551
5651
|
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
5552
5652
|
|
5553
5653
|
#else
|
5554
|
-
q8bytes =
|
5654
|
+
q8bytes = ggml_vld1q_s8_x4(q8);
|
5555
5655
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5556
5656
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5557
5657
|
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
@@ -5785,7 +5885,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5785
5885
|
const int32x4_t mzero = vdupq_n_s32(0);
|
5786
5886
|
#endif
|
5787
5887
|
|
5788
|
-
|
5888
|
+
ggml_int8x16x4_t q5bytes;
|
5789
5889
|
|
5790
5890
|
float sumf = 0;
|
5791
5891
|
|
@@ -5815,16 +5915,16 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5815
5915
|
const uint8_t * restrict qh = x[i].qh;
|
5816
5916
|
const int8_t * restrict q8 = y[i].qs;
|
5817
5917
|
|
5818
|
-
|
5918
|
+
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
5819
5919
|
|
5820
|
-
|
5920
|
+
ggml_uint8x16x4_t q5h;
|
5821
5921
|
|
5822
5922
|
int32_t sumi = 0;
|
5823
5923
|
|
5824
5924
|
for (int j = 0; j < QK_K/64; ++j) {
|
5825
5925
|
|
5826
|
-
const
|
5827
|
-
const
|
5926
|
+
const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
|
5927
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
5828
5928
|
|
5829
5929
|
q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
5830
5930
|
q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
@@ -6218,8 +6318,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6218
6318
|
const int32x4_t mzero = vdupq_n_s32(0);
|
6219
6319
|
#endif
|
6220
6320
|
|
6221
|
-
|
6222
|
-
|
6321
|
+
ggml_int8x16x4_t q5bytes;
|
6322
|
+
ggml_uint8x16x4_t q5h;
|
6223
6323
|
|
6224
6324
|
float sumf = 0;
|
6225
6325
|
|
@@ -6234,8 +6334,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6234
6334
|
|
6235
6335
|
const uint8x8_t qhbits = vld1_u8(qh);
|
6236
6336
|
|
6237
|
-
const
|
6238
|
-
const
|
6337
|
+
const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
|
6338
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
6239
6339
|
|
6240
6340
|
const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
|
6241
6341
|
q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
|
@@ -6511,8 +6611,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6511
6611
|
|
6512
6612
|
const uint8x16_t mone = vdupq_n_u8(3);
|
6513
6613
|
|
6514
|
-
|
6515
|
-
|
6614
|
+
ggml_int8x16x4_t q6bytes;
|
6615
|
+
ggml_uint8x16x4_t q6h;
|
6516
6616
|
|
6517
6617
|
for (int i = 0; i < nb; ++i) {
|
6518
6618
|
|
@@ -6524,9 +6624,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6524
6624
|
|
6525
6625
|
const int8_t * restrict scale = x[i].scales;
|
6526
6626
|
|
6527
|
-
const
|
6627
|
+
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
6528
6628
|
const int8x16_t scales = vld1q_s8(scale);
|
6529
|
-
const
|
6629
|
+
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
6530
6630
|
|
6531
6631
|
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
6532
6632
|
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
@@ -6538,9 +6638,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6538
6638
|
|
6539
6639
|
for (int j = 0; j < QK_K/128; ++j) {
|
6540
6640
|
|
6541
|
-
|
6542
|
-
|
6543
|
-
|
6641
|
+
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
|
6642
|
+
ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
|
6643
|
+
ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
6544
6644
|
|
6545
6645
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
6546
6646
|
q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
@@ -6583,7 +6683,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6583
6683
|
scale += 2;
|
6584
6684
|
#endif
|
6585
6685
|
|
6586
|
-
q8bytes =
|
6686
|
+
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
6587
6687
|
|
6588
6688
|
shifted = vshrq_n_u8(qhbits.val[0], 4);
|
6589
6689
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
|
@@ -6987,8 +7087,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6987
7087
|
|
6988
7088
|
const uint8x16_t mone = vdupq_n_u8(3);
|
6989
7089
|
|
6990
|
-
|
6991
|
-
|
7090
|
+
ggml_int8x16x4_t q6bytes;
|
7091
|
+
ggml_uint8x16x4_t q6h;
|
6992
7092
|
|
6993
7093
|
for (int i = 0; i < nb; ++i) {
|
6994
7094
|
|
@@ -7002,9 +7102,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7002
7102
|
|
7003
7103
|
int32_t isum = 0;
|
7004
7104
|
|
7005
|
-
uint8x16_t
|
7006
|
-
|
7007
|
-
|
7105
|
+
uint8x16_t qhbits = vld1q_u8(qh);
|
7106
|
+
ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
|
7107
|
+
ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
7008
7108
|
|
7009
7109
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
|
7010
7110
|
uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
|