llama_cpp 0.9.2 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -14,32 +14,12 @@
|
|
14
14
|
//
|
15
15
|
#include <arm_neon.h>
|
16
16
|
|
17
|
-
#if !defined(__aarch64__)
|
18
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
19
|
-
return
|
20
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
21
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
22
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
23
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
24
|
-
}
|
25
|
-
|
26
|
-
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
27
|
-
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
28
|
-
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
29
|
-
return vcombine_s16(a0, b0);
|
30
|
-
}
|
31
|
-
|
32
|
-
inline static int32_t vaddvq_s32(int32x4_t v) {
|
33
|
-
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
34
|
-
}
|
35
|
-
#endif
|
36
|
-
|
37
17
|
#else
|
38
18
|
|
39
19
|
#ifdef __wasm_simd128__
|
40
20
|
#include <wasm_simd128.h>
|
41
21
|
#else
|
42
|
-
#
|
22
|
+
#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
|
43
23
|
#include <altivec.h>
|
44
24
|
#undef bool
|
45
25
|
#define bool _Bool
|
@@ -47,13 +27,15 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
47
27
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
48
28
|
#include <intrin.h>
|
49
29
|
#else
|
50
|
-
#if
|
30
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
31
|
+
#if !defined(__riscv)
|
51
32
|
#include <immintrin.h>
|
52
33
|
#endif
|
53
34
|
#endif
|
54
35
|
#endif
|
55
36
|
#endif
|
56
37
|
#endif
|
38
|
+
#endif
|
57
39
|
|
58
40
|
#ifdef __riscv_v_intrinsic
|
59
41
|
#include <riscv_vector.h>
|
@@ -61,6 +43,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
61
43
|
|
62
44
|
#undef MIN
|
63
45
|
#undef MAX
|
46
|
+
|
64
47
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
65
48
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
66
49
|
|
@@ -283,9 +266,31 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
283
266
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
284
267
|
|
285
268
|
#if defined(__ARM_NEON)
|
286
|
-
|
287
269
|
#if !defined(__aarch64__)
|
288
270
|
|
271
|
+
// 64-bit compatibility
|
272
|
+
|
273
|
+
// vaddvq_s16
|
274
|
+
// vpaddq_s16
|
275
|
+
// vaddvq_s32
|
276
|
+
// vaddvq_f32
|
277
|
+
// vmaxvq_f32
|
278
|
+
// vcvtnq_s32_f32
|
279
|
+
|
280
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
281
|
+
return
|
282
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
283
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
284
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
285
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
286
|
+
}
|
287
|
+
|
288
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
289
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
290
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
291
|
+
return vcombine_s16(a0, b0);
|
292
|
+
}
|
293
|
+
|
289
294
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
290
295
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
291
296
|
}
|
@@ -311,6 +316,96 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
311
316
|
return res;
|
312
317
|
}
|
313
318
|
|
319
|
+
// vld1q_s16_x2
|
320
|
+
// vld1q_u8_x2
|
321
|
+
// vld1q_u8_x4
|
322
|
+
// vld1q_s8_x2
|
323
|
+
// vld1q_s8_x4
|
324
|
+
// TODO: double-check these work correctly
|
325
|
+
|
326
|
+
typedef struct ggml_int16x8x2_t {
|
327
|
+
int16x8_t val[2];
|
328
|
+
} ggml_int16x8x2_t;
|
329
|
+
|
330
|
+
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
331
|
+
ggml_int16x8x2_t res;
|
332
|
+
|
333
|
+
res.val[0] = vld1q_s16(ptr + 0);
|
334
|
+
res.val[1] = vld1q_s16(ptr + 8);
|
335
|
+
|
336
|
+
return res;
|
337
|
+
}
|
338
|
+
|
339
|
+
typedef struct ggml_uint8x16x2_t {
|
340
|
+
uint8x16_t val[2];
|
341
|
+
} ggml_uint8x16x2_t;
|
342
|
+
|
343
|
+
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
344
|
+
ggml_uint8x16x2_t res;
|
345
|
+
|
346
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
347
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
348
|
+
|
349
|
+
return res;
|
350
|
+
}
|
351
|
+
|
352
|
+
typedef struct ggml_uint8x16x4_t {
|
353
|
+
uint8x16_t val[4];
|
354
|
+
} ggml_uint8x16x4_t;
|
355
|
+
|
356
|
+
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
357
|
+
ggml_uint8x16x4_t res;
|
358
|
+
|
359
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
360
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
361
|
+
res.val[2] = vld1q_u8(ptr + 32);
|
362
|
+
res.val[3] = vld1q_u8(ptr + 48);
|
363
|
+
|
364
|
+
return res;
|
365
|
+
}
|
366
|
+
|
367
|
+
typedef struct ggml_int8x16x2_t {
|
368
|
+
int8x16_t val[2];
|
369
|
+
} ggml_int8x16x2_t;
|
370
|
+
|
371
|
+
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
372
|
+
ggml_int8x16x2_t res;
|
373
|
+
|
374
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
375
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
376
|
+
|
377
|
+
return res;
|
378
|
+
}
|
379
|
+
|
380
|
+
typedef struct ggml_int8x16x4_t {
|
381
|
+
int8x16_t val[4];
|
382
|
+
} ggml_int8x16x4_t;
|
383
|
+
|
384
|
+
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
385
|
+
ggml_int8x16x4_t res;
|
386
|
+
|
387
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
388
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
389
|
+
res.val[2] = vld1q_s8(ptr + 32);
|
390
|
+
res.val[3] = vld1q_s8(ptr + 48);
|
391
|
+
|
392
|
+
return res;
|
393
|
+
}
|
394
|
+
|
395
|
+
#else
|
396
|
+
|
397
|
+
#define ggml_int16x8x2_t int16x8x2_t
|
398
|
+
#define ggml_uint8x16x2_t uint8x16x2_t
|
399
|
+
#define ggml_uint8x16x4_t uint8x16x4_t
|
400
|
+
#define ggml_int8x16x2_t int8x16x2_t
|
401
|
+
#define ggml_int8x16x4_t int8x16x4_t
|
402
|
+
|
403
|
+
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
404
|
+
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
405
|
+
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
406
|
+
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
407
|
+
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
408
|
+
|
314
409
|
#endif
|
315
410
|
#endif
|
316
411
|
|
@@ -1273,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|
1273
1368
|
float max = x[0];
|
1274
1369
|
float sum_w = weights[0];
|
1275
1370
|
float sum_x = sum_w * x[0];
|
1371
|
+
#ifdef HAVE_BUGGY_APPLE_LINKER
|
1372
|
+
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
1373
|
+
for (volatile int i = 1; i < n; ++i) {
|
1374
|
+
#else
|
1276
1375
|
for (int i = 1; i < n; ++i) {
|
1376
|
+
#endif
|
1277
1377
|
if (x[i] < min) min = x[i];
|
1278
1378
|
if (x[i] > max) max = x[i];
|
1279
1379
|
float w = weights[i];
|
@@ -3557,7 +3657,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3557
3657
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3558
3658
|
#endif
|
3559
3659
|
|
3560
|
-
|
3660
|
+
ggml_int8x16x2_t q2bytes;
|
3561
3661
|
uint8_t aux[16];
|
3562
3662
|
|
3563
3663
|
float sum = 0;
|
@@ -3576,8 +3676,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3576
3676
|
vst1q_u8(aux, scales);
|
3577
3677
|
|
3578
3678
|
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
3579
|
-
const
|
3580
|
-
const
|
3679
|
+
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
3680
|
+
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
3581
3681
|
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
3582
3682
|
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
3583
3683
|
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
@@ -3605,7 +3705,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3605
3705
|
#endif
|
3606
3706
|
|
3607
3707
|
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
3608
|
-
q8bytes =
|
3708
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
3609
3709
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
|
3610
3710
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
3611
3711
|
MULTIPLY_ACCUM_WITH_SCALE((index));
|
@@ -3613,9 +3713,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3613
3713
|
|
3614
3714
|
for (int j = 0; j < QK_K/128; ++j) {
|
3615
3715
|
|
3616
|
-
const
|
3716
|
+
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
3617
3717
|
|
3618
|
-
|
3718
|
+
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
3619
3719
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
3620
3720
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
3621
3721
|
MULTIPLY_ACCUM_WITH_SCALE(0);
|
@@ -3949,7 +4049,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3949
4049
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3950
4050
|
#endif
|
3951
4051
|
|
3952
|
-
|
4052
|
+
ggml_int8x16x4_t q2bytes;
|
3953
4053
|
|
3954
4054
|
uint32_t aux32[2];
|
3955
4055
|
const uint8_t * scales = (const uint8_t *)aux32;
|
@@ -3974,7 +4074,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3974
4074
|
|
3975
4075
|
const uint8x16_t q2bits = vld1q_u8(q2);
|
3976
4076
|
|
3977
|
-
const
|
4077
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
3978
4078
|
|
3979
4079
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
|
3980
4080
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
|
@@ -4238,7 +4338,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4238
4338
|
const uint8x16_t m3 = vshlq_n_u8(m0, 3);
|
4239
4339
|
const int8_t m32 = 32;
|
4240
4340
|
|
4241
|
-
|
4341
|
+
ggml_int8x16x4_t q3bytes;
|
4242
4342
|
|
4243
4343
|
float sum = 0;
|
4244
4344
|
|
@@ -4250,9 +4350,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4250
4350
|
const uint8_t * restrict qh = x[i].hmask;
|
4251
4351
|
const int8_t * restrict q8 = y[i].qs;
|
4252
4352
|
|
4253
|
-
|
4353
|
+
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
4254
4354
|
|
4255
|
-
|
4355
|
+
ggml_uint8x16x4_t q3h;
|
4256
4356
|
|
4257
4357
|
int32_t isum = 0;
|
4258
4358
|
|
@@ -4268,9 +4368,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4268
4368
|
|
4269
4369
|
for (int j = 0; j < QK_K/128; ++j) {
|
4270
4370
|
|
4271
|
-
const
|
4272
|
-
const
|
4273
|
-
const
|
4371
|
+
const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
|
4372
|
+
const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
|
4373
|
+
const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
|
4274
4374
|
|
4275
4375
|
q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
|
4276
4376
|
q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
|
@@ -4772,7 +4872,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4772
4872
|
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
4773
4873
|
const uint8x16_t mh = vdupq_n_u8(4);
|
4774
4874
|
|
4775
|
-
|
4875
|
+
ggml_int8x16x4_t q3bytes;
|
4776
4876
|
|
4777
4877
|
uint16_t aux16[2];
|
4778
4878
|
int8_t * scales = (int8_t *)aux16;
|
@@ -4781,11 +4881,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4781
4881
|
|
4782
4882
|
for (int i = 0; i < nb; ++i) {
|
4783
4883
|
|
4784
|
-
|
4884
|
+
ggml_uint8x16x4_t q3h;
|
4785
4885
|
|
4786
4886
|
const uint8x8_t hbits = vld1_u8(x[i].hmask);
|
4787
4887
|
const uint8x16_t q3bits = vld1q_u8(x[i].qs);
|
4788
|
-
const
|
4888
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
|
4789
4889
|
|
4790
4890
|
const uint16_t a = *(const uint16_t *)x[i].scales;
|
4791
4891
|
aux16[0] = a & 0x0f0f;
|
@@ -5134,8 +5234,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5134
5234
|
const int32x4_t mzero = vdupq_n_s32(0);
|
5135
5235
|
#endif
|
5136
5236
|
|
5137
|
-
|
5138
|
-
|
5237
|
+
ggml_int8x16x2_t q4bytes;
|
5238
|
+
ggml_int8x16x2_t q8bytes;
|
5139
5239
|
|
5140
5240
|
float sumf = 0;
|
5141
5241
|
|
@@ -5170,17 +5270,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5170
5270
|
|
5171
5271
|
for (int j = 0; j < QK_K/64; ++j) {
|
5172
5272
|
|
5173
|
-
const
|
5273
|
+
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
5174
5274
|
|
5175
5275
|
#ifdef __ARM_FEATURE_DOTPROD
|
5176
|
-
q8bytes =
|
5276
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5177
5277
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5178
5278
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5179
5279
|
|
5180
5280
|
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
5181
5281
|
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
5182
5282
|
|
5183
|
-
q8bytes =
|
5283
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5184
5284
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
5185
5285
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
5186
5286
|
|
@@ -5188,7 +5288,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5188
5288
|
|
5189
5289
|
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
5190
5290
|
#else
|
5191
|
-
q8bytes =
|
5291
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5192
5292
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5193
5293
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5194
5294
|
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
@@ -5197,7 +5297,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5197
5297
|
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
5198
5298
|
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
|
5199
5299
|
|
5200
|
-
q8bytes =
|
5300
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
5201
5301
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
5202
5302
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
5203
5303
|
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
@@ -5512,8 +5612,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5512
5612
|
|
5513
5613
|
float sumf = 0;
|
5514
5614
|
|
5515
|
-
|
5516
|
-
|
5615
|
+
ggml_int8x16x2_t q4bytes;
|
5616
|
+
ggml_int8x16x4_t q8bytes;
|
5517
5617
|
|
5518
5618
|
float sum_mins = 0.f;
|
5519
5619
|
|
@@ -5534,10 +5634,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5534
5634
|
|
5535
5635
|
const float d = y[i].d * (float)x[i].d[0];
|
5536
5636
|
|
5537
|
-
const
|
5637
|
+
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
5538
5638
|
|
5539
5639
|
#ifdef __ARM_FEATURE_DOTPROD
|
5540
|
-
q8bytes =
|
5640
|
+
q8bytes = ggml_vld1q_s8_x4(q8);
|
5541
5641
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5542
5642
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5543
5643
|
|
@@ -5551,7 +5651,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5551
5651
|
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
5552
5652
|
|
5553
5653
|
#else
|
5554
|
-
q8bytes =
|
5654
|
+
q8bytes = ggml_vld1q_s8_x4(q8);
|
5555
5655
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
5556
5656
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
5557
5657
|
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
@@ -5785,7 +5885,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5785
5885
|
const int32x4_t mzero = vdupq_n_s32(0);
|
5786
5886
|
#endif
|
5787
5887
|
|
5788
|
-
|
5888
|
+
ggml_int8x16x4_t q5bytes;
|
5789
5889
|
|
5790
5890
|
float sumf = 0;
|
5791
5891
|
|
@@ -5815,16 +5915,16 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5815
5915
|
const uint8_t * restrict qh = x[i].qh;
|
5816
5916
|
const int8_t * restrict q8 = y[i].qs;
|
5817
5917
|
|
5818
|
-
|
5918
|
+
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
5819
5919
|
|
5820
|
-
|
5920
|
+
ggml_uint8x16x4_t q5h;
|
5821
5921
|
|
5822
5922
|
int32_t sumi = 0;
|
5823
5923
|
|
5824
5924
|
for (int j = 0; j < QK_K/64; ++j) {
|
5825
5925
|
|
5826
|
-
const
|
5827
|
-
const
|
5926
|
+
const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
|
5927
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
5828
5928
|
|
5829
5929
|
q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
5830
5930
|
q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
@@ -6218,8 +6318,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6218
6318
|
const int32x4_t mzero = vdupq_n_s32(0);
|
6219
6319
|
#endif
|
6220
6320
|
|
6221
|
-
|
6222
|
-
|
6321
|
+
ggml_int8x16x4_t q5bytes;
|
6322
|
+
ggml_uint8x16x4_t q5h;
|
6223
6323
|
|
6224
6324
|
float sumf = 0;
|
6225
6325
|
|
@@ -6234,8 +6334,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6234
6334
|
|
6235
6335
|
const uint8x8_t qhbits = vld1_u8(qh);
|
6236
6336
|
|
6237
|
-
const
|
6238
|
-
const
|
6337
|
+
const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
|
6338
|
+
const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
6239
6339
|
|
6240
6340
|
const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
|
6241
6341
|
q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
|
@@ -6511,8 +6611,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6511
6611
|
|
6512
6612
|
const uint8x16_t mone = vdupq_n_u8(3);
|
6513
6613
|
|
6514
|
-
|
6515
|
-
|
6614
|
+
ggml_int8x16x4_t q6bytes;
|
6615
|
+
ggml_uint8x16x4_t q6h;
|
6516
6616
|
|
6517
6617
|
for (int i = 0; i < nb; ++i) {
|
6518
6618
|
|
@@ -6524,9 +6624,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6524
6624
|
|
6525
6625
|
const int8_t * restrict scale = x[i].scales;
|
6526
6626
|
|
6527
|
-
const
|
6627
|
+
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
6528
6628
|
const int8x16_t scales = vld1q_s8(scale);
|
6529
|
-
const
|
6629
|
+
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
6530
6630
|
|
6531
6631
|
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
6532
6632
|
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
@@ -6538,9 +6638,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6538
6638
|
|
6539
6639
|
for (int j = 0; j < QK_K/128; ++j) {
|
6540
6640
|
|
6541
|
-
|
6542
|
-
|
6543
|
-
|
6641
|
+
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
|
6642
|
+
ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
|
6643
|
+
ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
6544
6644
|
|
6545
6645
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
|
6546
6646
|
q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
|
@@ -6583,7 +6683,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6583
6683
|
scale += 2;
|
6584
6684
|
#endif
|
6585
6685
|
|
6586
|
-
q8bytes =
|
6686
|
+
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
6587
6687
|
|
6588
6688
|
shifted = vshrq_n_u8(qhbits.val[0], 4);
|
6589
6689
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
|
@@ -6987,8 +7087,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6987
7087
|
|
6988
7088
|
const uint8x16_t mone = vdupq_n_u8(3);
|
6989
7089
|
|
6990
|
-
|
6991
|
-
|
7090
|
+
ggml_int8x16x4_t q6bytes;
|
7091
|
+
ggml_uint8x16x4_t q6h;
|
6992
7092
|
|
6993
7093
|
for (int i = 0; i < nb; ++i) {
|
6994
7094
|
|
@@ -7002,9 +7102,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7002
7102
|
|
7003
7103
|
int32_t isum = 0;
|
7004
7104
|
|
7005
|
-
uint8x16_t
|
7006
|
-
|
7007
|
-
|
7105
|
+
uint8x16_t qhbits = vld1q_u8(qh);
|
7106
|
+
ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
|
7107
|
+
ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
|
7008
7108
|
|
7009
7109
|
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
|
7010
7110
|
uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
|