llama_cpp 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,10 +1,8 @@
|
|
1
1
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
2
|
+
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
2
3
|
|
3
|
-
#include "ggml.h"
|
4
|
-
|
5
|
-
#ifdef GGML_USE_K_QUANTS
|
6
|
-
#include "k_quants.h"
|
7
|
-
#endif
|
4
|
+
#include "ggml-impl.h"
|
5
|
+
#include "ggml-quants.h"
|
8
6
|
|
9
7
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
10
8
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -30,18 +28,6 @@
|
|
30
28
|
#include <unistd.h>
|
31
29
|
#endif
|
32
30
|
|
33
|
-
// static_assert should be a #define, but if it's not,
|
34
|
-
// fall back to the _Static_assert C11 keyword.
|
35
|
-
// if C99 - static_assert is noop
|
36
|
-
// ref: https://stackoverflow.com/a/53923785/4039976
|
37
|
-
#ifndef static_assert
|
38
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
39
|
-
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
40
|
-
#else
|
41
|
-
#define static_assert(cond, msg) struct global_scope_noop_trick
|
42
|
-
#endif
|
43
|
-
#endif
|
44
|
-
|
45
31
|
#if defined(_MSC_VER)
|
46
32
|
// disable "possible loss of data" to avoid hundreds of casts
|
47
33
|
// we should just be careful :)
|
@@ -109,23 +95,11 @@ typedef void * thread_ret_t;
|
|
109
95
|
#include <unistd.h>
|
110
96
|
|
111
97
|
#endif
|
98
|
+
|
112
99
|
#ifdef GGML_USE_CPU_HBM
|
113
100
|
#include <hbwmalloc.h>
|
114
101
|
#endif
|
115
102
|
|
116
|
-
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
117
|
-
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
118
|
-
#ifndef __FMA__
|
119
|
-
#define __FMA__
|
120
|
-
#endif
|
121
|
-
#ifndef __F16C__
|
122
|
-
#define __F16C__
|
123
|
-
#endif
|
124
|
-
#ifndef __SSE3__
|
125
|
-
#define __SSE3__
|
126
|
-
#endif
|
127
|
-
#endif
|
128
|
-
|
129
103
|
/*#define GGML_PERF*/
|
130
104
|
#define GGML_DEBUG 0
|
131
105
|
#define GGML_GELU_FP16
|
@@ -251,228 +225,27 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
251
225
|
#include "ggml-opencl.h"
|
252
226
|
#endif
|
253
227
|
|
254
|
-
#undef MIN
|
255
|
-
#undef MAX
|
256
|
-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
257
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
258
|
-
|
259
228
|
// floating point type used to accumulate sums
|
260
229
|
typedef double ggml_float;
|
261
230
|
|
262
|
-
// 16-bit float
|
263
|
-
// on Arm, we use __fp16
|
264
|
-
// on x86, we use uint16_t
|
265
|
-
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
266
|
-
|
267
|
-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
268
|
-
//
|
269
|
-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
270
|
-
//
|
271
|
-
#include <arm_neon.h>
|
272
|
-
|
273
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
274
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
275
|
-
|
276
|
-
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
277
|
-
#define GGML_FP32_TO_FP16(x) (x)
|
278
|
-
|
279
|
-
#else
|
280
|
-
|
281
|
-
#ifdef __wasm_simd128__
|
282
|
-
#include <wasm_simd128.h>
|
283
|
-
#else
|
284
|
-
#ifdef __POWER9_VECTOR__
|
285
|
-
#include <altivec.h>
|
286
|
-
#undef bool
|
287
|
-
#define bool _Bool
|
288
|
-
#else
|
289
|
-
#if defined(_MSC_VER) || defined(__MINGW32__)
|
290
|
-
#include <intrin.h>
|
291
|
-
#else
|
292
|
-
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
293
|
-
#if !defined(__riscv)
|
294
|
-
#include <immintrin.h>
|
295
|
-
#endif
|
296
|
-
#endif
|
297
|
-
#endif
|
298
|
-
#endif
|
299
|
-
#endif
|
300
|
-
|
301
|
-
#ifdef __riscv_v_intrinsic
|
302
|
-
#include <riscv_vector.h>
|
303
|
-
#endif
|
304
|
-
|
305
|
-
#ifdef __F16C__
|
306
|
-
|
307
|
-
#ifdef _MSC_VER
|
308
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
309
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
310
|
-
#else
|
311
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
312
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
313
|
-
#endif
|
314
|
-
|
315
|
-
#elif defined(__POWER9_VECTOR__)
|
316
|
-
|
317
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
318
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
319
|
-
/* the inline asm below is about 12% faster than the lookup method */
|
320
|
-
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
321
|
-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
322
|
-
|
323
|
-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
324
|
-
register float f;
|
325
|
-
register double d;
|
326
|
-
__asm__(
|
327
|
-
"mtfprd %0,%2\n"
|
328
|
-
"xscvhpdp %0,%0\n"
|
329
|
-
"frsp %1,%0\n" :
|
330
|
-
/* temp */ "=d"(d),
|
331
|
-
/* out */ "=f"(f):
|
332
|
-
/* in */ "r"(h));
|
333
|
-
return f;
|
334
|
-
}
|
335
|
-
|
336
|
-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
337
|
-
register double d;
|
338
|
-
register ggml_fp16_t r;
|
339
|
-
__asm__( /* xscvdphp can work on double or single precision */
|
340
|
-
"xscvdphp %0,%2\n"
|
341
|
-
"mffprd %1,%0\n" :
|
342
|
-
/* temp */ "=d"(d),
|
343
|
-
/* out */ "=r"(r):
|
344
|
-
/* in */ "f"(f));
|
345
|
-
return r;
|
346
|
-
}
|
347
|
-
|
348
|
-
#else
|
349
|
-
|
350
|
-
// FP16 <-> FP32
|
351
|
-
// ref: https://github.com/Maratyszcza/FP16
|
352
|
-
|
353
|
-
static inline float fp32_from_bits(uint32_t w) {
|
354
|
-
union {
|
355
|
-
uint32_t as_bits;
|
356
|
-
float as_value;
|
357
|
-
} fp32;
|
358
|
-
fp32.as_bits = w;
|
359
|
-
return fp32.as_value;
|
360
|
-
}
|
361
|
-
|
362
|
-
static inline uint32_t fp32_to_bits(float f) {
|
363
|
-
union {
|
364
|
-
float as_value;
|
365
|
-
uint32_t as_bits;
|
366
|
-
} fp32;
|
367
|
-
fp32.as_value = f;
|
368
|
-
return fp32.as_bits;
|
369
|
-
}
|
370
|
-
|
371
|
-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
372
|
-
const uint32_t w = (uint32_t) h << 16;
|
373
|
-
const uint32_t sign = w & UINT32_C(0x80000000);
|
374
|
-
const uint32_t two_w = w + w;
|
375
|
-
|
376
|
-
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
377
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
378
|
-
const float exp_scale = 0x1.0p-112f;
|
379
|
-
#else
|
380
|
-
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
381
|
-
#endif
|
382
|
-
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
383
|
-
|
384
|
-
const uint32_t magic_mask = UINT32_C(126) << 23;
|
385
|
-
const float magic_bias = 0.5f;
|
386
|
-
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
387
|
-
|
388
|
-
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
389
|
-
const uint32_t result = sign |
|
390
|
-
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
391
|
-
return fp32_from_bits(result);
|
392
|
-
}
|
393
|
-
|
394
|
-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
395
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
396
|
-
const float scale_to_inf = 0x1.0p+112f;
|
397
|
-
const float scale_to_zero = 0x1.0p-110f;
|
398
|
-
#else
|
399
|
-
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
400
|
-
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
401
|
-
#endif
|
402
|
-
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
403
|
-
|
404
|
-
const uint32_t w = fp32_to_bits(f);
|
405
|
-
const uint32_t shl1_w = w + w;
|
406
|
-
const uint32_t sign = w & UINT32_C(0x80000000);
|
407
|
-
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
408
|
-
if (bias < UINT32_C(0x71000000)) {
|
409
|
-
bias = UINT32_C(0x71000000);
|
410
|
-
}
|
411
|
-
|
412
|
-
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
413
|
-
const uint32_t bits = fp32_to_bits(base);
|
414
|
-
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
415
|
-
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
416
|
-
const uint32_t nonsign = exp_bits + mantissa_bits;
|
417
|
-
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
418
|
-
}
|
419
|
-
|
420
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
421
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
422
|
-
|
423
|
-
#endif // __F16C__
|
424
|
-
|
425
|
-
#endif // __ARM_NEON
|
426
|
-
|
427
231
|
//
|
428
232
|
// global data
|
429
233
|
//
|
430
234
|
|
431
235
|
// precomputed gelu table for f16 (128 KB)
|
432
|
-
static ggml_fp16_t
|
236
|
+
static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
433
237
|
|
434
238
|
// precomputed quick gelu table for f16 (128 KB)
|
435
|
-
static ggml_fp16_t
|
239
|
+
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
436
240
|
|
437
241
|
// precomputed silu table for f16 (128 KB)
|
438
|
-
static ggml_fp16_t
|
242
|
+
static ggml_fp16_t ggml_table_silu_f16[1 << 16];
|
439
243
|
|
440
244
|
// precomputed exp table for f16 (128 KB)
|
441
|
-
static ggml_fp16_t
|
442
|
-
|
443
|
-
// precomputed f32 table for f16 (256 KB)
|
444
|
-
static float table_f32_f16[1 << 16];
|
445
|
-
|
446
|
-
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
447
|
-
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
448
|
-
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
449
|
-
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
450
|
-
#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
|
451
|
-
#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
|
452
|
-
#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
|
453
|
-
#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
|
454
|
-
#define B8(c,s ) B7(c,s, c), B7(c,s, s)
|
455
|
-
|
456
|
-
// precomputed tables for expanding 8bits to 8 bytes:
|
457
|
-
static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
|
458
|
-
static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
|
459
|
-
#endif
|
460
|
-
|
461
|
-
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
462
|
-
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
463
|
-
// This is also true for POWER9.
|
464
|
-
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
465
|
-
|
466
|
-
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
467
|
-
uint16_t s;
|
468
|
-
memcpy(&s, &f, sizeof(uint16_t));
|
469
|
-
return table_f32_f16[s];
|
470
|
-
}
|
471
|
-
|
472
|
-
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
473
|
-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
245
|
+
static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
474
246
|
|
475
|
-
|
247
|
+
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
248
|
+
float ggml_table_f32_f16[1 << 16];
|
476
249
|
|
477
250
|
// note: do not use these inside ggml.c
|
478
251
|
// these are meant to be used via the ggml.h API
|
@@ -587,3071 +360,816 @@ int64_t ggml_cycles_per_ms(void) {
|
|
587
360
|
|
588
361
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
589
362
|
|
590
|
-
|
591
|
-
|
592
|
-
//
|
363
|
+
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
364
|
+
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
593
365
|
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
366
|
+
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
367
|
+
[GGML_TYPE_I8] = {
|
368
|
+
.type_name = "i8",
|
369
|
+
.blck_size = 1,
|
370
|
+
.type_size = sizeof(int8_t),
|
371
|
+
.is_quantized = false,
|
372
|
+
},
|
373
|
+
[GGML_TYPE_I16] = {
|
374
|
+
.type_name = "i16",
|
375
|
+
.blck_size = 1,
|
376
|
+
.type_size = sizeof(int16_t),
|
377
|
+
.is_quantized = false,
|
378
|
+
},
|
379
|
+
[GGML_TYPE_I32] = {
|
380
|
+
.type_name = "i32",
|
381
|
+
.blck_size = 1,
|
382
|
+
.type_size = sizeof(int32_t),
|
383
|
+
.is_quantized = false,
|
384
|
+
},
|
385
|
+
[GGML_TYPE_F32] = {
|
386
|
+
.type_name = "f32",
|
387
|
+
.blck_size = 1,
|
388
|
+
.type_size = sizeof(float),
|
389
|
+
.is_quantized = false,
|
390
|
+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
391
|
+
.vec_dot_type = GGML_TYPE_F32,
|
392
|
+
},
|
393
|
+
[GGML_TYPE_F16] = {
|
394
|
+
.type_name = "f16",
|
395
|
+
.blck_size = 1,
|
396
|
+
.type_size = sizeof(ggml_fp16_t),
|
397
|
+
.is_quantized = false,
|
398
|
+
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
399
|
+
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
400
|
+
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
401
|
+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
402
|
+
.vec_dot_type = GGML_TYPE_F16,
|
403
|
+
},
|
404
|
+
[GGML_TYPE_Q4_0] = {
|
405
|
+
.type_name = "q4_0",
|
406
|
+
.blck_size = QK4_0,
|
407
|
+
.type_size = sizeof(block_q4_0),
|
408
|
+
.is_quantized = true,
|
409
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
410
|
+
.from_float = quantize_row_q4_0,
|
411
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
412
|
+
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
413
|
+
.vec_dot_type = GGML_TYPE_Q8_0,
|
414
|
+
},
|
415
|
+
[GGML_TYPE_Q4_1] = {
|
416
|
+
.type_name = "q4_1",
|
417
|
+
.blck_size = QK4_1,
|
418
|
+
.type_size = sizeof(block_q4_1),
|
419
|
+
.is_quantized = true,
|
420
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
421
|
+
.from_float = quantize_row_q4_1,
|
422
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
423
|
+
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
424
|
+
.vec_dot_type = GGML_TYPE_Q8_1,
|
425
|
+
},
|
426
|
+
[4] = { // GGML_TYPE_Q4_2
|
427
|
+
.type_name = "DEPRECATED",
|
428
|
+
.blck_size = 0,
|
429
|
+
.type_size = 0,
|
430
|
+
.is_quantized = false,
|
431
|
+
.to_float = NULL,
|
432
|
+
.from_float = NULL,
|
433
|
+
.from_float_reference = NULL,
|
434
|
+
.vec_dot = NULL,
|
435
|
+
.vec_dot_type = GGML_TYPE_COUNT,
|
436
|
+
},
|
437
|
+
[5] = { // GGML_TYPE_Q4_3
|
438
|
+
.type_name = "DEPRECATED",
|
439
|
+
.blck_size = 0,
|
440
|
+
.type_size = 0,
|
441
|
+
.is_quantized = false,
|
442
|
+
.to_float = NULL,
|
443
|
+
.from_float = NULL,
|
444
|
+
.from_float_reference = NULL,
|
445
|
+
.vec_dot = NULL,
|
446
|
+
.vec_dot_type = GGML_TYPE_COUNT,
|
447
|
+
},
|
448
|
+
[GGML_TYPE_Q5_0] = {
|
449
|
+
.type_name = "q5_0",
|
450
|
+
.blck_size = QK5_0,
|
451
|
+
.type_size = sizeof(block_q5_0),
|
452
|
+
.is_quantized = true,
|
453
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
454
|
+
.from_float = quantize_row_q5_0,
|
455
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
456
|
+
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
457
|
+
.vec_dot_type = GGML_TYPE_Q8_0,
|
458
|
+
},
|
459
|
+
[GGML_TYPE_Q5_1] = {
|
460
|
+
.type_name = "q5_1",
|
461
|
+
.blck_size = QK5_1,
|
462
|
+
.type_size = sizeof(block_q5_1),
|
463
|
+
.is_quantized = true,
|
464
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
465
|
+
.from_float = quantize_row_q5_1,
|
466
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
467
|
+
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
468
|
+
.vec_dot_type = GGML_TYPE_Q8_1,
|
469
|
+
},
|
470
|
+
[GGML_TYPE_Q8_0] = {
|
471
|
+
.type_name = "q8_0",
|
472
|
+
.blck_size = QK8_0,
|
473
|
+
.type_size = sizeof(block_q8_0),
|
474
|
+
.is_quantized = true,
|
475
|
+
.to_float = (ggml_to_float_t) dequantize_row_q8_0,
|
476
|
+
.from_float = quantize_row_q8_0,
|
477
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
478
|
+
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
479
|
+
.vec_dot_type = GGML_TYPE_Q8_0,
|
480
|
+
},
|
481
|
+
[GGML_TYPE_Q8_1] = {
|
482
|
+
.type_name = "q8_1",
|
483
|
+
.blck_size = QK8_1,
|
484
|
+
.type_size = sizeof(block_q8_1),
|
485
|
+
.is_quantized = true,
|
486
|
+
.from_float = quantize_row_q8_1,
|
487
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
488
|
+
.vec_dot_type = GGML_TYPE_Q8_1,
|
489
|
+
},
|
490
|
+
[GGML_TYPE_Q2_K] = {
|
491
|
+
.type_name = "q2_K",
|
492
|
+
.blck_size = QK_K,
|
493
|
+
.type_size = sizeof(block_q2_K),
|
494
|
+
.is_quantized = true,
|
495
|
+
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
496
|
+
.from_float = quantize_row_q2_K,
|
497
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
498
|
+
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
499
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
500
|
+
},
|
501
|
+
[GGML_TYPE_Q3_K] = {
|
502
|
+
.type_name = "q3_K",
|
503
|
+
.blck_size = QK_K,
|
504
|
+
.type_size = sizeof(block_q3_K),
|
505
|
+
.is_quantized = true,
|
506
|
+
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
507
|
+
.from_float = quantize_row_q3_K,
|
508
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
509
|
+
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
510
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
511
|
+
},
|
512
|
+
[GGML_TYPE_Q4_K] = {
|
513
|
+
.type_name = "q4_K",
|
514
|
+
.blck_size = QK_K,
|
515
|
+
.type_size = sizeof(block_q4_K),
|
516
|
+
.is_quantized = true,
|
517
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
518
|
+
.from_float = quantize_row_q4_K,
|
519
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
520
|
+
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
521
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
522
|
+
},
|
523
|
+
[GGML_TYPE_Q5_K] = {
|
524
|
+
.type_name = "q5_K",
|
525
|
+
.blck_size = QK_K,
|
526
|
+
.type_size = sizeof(block_q5_K),
|
527
|
+
.is_quantized = true,
|
528
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
529
|
+
.from_float = quantize_row_q5_K,
|
530
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
531
|
+
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
532
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
533
|
+
},
|
534
|
+
[GGML_TYPE_Q6_K] = {
|
535
|
+
.type_name = "q6_K",
|
536
|
+
.blck_size = QK_K,
|
537
|
+
.type_size = sizeof(block_q6_K),
|
538
|
+
.is_quantized = true,
|
539
|
+
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
540
|
+
.from_float = quantize_row_q6_K,
|
541
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
542
|
+
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
543
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
544
|
+
},
|
545
|
+
[GGML_TYPE_Q8_K] = {
|
546
|
+
.type_name = "q8_K",
|
547
|
+
.blck_size = QK_K,
|
548
|
+
.type_size = sizeof(block_q8_K),
|
549
|
+
.is_quantized = true,
|
550
|
+
.from_float = quantize_row_q8_K,
|
551
|
+
}
|
552
|
+
};
|
678
553
|
|
679
|
-
//
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
|
684
|
-
return _mm256_cvtepi32_ps(summed_pairs);
|
685
|
-
#else
|
686
|
-
// Get absolute values of x vectors
|
687
|
-
const __m256i ax = _mm256_sign_epi8(x, x);
|
688
|
-
// Sign the values of the y vectors
|
689
|
-
const __m256i sy = _mm256_sign_epi8(y, x);
|
690
|
-
return mul_sum_us8_pairs_float(ax, sy);
|
691
|
-
#endif
|
554
|
+
// For internal test use
|
555
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
556
|
+
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
557
|
+
return type_traits[type];
|
692
558
|
}
|
693
559
|
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
#if __AVX512F__
|
698
|
-
const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
|
699
|
-
bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
|
700
|
-
return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
|
701
|
-
#else
|
702
|
-
const __m256i lowByte = _mm256_set1_epi16( 0xFF );
|
703
|
-
__m256i high = _mm256_andnot_si256( lowByte, bytes );
|
704
|
-
__m256i low = _mm256_and_si256( lowByte, bytes );
|
705
|
-
high = _mm256_srli_epi16( high, 4 );
|
706
|
-
bytes = _mm256_or_si256( low, high );
|
707
|
-
|
708
|
-
// Compress uint16_t lanes into bytes
|
709
|
-
__m128i r0 = _mm256_castsi256_si128( bytes );
|
710
|
-
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
711
|
-
return _mm_packus_epi16( r0, r1 );
|
712
|
-
#endif
|
713
|
-
}
|
714
|
-
#elif defined(__AVX__)
|
715
|
-
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
716
|
-
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
717
|
-
uint32_t x32;
|
718
|
-
memcpy(&x32, x, sizeof(uint32_t));
|
719
|
-
const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
|
720
|
-
const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
|
721
|
-
__m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
|
722
|
-
__m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
|
723
|
-
const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
|
724
|
-
bytesl = _mm_or_si128(bytesl, bit_mask);
|
725
|
-
bytesh = _mm_or_si128(bytesh, bit_mask);
|
726
|
-
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
|
727
|
-
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
|
728
|
-
return MM256_SET_M128I(bytesh, bytesl);
|
729
|
-
}
|
730
|
-
|
731
|
-
// Unpack 32 4-bit fields into 32 bytes
|
732
|
-
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
|
733
|
-
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
734
|
-
{
|
735
|
-
// Load 16 bytes from memory
|
736
|
-
__m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
|
737
|
-
__m128i tmph = _mm_srli_epi16(tmpl, 4);
|
738
|
-
const __m128i lowMask = _mm_set1_epi8(0xF);
|
739
|
-
tmpl = _mm_and_si128(lowMask, tmpl);
|
740
|
-
tmph = _mm_and_si128(lowMask, tmph);
|
741
|
-
return MM256_SET_M128I(tmph, tmpl);
|
742
|
-
}
|
743
|
-
|
744
|
-
// add int16_t pairwise and return as float vector
|
745
|
-
static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
|
746
|
-
const __m128i ones = _mm_set1_epi16(1);
|
747
|
-
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
|
748
|
-
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
|
749
|
-
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
|
750
|
-
return _mm256_cvtepi32_ps(summed_pairs);
|
751
|
-
}
|
752
|
-
|
753
|
-
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
754
|
-
const __m128i axl = _mm256_castsi256_si128(ax);
|
755
|
-
const __m128i axh = _mm256_extractf128_si256(ax, 1);
|
756
|
-
const __m128i syl = _mm256_castsi256_si128(sy);
|
757
|
-
const __m128i syh = _mm256_extractf128_si256(sy, 1);
|
758
|
-
// Perform multiplication and create 16-bit values
|
759
|
-
const __m128i dotl = _mm_maddubs_epi16(axl, syl);
|
760
|
-
const __m128i doth = _mm_maddubs_epi16(axh, syh);
|
761
|
-
return sum_i16_pairs_float(doth, dotl);
|
762
|
-
}
|
763
|
-
|
764
|
-
// multiply int8_t, add results pairwise twice and return as float vector
|
765
|
-
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
766
|
-
const __m128i xl = _mm256_castsi256_si128(x);
|
767
|
-
const __m128i xh = _mm256_extractf128_si256(x, 1);
|
768
|
-
const __m128i yl = _mm256_castsi256_si128(y);
|
769
|
-
const __m128i yh = _mm256_extractf128_si256(y, 1);
|
770
|
-
// Get absolute values of x vectors
|
771
|
-
const __m128i axl = _mm_sign_epi8(xl, xl);
|
772
|
-
const __m128i axh = _mm_sign_epi8(xh, xh);
|
773
|
-
// Sign the values of the y vectors
|
774
|
-
const __m128i syl = _mm_sign_epi8(yl, xl);
|
775
|
-
const __m128i syh = _mm_sign_epi8(yh, xh);
|
776
|
-
// Perform multiplication and create 16-bit values
|
777
|
-
const __m128i dotl = _mm_maddubs_epi16(axl, syl);
|
778
|
-
const __m128i doth = _mm_maddubs_epi16(axh, syh);
|
779
|
-
return sum_i16_pairs_float(doth, dotl);
|
780
|
-
}
|
781
|
-
|
782
|
-
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
783
|
-
{
|
784
|
-
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
785
|
-
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
786
|
-
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
787
|
-
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
788
|
-
high = _mm_srli_epi16( high, 4 );
|
789
|
-
bytes1 = _mm_or_si128( low, high );
|
790
|
-
high = _mm_andnot_si128( lowByte, bytes2 );
|
791
|
-
low = _mm_and_si128( lowByte, bytes2 );
|
792
|
-
high = _mm_srli_epi16( high, 4 );
|
793
|
-
bytes2 = _mm_or_si128( low, high );
|
794
|
-
|
795
|
-
return _mm_packus_epi16( bytes1, bytes2);
|
796
|
-
}
|
797
|
-
#endif
|
798
|
-
#elif defined(__SSSE3__)
|
799
|
-
// horizontally add 4x4 floats
|
800
|
-
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
|
801
|
-
__m128 res_0 =_mm_hadd_ps(a, b);
|
802
|
-
__m128 res_1 =_mm_hadd_ps(c, d);
|
803
|
-
__m128 res =_mm_hadd_ps(res_0, res_1);
|
804
|
-
res =_mm_hadd_ps(res, res);
|
805
|
-
res =_mm_hadd_ps(res, res);
|
806
|
-
|
807
|
-
return _mm_cvtss_f32(res);
|
808
|
-
}
|
809
|
-
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
810
|
-
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
811
|
-
|
812
|
-
#if defined(__ARM_NEON)
|
813
|
-
|
814
|
-
#if !defined(__aarch64__)
|
815
|
-
|
816
|
-
inline static int32_t vaddvq_s32(int32x4_t v) {
|
817
|
-
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
818
|
-
}
|
819
|
-
|
820
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
821
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
822
|
-
}
|
823
|
-
|
824
|
-
inline static float vmaxvq_f32(float32x4_t v) {
|
825
|
-
return
|
826
|
-
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
827
|
-
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
828
|
-
}
|
829
|
-
|
830
|
-
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
831
|
-
int32x4_t res;
|
832
|
-
|
833
|
-
res[0] = roundf(vgetq_lane_f32(v, 0));
|
834
|
-
res[1] = roundf(vgetq_lane_f32(v, 1));
|
835
|
-
res[2] = roundf(vgetq_lane_f32(v, 2));
|
836
|
-
res[3] = roundf(vgetq_lane_f32(v, 3));
|
837
|
-
|
838
|
-
return res;
|
839
|
-
}
|
840
|
-
|
841
|
-
#endif
|
842
|
-
#endif
|
843
|
-
|
844
|
-
#define QK4_0 32
|
845
|
-
typedef struct {
|
846
|
-
ggml_fp16_t d; // delta
|
847
|
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
848
|
-
} block_q4_0;
|
849
|
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
850
|
-
|
851
|
-
#define QK4_1 32
|
852
|
-
typedef struct {
|
853
|
-
ggml_fp16_t d; // delta
|
854
|
-
ggml_fp16_t m; // min
|
855
|
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
856
|
-
} block_q4_1;
|
857
|
-
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
858
|
-
|
859
|
-
#define QK5_0 32
|
860
|
-
typedef struct {
|
861
|
-
ggml_fp16_t d; // delta
|
862
|
-
uint8_t qh[4]; // 5-th bit of quants
|
863
|
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
864
|
-
} block_q5_0;
|
865
|
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
866
|
-
|
867
|
-
#define QK5_1 32
|
868
|
-
typedef struct {
|
869
|
-
ggml_fp16_t d; // delta
|
870
|
-
ggml_fp16_t m; // min
|
871
|
-
uint8_t qh[4]; // 5-th bit of quants
|
872
|
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
873
|
-
} block_q5_1;
|
874
|
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
875
|
-
|
876
|
-
#define QK8_0 32
|
877
|
-
typedef struct {
|
878
|
-
ggml_fp16_t d; // delta
|
879
|
-
int8_t qs[QK8_0]; // quants
|
880
|
-
} block_q8_0;
|
881
|
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
882
|
-
|
883
|
-
#define QK8_1 32
|
884
|
-
typedef struct {
|
885
|
-
float d; // delta
|
886
|
-
float s; // d * sum(qs[i])
|
887
|
-
int8_t qs[QK8_1]; // quants
|
888
|
-
} block_q8_1;
|
889
|
-
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
890
|
-
|
891
|
-
// reference implementation for deterministic creation of model files
|
892
|
-
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
|
893
|
-
static const int qk = QK4_0;
|
894
|
-
|
895
|
-
assert(k % qk == 0);
|
896
|
-
|
897
|
-
const int nb = k / qk;
|
898
|
-
|
899
|
-
for (int i = 0; i < nb; i++) {
|
900
|
-
float amax = 0.0f; // absolute max
|
901
|
-
float max = 0.0f;
|
902
|
-
|
903
|
-
for (int j = 0; j < qk; j++) {
|
904
|
-
const float v = x[i*qk + j];
|
905
|
-
if (amax < fabsf(v)) {
|
906
|
-
amax = fabsf(v);
|
907
|
-
max = v;
|
908
|
-
}
|
909
|
-
}
|
910
|
-
|
911
|
-
const float d = max / -8;
|
912
|
-
const float id = d ? 1.0f/d : 0.0f;
|
913
|
-
|
914
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
915
|
-
|
916
|
-
for (int j = 0; j < qk/2; ++j) {
|
917
|
-
const float x0 = x[i*qk + 0 + j]*id;
|
918
|
-
const float x1 = x[i*qk + qk/2 + j]*id;
|
919
|
-
|
920
|
-
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
|
921
|
-
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
|
922
|
-
|
923
|
-
y[i].qs[j] = xi0;
|
924
|
-
y[i].qs[j] |= xi1 << 4;
|
925
|
-
}
|
926
|
-
}
|
927
|
-
}
|
928
|
-
|
929
|
-
static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
930
|
-
quantize_row_q4_0_reference(x, y, k);
|
931
|
-
}
|
932
|
-
|
933
|
-
static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
|
934
|
-
const int qk = QK4_1;
|
935
|
-
|
936
|
-
assert(k % qk == 0);
|
937
|
-
|
938
|
-
const int nb = k / qk;
|
939
|
-
|
940
|
-
for (int i = 0; i < nb; i++) {
|
941
|
-
float min = FLT_MAX;
|
942
|
-
float max = -FLT_MAX;
|
943
|
-
|
944
|
-
for (int j = 0; j < qk; j++) {
|
945
|
-
const float v = x[i*qk + j];
|
946
|
-
|
947
|
-
if (v < min) min = v;
|
948
|
-
if (v > max) max = v;
|
949
|
-
}
|
950
|
-
|
951
|
-
const float d = (max - min) / ((1 << 4) - 1);
|
952
|
-
const float id = d ? 1.0f/d : 0.0f;
|
953
|
-
|
954
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
955
|
-
y[i].m = GGML_FP32_TO_FP16(min);
|
956
|
-
|
957
|
-
for (int j = 0; j < qk/2; ++j) {
|
958
|
-
const float x0 = (x[i*qk + 0 + j] - min)*id;
|
959
|
-
const float x1 = (x[i*qk + qk/2 + j] - min)*id;
|
960
|
-
|
961
|
-
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
|
962
|
-
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
|
963
|
-
|
964
|
-
y[i].qs[j] = xi0;
|
965
|
-
y[i].qs[j] |= xi1 << 4;
|
966
|
-
}
|
967
|
-
}
|
968
|
-
}
|
969
|
-
|
970
|
-
static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
|
971
|
-
quantize_row_q4_1_reference(x, y, k);
|
972
|
-
}
|
973
|
-
|
974
|
-
static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
|
975
|
-
static const int qk = QK5_0;
|
976
|
-
|
977
|
-
assert(k % qk == 0);
|
978
|
-
|
979
|
-
const int nb = k / qk;
|
980
|
-
|
981
|
-
for (int i = 0; i < nb; i++) {
|
982
|
-
float amax = 0.0f; // absolute max
|
983
|
-
float max = 0.0f;
|
984
|
-
|
985
|
-
for (int j = 0; j < qk; j++) {
|
986
|
-
const float v = x[i*qk + j];
|
987
|
-
if (amax < fabsf(v)) {
|
988
|
-
amax = fabsf(v);
|
989
|
-
max = v;
|
990
|
-
}
|
991
|
-
}
|
992
|
-
|
993
|
-
const float d = max / -16;
|
994
|
-
const float id = d ? 1.0f/d : 0.0f;
|
995
|
-
|
996
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
997
|
-
|
998
|
-
uint32_t qh = 0;
|
999
|
-
|
1000
|
-
for (int j = 0; j < qk/2; ++j) {
|
1001
|
-
const float x0 = x[i*qk + 0 + j]*id;
|
1002
|
-
const float x1 = x[i*qk + qk/2 + j]*id;
|
1003
|
-
|
1004
|
-
const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
|
1005
|
-
const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
|
1006
|
-
|
1007
|
-
y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
1008
|
-
|
1009
|
-
// get the 5-th bit and store it in qh at the right position
|
1010
|
-
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
1011
|
-
qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
|
1012
|
-
}
|
1013
|
-
|
1014
|
-
memcpy(&y[i].qh, &qh, sizeof(qh));
|
1015
|
-
}
|
1016
|
-
}
|
1017
|
-
|
1018
|
-
static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
|
1019
|
-
quantize_row_q5_0_reference(x, y, k);
|
1020
|
-
}
|
1021
|
-
|
1022
|
-
static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
|
1023
|
-
const int qk = QK5_1;
|
1024
|
-
|
1025
|
-
assert(k % qk == 0);
|
1026
|
-
|
1027
|
-
const int nb = k / qk;
|
1028
|
-
|
1029
|
-
for (int i = 0; i < nb; i++) {
|
1030
|
-
float min = FLT_MAX;
|
1031
|
-
float max = -FLT_MAX;
|
1032
|
-
|
1033
|
-
for (int j = 0; j < qk; j++) {
|
1034
|
-
const float v = x[i*qk + j];
|
1035
|
-
|
1036
|
-
if (v < min) min = v;
|
1037
|
-
if (v > max) max = v;
|
1038
|
-
}
|
1039
|
-
|
1040
|
-
const float d = (max - min) / ((1 << 5) - 1);
|
1041
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1042
|
-
|
1043
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1044
|
-
y[i].m = GGML_FP32_TO_FP16(min);
|
1045
|
-
|
1046
|
-
uint32_t qh = 0;
|
1047
|
-
|
1048
|
-
for (int j = 0; j < qk/2; ++j) {
|
1049
|
-
const float x0 = (x[i*qk + 0 + j] - min)*id;
|
1050
|
-
const float x1 = (x[i*qk + qk/2 + j] - min)*id;
|
1051
|
-
|
1052
|
-
const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
|
1053
|
-
const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
|
1054
|
-
|
1055
|
-
y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
1056
|
-
|
1057
|
-
// get the 5-th bit and store it in qh at the right position
|
1058
|
-
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
1059
|
-
qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
|
1060
|
-
}
|
1061
|
-
|
1062
|
-
memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
|
1063
|
-
}
|
1064
|
-
}
|
1065
|
-
|
1066
|
-
static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
|
1067
|
-
quantize_row_q5_1_reference(x, y, k);
|
1068
|
-
}
|
1069
|
-
|
1070
|
-
// reference implementation for deterministic creation of model files
|
1071
|
-
static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
|
1072
|
-
assert(k % QK8_0 == 0);
|
1073
|
-
const int nb = k / QK8_0;
|
1074
|
-
|
1075
|
-
for (int i = 0; i < nb; i++) {
|
1076
|
-
float amax = 0.0f; // absolute max
|
1077
|
-
|
1078
|
-
for (int j = 0; j < QK8_0; j++) {
|
1079
|
-
const float v = x[i*QK8_0 + j];
|
1080
|
-
amax = MAX(amax, fabsf(v));
|
1081
|
-
}
|
1082
|
-
|
1083
|
-
const float d = amax / ((1 << 7) - 1);
|
1084
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1085
|
-
|
1086
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1087
|
-
|
1088
|
-
for (int j = 0; j < QK8_0; ++j) {
|
1089
|
-
const float x0 = x[i*QK8_0 + j]*id;
|
1090
|
-
|
1091
|
-
y[i].qs[j] = roundf(x0);
|
1092
|
-
}
|
1093
|
-
}
|
1094
|
-
}
|
1095
|
-
|
1096
|
-
static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
1097
|
-
assert(QK8_0 == 32);
|
1098
|
-
assert(k % QK8_0 == 0);
|
1099
|
-
const int nb = k / QK8_0;
|
1100
|
-
|
1101
|
-
block_q8_0 * restrict y = vy;
|
1102
|
-
|
1103
|
-
#if defined(__ARM_NEON)
|
1104
|
-
for (int i = 0; i < nb; i++) {
|
1105
|
-
float32x4_t srcv [8];
|
1106
|
-
float32x4_t asrcv[8];
|
1107
|
-
float32x4_t amaxv[8];
|
1108
|
-
|
1109
|
-
for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j);
|
1110
|
-
for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
|
1111
|
-
|
1112
|
-
for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
|
1113
|
-
for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
|
1114
|
-
for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
|
1115
|
-
|
1116
|
-
const float amax = vmaxvq_f32(amaxv[0]);
|
1117
|
-
|
1118
|
-
const float d = amax / ((1 << 7) - 1);
|
1119
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1120
|
-
|
1121
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1122
|
-
|
1123
|
-
for (int j = 0; j < 8; j++) {
|
1124
|
-
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
1125
|
-
const int32x4_t vi = vcvtnq_s32_f32(v);
|
1126
|
-
|
1127
|
-
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
|
1128
|
-
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
|
1129
|
-
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
|
1130
|
-
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
1131
|
-
}
|
1132
|
-
}
|
1133
|
-
#elif defined(__wasm_simd128__)
|
1134
|
-
for (int i = 0; i < nb; i++) {
|
1135
|
-
v128_t srcv [8];
|
1136
|
-
v128_t asrcv[8];
|
1137
|
-
v128_t amaxv[8];
|
1138
|
-
|
1139
|
-
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1140
|
-
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1141
|
-
|
1142
|
-
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1143
|
-
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1144
|
-
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1145
|
-
|
1146
|
-
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1147
|
-
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1148
|
-
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1149
|
-
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1150
|
-
|
1151
|
-
const float d = amax / ((1 << 7) - 1);
|
1152
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1153
|
-
|
1154
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1155
|
-
|
1156
|
-
for (int j = 0; j < 8; j++) {
|
1157
|
-
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1158
|
-
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1159
|
-
|
1160
|
-
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1161
|
-
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1162
|
-
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1163
|
-
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1164
|
-
}
|
1165
|
-
}
|
1166
|
-
#elif defined(__AVX2__) || defined(__AVX__)
|
1167
|
-
for (int i = 0; i < nb; i++) {
|
1168
|
-
// Load elements into 4 AVX vectors
|
1169
|
-
__m256 v0 = _mm256_loadu_ps( x );
|
1170
|
-
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
1171
|
-
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
1172
|
-
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
1173
|
-
x += 32;
|
1174
|
-
|
1175
|
-
// Compute max(abs(e)) for the block
|
1176
|
-
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
1177
|
-
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
1178
|
-
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
1179
|
-
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
1180
|
-
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
1181
|
-
|
1182
|
-
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
1183
|
-
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
1184
|
-
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
1185
|
-
const float maxScalar = _mm_cvtss_f32( max4 );
|
1186
|
-
|
1187
|
-
// Quantize these floats
|
1188
|
-
const float d = maxScalar / 127.f;
|
1189
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1190
|
-
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1191
|
-
const __m256 mul = _mm256_set1_ps( id );
|
1192
|
-
|
1193
|
-
// Apply the multiplier
|
1194
|
-
v0 = _mm256_mul_ps( v0, mul );
|
1195
|
-
v1 = _mm256_mul_ps( v1, mul );
|
1196
|
-
v2 = _mm256_mul_ps( v2, mul );
|
1197
|
-
v3 = _mm256_mul_ps( v3, mul );
|
1198
|
-
|
1199
|
-
// Round to nearest integer
|
1200
|
-
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
1201
|
-
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
1202
|
-
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
1203
|
-
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
1204
|
-
|
1205
|
-
// Convert floats to integers
|
1206
|
-
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
1207
|
-
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
1208
|
-
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
1209
|
-
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
1210
|
-
|
1211
|
-
#if defined(__AVX2__)
|
1212
|
-
// Convert int32 to int16
|
1213
|
-
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
1214
|
-
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
|
1215
|
-
// Convert int16 to int8
|
1216
|
-
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
|
1217
|
-
|
1218
|
-
// We got our precious signed bytes, but the order is now wrong
|
1219
|
-
// These AVX2 pack instructions process 16-byte pieces independently
|
1220
|
-
// The following instruction is fixing the order
|
1221
|
-
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
1222
|
-
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
1223
|
-
|
1224
|
-
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
1225
|
-
#else
|
1226
|
-
// Since we don't have in AVX some necessary functions,
|
1227
|
-
// we split the registers in half and call AVX2 analogs from SSE
|
1228
|
-
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
1229
|
-
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
1230
|
-
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
1231
|
-
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
1232
|
-
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
1233
|
-
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
1234
|
-
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
1235
|
-
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
1236
|
-
|
1237
|
-
// Convert int32 to int16
|
1238
|
-
ni0 = _mm_packs_epi32( ni0, ni1 );
|
1239
|
-
ni2 = _mm_packs_epi32( ni2, ni3 );
|
1240
|
-
ni4 = _mm_packs_epi32( ni4, ni5 );
|
1241
|
-
ni6 = _mm_packs_epi32( ni6, ni7 );
|
1242
|
-
// Convert int16 to int8
|
1243
|
-
ni0 = _mm_packs_epi16( ni0, ni2 );
|
1244
|
-
ni4 = _mm_packs_epi16( ni4, ni6 );
|
1245
|
-
|
1246
|
-
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
|
1247
|
-
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
1248
|
-
#endif
|
1249
|
-
}
|
1250
|
-
#elif defined(__riscv_v_intrinsic)
|
1251
|
-
|
1252
|
-
size_t vl = __riscv_vsetvl_e32m4(QK8_0);
|
1253
|
-
|
1254
|
-
for (int i = 0; i < nb; i++) {
|
1255
|
-
// load elements
|
1256
|
-
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
|
1257
|
-
|
1258
|
-
vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
|
1259
|
-
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
|
1260
|
-
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
|
1261
|
-
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
|
1262
|
-
|
1263
|
-
const float d = amax / ((1 << 7) - 1);
|
1264
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1265
|
-
|
1266
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1267
|
-
|
1268
|
-
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
1269
|
-
|
1270
|
-
// convert to integer
|
1271
|
-
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
|
1272
|
-
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
|
1273
|
-
|
1274
|
-
// store result
|
1275
|
-
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
1276
|
-
}
|
1277
|
-
#else
|
1278
|
-
// scalar
|
1279
|
-
quantize_row_q8_0_reference(x, y, k);
|
1280
|
-
#endif
|
1281
|
-
}
|
1282
|
-
|
1283
|
-
// reference implementation for deterministic creation of model files
|
1284
|
-
static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
|
1285
|
-
assert(QK8_1 == 32);
|
1286
|
-
assert(k % QK8_1 == 0);
|
1287
|
-
const int nb = k / QK8_1;
|
1288
|
-
|
1289
|
-
for (int i = 0; i < nb; i++) {
|
1290
|
-
float amax = 0.0f; // absolute max
|
1291
|
-
|
1292
|
-
for (int j = 0; j < QK8_1; j++) {
|
1293
|
-
const float v = x[i*QK8_1 + j];
|
1294
|
-
amax = MAX(amax, fabsf(v));
|
1295
|
-
}
|
1296
|
-
|
1297
|
-
const float d = amax / ((1 << 7) - 1);
|
1298
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1299
|
-
|
1300
|
-
y[i].d = d;
|
1301
|
-
|
1302
|
-
int sum = 0;
|
1303
|
-
|
1304
|
-
for (int j = 0; j < QK8_1/2; ++j) {
|
1305
|
-
const float v0 = x[i*QK8_1 + j]*id;
|
1306
|
-
const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
|
1307
|
-
|
1308
|
-
y[i].qs[ j] = roundf(v0);
|
1309
|
-
y[i].qs[QK8_1/2 + j] = roundf(v1);
|
1310
|
-
|
1311
|
-
sum += y[i].qs[ j];
|
1312
|
-
sum += y[i].qs[QK8_1/2 + j];
|
1313
|
-
}
|
1314
|
-
|
1315
|
-
y[i].s = sum*d;
|
1316
|
-
}
|
1317
|
-
}
|
1318
|
-
|
1319
|
-
static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
1320
|
-
assert(k % QK8_1 == 0);
|
1321
|
-
const int nb = k / QK8_1;
|
1322
|
-
|
1323
|
-
block_q8_1 * restrict y = vy;
|
1324
|
-
|
1325
|
-
#if defined(__ARM_NEON)
|
1326
|
-
for (int i = 0; i < nb; i++) {
|
1327
|
-
float32x4_t srcv [8];
|
1328
|
-
float32x4_t asrcv[8];
|
1329
|
-
float32x4_t amaxv[8];
|
1330
|
-
|
1331
|
-
for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j);
|
1332
|
-
for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
|
1333
|
-
|
1334
|
-
for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
|
1335
|
-
for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
|
1336
|
-
for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
|
1337
|
-
|
1338
|
-
const float amax = vmaxvq_f32(amaxv[0]);
|
1339
|
-
|
1340
|
-
const float d = amax / ((1 << 7) - 1);
|
1341
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1342
|
-
|
1343
|
-
y[i].d = d;
|
1344
|
-
|
1345
|
-
int32x4_t accv = vdupq_n_s32(0);
|
1346
|
-
|
1347
|
-
for (int j = 0; j < 8; j++) {
|
1348
|
-
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
1349
|
-
const int32x4_t vi = vcvtnq_s32_f32(v);
|
1350
|
-
|
1351
|
-
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
|
1352
|
-
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
|
1353
|
-
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
|
1354
|
-
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
1355
|
-
|
1356
|
-
accv = vaddq_s32(accv, vi);
|
1357
|
-
}
|
1358
|
-
|
1359
|
-
y[i].s = d * vaddvq_s32(accv);
|
1360
|
-
}
|
1361
|
-
#elif defined(__wasm_simd128__)
|
1362
|
-
for (int i = 0; i < nb; i++) {
|
1363
|
-
v128_t srcv [8];
|
1364
|
-
v128_t asrcv[8];
|
1365
|
-
v128_t amaxv[8];
|
1366
|
-
|
1367
|
-
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1368
|
-
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1369
|
-
|
1370
|
-
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1371
|
-
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1372
|
-
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1373
|
-
|
1374
|
-
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1375
|
-
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1376
|
-
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1377
|
-
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1378
|
-
|
1379
|
-
const float d = amax / ((1 << 7) - 1);
|
1380
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1381
|
-
|
1382
|
-
y[i].d = d;
|
1383
|
-
|
1384
|
-
v128_t accv = wasm_i32x4_splat(0);
|
1385
|
-
|
1386
|
-
for (int j = 0; j < 8; j++) {
|
1387
|
-
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1388
|
-
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1389
|
-
|
1390
|
-
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1391
|
-
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1392
|
-
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1393
|
-
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1394
|
-
|
1395
|
-
accv = wasm_i32x4_add(accv, vi);
|
1396
|
-
}
|
1397
|
-
|
1398
|
-
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
|
1399
|
-
wasm_i32x4_extract_lane(accv, 1) +
|
1400
|
-
wasm_i32x4_extract_lane(accv, 2) +
|
1401
|
-
wasm_i32x4_extract_lane(accv, 3));
|
1402
|
-
}
|
1403
|
-
#elif defined(__AVX2__) || defined(__AVX__)
|
1404
|
-
for (int i = 0; i < nb; i++) {
|
1405
|
-
// Load elements into 4 AVX vectors
|
1406
|
-
__m256 v0 = _mm256_loadu_ps( x );
|
1407
|
-
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
1408
|
-
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
1409
|
-
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
1410
|
-
x += 32;
|
1411
|
-
|
1412
|
-
// Compute max(abs(e)) for the block
|
1413
|
-
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
1414
|
-
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
1415
|
-
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
1416
|
-
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
1417
|
-
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
1418
|
-
|
1419
|
-
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
1420
|
-
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
1421
|
-
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
1422
|
-
const float maxScalar = _mm_cvtss_f32( max4 );
|
1423
|
-
|
1424
|
-
// Quantize these floats
|
1425
|
-
const float d = maxScalar / 127.f;
|
1426
|
-
y[i].d = d;
|
1427
|
-
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1428
|
-
const __m256 mul = _mm256_set1_ps( id );
|
1429
|
-
|
1430
|
-
// Apply the multiplier
|
1431
|
-
v0 = _mm256_mul_ps( v0, mul );
|
1432
|
-
v1 = _mm256_mul_ps( v1, mul );
|
1433
|
-
v2 = _mm256_mul_ps( v2, mul );
|
1434
|
-
v3 = _mm256_mul_ps( v3, mul );
|
1435
|
-
|
1436
|
-
// Round to nearest integer
|
1437
|
-
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
1438
|
-
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
1439
|
-
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
1440
|
-
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
1441
|
-
|
1442
|
-
// Convert floats to integers
|
1443
|
-
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
1444
|
-
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
1445
|
-
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
1446
|
-
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
1447
|
-
|
1448
|
-
#if defined(__AVX2__)
|
1449
|
-
// Compute the sum of the quants and set y[i].s
|
1450
|
-
y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
|
1451
|
-
|
1452
|
-
// Convert int32 to int16
|
1453
|
-
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
1454
|
-
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
|
1455
|
-
// Convert int16 to int8
|
1456
|
-
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
|
1457
|
-
|
1458
|
-
// We got our precious signed bytes, but the order is now wrong
|
1459
|
-
// These AVX2 pack instructions process 16-byte pieces independently
|
1460
|
-
// The following instruction is fixing the order
|
1461
|
-
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
1462
|
-
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
1463
|
-
|
1464
|
-
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
1465
|
-
#else
|
1466
|
-
// Since we don't have in AVX some necessary functions,
|
1467
|
-
// we split the registers in half and call AVX2 analogs from SSE
|
1468
|
-
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
1469
|
-
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
1470
|
-
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
1471
|
-
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
1472
|
-
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
1473
|
-
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
1474
|
-
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
1475
|
-
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
1476
|
-
|
1477
|
-
// Compute the sum of the quants and set y[i].s
|
1478
|
-
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
1479
|
-
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
1480
|
-
y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
|
1481
|
-
|
1482
|
-
// Convert int32 to int16
|
1483
|
-
ni0 = _mm_packs_epi32( ni0, ni1 );
|
1484
|
-
ni2 = _mm_packs_epi32( ni2, ni3 );
|
1485
|
-
ni4 = _mm_packs_epi32( ni4, ni5 );
|
1486
|
-
ni6 = _mm_packs_epi32( ni6, ni7 );
|
1487
|
-
// Convert int16 to int8
|
1488
|
-
ni0 = _mm_packs_epi16( ni0, ni2 );
|
1489
|
-
ni4 = _mm_packs_epi16( ni4, ni6 );
|
1490
|
-
|
1491
|
-
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
|
1492
|
-
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
1493
|
-
#endif
|
1494
|
-
}
|
1495
|
-
#elif defined(__riscv_v_intrinsic)
|
1496
|
-
|
1497
|
-
size_t vl = __riscv_vsetvl_e32m4(QK8_1);
|
1498
|
-
|
1499
|
-
for (int i = 0; i < nb; i++) {
|
1500
|
-
// load elements
|
1501
|
-
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
|
1502
|
-
|
1503
|
-
vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
|
1504
|
-
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
|
1505
|
-
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
|
1506
|
-
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
|
1507
|
-
|
1508
|
-
const float d = amax / ((1 << 7) - 1);
|
1509
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1510
|
-
|
1511
|
-
y[i].d = d;
|
1512
|
-
|
1513
|
-
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
1514
|
-
|
1515
|
-
// convert to integer
|
1516
|
-
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
|
1517
|
-
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
|
1518
|
-
|
1519
|
-
// store result
|
1520
|
-
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
1521
|
-
|
1522
|
-
// compute sum for y[i].s
|
1523
|
-
vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
|
1524
|
-
vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
|
1525
|
-
|
1526
|
-
// set y[i].s
|
1527
|
-
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
1528
|
-
y[i].s = sum*d;
|
1529
|
-
}
|
1530
|
-
#else
|
1531
|
-
// scalar
|
1532
|
-
quantize_row_q8_1_reference(x, y, k);
|
1533
|
-
#endif
|
1534
|
-
}
|
1535
|
-
|
1536
|
-
static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
|
1537
|
-
static const int qk = QK4_0;
|
1538
|
-
|
1539
|
-
assert(k % qk == 0);
|
1540
|
-
|
1541
|
-
const int nb = k / qk;
|
1542
|
-
|
1543
|
-
for (int i = 0; i < nb; i++) {
|
1544
|
-
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1545
|
-
|
1546
|
-
for (int j = 0; j < qk/2; ++j) {
|
1547
|
-
const int x0 = (x[i].qs[j] & 0x0F) - 8;
|
1548
|
-
const int x1 = (x[i].qs[j] >> 4) - 8;
|
1549
|
-
|
1550
|
-
y[i*qk + j + 0 ] = x0*d;
|
1551
|
-
y[i*qk + j + qk/2] = x1*d;
|
1552
|
-
}
|
1553
|
-
}
|
1554
|
-
}
|
1555
|
-
|
1556
|
-
static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
|
1557
|
-
static const int qk = QK4_1;
|
1558
|
-
|
1559
|
-
assert(k % qk == 0);
|
1560
|
-
|
1561
|
-
const int nb = k / qk;
|
1562
|
-
|
1563
|
-
for (int i = 0; i < nb; i++) {
|
1564
|
-
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1565
|
-
const float m = GGML_FP16_TO_FP32(x[i].m);
|
1566
|
-
|
1567
|
-
for (int j = 0; j < qk/2; ++j) {
|
1568
|
-
const int x0 = (x[i].qs[j] & 0x0F);
|
1569
|
-
const int x1 = (x[i].qs[j] >> 4);
|
1570
|
-
|
1571
|
-
y[i*qk + j + 0 ] = x0*d + m;
|
1572
|
-
y[i*qk + j + qk/2] = x1*d + m;
|
1573
|
-
}
|
1574
|
-
}
|
1575
|
-
}
|
1576
|
-
|
1577
|
-
static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
|
1578
|
-
static const int qk = QK5_0;
|
1579
|
-
|
1580
|
-
assert(k % qk == 0);
|
1581
|
-
|
1582
|
-
const int nb = k / qk;
|
1583
|
-
|
1584
|
-
for (int i = 0; i < nb; i++) {
|
1585
|
-
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1586
|
-
|
1587
|
-
uint32_t qh;
|
1588
|
-
memcpy(&qh, x[i].qh, sizeof(qh));
|
1589
|
-
|
1590
|
-
for (int j = 0; j < qk/2; ++j) {
|
1591
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
1592
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
1593
|
-
|
1594
|
-
const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
|
1595
|
-
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
1596
|
-
|
1597
|
-
y[i*qk + j + 0 ] = x0*d;
|
1598
|
-
y[i*qk + j + qk/2] = x1*d;
|
1599
|
-
}
|
1600
|
-
}
|
1601
|
-
}
|
1602
|
-
|
1603
|
-
static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
|
1604
|
-
static const int qk = QK5_1;
|
1605
|
-
|
1606
|
-
assert(k % qk == 0);
|
1607
|
-
|
1608
|
-
const int nb = k / qk;
|
1609
|
-
|
1610
|
-
for (int i = 0; i < nb; i++) {
|
1611
|
-
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1612
|
-
const float m = GGML_FP16_TO_FP32(x[i].m);
|
1613
|
-
|
1614
|
-
uint32_t qh;
|
1615
|
-
memcpy(&qh, x[i].qh, sizeof(qh));
|
1616
|
-
|
1617
|
-
for (int j = 0; j < qk/2; ++j) {
|
1618
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
1619
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
1620
|
-
|
1621
|
-
const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
|
1622
|
-
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
1623
|
-
|
1624
|
-
y[i*qk + j + 0 ] = x0*d + m;
|
1625
|
-
y[i*qk + j + qk/2] = x1*d + m;
|
1626
|
-
}
|
1627
|
-
}
|
1628
|
-
}
|
1629
|
-
|
1630
|
-
static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) {
|
1631
|
-
static const int qk = QK8_0;
|
1632
|
-
|
1633
|
-
assert(k % qk == 0);
|
1634
|
-
|
1635
|
-
const int nb = k / qk;
|
1636
|
-
|
1637
|
-
const block_q8_0 * restrict x = vx;
|
1638
|
-
|
1639
|
-
for (int i = 0; i < nb; i++) {
|
1640
|
-
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1641
|
-
|
1642
|
-
for (int j = 0; j < qk; ++j) {
|
1643
|
-
y[i*qk + j] = x[i].qs[j]*d;
|
1644
|
-
}
|
1645
|
-
}
|
1646
|
-
}
|
1647
|
-
|
1648
|
-
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
1649
|
-
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
1650
|
-
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1651
|
-
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1652
|
-
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1653
|
-
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1654
|
-
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1655
|
-
|
1656
|
-
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
1657
|
-
[GGML_TYPE_I8] = {
|
1658
|
-
.type_name = "i8",
|
1659
|
-
.blck_size = 1,
|
1660
|
-
.type_size = sizeof(int8_t),
|
1661
|
-
.is_quantized = false,
|
1662
|
-
},
|
1663
|
-
[GGML_TYPE_I16] = {
|
1664
|
-
.type_name = "i16",
|
1665
|
-
.blck_size = 1,
|
1666
|
-
.type_size = sizeof(int16_t),
|
1667
|
-
.is_quantized = false,
|
1668
|
-
},
|
1669
|
-
[GGML_TYPE_I32] = {
|
1670
|
-
.type_name = "i32",
|
1671
|
-
.blck_size = 1,
|
1672
|
-
.type_size = sizeof(int32_t),
|
1673
|
-
.is_quantized = false,
|
1674
|
-
},
|
1675
|
-
[GGML_TYPE_F32] = {
|
1676
|
-
.type_name = "f32",
|
1677
|
-
.blck_size = 1,
|
1678
|
-
.type_size = sizeof(float),
|
1679
|
-
.is_quantized = false,
|
1680
|
-
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
1681
|
-
.vec_dot_type = GGML_TYPE_F32,
|
1682
|
-
},
|
1683
|
-
[GGML_TYPE_F16] = {
|
1684
|
-
.type_name = "f16",
|
1685
|
-
.blck_size = 1,
|
1686
|
-
.type_size = sizeof(ggml_fp16_t),
|
1687
|
-
.is_quantized = false,
|
1688
|
-
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
1689
|
-
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1690
|
-
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1691
|
-
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
1692
|
-
.vec_dot_type = GGML_TYPE_F16,
|
1693
|
-
},
|
1694
|
-
[GGML_TYPE_Q4_0] = {
|
1695
|
-
.type_name = "q4_0",
|
1696
|
-
.blck_size = QK4_0,
|
1697
|
-
.type_size = sizeof(block_q4_0),
|
1698
|
-
.is_quantized = true,
|
1699
|
-
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
1700
|
-
.from_float = quantize_row_q4_0,
|
1701
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
1702
|
-
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
1703
|
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
1704
|
-
},
|
1705
|
-
[GGML_TYPE_Q4_1] = {
|
1706
|
-
.type_name = "q4_1",
|
1707
|
-
.blck_size = QK4_1,
|
1708
|
-
.type_size = sizeof(block_q4_1),
|
1709
|
-
.is_quantized = true,
|
1710
|
-
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
1711
|
-
.from_float = quantize_row_q4_1,
|
1712
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
1713
|
-
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
1714
|
-
.vec_dot_type = GGML_TYPE_Q8_1,
|
1715
|
-
},
|
1716
|
-
[GGML_TYPE_Q5_0] = {
|
1717
|
-
.type_name = "q5_0",
|
1718
|
-
.blck_size = QK5_0,
|
1719
|
-
.type_size = sizeof(block_q5_0),
|
1720
|
-
.is_quantized = true,
|
1721
|
-
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
1722
|
-
.from_float = quantize_row_q5_0,
|
1723
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
1724
|
-
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
1725
|
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
1726
|
-
},
|
1727
|
-
[GGML_TYPE_Q5_1] = {
|
1728
|
-
.type_name = "q5_1",
|
1729
|
-
.blck_size = QK5_1,
|
1730
|
-
.type_size = sizeof(block_q5_1),
|
1731
|
-
.is_quantized = true,
|
1732
|
-
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
1733
|
-
.from_float = quantize_row_q5_1,
|
1734
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
1735
|
-
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
1736
|
-
.vec_dot_type = GGML_TYPE_Q8_1,
|
1737
|
-
},
|
1738
|
-
[GGML_TYPE_Q8_0] = {
|
1739
|
-
.type_name = "q8_0",
|
1740
|
-
.blck_size = QK8_0,
|
1741
|
-
.type_size = sizeof(block_q8_0),
|
1742
|
-
.is_quantized = true,
|
1743
|
-
.to_float = dequantize_row_q8_0,
|
1744
|
-
.from_float = quantize_row_q8_0,
|
1745
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
1746
|
-
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
1747
|
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
1748
|
-
},
|
1749
|
-
[GGML_TYPE_Q8_1] = {
|
1750
|
-
.type_name = "q8_1",
|
1751
|
-
.blck_size = QK8_1,
|
1752
|
-
.type_size = sizeof(block_q8_1),
|
1753
|
-
.is_quantized = true,
|
1754
|
-
.from_float = quantize_row_q8_1,
|
1755
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
1756
|
-
.vec_dot_type = GGML_TYPE_Q8_1,
|
1757
|
-
},
|
1758
|
-
#ifdef GGML_USE_K_QUANTS
|
1759
|
-
[GGML_TYPE_Q2_K] = {
|
1760
|
-
.type_name = "q2_K",
|
1761
|
-
.blck_size = QK_K,
|
1762
|
-
.type_size = sizeof(block_q2_K),
|
1763
|
-
.is_quantized = true,
|
1764
|
-
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
1765
|
-
.from_float = quantize_row_q2_K,
|
1766
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
1767
|
-
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
1768
|
-
.vec_dot_type = GGML_TYPE_Q8_K,
|
1769
|
-
},
|
1770
|
-
[GGML_TYPE_Q3_K] = {
|
1771
|
-
.type_name = "q3_K",
|
1772
|
-
.blck_size = QK_K,
|
1773
|
-
.type_size = sizeof(block_q3_K),
|
1774
|
-
.is_quantized = true,
|
1775
|
-
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
1776
|
-
.from_float = quantize_row_q3_K,
|
1777
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
1778
|
-
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
1779
|
-
.vec_dot_type = GGML_TYPE_Q8_K,
|
1780
|
-
},
|
1781
|
-
[GGML_TYPE_Q4_K] = {
|
1782
|
-
.type_name = "q4_K",
|
1783
|
-
.blck_size = QK_K,
|
1784
|
-
.type_size = sizeof(block_q4_K),
|
1785
|
-
.is_quantized = true,
|
1786
|
-
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
1787
|
-
.from_float = quantize_row_q4_K,
|
1788
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
1789
|
-
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
1790
|
-
.vec_dot_type = GGML_TYPE_Q8_K,
|
1791
|
-
},
|
1792
|
-
[GGML_TYPE_Q5_K] = {
|
1793
|
-
.type_name = "q5_K",
|
1794
|
-
.blck_size = QK_K,
|
1795
|
-
.type_size = sizeof(block_q5_K),
|
1796
|
-
.is_quantized = true,
|
1797
|
-
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
1798
|
-
.from_float = quantize_row_q5_K,
|
1799
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
1800
|
-
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
1801
|
-
.vec_dot_type = GGML_TYPE_Q8_K,
|
1802
|
-
},
|
1803
|
-
[GGML_TYPE_Q6_K] = {
|
1804
|
-
.type_name = "q6_K",
|
1805
|
-
.blck_size = QK_K,
|
1806
|
-
.type_size = sizeof(block_q6_K),
|
1807
|
-
.is_quantized = true,
|
1808
|
-
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
1809
|
-
.from_float = quantize_row_q6_K,
|
1810
|
-
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
1811
|
-
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
1812
|
-
.vec_dot_type = GGML_TYPE_Q8_K,
|
1813
|
-
},
|
1814
|
-
[GGML_TYPE_Q8_K] = {
|
1815
|
-
.type_name = "q8_K",
|
1816
|
-
.blck_size = QK_K,
|
1817
|
-
.type_size = sizeof(block_q8_K),
|
1818
|
-
.is_quantized = true,
|
1819
|
-
.from_float = quantize_row_q8_K,
|
1820
|
-
}
|
1821
|
-
#endif
|
1822
|
-
};
|
1823
|
-
|
1824
|
-
// For internal test use
|
1825
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
1826
|
-
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
1827
|
-
return type_traits[type];
|
1828
|
-
}
|
1829
|
-
|
1830
|
-
//
|
1831
|
-
// simd mappings
|
1832
|
-
//
|
1833
|
-
|
1834
|
-
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
1835
|
-
// we then implement the fundamental computation operations below using only these macros
|
1836
|
-
// adding support for new architectures requires to define the corresponding SIMD macros
|
1837
|
-
//
|
1838
|
-
// GGML_F32_STEP / GGML_F16_STEP
|
1839
|
-
// number of elements to process in a single step
|
1840
|
-
//
|
1841
|
-
// GGML_F32_EPR / GGML_F16_EPR
|
1842
|
-
// number of elements to fit in a single register
|
1843
|
-
//
|
1844
|
-
|
1845
|
-
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
1846
|
-
|
1847
|
-
#define GGML_SIMD
|
1848
|
-
|
1849
|
-
// F32 NEON
|
1850
|
-
|
1851
|
-
#define GGML_F32_STEP 16
|
1852
|
-
#define GGML_F32_EPR 4
|
1853
|
-
|
1854
|
-
#define GGML_F32x4 float32x4_t
|
1855
|
-
#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
|
1856
|
-
#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
|
1857
|
-
#define GGML_F32x4_LOAD vld1q_f32
|
1858
|
-
#define GGML_F32x4_STORE vst1q_f32
|
1859
|
-
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
1860
|
-
#define GGML_F32x4_ADD vaddq_f32
|
1861
|
-
#define GGML_F32x4_MUL vmulq_f32
|
1862
|
-
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1863
|
-
#define GGML_F32x4_REDUCE(res, x) \
|
1864
|
-
{ \
|
1865
|
-
int offset = GGML_F32_ARR >> 1; \
|
1866
|
-
for (int i = 0; i < offset; ++i) { \
|
1867
|
-
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1868
|
-
} \
|
1869
|
-
offset >>= 1; \
|
1870
|
-
for (int i = 0; i < offset; ++i) { \
|
1871
|
-
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1872
|
-
} \
|
1873
|
-
offset >>= 1; \
|
1874
|
-
for (int i = 0; i < offset; ++i) { \
|
1875
|
-
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1876
|
-
} \
|
1877
|
-
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
1878
|
-
}
|
1879
|
-
|
1880
|
-
#define GGML_F32_VEC GGML_F32x4
|
1881
|
-
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
1882
|
-
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
1883
|
-
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
1884
|
-
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
1885
|
-
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
1886
|
-
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
1887
|
-
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
1888
|
-
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
1889
|
-
|
1890
|
-
// F16 NEON
|
1891
|
-
|
1892
|
-
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
1893
|
-
#define GGML_F16_STEP 32
|
1894
|
-
#define GGML_F16_EPR 8
|
1895
|
-
|
1896
|
-
#define GGML_F16x8 float16x8_t
|
1897
|
-
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
1898
|
-
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
1899
|
-
#define GGML_F16x8_LOAD vld1q_f16
|
1900
|
-
#define GGML_F16x8_STORE vst1q_f16
|
1901
|
-
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
1902
|
-
#define GGML_F16x8_ADD vaddq_f16
|
1903
|
-
#define GGML_F16x8_MUL vmulq_f16
|
1904
|
-
#define GGML_F16x8_REDUCE(res, x) \
|
1905
|
-
do { \
|
1906
|
-
int offset = GGML_F16_ARR >> 1; \
|
1907
|
-
for (int i = 0; i < offset; ++i) { \
|
1908
|
-
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1909
|
-
} \
|
1910
|
-
offset >>= 1; \
|
1911
|
-
for (int i = 0; i < offset; ++i) { \
|
1912
|
-
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1913
|
-
} \
|
1914
|
-
offset >>= 1; \
|
1915
|
-
for (int i = 0; i < offset; ++i) { \
|
1916
|
-
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1917
|
-
} \
|
1918
|
-
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
1919
|
-
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
1920
|
-
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
1921
|
-
} while (0)
|
1922
|
-
|
1923
|
-
#define GGML_F16_VEC GGML_F16x8
|
1924
|
-
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
1925
|
-
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
1926
|
-
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
1927
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
|
1928
|
-
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
1929
|
-
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
1930
|
-
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
1931
|
-
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
1932
|
-
#else
|
1933
|
-
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
1934
|
-
// and take advantage of the vcvt_ functions to convert to/from FP16
|
1935
|
-
|
1936
|
-
#define GGML_F16_STEP 16
|
1937
|
-
#define GGML_F16_EPR 4
|
1938
|
-
|
1939
|
-
#define GGML_F32Cx4 float32x4_t
|
1940
|
-
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
1941
|
-
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
1942
|
-
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
|
1943
|
-
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
1944
|
-
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
1945
|
-
#define GGML_F32Cx4_ADD vaddq_f32
|
1946
|
-
#define GGML_F32Cx4_MUL vmulq_f32
|
1947
|
-
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
1948
|
-
|
1949
|
-
#define GGML_F16_VEC GGML_F32Cx4
|
1950
|
-
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
1951
|
-
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
1952
|
-
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
1953
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
1954
|
-
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
1955
|
-
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
1956
|
-
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
1957
|
-
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
1958
|
-
#endif
|
1959
|
-
|
1960
|
-
#elif defined(__AVX__)
|
1961
|
-
|
1962
|
-
#define GGML_SIMD
|
1963
|
-
|
1964
|
-
// F32 AVX
|
1965
|
-
|
1966
|
-
#define GGML_F32_STEP 32
|
1967
|
-
#define GGML_F32_EPR 8
|
1968
|
-
|
1969
|
-
#define GGML_F32x8 __m256
|
1970
|
-
#define GGML_F32x8_ZERO _mm256_setzero_ps()
|
1971
|
-
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
|
1972
|
-
#define GGML_F32x8_LOAD _mm256_loadu_ps
|
1973
|
-
#define GGML_F32x8_STORE _mm256_storeu_ps
|
1974
|
-
#if defined(__FMA__)
|
1975
|
-
#define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
|
1976
|
-
#else
|
1977
|
-
#define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
|
1978
|
-
#endif
|
1979
|
-
#define GGML_F32x8_ADD _mm256_add_ps
|
1980
|
-
#define GGML_F32x8_MUL _mm256_mul_ps
|
1981
|
-
#define GGML_F32x8_REDUCE(res, x) \
|
1982
|
-
do { \
|
1983
|
-
int offset = GGML_F32_ARR >> 1; \
|
1984
|
-
for (int i = 0; i < offset; ++i) { \
|
1985
|
-
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1986
|
-
} \
|
1987
|
-
offset >>= 1; \
|
1988
|
-
for (int i = 0; i < offset; ++i) { \
|
1989
|
-
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1990
|
-
} \
|
1991
|
-
offset >>= 1; \
|
1992
|
-
for (int i = 0; i < offset; ++i) { \
|
1993
|
-
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1994
|
-
} \
|
1995
|
-
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
1996
|
-
_mm256_extractf128_ps(x[0], 1)); \
|
1997
|
-
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
1998
|
-
res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
1999
|
-
} while (0)
|
2000
|
-
// TODO: is this optimal ?
|
2001
|
-
|
2002
|
-
#define GGML_F32_VEC GGML_F32x8
|
2003
|
-
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
2004
|
-
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
2005
|
-
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
2006
|
-
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
2007
|
-
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
2008
|
-
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
2009
|
-
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
2010
|
-
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
2011
|
-
|
2012
|
-
// F16 AVX
|
2013
|
-
|
2014
|
-
#define GGML_F16_STEP 32
|
2015
|
-
#define GGML_F16_EPR 8
|
2016
|
-
|
2017
|
-
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
2018
|
-
|
2019
|
-
#define GGML_F32Cx8 __m256
|
2020
|
-
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
2021
|
-
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
2022
|
-
|
2023
|
-
#if defined(__F16C__)
|
2024
|
-
// the _mm256_cvt intrinsics require F16C
|
2025
|
-
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
2026
|
-
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
2027
|
-
#else
|
2028
|
-
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
2029
|
-
float tmp[8];
|
2030
|
-
|
2031
|
-
for (int i = 0; i < 8; i++) {
|
2032
|
-
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
2033
|
-
}
|
2034
|
-
|
2035
|
-
return _mm256_loadu_ps(tmp);
|
2036
|
-
}
|
2037
|
-
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
2038
|
-
float arr[8];
|
2039
|
-
|
2040
|
-
_mm256_storeu_ps(arr, y);
|
2041
|
-
|
2042
|
-
for (int i = 0; i < 8; i++)
|
2043
|
-
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
2044
|
-
}
|
2045
|
-
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
2046
|
-
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
2047
|
-
#endif
|
2048
|
-
|
2049
|
-
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
2050
|
-
#define GGML_F32Cx8_ADD _mm256_add_ps
|
2051
|
-
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
2052
|
-
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
2053
|
-
|
2054
|
-
#define GGML_F16_VEC GGML_F32Cx8
|
2055
|
-
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
2056
|
-
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
2057
|
-
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
2058
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
2059
|
-
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
2060
|
-
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
2061
|
-
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
2062
|
-
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
2063
|
-
|
2064
|
-
#elif defined(__POWER9_VECTOR__)
|
2065
|
-
|
2066
|
-
#define GGML_SIMD
|
2067
|
-
|
2068
|
-
// F32 POWER9
|
2069
|
-
|
2070
|
-
#define GGML_F32_STEP 32
|
2071
|
-
#define GGML_F32_EPR 4
|
2072
|
-
|
2073
|
-
#define GGML_F32x4 vector float
|
2074
|
-
#define GGML_F32x4_ZERO 0.0f
|
2075
|
-
#define GGML_F32x4_SET1 vec_splats
|
2076
|
-
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
2077
|
-
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
2078
|
-
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
2079
|
-
#define GGML_F32x4_ADD vec_add
|
2080
|
-
#define GGML_F32x4_MUL vec_mul
|
2081
|
-
#define GGML_F32x4_REDUCE(res, x) \
|
2082
|
-
{ \
|
2083
|
-
int offset = GGML_F32_ARR >> 1; \
|
2084
|
-
for (int i = 0; i < offset; ++i) { \
|
2085
|
-
x[i] = vec_add(x[i], x[offset+i]); \
|
2086
|
-
} \
|
2087
|
-
offset >>= 1; \
|
2088
|
-
for (int i = 0; i < offset; ++i) { \
|
2089
|
-
x[i] = vec_add(x[i], x[offset+i]); \
|
2090
|
-
} \
|
2091
|
-
offset >>= 1; \
|
2092
|
-
for (int i = 0; i < offset; ++i) { \
|
2093
|
-
x[i] = vec_add(x[i], x[offset+i]); \
|
2094
|
-
} \
|
2095
|
-
res = vec_extract(x[0], 0) + \
|
2096
|
-
vec_extract(x[0], 1) + \
|
2097
|
-
vec_extract(x[0], 2) + \
|
2098
|
-
vec_extract(x[0], 3); \
|
2099
|
-
}
|
2100
|
-
|
2101
|
-
#define GGML_F32_VEC GGML_F32x4
|
2102
|
-
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
2103
|
-
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
2104
|
-
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
2105
|
-
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
2106
|
-
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
2107
|
-
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
2108
|
-
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
2109
|
-
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
2110
|
-
|
2111
|
-
// F16 POWER9
|
2112
|
-
#define GGML_F16_STEP GGML_F32_STEP
|
2113
|
-
#define GGML_F16_EPR GGML_F32_EPR
|
2114
|
-
#define GGML_F16_VEC GGML_F32x4
|
2115
|
-
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
2116
|
-
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
2117
|
-
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
2118
|
-
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
2119
|
-
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
2120
|
-
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
2121
|
-
vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
|
2122
|
-
vec_extract_fp32_from_shortl(vec_xl(0, p))
|
2123
|
-
#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
|
2124
|
-
#define GGML_F16_VEC_STORE(p, r, i) \
|
2125
|
-
if (i & 0x1) \
|
2126
|
-
vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
|
2127
|
-
r[i - GGML_ENDIAN_BYTE(0)]), \
|
2128
|
-
0, p - GGML_F16_EPR)
|
2129
|
-
|
2130
|
-
#elif defined(__wasm_simd128__)
|
2131
|
-
|
2132
|
-
#define GGML_SIMD
|
2133
|
-
|
2134
|
-
// F32 WASM
|
2135
|
-
|
2136
|
-
#define GGML_F32_STEP 16
|
2137
|
-
#define GGML_F32_EPR 4
|
2138
|
-
|
2139
|
-
#define GGML_F32x4 v128_t
|
2140
|
-
#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
|
2141
|
-
#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
|
2142
|
-
#define GGML_F32x4_LOAD wasm_v128_load
|
2143
|
-
#define GGML_F32x4_STORE wasm_v128_store
|
2144
|
-
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
|
2145
|
-
#define GGML_F32x4_ADD wasm_f32x4_add
|
2146
|
-
#define GGML_F32x4_MUL wasm_f32x4_mul
|
2147
|
-
#define GGML_F32x4_REDUCE(res, x) \
|
2148
|
-
{ \
|
2149
|
-
int offset = GGML_F32_ARR >> 1; \
|
2150
|
-
for (int i = 0; i < offset; ++i) { \
|
2151
|
-
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2152
|
-
} \
|
2153
|
-
offset >>= 1; \
|
2154
|
-
for (int i = 0; i < offset; ++i) { \
|
2155
|
-
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2156
|
-
} \
|
2157
|
-
offset >>= 1; \
|
2158
|
-
for (int i = 0; i < offset; ++i) { \
|
2159
|
-
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2160
|
-
} \
|
2161
|
-
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2162
|
-
wasm_f32x4_extract_lane(x[0], 1) + \
|
2163
|
-
wasm_f32x4_extract_lane(x[0], 2) + \
|
2164
|
-
wasm_f32x4_extract_lane(x[0], 3); \
|
2165
|
-
}
|
2166
|
-
|
2167
|
-
#define GGML_F32_VEC GGML_F32x4
|
2168
|
-
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
2169
|
-
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
2170
|
-
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
2171
|
-
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
2172
|
-
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
2173
|
-
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
2174
|
-
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
2175
|
-
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
2176
|
-
|
2177
|
-
// F16 WASM
|
2178
|
-
|
2179
|
-
#define GGML_F16_STEP 16
|
2180
|
-
#define GGML_F16_EPR 4
|
2181
|
-
|
2182
|
-
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
2183
|
-
float tmp[4];
|
2184
|
-
|
2185
|
-
tmp[0] = GGML_FP16_TO_FP32(p[0]);
|
2186
|
-
tmp[1] = GGML_FP16_TO_FP32(p[1]);
|
2187
|
-
tmp[2] = GGML_FP16_TO_FP32(p[2]);
|
2188
|
-
tmp[3] = GGML_FP16_TO_FP32(p[3]);
|
2189
|
-
|
2190
|
-
return wasm_v128_load(tmp);
|
2191
|
-
}
|
2192
|
-
|
2193
|
-
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
2194
|
-
float tmp[4];
|
2195
|
-
|
2196
|
-
wasm_v128_store(tmp, x);
|
2197
|
-
|
2198
|
-
p[0] = GGML_FP32_TO_FP16(tmp[0]);
|
2199
|
-
p[1] = GGML_FP32_TO_FP16(tmp[1]);
|
2200
|
-
p[2] = GGML_FP32_TO_FP16(tmp[2]);
|
2201
|
-
p[3] = GGML_FP32_TO_FP16(tmp[3]);
|
2202
|
-
}
|
2203
|
-
|
2204
|
-
#define GGML_F16x4 v128_t
|
2205
|
-
#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
|
2206
|
-
#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
|
2207
|
-
#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
|
2208
|
-
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
|
2209
|
-
#define GGML_F16x4_FMA GGML_F32x4_FMA
|
2210
|
-
#define GGML_F16x4_ADD wasm_f32x4_add
|
2211
|
-
#define GGML_F16x4_MUL wasm_f32x4_mul
|
2212
|
-
#define GGML_F16x4_REDUCE(res, x) \
|
2213
|
-
{ \
|
2214
|
-
int offset = GGML_F16_ARR >> 1; \
|
2215
|
-
for (int i = 0; i < offset; ++i) { \
|
2216
|
-
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2217
|
-
} \
|
2218
|
-
offset >>= 1; \
|
2219
|
-
for (int i = 0; i < offset; ++i) { \
|
2220
|
-
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2221
|
-
} \
|
2222
|
-
offset >>= 1; \
|
2223
|
-
for (int i = 0; i < offset; ++i) { \
|
2224
|
-
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2225
|
-
} \
|
2226
|
-
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2227
|
-
wasm_f32x4_extract_lane(x[0], 1) + \
|
2228
|
-
wasm_f32x4_extract_lane(x[0], 2) + \
|
2229
|
-
wasm_f32x4_extract_lane(x[0], 3); \
|
2230
|
-
}
|
2231
|
-
|
2232
|
-
#define GGML_F16_VEC GGML_F16x4
|
2233
|
-
#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
|
2234
|
-
#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
|
2235
|
-
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
|
2236
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
|
2237
|
-
#define GGML_F16_VEC_FMA GGML_F16x4_FMA
|
2238
|
-
#define GGML_F16_VEC_ADD GGML_F16x4_ADD
|
2239
|
-
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
2240
|
-
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
2241
|
-
|
2242
|
-
#elif defined(__SSE3__)
|
2243
|
-
|
2244
|
-
#define GGML_SIMD
|
2245
|
-
|
2246
|
-
// F32 SSE
|
2247
|
-
|
2248
|
-
#define GGML_F32_STEP 32
|
2249
|
-
#define GGML_F32_EPR 4
|
2250
|
-
|
2251
|
-
#define GGML_F32x4 __m128
|
2252
|
-
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
2253
|
-
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
2254
|
-
#define GGML_F32x4_LOAD _mm_loadu_ps
|
2255
|
-
#define GGML_F32x4_STORE _mm_storeu_ps
|
2256
|
-
#if defined(__FMA__)
|
2257
|
-
// TODO: Does this work?
|
2258
|
-
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
2259
|
-
#else
|
2260
|
-
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
2261
|
-
#endif
|
2262
|
-
#define GGML_F32x4_ADD _mm_add_ps
|
2263
|
-
#define GGML_F32x4_MUL _mm_mul_ps
|
2264
|
-
#define GGML_F32x4_REDUCE(res, x) \
|
2265
|
-
{ \
|
2266
|
-
int offset = GGML_F32_ARR >> 1; \
|
2267
|
-
for (int i = 0; i < offset; ++i) { \
|
2268
|
-
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2269
|
-
} \
|
2270
|
-
offset >>= 1; \
|
2271
|
-
for (int i = 0; i < offset; ++i) { \
|
2272
|
-
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2273
|
-
} \
|
2274
|
-
offset >>= 1; \
|
2275
|
-
for (int i = 0; i < offset; ++i) { \
|
2276
|
-
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2277
|
-
} \
|
2278
|
-
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
2279
|
-
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
2280
|
-
}
|
2281
|
-
// TODO: is this optimal ?
|
2282
|
-
|
2283
|
-
#define GGML_F32_VEC GGML_F32x4
|
2284
|
-
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
2285
|
-
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
2286
|
-
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
2287
|
-
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
2288
|
-
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
2289
|
-
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
2290
|
-
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
2291
|
-
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
2292
|
-
|
2293
|
-
// F16 SSE
|
2294
|
-
|
2295
|
-
#define GGML_F16_STEP 32
|
2296
|
-
#define GGML_F16_EPR 4
|
2297
|
-
|
2298
|
-
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
2299
|
-
float tmp[4];
|
2300
|
-
|
2301
|
-
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
2302
|
-
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
2303
|
-
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
2304
|
-
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
2305
|
-
|
2306
|
-
return _mm_loadu_ps(tmp);
|
2307
|
-
}
|
2308
|
-
|
2309
|
-
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
2310
|
-
float arr[4];
|
2311
|
-
|
2312
|
-
_mm_storeu_ps(arr, y);
|
2313
|
-
|
2314
|
-
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
2315
|
-
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
2316
|
-
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
2317
|
-
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
2318
|
-
}
|
2319
|
-
|
2320
|
-
#define GGML_F32Cx4 __m128
|
2321
|
-
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
2322
|
-
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
2323
|
-
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
2324
|
-
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
2325
|
-
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
2326
|
-
#define GGML_F32Cx4_ADD _mm_add_ps
|
2327
|
-
#define GGML_F32Cx4_MUL _mm_mul_ps
|
2328
|
-
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
2329
|
-
|
2330
|
-
#define GGML_F16_VEC GGML_F32Cx4
|
2331
|
-
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
2332
|
-
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
2333
|
-
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
2334
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
2335
|
-
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
2336
|
-
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
2337
|
-
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
2338
|
-
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
2339
|
-
|
2340
|
-
#endif
|
2341
|
-
|
2342
|
-
// GGML_F32_ARR / GGML_F16_ARR
|
2343
|
-
// number of registers to use per step
|
2344
|
-
#ifdef GGML_SIMD
|
2345
|
-
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
2346
|
-
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
2347
|
-
#endif
|
2348
|
-
|
2349
|
-
//
|
2350
|
-
// fundamental operations
|
2351
|
-
//
|
2352
|
-
|
2353
|
-
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
2354
|
-
|
2355
|
-
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
2356
|
-
|
2357
|
-
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
2358
|
-
|
2359
|
-
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
2360
|
-
|
2361
|
-
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
2362
|
-
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
2363
|
-
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
2364
|
-
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
|
2365
|
-
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
2366
|
-
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
2367
|
-
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
2368
|
-
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
2369
|
-
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
2370
|
-
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
2371
|
-
|
2372
|
-
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
2373
|
-
#ifdef GGML_SIMD
|
2374
|
-
float sumf = 0.0f;
|
2375
|
-
const int np = (n & ~(GGML_F32_STEP - 1));
|
2376
|
-
|
2377
|
-
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
2378
|
-
|
2379
|
-
GGML_F32_VEC ax[GGML_F32_ARR];
|
2380
|
-
GGML_F32_VEC ay[GGML_F32_ARR];
|
2381
|
-
|
2382
|
-
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
2383
|
-
for (int j = 0; j < GGML_F32_ARR; j++) {
|
2384
|
-
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
2385
|
-
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
2386
|
-
|
2387
|
-
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
2388
|
-
}
|
2389
|
-
}
|
2390
|
-
|
2391
|
-
// reduce sum0..sum3 to sum0
|
2392
|
-
GGML_F32_VEC_REDUCE(sumf, sum);
|
2393
|
-
|
2394
|
-
// leftovers
|
2395
|
-
for (int i = np; i < n; ++i) {
|
2396
|
-
sumf += x[i]*y[i];
|
2397
|
-
}
|
2398
|
-
#else
|
2399
|
-
// scalar
|
2400
|
-
ggml_float sumf = 0.0;
|
2401
|
-
for (int i = 0; i < n; ++i) {
|
2402
|
-
sumf += (ggml_float)(x[i]*y[i]);
|
2403
|
-
}
|
2404
|
-
#endif
|
2405
|
-
|
2406
|
-
*s = sumf;
|
2407
|
-
}
|
2408
|
-
|
2409
|
-
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
2410
|
-
ggml_float sumf = 0.0;
|
2411
|
-
|
2412
|
-
#if defined(GGML_SIMD)
|
2413
|
-
const int np = (n & ~(GGML_F16_STEP - 1));
|
2414
|
-
|
2415
|
-
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
2416
|
-
|
2417
|
-
GGML_F16_VEC ax[GGML_F16_ARR];
|
2418
|
-
GGML_F16_VEC ay[GGML_F16_ARR];
|
2419
|
-
|
2420
|
-
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
2421
|
-
for (int j = 0; j < GGML_F16_ARR; j++) {
|
2422
|
-
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
2423
|
-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
2424
|
-
|
2425
|
-
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
2426
|
-
}
|
2427
|
-
}
|
2428
|
-
|
2429
|
-
// reduce sum0..sum3 to sum0
|
2430
|
-
GGML_F16_VEC_REDUCE(sumf, sum);
|
2431
|
-
|
2432
|
-
// leftovers
|
2433
|
-
for (int i = np; i < n; ++i) {
|
2434
|
-
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
2435
|
-
}
|
2436
|
-
#else
|
2437
|
-
for (int i = 0; i < n; ++i) {
|
2438
|
-
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
2439
|
-
}
|
2440
|
-
#endif
|
2441
|
-
|
2442
|
-
*s = sumf;
|
2443
|
-
}
|
2444
|
-
|
2445
|
-
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
2446
|
-
const int qk = QK8_0;
|
2447
|
-
const int nb = n / qk;
|
2448
|
-
|
2449
|
-
assert(n % qk == 0);
|
2450
|
-
|
2451
|
-
const block_q4_0 * restrict x = vx;
|
2452
|
-
const block_q8_0 * restrict y = vy;
|
2453
|
-
|
2454
|
-
#if defined(__ARM_NEON)
|
2455
|
-
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
2456
|
-
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
2457
|
-
|
2458
|
-
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2459
|
-
for (int i = 0; i < nb; i += 2) {
|
2460
|
-
const block_q4_0 * restrict x0 = &x[i + 0];
|
2461
|
-
const block_q4_0 * restrict x1 = &x[i + 1];
|
2462
|
-
const block_q8_0 * restrict y0 = &y[i + 0];
|
2463
|
-
const block_q8_0 * restrict y1 = &y[i + 1];
|
2464
|
-
|
2465
|
-
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2466
|
-
const int8x16_t s8b = vdupq_n_s8(0x8);
|
2467
|
-
|
2468
|
-
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2469
|
-
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
2470
|
-
|
2471
|
-
// 4-bit -> 8-bit
|
2472
|
-
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
2473
|
-
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
2474
|
-
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
2475
|
-
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
2476
|
-
|
2477
|
-
// sub 8
|
2478
|
-
const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
|
2479
|
-
const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
|
2480
|
-
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
|
2481
|
-
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
|
2482
|
-
|
2483
|
-
// load y
|
2484
|
-
const int8x16_t v1_0l = vld1q_s8(y0->qs);
|
2485
|
-
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
|
2486
|
-
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2487
|
-
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2488
|
-
|
2489
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
2490
|
-
// dot product into int32x4_t
|
2491
|
-
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
2492
|
-
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
2493
|
-
|
2494
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2495
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2496
|
-
#else
|
2497
|
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
2498
|
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
2499
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
|
2500
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
|
2501
|
-
|
2502
|
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
|
2503
|
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
|
2504
|
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
|
2505
|
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
|
2506
|
-
|
2507
|
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
2508
|
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
2509
|
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2510
|
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2511
|
-
|
2512
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2513
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2514
|
-
#endif
|
2515
|
-
}
|
2516
|
-
|
2517
|
-
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
2518
|
-
#elif defined(__AVX2__)
|
2519
|
-
// Initialize accumulator with zeros
|
2520
|
-
__m256 acc = _mm256_setzero_ps();
|
2521
|
-
|
2522
|
-
// Main loop
|
2523
|
-
for (int i = 0; i < nb; ++i) {
|
2524
|
-
/* Compute combined scale for the block */
|
2525
|
-
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2526
|
-
|
2527
|
-
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2528
|
-
|
2529
|
-
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
2530
|
-
const __m256i off = _mm256_set1_epi8( 8 );
|
2531
|
-
bx = _mm256_sub_epi8( bx, off );
|
2532
|
-
|
2533
|
-
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2534
|
-
|
2535
|
-
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
2536
|
-
|
2537
|
-
/* Multiply q with scale and accumulate */
|
2538
|
-
acc = _mm256_fmadd_ps( d, q, acc );
|
2539
|
-
}
|
2540
|
-
|
2541
|
-
*s = hsum_float_8(acc);
|
2542
|
-
#elif defined(__AVX__)
|
2543
|
-
// Initialize accumulator with zeros
|
2544
|
-
__m256 acc = _mm256_setzero_ps();
|
2545
|
-
|
2546
|
-
// Main loop
|
2547
|
-
for (int i = 0; i < nb; ++i) {
|
2548
|
-
// Compute combined scale for the block
|
2549
|
-
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2550
|
-
|
2551
|
-
const __m128i lowMask = _mm_set1_epi8(0xF);
|
2552
|
-
const __m128i off = _mm_set1_epi8(8);
|
2553
|
-
|
2554
|
-
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
|
2555
|
-
|
2556
|
-
__m128i bx = _mm_and_si128(lowMask, tmp);
|
2557
|
-
__m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
|
2558
|
-
bx = _mm_sub_epi8(bx, off);
|
2559
|
-
const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
|
2560
|
-
|
2561
|
-
bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
2562
|
-
by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
2563
|
-
bx = _mm_sub_epi8(bx, off);
|
2564
|
-
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
|
2565
|
-
|
2566
|
-
// Convert int32_t to float
|
2567
|
-
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
2568
|
-
|
2569
|
-
// Apply the scale, and accumulate
|
2570
|
-
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
2571
|
-
}
|
2572
|
-
|
2573
|
-
*s = hsum_float_8(acc);
|
2574
|
-
#elif defined(__SSSE3__)
|
2575
|
-
// set constants
|
2576
|
-
const __m128i lowMask = _mm_set1_epi8(0xF);
|
2577
|
-
const __m128i off = _mm_set1_epi8(8);
|
2578
|
-
|
2579
|
-
// Initialize accumulator with zeros
|
2580
|
-
__m128 acc_0 = _mm_setzero_ps();
|
2581
|
-
__m128 acc_1 = _mm_setzero_ps();
|
2582
|
-
__m128 acc_2 = _mm_setzero_ps();
|
2583
|
-
__m128 acc_3 = _mm_setzero_ps();
|
2584
|
-
|
2585
|
-
// First round without accumulation
|
2586
|
-
{
|
2587
|
-
_mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
|
2588
|
-
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
|
2589
|
-
|
2590
|
-
// Compute combined scale for the block 0 and 1
|
2591
|
-
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
|
2592
|
-
|
2593
|
-
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
2594
|
-
|
2595
|
-
__m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
|
2596
|
-
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
|
2597
|
-
bx_0 = _mm_sub_epi8(bx_0, off);
|
2598
|
-
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
2599
|
-
|
2600
|
-
__m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
|
2601
|
-
__m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
|
2602
|
-
bx_1 = _mm_sub_epi8(bx_1, off);
|
2603
|
-
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
2604
|
-
|
2605
|
-
_mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
|
2606
|
-
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
2607
|
-
|
2608
|
-
// Compute combined scale for the block 2 and 3
|
2609
|
-
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
|
2610
|
-
|
2611
|
-
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
2612
|
-
|
2613
|
-
__m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
|
2614
|
-
__m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
|
2615
|
-
bx_2 = _mm_sub_epi8(bx_2, off);
|
2616
|
-
const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
|
2617
|
-
|
2618
|
-
__m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
|
2619
|
-
__m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
|
2620
|
-
bx_3 = _mm_sub_epi8(bx_3, off);
|
2621
|
-
const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
|
2622
|
-
|
2623
|
-
// Convert int32_t to float
|
2624
|
-
__m128 p0 = _mm_cvtepi32_ps(i32_0);
|
2625
|
-
__m128 p1 = _mm_cvtepi32_ps(i32_1);
|
2626
|
-
__m128 p2 = _mm_cvtepi32_ps(i32_2);
|
2627
|
-
__m128 p3 = _mm_cvtepi32_ps(i32_3);
|
2628
|
-
|
2629
|
-
// Apply the scale
|
2630
|
-
acc_0 = _mm_mul_ps( d_0_1, p0 );
|
2631
|
-
acc_1 = _mm_mul_ps( d_0_1, p1 );
|
2632
|
-
acc_2 = _mm_mul_ps( d_2_3, p2 );
|
2633
|
-
acc_3 = _mm_mul_ps( d_2_3, p3 );
|
2634
|
-
}
|
2635
|
-
|
2636
|
-
// Main loop
|
2637
|
-
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2638
|
-
for (int i = 2; i < nb; i+=2) {
|
2639
|
-
_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
|
2640
|
-
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
2641
|
-
|
2642
|
-
// Compute combined scale for the block 0 and 1
|
2643
|
-
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2644
|
-
|
2645
|
-
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
2646
|
-
|
2647
|
-
__m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
|
2648
|
-
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
|
2649
|
-
bx_0 = _mm_sub_epi8(bx_0, off);
|
2650
|
-
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
2651
|
-
|
2652
|
-
__m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
|
2653
|
-
__m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
2654
|
-
bx_1 = _mm_sub_epi8(bx_1, off);
|
2655
|
-
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
2656
|
-
|
2657
|
-
_mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
|
2658
|
-
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
2659
|
-
|
2660
|
-
// Compute combined scale for the block 2 and 3
|
2661
|
-
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
|
2662
|
-
|
2663
|
-
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
|
2664
|
-
|
2665
|
-
__m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
|
2666
|
-
__m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
|
2667
|
-
bx_2 = _mm_sub_epi8(bx_2, off);
|
2668
|
-
const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
|
2669
|
-
|
2670
|
-
__m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
|
2671
|
-
__m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
|
2672
|
-
bx_3 = _mm_sub_epi8(bx_3, off);
|
2673
|
-
const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
|
2674
|
-
|
2675
|
-
// Convert int32_t to float
|
2676
|
-
__m128 p0 = _mm_cvtepi32_ps(i32_0);
|
2677
|
-
__m128 p1 = _mm_cvtepi32_ps(i32_1);
|
2678
|
-
__m128 p2 = _mm_cvtepi32_ps(i32_2);
|
2679
|
-
__m128 p3 = _mm_cvtepi32_ps(i32_3);
|
2680
|
-
|
2681
|
-
// Apply the scale
|
2682
|
-
__m128 p0_d = _mm_mul_ps( d_0_1, p0 );
|
2683
|
-
__m128 p1_d = _mm_mul_ps( d_0_1, p1 );
|
2684
|
-
__m128 p2_d = _mm_mul_ps( d_2_3, p2 );
|
2685
|
-
__m128 p3_d = _mm_mul_ps( d_2_3, p3 );
|
2686
|
-
|
2687
|
-
// Acummulate
|
2688
|
-
acc_0 = _mm_add_ps(p0_d, acc_0);
|
2689
|
-
acc_1 = _mm_add_ps(p1_d, acc_1);
|
2690
|
-
acc_2 = _mm_add_ps(p2_d, acc_2);
|
2691
|
-
acc_3 = _mm_add_ps(p3_d, acc_3);
|
2692
|
-
}
|
2693
|
-
|
2694
|
-
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
2695
|
-
#elif defined(__riscv_v_intrinsic)
|
2696
|
-
float sumf = 0.0;
|
2697
|
-
|
2698
|
-
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2699
|
-
|
2700
|
-
for (int i = 0; i < nb; i++) {
|
2701
|
-
// load elements
|
2702
|
-
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
|
2703
|
-
|
2704
|
-
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
|
2705
|
-
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
|
2706
|
-
|
2707
|
-
// mask and store lower part of x, and then upper part
|
2708
|
-
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
|
2709
|
-
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
2710
|
-
|
2711
|
-
vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
|
2712
|
-
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
2713
|
-
|
2714
|
-
// subtract offset
|
2715
|
-
vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
|
2716
|
-
vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
|
2717
|
-
|
2718
|
-
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
|
2719
|
-
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
|
2720
|
-
|
2721
|
-
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2722
|
-
|
2723
|
-
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
|
2724
|
-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
2725
|
-
|
2726
|
-
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
2727
|
-
|
2728
|
-
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2729
|
-
}
|
560
|
+
//
|
561
|
+
// simd mappings
|
562
|
+
//
|
2730
563
|
|
2731
|
-
|
2732
|
-
|
2733
|
-
|
2734
|
-
|
564
|
+
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
565
|
+
// we then implement the fundamental computation operations below using only these macros
|
566
|
+
// adding support for new architectures requires to define the corresponding SIMD macros
|
567
|
+
//
|
568
|
+
// GGML_F32_STEP / GGML_F16_STEP
|
569
|
+
// number of elements to process in a single step
|
570
|
+
//
|
571
|
+
// GGML_F32_EPR / GGML_F16_EPR
|
572
|
+
// number of elements to fit in a single register
|
573
|
+
//
|
2735
574
|
|
2736
|
-
|
2737
|
-
int sumi = 0;
|
575
|
+
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
2738
576
|
|
2739
|
-
|
2740
|
-
const int v0 = (x[i].qs[j] & 0x0F) - 8;
|
2741
|
-
const int v1 = (x[i].qs[j] >> 4) - 8;
|
577
|
+
#define GGML_SIMD
|
2742
578
|
|
2743
|
-
|
2744
|
-
}
|
579
|
+
// F32 NEON
|
2745
580
|
|
2746
|
-
|
2747
|
-
|
581
|
+
#define GGML_F32_STEP 16
|
582
|
+
#define GGML_F32_EPR 4
|
2748
583
|
|
2749
|
-
|
2750
|
-
#
|
584
|
+
#define GGML_F32x4 float32x4_t
|
585
|
+
#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
|
586
|
+
#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
|
587
|
+
#define GGML_F32x4_LOAD vld1q_f32
|
588
|
+
#define GGML_F32x4_STORE vst1q_f32
|
589
|
+
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
590
|
+
#define GGML_F32x4_ADD vaddq_f32
|
591
|
+
#define GGML_F32x4_MUL vmulq_f32
|
592
|
+
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
593
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
594
|
+
{ \
|
595
|
+
int offset = GGML_F32_ARR >> 1; \
|
596
|
+
for (int i = 0; i < offset; ++i) { \
|
597
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
598
|
+
} \
|
599
|
+
offset >>= 1; \
|
600
|
+
for (int i = 0; i < offset; ++i) { \
|
601
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
602
|
+
} \
|
603
|
+
offset >>= 1; \
|
604
|
+
for (int i = 0; i < offset; ++i) { \
|
605
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
606
|
+
} \
|
607
|
+
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
2751
608
|
}
|
2752
609
|
|
2753
|
-
|
2754
|
-
|
2755
|
-
|
2756
|
-
|
2757
|
-
|
2758
|
-
|
2759
|
-
|
2760
|
-
|
2761
|
-
|
2762
|
-
// TODO: add WASM SIMD
|
2763
|
-
#if defined(__ARM_NEON)
|
2764
|
-
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
2765
|
-
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
2766
|
-
|
2767
|
-
float summs = 0;
|
2768
|
-
|
2769
|
-
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2770
|
-
for (int i = 0; i < nb; i += 2) {
|
2771
|
-
const block_q4_1 * restrict x0 = &x[i + 0];
|
2772
|
-
const block_q4_1 * restrict x1 = &x[i + 1];
|
2773
|
-
const block_q8_1 * restrict y0 = &y[i + 0];
|
2774
|
-
const block_q8_1 * restrict y1 = &y[i + 1];
|
2775
|
-
|
2776
|
-
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
2777
|
-
|
2778
|
-
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2779
|
-
|
2780
|
-
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2781
|
-
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
610
|
+
#define GGML_F32_VEC GGML_F32x4
|
611
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
612
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
613
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
614
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
615
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
616
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
617
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
618
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
2782
619
|
|
2783
|
-
|
2784
|
-
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
2785
|
-
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
2786
|
-
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
2787
|
-
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
620
|
+
// F16 NEON
|
2788
621
|
|
2789
|
-
|
2790
|
-
|
2791
|
-
|
2792
|
-
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2793
|
-
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
622
|
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
623
|
+
#define GGML_F16_STEP 32
|
624
|
+
#define GGML_F16_EPR 8
|
2794
625
|
|
2795
|
-
#
|
2796
|
-
|
2797
|
-
|
2798
|
-
|
626
|
+
#define GGML_F16x8 float16x8_t
|
627
|
+
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
628
|
+
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
629
|
+
#define GGML_F16x8_LOAD vld1q_f16
|
630
|
+
#define GGML_F16x8_STORE vst1q_f16
|
631
|
+
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
632
|
+
#define GGML_F16x8_ADD vaddq_f16
|
633
|
+
#define GGML_F16x8_MUL vmulq_f16
|
634
|
+
#define GGML_F16x8_REDUCE(res, x) \
|
635
|
+
do { \
|
636
|
+
int offset = GGML_F16_ARR >> 1; \
|
637
|
+
for (int i = 0; i < offset; ++i) { \
|
638
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
639
|
+
} \
|
640
|
+
offset >>= 1; \
|
641
|
+
for (int i = 0; i < offset; ++i) { \
|
642
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
643
|
+
} \
|
644
|
+
offset >>= 1; \
|
645
|
+
for (int i = 0; i < offset; ++i) { \
|
646
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
647
|
+
} \
|
648
|
+
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
649
|
+
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
650
|
+
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
651
|
+
} while (0)
|
2799
652
|
|
2800
|
-
|
2801
|
-
|
653
|
+
#define GGML_F16_VEC GGML_F16x8
|
654
|
+
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
655
|
+
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
656
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
657
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
|
658
|
+
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
659
|
+
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
660
|
+
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
661
|
+
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
2802
662
|
#else
|
2803
|
-
|
2804
|
-
|
2805
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
|
2806
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
|
2807
|
-
|
2808
|
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
|
2809
|
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
|
2810
|
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
|
2811
|
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
|
2812
|
-
|
2813
|
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
2814
|
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
2815
|
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2816
|
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2817
|
-
|
2818
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2819
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2820
|
-
#endif
|
2821
|
-
}
|
2822
|
-
|
2823
|
-
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
2824
|
-
#elif defined(__AVX2__) || defined(__AVX__)
|
2825
|
-
// Initialize accumulator with zeros
|
2826
|
-
__m256 acc = _mm256_setzero_ps();
|
2827
|
-
|
2828
|
-
float summs = 0;
|
2829
|
-
|
2830
|
-
// Main loop
|
2831
|
-
for (int i = 0; i < nb; ++i) {
|
2832
|
-
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
2833
|
-
const float d1 = y[i].d;
|
2834
|
-
|
2835
|
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
2836
|
-
|
2837
|
-
const __m256 d0v = _mm256_set1_ps( d0 );
|
2838
|
-
const __m256 d1v = _mm256_set1_ps( d1 );
|
2839
|
-
|
2840
|
-
// Compute combined scales
|
2841
|
-
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
663
|
+
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
664
|
+
// and take advantage of the vcvt_ functions to convert to/from FP16
|
2842
665
|
|
2843
|
-
|
2844
|
-
|
2845
|
-
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
666
|
+
#define GGML_F16_STEP 16
|
667
|
+
#define GGML_F16_EPR 4
|
2846
668
|
|
2847
|
-
|
669
|
+
#define GGML_F32Cx4 float32x4_t
|
670
|
+
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
671
|
+
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
672
|
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
|
673
|
+
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
674
|
+
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
675
|
+
#define GGML_F32Cx4_ADD vaddq_f32
|
676
|
+
#define GGML_F32Cx4_MUL vmulq_f32
|
677
|
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
2848
678
|
|
2849
|
-
|
2850
|
-
#
|
2851
|
-
|
2852
|
-
#
|
2853
|
-
|
679
|
+
#define GGML_F16_VEC GGML_F32Cx4
|
680
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
681
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
682
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
683
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
684
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
685
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
686
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
687
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
2854
688
|
#endif
|
2855
|
-
}
|
2856
|
-
|
2857
|
-
*s = hsum_float_8(acc) + summs;
|
2858
|
-
#elif defined(__riscv_v_intrinsic)
|
2859
|
-
float sumf = 0.0;
|
2860
|
-
|
2861
|
-
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2862
|
-
|
2863
|
-
for (int i = 0; i < nb; i++) {
|
2864
|
-
// load elements
|
2865
|
-
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
|
2866
|
-
|
2867
|
-
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
|
2868
|
-
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
|
2869
|
-
|
2870
|
-
// mask and store lower part of x, and then upper part
|
2871
|
-
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
|
2872
|
-
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
2873
|
-
|
2874
|
-
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
|
2875
|
-
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
2876
|
-
|
2877
|
-
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
|
2878
|
-
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
|
2879
|
-
|
2880
|
-
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2881
|
-
|
2882
|
-
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
|
2883
|
-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
2884
|
-
|
2885
|
-
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
2886
|
-
|
2887
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2888
|
-
}
|
2889
|
-
|
2890
|
-
*s = sumf;
|
2891
|
-
#else
|
2892
|
-
// scalar
|
2893
|
-
float sumf = 0.0;
|
2894
|
-
|
2895
|
-
for (int i = 0; i < nb; i++) {
|
2896
|
-
int sumi = 0;
|
2897
|
-
|
2898
|
-
for (int j = 0; j < qk/2; ++j) {
|
2899
|
-
const int v0 = (x[i].qs[j] & 0x0F);
|
2900
|
-
const int v1 = (x[i].qs[j] >> 4);
|
2901
|
-
|
2902
|
-
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
2903
|
-
}
|
2904
|
-
|
2905
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2906
|
-
}
|
2907
689
|
|
2908
|
-
|
2909
|
-
#endif
|
2910
|
-
}
|
690
|
+
#elif defined(__AVX__)
|
2911
691
|
|
2912
|
-
|
2913
|
-
const int qk = QK8_0;
|
2914
|
-
const int nb = n / qk;
|
692
|
+
#define GGML_SIMD
|
2915
693
|
|
2916
|
-
|
2917
|
-
assert(qk == QK5_0);
|
694
|
+
// F32 AVX
|
2918
695
|
|
2919
|
-
|
2920
|
-
|
696
|
+
#define GGML_F32_STEP 32
|
697
|
+
#define GGML_F32_EPR 8
|
2921
698
|
|
2922
|
-
#
|
2923
|
-
|
2924
|
-
|
2925
|
-
|
2926
|
-
|
2927
|
-
|
2928
|
-
|
2929
|
-
uint64_t tmp0[4];
|
2930
|
-
uint64_t tmp1[4];
|
2931
|
-
|
2932
|
-
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2933
|
-
for (int i = 0; i < nb; i += 2) {
|
2934
|
-
const block_q5_0 * restrict x0 = &x[i];
|
2935
|
-
const block_q5_0 * restrict x1 = &x[i + 1];
|
2936
|
-
const block_q8_0 * restrict y0 = &y[i];
|
2937
|
-
const block_q8_0 * restrict y1 = &y[i + 1];
|
2938
|
-
|
2939
|
-
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2940
|
-
|
2941
|
-
// extract the 5th bit via lookup table ((!b) << 4)
|
2942
|
-
memcpy(&qh0, x0->qh, sizeof(qh0));
|
2943
|
-
memcpy(&qh1, x1->qh, sizeof(qh1));
|
2944
|
-
|
2945
|
-
tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
|
2946
|
-
tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
|
2947
|
-
tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
|
2948
|
-
tmp0[3] = table_b2b_1[(qh0 >> 24) ];
|
2949
|
-
|
2950
|
-
tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
|
2951
|
-
tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
|
2952
|
-
tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
|
2953
|
-
tmp1[3] = table_b2b_1[(qh1 >> 24) ];
|
2954
|
-
|
2955
|
-
const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
|
2956
|
-
const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
|
2957
|
-
const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
|
2958
|
-
const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
|
2959
|
-
|
2960
|
-
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2961
|
-
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
2962
|
-
|
2963
|
-
// 4-bit -> 8-bit
|
2964
|
-
int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
2965
|
-
int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
2966
|
-
int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
2967
|
-
int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
2968
|
-
|
2969
|
-
// add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
|
2970
|
-
const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
|
2971
|
-
const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
|
2972
|
-
const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
|
2973
|
-
const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
|
2974
|
-
|
2975
|
-
// load y
|
2976
|
-
const int8x16_t v1_0l = vld1q_s8(y0->qs);
|
2977
|
-
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
|
2978
|
-
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2979
|
-
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2980
|
-
|
2981
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
2982
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
2983
|
-
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
2984
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2985
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
2986
|
-
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
2987
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
699
|
+
#define GGML_F32x8 __m256
|
700
|
+
#define GGML_F32x8_ZERO _mm256_setzero_ps()
|
701
|
+
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
|
702
|
+
#define GGML_F32x8_LOAD _mm256_loadu_ps
|
703
|
+
#define GGML_F32x8_STORE _mm256_storeu_ps
|
704
|
+
#if defined(__FMA__)
|
705
|
+
#define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
|
2988
706
|
#else
|
2989
|
-
|
2990
|
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
2991
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
2992
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
2993
|
-
|
2994
|
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
2995
|
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
2996
|
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
2997
|
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
2998
|
-
|
2999
|
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
3000
|
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
3001
|
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
3002
|
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
3003
|
-
|
3004
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3005
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
707
|
+
#define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
|
3006
708
|
#endif
|
3007
|
-
|
3008
|
-
|
3009
|
-
|
3010
|
-
|
3011
|
-
|
3012
|
-
|
3013
|
-
|
3014
|
-
|
3015
|
-
|
3016
|
-
|
3017
|
-
|
3018
|
-
|
3019
|
-
|
3020
|
-
|
3021
|
-
|
3022
|
-
|
3023
|
-
|
3024
|
-
|
3025
|
-
|
3026
|
-
|
3027
|
-
|
3028
|
-
|
3029
|
-
tmp[3] = table_b2b_1[(qh >> 24) ];
|
3030
|
-
|
3031
|
-
const v128_t qhl = wasm_v128_load(tmp + 0);
|
3032
|
-
const v128_t qhh = wasm_v128_load(tmp + 2);
|
3033
|
-
|
3034
|
-
const v128_t v0 = wasm_v128_load(x0->qs);
|
3035
|
-
|
3036
|
-
// 4-bit -> 8-bit
|
3037
|
-
const v128_t v0l = wasm_v128_and (v0, m4b);
|
3038
|
-
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
3039
|
-
|
3040
|
-
// add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
|
3041
|
-
const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
|
3042
|
-
const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
|
3043
|
-
|
3044
|
-
// load y
|
3045
|
-
const v128_t v1l = wasm_v128_load(y0->qs);
|
3046
|
-
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
3047
|
-
|
3048
|
-
// int8x16 -> int16x8
|
3049
|
-
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
3050
|
-
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
3051
|
-
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
3052
|
-
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
3053
|
-
|
3054
|
-
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
3055
|
-
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
3056
|
-
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
3057
|
-
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
3058
|
-
|
3059
|
-
// dot product
|
3060
|
-
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
3061
|
-
wasm_i32x4_add(
|
3062
|
-
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
3063
|
-
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
3064
|
-
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
3065
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
3066
|
-
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
3067
|
-
}
|
3068
|
-
|
3069
|
-
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
3070
|
-
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
3071
|
-
#elif defined(__AVX2__)
|
3072
|
-
// Initialize accumulator with zeros
|
3073
|
-
__m256 acc = _mm256_setzero_ps();
|
3074
|
-
|
3075
|
-
// Main loop
|
3076
|
-
for (int i = 0; i < nb; i++) {
|
3077
|
-
/* Compute combined scale for the block */
|
3078
|
-
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
3079
|
-
|
3080
|
-
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
3081
|
-
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
3082
|
-
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
3083
|
-
bx = _mm256_or_si256(bx, bxhi);
|
3084
|
-
|
3085
|
-
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3086
|
-
|
3087
|
-
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
709
|
+
#define GGML_F32x8_ADD _mm256_add_ps
|
710
|
+
#define GGML_F32x8_MUL _mm256_mul_ps
|
711
|
+
#define GGML_F32x8_REDUCE(res, x) \
|
712
|
+
do { \
|
713
|
+
int offset = GGML_F32_ARR >> 1; \
|
714
|
+
for (int i = 0; i < offset; ++i) { \
|
715
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
716
|
+
} \
|
717
|
+
offset >>= 1; \
|
718
|
+
for (int i = 0; i < offset; ++i) { \
|
719
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
720
|
+
} \
|
721
|
+
offset >>= 1; \
|
722
|
+
for (int i = 0; i < offset; ++i) { \
|
723
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
724
|
+
} \
|
725
|
+
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
726
|
+
_mm256_extractf128_ps(x[0], 1)); \
|
727
|
+
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
728
|
+
res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
729
|
+
} while (0)
|
730
|
+
// TODO: is this optimal ?
|
3088
731
|
|
3089
|
-
|
3090
|
-
|
3091
|
-
|
732
|
+
#define GGML_F32_VEC GGML_F32x8
|
733
|
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
734
|
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
735
|
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
736
|
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
737
|
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
738
|
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
739
|
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
740
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
3092
741
|
|
3093
|
-
|
3094
|
-
#elif defined(__AVX__)
|
3095
|
-
// Initialize accumulator with zeros
|
3096
|
-
__m256 acc = _mm256_setzero_ps();
|
3097
|
-
__m128i mask = _mm_set1_epi8((char)0xF0);
|
742
|
+
// F16 AVX
|
3098
743
|
|
3099
|
-
|
3100
|
-
|
3101
|
-
/* Compute combined scale for the block */
|
3102
|
-
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
744
|
+
#define GGML_F16_STEP 32
|
745
|
+
#define GGML_F16_EPR 8
|
3103
746
|
|
3104
|
-
|
3105
|
-
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
3106
|
-
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
3107
|
-
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
3108
|
-
bxhil = _mm_andnot_si128(bxhil, mask);
|
3109
|
-
bxhih = _mm_andnot_si128(bxhih, mask);
|
3110
|
-
__m128i bxl = _mm256_castsi256_si128(bx);
|
3111
|
-
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
3112
|
-
bxl = _mm_or_si128(bxl, bxhil);
|
3113
|
-
bxh = _mm_or_si128(bxh, bxhih);
|
3114
|
-
bx = MM256_SET_M128I(bxh, bxl);
|
747
|
+
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
3115
748
|
|
3116
|
-
|
749
|
+
#define GGML_F32Cx8 __m256
|
750
|
+
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
751
|
+
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
3117
752
|
|
3118
|
-
|
753
|
+
#if defined(__F16C__)
|
754
|
+
// the _mm256_cvt intrinsics require F16C
|
755
|
+
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
756
|
+
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
757
|
+
#else
|
758
|
+
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
759
|
+
float tmp[8];
|
3119
760
|
|
3120
|
-
|
3121
|
-
|
761
|
+
for (int i = 0; i < 8; i++) {
|
762
|
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
3122
763
|
}
|
3123
764
|
|
3124
|
-
|
3125
|
-
|
3126
|
-
|
765
|
+
return _mm256_loadu_ps(tmp);
|
766
|
+
}
|
767
|
+
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
768
|
+
float arr[8];
|
3127
769
|
|
3128
|
-
|
770
|
+
_mm256_storeu_ps(arr, y);
|
3129
771
|
|
3130
|
-
|
772
|
+
for (int i = 0; i < 8; i++)
|
773
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
774
|
+
}
|
775
|
+
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
776
|
+
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
777
|
+
#endif
|
3131
778
|
|
3132
|
-
|
3133
|
-
|
3134
|
-
|
779
|
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
780
|
+
#define GGML_F32Cx8_ADD _mm256_add_ps
|
781
|
+
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
782
|
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
3135
783
|
|
3136
|
-
|
3137
|
-
|
784
|
+
#define GGML_F16_VEC GGML_F32Cx8
|
785
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
786
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
787
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
788
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
789
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
790
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
791
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
792
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
3138
793
|
|
3139
|
-
|
3140
|
-
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
794
|
+
#elif defined(__POWER9_VECTOR__)
|
3141
795
|
|
3142
|
-
|
3143
|
-
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
|
3144
|
-
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
|
3145
|
-
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
|
796
|
+
#define GGML_SIMD
|
3146
797
|
|
3147
|
-
|
3148
|
-
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
|
3149
|
-
vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
|
798
|
+
// F32 POWER9
|
3150
799
|
|
3151
|
-
|
3152
|
-
|
3153
|
-
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
|
800
|
+
#define GGML_F32_STEP 32
|
801
|
+
#define GGML_F32_EPR 4
|
3154
802
|
|
3155
|
-
|
3156
|
-
|
803
|
+
#define GGML_F32x4 vector float
|
804
|
+
#define GGML_F32x4_ZERO 0.0f
|
805
|
+
#define GGML_F32x4_SET1 vec_splats
|
806
|
+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
807
|
+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
808
|
+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
809
|
+
#define GGML_F32x4_ADD vec_add
|
810
|
+
#define GGML_F32x4_MUL vec_mul
|
811
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
812
|
+
{ \
|
813
|
+
int offset = GGML_F32_ARR >> 1; \
|
814
|
+
for (int i = 0; i < offset; ++i) { \
|
815
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
816
|
+
} \
|
817
|
+
offset >>= 1; \
|
818
|
+
for (int i = 0; i < offset; ++i) { \
|
819
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
820
|
+
} \
|
821
|
+
offset >>= 1; \
|
822
|
+
for (int i = 0; i < offset; ++i) { \
|
823
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
824
|
+
} \
|
825
|
+
res = vec_extract(x[0], 0) + \
|
826
|
+
vec_extract(x[0], 1) + \
|
827
|
+
vec_extract(x[0], 2) + \
|
828
|
+
vec_extract(x[0], 3); \
|
829
|
+
}
|
3157
830
|
|
3158
|
-
|
3159
|
-
|
831
|
+
#define GGML_F32_VEC GGML_F32x4
|
832
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
833
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
834
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
835
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
836
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
837
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
838
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
839
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
3160
840
|
|
3161
|
-
|
3162
|
-
|
841
|
+
// F16 POWER9
|
842
|
+
#define GGML_F16_STEP GGML_F32_STEP
|
843
|
+
#define GGML_F16_EPR GGML_F32_EPR
|
844
|
+
#define GGML_F16_VEC GGML_F32x4
|
845
|
+
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
846
|
+
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
847
|
+
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
848
|
+
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
849
|
+
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
850
|
+
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
851
|
+
vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
|
852
|
+
vec_extract_fp32_from_shortl(vec_xl(0, p))
|
853
|
+
#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
|
854
|
+
#define GGML_F16_VEC_STORE(p, r, i) \
|
855
|
+
if (i & 0x1) \
|
856
|
+
vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
|
857
|
+
r[i - GGML_ENDIAN_BYTE(0)]), \
|
858
|
+
0, p - GGML_F16_EPR)
|
3163
859
|
|
3164
|
-
|
3165
|
-
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
860
|
+
#elif defined(__wasm_simd128__)
|
3166
861
|
|
3167
|
-
|
3168
|
-
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
|
862
|
+
#define GGML_SIMD
|
3169
863
|
|
3170
|
-
|
3171
|
-
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
864
|
+
// F32 WASM
|
3172
865
|
|
3173
|
-
|
3174
|
-
|
866
|
+
#define GGML_F32_STEP 16
|
867
|
+
#define GGML_F32_EPR 4
|
3175
868
|
|
3176
|
-
|
3177
|
-
|
869
|
+
#define GGML_F32x4 v128_t
|
870
|
+
#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
|
871
|
+
#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
|
872
|
+
#define GGML_F32x4_LOAD wasm_v128_load
|
873
|
+
#define GGML_F32x4_STORE wasm_v128_store
|
874
|
+
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
|
875
|
+
#define GGML_F32x4_ADD wasm_f32x4_add
|
876
|
+
#define GGML_F32x4_MUL wasm_f32x4_mul
|
877
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
878
|
+
{ \
|
879
|
+
int offset = GGML_F32_ARR >> 1; \
|
880
|
+
for (int i = 0; i < offset; ++i) { \
|
881
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
882
|
+
} \
|
883
|
+
offset >>= 1; \
|
884
|
+
for (int i = 0; i < offset; ++i) { \
|
885
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
886
|
+
} \
|
887
|
+
offset >>= 1; \
|
888
|
+
for (int i = 0; i < offset; ++i) { \
|
889
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
890
|
+
} \
|
891
|
+
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
892
|
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
893
|
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
894
|
+
wasm_f32x4_extract_lane(x[0], 3); \
|
895
|
+
}
|
3178
896
|
|
3179
|
-
|
897
|
+
#define GGML_F32_VEC GGML_F32x4
|
898
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
899
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
900
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
901
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
902
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
903
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
904
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
905
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
3180
906
|
|
3181
|
-
|
3182
|
-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
907
|
+
// F16 WASM
|
3183
908
|
|
3184
|
-
|
909
|
+
#define GGML_F16_STEP 16
|
910
|
+
#define GGML_F16_EPR 4
|
3185
911
|
|
3186
|
-
|
3187
|
-
|
912
|
+
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
913
|
+
float tmp[4];
|
3188
914
|
|
3189
|
-
|
3190
|
-
|
3191
|
-
|
3192
|
-
|
915
|
+
tmp[0] = GGML_FP16_TO_FP32(p[0]);
|
916
|
+
tmp[1] = GGML_FP16_TO_FP32(p[1]);
|
917
|
+
tmp[2] = GGML_FP16_TO_FP32(p[2]);
|
918
|
+
tmp[3] = GGML_FP16_TO_FP32(p[3]);
|
3193
919
|
|
3194
|
-
|
3195
|
-
|
3196
|
-
memcpy(&qh, x[i].qh, sizeof(qh));
|
920
|
+
return wasm_v128_load(tmp);
|
921
|
+
}
|
3197
922
|
|
3198
|
-
|
923
|
+
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
924
|
+
float tmp[4];
|
3199
925
|
|
3200
|
-
|
3201
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
3202
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
926
|
+
wasm_v128_store(tmp, x);
|
3203
927
|
|
3204
|
-
|
3205
|
-
|
928
|
+
p[0] = GGML_FP32_TO_FP16(tmp[0]);
|
929
|
+
p[1] = GGML_FP32_TO_FP16(tmp[1]);
|
930
|
+
p[2] = GGML_FP32_TO_FP16(tmp[2]);
|
931
|
+
p[3] = GGML_FP32_TO_FP16(tmp[3]);
|
932
|
+
}
|
3206
933
|
|
3207
|
-
|
3208
|
-
|
934
|
+
#define GGML_F16x4 v128_t
|
935
|
+
#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
|
936
|
+
#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
|
937
|
+
#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
|
938
|
+
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
|
939
|
+
#define GGML_F16x4_FMA GGML_F32x4_FMA
|
940
|
+
#define GGML_F16x4_ADD wasm_f32x4_add
|
941
|
+
#define GGML_F16x4_MUL wasm_f32x4_mul
|
942
|
+
#define GGML_F16x4_REDUCE(res, x) \
|
943
|
+
{ \
|
944
|
+
int offset = GGML_F16_ARR >> 1; \
|
945
|
+
for (int i = 0; i < offset; ++i) { \
|
946
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
947
|
+
} \
|
948
|
+
offset >>= 1; \
|
949
|
+
for (int i = 0; i < offset; ++i) { \
|
950
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
951
|
+
} \
|
952
|
+
offset >>= 1; \
|
953
|
+
for (int i = 0; i < offset; ++i) { \
|
954
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
955
|
+
} \
|
956
|
+
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
957
|
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
958
|
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
959
|
+
wasm_f32x4_extract_lane(x[0], 3); \
|
960
|
+
}
|
3209
961
|
|
3210
|
-
|
3211
|
-
|
962
|
+
#define GGML_F16_VEC GGML_F16x4
|
963
|
+
#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
|
964
|
+
#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
|
965
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
|
966
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
|
967
|
+
#define GGML_F16_VEC_FMA GGML_F16x4_FMA
|
968
|
+
#define GGML_F16_VEC_ADD GGML_F16x4_ADD
|
969
|
+
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
970
|
+
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
3212
971
|
|
3213
|
-
|
3214
|
-
#endif
|
3215
|
-
}
|
972
|
+
#elif defined(__SSE3__)
|
3216
973
|
|
3217
|
-
|
3218
|
-
const int qk = QK8_1;
|
3219
|
-
const int nb = n / qk;
|
974
|
+
#define GGML_SIMD
|
3220
975
|
|
3221
|
-
|
3222
|
-
assert(qk == QK5_1);
|
976
|
+
// F32 SSE
|
3223
977
|
|
3224
|
-
|
3225
|
-
|
978
|
+
#define GGML_F32_STEP 32
|
979
|
+
#define GGML_F32_EPR 4
|
3226
980
|
|
3227
|
-
#
|
3228
|
-
|
3229
|
-
|
3230
|
-
|
3231
|
-
|
3232
|
-
|
3233
|
-
|
3234
|
-
|
3235
|
-
uint32_t qh1;
|
3236
|
-
|
3237
|
-
uint64_t tmp0[4];
|
3238
|
-
uint64_t tmp1[4];
|
3239
|
-
|
3240
|
-
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3241
|
-
for (int i = 0; i < nb; i += 2) {
|
3242
|
-
const block_q5_1 * restrict x0 = &x[i];
|
3243
|
-
const block_q5_1 * restrict x1 = &x[i + 1];
|
3244
|
-
const block_q8_1 * restrict y0 = &y[i];
|
3245
|
-
const block_q8_1 * restrict y1 = &y[i + 1];
|
3246
|
-
|
3247
|
-
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
3248
|
-
|
3249
|
-
summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
|
3250
|
-
summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
|
3251
|
-
|
3252
|
-
// extract the 5th bit via lookup table ((b) << 4)
|
3253
|
-
memcpy(&qh0, x0->qh, sizeof(qh0));
|
3254
|
-
memcpy(&qh1, x1->qh, sizeof(qh1));
|
3255
|
-
|
3256
|
-
tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
|
3257
|
-
tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
|
3258
|
-
tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
|
3259
|
-
tmp0[3] = table_b2b_0[(qh0 >> 24) ];
|
3260
|
-
|
3261
|
-
tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
|
3262
|
-
tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
|
3263
|
-
tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
|
3264
|
-
tmp1[3] = table_b2b_0[(qh1 >> 24) ];
|
3265
|
-
|
3266
|
-
const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
|
3267
|
-
const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
|
3268
|
-
const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
|
3269
|
-
const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
|
3270
|
-
|
3271
|
-
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
3272
|
-
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
3273
|
-
|
3274
|
-
// 4-bit -> 8-bit
|
3275
|
-
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
3276
|
-
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
3277
|
-
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
3278
|
-
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
3279
|
-
|
3280
|
-
// add high bit
|
3281
|
-
const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
|
3282
|
-
const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
|
3283
|
-
const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
|
3284
|
-
const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
|
3285
|
-
|
3286
|
-
// load y
|
3287
|
-
const int8x16_t v1_0l = vld1q_s8(y0->qs);
|
3288
|
-
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
|
3289
|
-
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
3290
|
-
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
3291
|
-
|
3292
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
3293
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
3294
|
-
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
3295
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
3296
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
3297
|
-
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
3298
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
981
|
+
#define GGML_F32x4 __m128
|
982
|
+
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
983
|
+
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
984
|
+
#define GGML_F32x4_LOAD _mm_loadu_ps
|
985
|
+
#define GGML_F32x4_STORE _mm_storeu_ps
|
986
|
+
#if defined(__FMA__)
|
987
|
+
// TODO: Does this work?
|
988
|
+
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
3299
989
|
#else
|
3300
|
-
|
3301
|
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
3302
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
3303
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
3304
|
-
|
3305
|
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
3306
|
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
3307
|
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
3308
|
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
3309
|
-
|
3310
|
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
3311
|
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
3312
|
-
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
3313
|
-
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
3314
|
-
|
3315
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
3316
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
990
|
+
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
3317
991
|
#endif
|
3318
|
-
|
3319
|
-
|
3320
|
-
|
3321
|
-
|
3322
|
-
|
3323
|
-
|
3324
|
-
|
3325
|
-
|
3326
|
-
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
|
3337
|
-
|
3338
|
-
// extract the 5th bit
|
3339
|
-
memcpy(&qh, x0->qh, sizeof(qh));
|
3340
|
-
|
3341
|
-
tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
|
3342
|
-
tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
|
3343
|
-
tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
|
3344
|
-
tmp[3] = table_b2b_0[(qh >> 24) ];
|
992
|
+
#define GGML_F32x4_ADD _mm_add_ps
|
993
|
+
#define GGML_F32x4_MUL _mm_mul_ps
|
994
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
995
|
+
{ \
|
996
|
+
int offset = GGML_F32_ARR >> 1; \
|
997
|
+
for (int i = 0; i < offset; ++i) { \
|
998
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
999
|
+
} \
|
1000
|
+
offset >>= 1; \
|
1001
|
+
for (int i = 0; i < offset; ++i) { \
|
1002
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
1003
|
+
} \
|
1004
|
+
offset >>= 1; \
|
1005
|
+
for (int i = 0; i < offset; ++i) { \
|
1006
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
1007
|
+
} \
|
1008
|
+
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
1009
|
+
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
1010
|
+
}
|
1011
|
+
// TODO: is this optimal ?
|
3345
1012
|
|
3346
|
-
|
3347
|
-
|
1013
|
+
#define GGML_F32_VEC GGML_F32x4
|
1014
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
1015
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
1016
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
1017
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
1018
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
1019
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
1020
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
1021
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
3348
1022
|
|
3349
|
-
|
1023
|
+
// F16 SSE
|
3350
1024
|
|
3351
|
-
|
3352
|
-
|
3353
|
-
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
1025
|
+
#define GGML_F16_STEP 32
|
1026
|
+
#define GGML_F16_EPR 4
|
3354
1027
|
|
3355
|
-
|
3356
|
-
|
3357
|
-
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
1028
|
+
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
1029
|
+
float tmp[4];
|
3358
1030
|
|
3359
|
-
|
3360
|
-
|
3361
|
-
|
1031
|
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
1032
|
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
1033
|
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
1034
|
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
3362
1035
|
|
3363
|
-
|
3364
|
-
|
3365
|
-
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
3366
|
-
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
3367
|
-
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
1036
|
+
return _mm_loadu_ps(tmp);
|
1037
|
+
}
|
3368
1038
|
|
3369
|
-
|
3370
|
-
|
3371
|
-
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
3372
|
-
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
1039
|
+
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
1040
|
+
float arr[4];
|
3373
1041
|
|
3374
|
-
|
3375
|
-
sumv = wasm_f32x4_add(sumv,
|
3376
|
-
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
3377
|
-
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
3378
|
-
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
3379
|
-
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
3380
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
3381
|
-
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
3382
|
-
}
|
1042
|
+
_mm_storeu_ps(arr, y);
|
3383
1043
|
|
3384
|
-
|
3385
|
-
|
3386
|
-
|
3387
|
-
|
3388
|
-
|
1044
|
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
1045
|
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
1046
|
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
1047
|
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
1048
|
+
}
|
3389
1049
|
|
3390
|
-
|
1050
|
+
#define GGML_F32Cx4 __m128
|
1051
|
+
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
1052
|
+
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
1053
|
+
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
1054
|
+
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
1055
|
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
1056
|
+
#define GGML_F32Cx4_ADD _mm_add_ps
|
1057
|
+
#define GGML_F32Cx4_MUL _mm_mul_ps
|
1058
|
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
3391
1059
|
|
3392
|
-
|
3393
|
-
|
3394
|
-
|
1060
|
+
#define GGML_F16_VEC GGML_F32Cx4
|
1061
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
1062
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
1063
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
1064
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
1065
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
1066
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
1067
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
1068
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
3395
1069
|
|
3396
|
-
|
1070
|
+
#endif
|
3397
1071
|
|
3398
|
-
|
3399
|
-
|
3400
|
-
|
3401
|
-
|
1072
|
+
// GGML_F32_ARR / GGML_F16_ARR
|
1073
|
+
// number of registers to use per step
|
1074
|
+
#ifdef GGML_SIMD
|
1075
|
+
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
1076
|
+
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
1077
|
+
#endif
|
3402
1078
|
|
3403
|
-
|
3404
|
-
|
1079
|
+
//
|
1080
|
+
// fundamental operations
|
1081
|
+
//
|
3405
1082
|
|
3406
|
-
|
1083
|
+
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
3407
1084
|
|
3408
|
-
|
3409
|
-
}
|
1085
|
+
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
3410
1086
|
|
3411
|
-
|
3412
|
-
#elif defined(__AVX__)
|
3413
|
-
// Initialize accumulator with zeros
|
3414
|
-
__m256 acc = _mm256_setzero_ps();
|
3415
|
-
__m128i mask = _mm_set1_epi8(0x10);
|
1087
|
+
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
3416
1088
|
|
3417
|
-
|
1089
|
+
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
3418
1090
|
|
3419
|
-
|
3420
|
-
|
3421
|
-
|
1091
|
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
1092
|
+
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
1093
|
+
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
1094
|
+
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
|
1095
|
+
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
1096
|
+
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1097
|
+
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
1098
|
+
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
1099
|
+
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
1100
|
+
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
3422
1101
|
|
3423
|
-
|
1102
|
+
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
1103
|
+
#ifdef GGML_SIMD
|
1104
|
+
float sumf = 0.0f;
|
1105
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
3424
1106
|
|
3425
|
-
|
3426
|
-
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
3427
|
-
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
3428
|
-
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
3429
|
-
bxhil = _mm_and_si128(bxhil, mask);
|
3430
|
-
bxhih = _mm_and_si128(bxhih, mask);
|
3431
|
-
__m128i bxl = _mm256_castsi256_si128(bx);
|
3432
|
-
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
3433
|
-
bxl = _mm_or_si128(bxl, bxhil);
|
3434
|
-
bxh = _mm_or_si128(bxh, bxhih);
|
3435
|
-
bx = MM256_SET_M128I(bxh, bxl);
|
1107
|
+
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
3436
1108
|
|
3437
|
-
|
3438
|
-
|
1109
|
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
1110
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
3439
1111
|
|
3440
|
-
|
1112
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
1113
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
1114
|
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
1115
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
3441
1116
|
|
3442
|
-
|
1117
|
+
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
1118
|
+
}
|
3443
1119
|
}
|
3444
1120
|
|
3445
|
-
|
3446
|
-
|
3447
|
-
float sumf = 0.0;
|
3448
|
-
|
3449
|
-
uint32_t qh;
|
3450
|
-
|
3451
|
-
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3452
|
-
|
3453
|
-
// temporary registers for shift operations
|
3454
|
-
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
3455
|
-
vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
|
3456
|
-
|
3457
|
-
for (int i = 0; i < nb; i++) {
|
3458
|
-
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3459
|
-
|
3460
|
-
// load qh
|
3461
|
-
vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
|
3462
|
-
|
3463
|
-
// ((qh >> (j + 0)) << 4) & 0x10;
|
3464
|
-
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
|
3465
|
-
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
|
3466
|
-
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
|
3467
|
-
|
3468
|
-
// ((qh >> (j + 12)) ) & 0x10;
|
3469
|
-
vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
|
3470
|
-
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
|
3471
|
-
|
3472
|
-
// narrowing
|
3473
|
-
vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
|
3474
|
-
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
|
3475
|
-
|
3476
|
-
vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
|
3477
|
-
vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
|
3478
|
-
|
3479
|
-
// load
|
3480
|
-
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
|
3481
|
-
|
3482
|
-
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
|
3483
|
-
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
|
3484
|
-
|
3485
|
-
vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
|
3486
|
-
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
3487
|
-
|
3488
|
-
vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
|
3489
|
-
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
|
3490
|
-
|
3491
|
-
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
|
3492
|
-
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
3493
|
-
|
3494
|
-
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
|
3495
|
-
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
|
3496
|
-
|
3497
|
-
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3498
|
-
|
3499
|
-
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
|
3500
|
-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
3501
|
-
|
3502
|
-
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
1121
|
+
// reduce sum0..sum3 to sum0
|
1122
|
+
GGML_F32_VEC_REDUCE(sumf, sum);
|
3503
1123
|
|
3504
|
-
|
1124
|
+
// leftovers
|
1125
|
+
for (int i = np; i < n; ++i) {
|
1126
|
+
sumf += x[i]*y[i];
|
3505
1127
|
}
|
3506
|
-
|
3507
|
-
*s = sumf;
|
3508
1128
|
#else
|
3509
1129
|
// scalar
|
3510
|
-
|
3511
|
-
|
3512
|
-
|
3513
|
-
uint32_t qh;
|
3514
|
-
memcpy(&qh, x[i].qh, sizeof(qh));
|
3515
|
-
|
3516
|
-
int sumi = 0;
|
3517
|
-
|
3518
|
-
for (int j = 0; j < qk/2; ++j) {
|
3519
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
3520
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
3521
|
-
|
3522
|
-
const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
|
3523
|
-
const int32_t x1 = (x[i].qs[j] >> 4) | xh_1;
|
3524
|
-
|
3525
|
-
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
3526
|
-
}
|
3527
|
-
|
3528
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
1130
|
+
ggml_float sumf = 0.0;
|
1131
|
+
for (int i = 0; i < n; ++i) {
|
1132
|
+
sumf += (ggml_float)(x[i]*y[i]);
|
3529
1133
|
}
|
1134
|
+
#endif
|
3530
1135
|
|
3531
1136
|
*s = sumf;
|
3532
|
-
#endif
|
3533
1137
|
}
|
3534
1138
|
|
3535
|
-
static void
|
3536
|
-
|
3537
|
-
const int nb = n / qk;
|
3538
|
-
|
3539
|
-
assert(n % qk == 0);
|
3540
|
-
|
3541
|
-
const block_q8_0 * restrict x = vx;
|
3542
|
-
const block_q8_0 * restrict y = vy;
|
3543
|
-
|
3544
|
-
#if defined(__ARM_NEON)
|
3545
|
-
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3546
|
-
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
3547
|
-
|
3548
|
-
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3549
|
-
for (int i = 0; i < nb; i += 2) {
|
3550
|
-
const block_q8_0 * restrict x0 = &x[i + 0];
|
3551
|
-
const block_q8_0 * restrict x1 = &x[i + 1];
|
3552
|
-
const block_q8_0 * restrict y0 = &y[i + 0];
|
3553
|
-
const block_q8_0 * restrict y1 = &y[i + 1];
|
3554
|
-
|
3555
|
-
const int8x16_t x0_0 = vld1q_s8(x0->qs);
|
3556
|
-
const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
|
3557
|
-
const int8x16_t x1_0 = vld1q_s8(x1->qs);
|
3558
|
-
const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
|
3559
|
-
|
3560
|
-
// load y
|
3561
|
-
const int8x16_t y0_0 = vld1q_s8(y0->qs);
|
3562
|
-
const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
|
3563
|
-
const int8x16_t y1_0 = vld1q_s8(y1->qs);
|
3564
|
-
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
3565
|
-
|
3566
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
3567
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
3568
|
-
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
3569
|
-
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3570
|
-
|
3571
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
3572
|
-
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
3573
|
-
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
1139
|
+
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
1140
|
+
ggml_float sumf = 0.0;
|
3574
1141
|
|
3575
|
-
#
|
3576
|
-
|
3577
|
-
const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
|
3578
|
-
const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
|
3579
|
-
const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
|
3580
|
-
|
3581
|
-
const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
|
3582
|
-
const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
|
3583
|
-
const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
|
3584
|
-
const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
|
3585
|
-
|
3586
|
-
const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
|
3587
|
-
const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
|
3588
|
-
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
3589
|
-
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
3590
|
-
|
3591
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3592
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
3593
|
-
#endif
|
3594
|
-
}
|
1142
|
+
#if defined(GGML_SIMD)
|
1143
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
3595
1144
|
|
3596
|
-
|
3597
|
-
#elif defined(__AVX2__) || defined(__AVX__)
|
3598
|
-
// Initialize accumulator with zeros
|
3599
|
-
__m256 acc = _mm256_setzero_ps();
|
1145
|
+
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
3600
1146
|
|
3601
|
-
|
3602
|
-
|
3603
|
-
// Compute combined scale for the block
|
3604
|
-
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
3605
|
-
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
3606
|
-
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
1147
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
1148
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
3607
1149
|
|
3608
|
-
|
1150
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
1151
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
1152
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
1153
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
3609
1154
|
|
3610
|
-
|
3611
|
-
|
3612
|
-
acc = _mm256_fmadd_ps( d, q, acc );
|
3613
|
-
#else
|
3614
|
-
acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
|
3615
|
-
#endif
|
1155
|
+
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
1156
|
+
}
|
3616
1157
|
}
|
3617
1158
|
|
3618
|
-
|
3619
|
-
|
3620
|
-
float sumf = 0.0;
|
3621
|
-
size_t vl = __riscv_vsetvl_e8m1(qk);
|
3622
|
-
|
3623
|
-
for (int i = 0; i < nb; i++) {
|
3624
|
-
// load elements
|
3625
|
-
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
3626
|
-
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3627
|
-
|
3628
|
-
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
3629
|
-
|
3630
|
-
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3631
|
-
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
3632
|
-
|
3633
|
-
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
1159
|
+
// reduce sum0..sum3 to sum0
|
1160
|
+
GGML_F16_VEC_REDUCE(sumf, sum);
|
3634
1161
|
|
3635
|
-
|
1162
|
+
// leftovers
|
1163
|
+
for (int i = np; i < n; ++i) {
|
1164
|
+
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
3636
1165
|
}
|
3637
|
-
|
3638
|
-
*s = sumf;
|
3639
1166
|
#else
|
3640
|
-
|
3641
|
-
|
3642
|
-
|
3643
|
-
for (int i = 0; i < nb; i++) {
|
3644
|
-
int sumi = 0;
|
3645
|
-
|
3646
|
-
for (int j = 0; j < qk; j++) {
|
3647
|
-
sumi += x[i].qs[j]*y[i].qs[j];
|
3648
|
-
}
|
3649
|
-
|
3650
|
-
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
1167
|
+
for (int i = 0; i < n; ++i) {
|
1168
|
+
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
3651
1169
|
}
|
1170
|
+
#endif
|
3652
1171
|
|
3653
1172
|
*s = sumf;
|
3654
|
-
#endif
|
3655
1173
|
}
|
3656
1174
|
|
3657
1175
|
// compute GGML_VEC_DOT_UNROLL dot products at once
|
@@ -3846,7 +1364,7 @@ inline static float ggml_gelu_f32(float x) {
|
|
3846
1364
|
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3847
1365
|
const uint16_t * i16 = (const uint16_t *) x;
|
3848
1366
|
for (int i = 0; i < n; ++i) {
|
3849
|
-
y[i] =
|
1367
|
+
y[i] = ggml_table_gelu_f16[i16[i]];
|
3850
1368
|
}
|
3851
1369
|
}
|
3852
1370
|
|
@@ -3856,7 +1374,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
3856
1374
|
for (int i = 0; i < n; ++i) {
|
3857
1375
|
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3858
1376
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
3859
|
-
y[i] = GGML_FP16_TO_FP32(
|
1377
|
+
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
3860
1378
|
}
|
3861
1379
|
}
|
3862
1380
|
#else
|
@@ -3874,7 +1392,7 @@ inline static float ggml_gelu_quick_f32(float x) {
|
|
3874
1392
|
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3875
1393
|
// const uint16_t * i16 = (const uint16_t *) x;
|
3876
1394
|
// for (int i = 0; i < n; ++i) {
|
3877
|
-
// y[i] =
|
1395
|
+
// y[i] = ggml_table_gelu_quick_f16[i16[i]];
|
3878
1396
|
// }
|
3879
1397
|
//}
|
3880
1398
|
|
@@ -3884,7 +1402,7 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
|
3884
1402
|
for (int i = 0; i < n; ++i) {
|
3885
1403
|
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3886
1404
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
3887
|
-
y[i] = GGML_FP16_TO_FP32(
|
1405
|
+
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
3888
1406
|
}
|
3889
1407
|
}
|
3890
1408
|
#else
|
@@ -3903,7 +1421,7 @@ inline static float ggml_silu_f32(float x) {
|
|
3903
1421
|
//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3904
1422
|
// const uint16_t * i16 = (const uint16_t *) x;
|
3905
1423
|
// for (int i = 0; i < n; ++i) {
|
3906
|
-
// y[i] =
|
1424
|
+
// y[i] = ggml_table_silu_f16[i16[i]];
|
3907
1425
|
// }
|
3908
1426
|
//}
|
3909
1427
|
|
@@ -3913,7 +1431,7 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
3913
1431
|
for (int i = 0; i < n; ++i) {
|
3914
1432
|
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3915
1433
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
3916
|
-
y[i] = GGML_FP16_TO_FP32(
|
1434
|
+
y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
|
3917
1435
|
}
|
3918
1436
|
}
|
3919
1437
|
#else
|
@@ -4629,11 +2147,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4629
2147
|
for (int i = 0; i < (1 << 16); ++i) {
|
4630
2148
|
uint16_t ui = i;
|
4631
2149
|
memcpy(&ii, &ui, sizeof(ii));
|
4632
|
-
const float f =
|
4633
|
-
|
4634
|
-
|
4635
|
-
|
4636
|
-
|
2150
|
+
const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
2151
|
+
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
2152
|
+
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
2153
|
+
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
2154
|
+
ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
4637
2155
|
}
|
4638
2156
|
|
4639
2157
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
@@ -5636,7 +3154,7 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|
5636
3154
|
// TODO: support less-strict constraint
|
5637
3155
|
// GGML_ASSERT(ggml_can_repeat(b, a));
|
5638
3156
|
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
5639
|
-
GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input
|
3157
|
+
GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
|
5640
3158
|
|
5641
3159
|
bool is_node = false;
|
5642
3160
|
|
@@ -7328,8 +4846,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
7328
4846
|
int n_dims,
|
7329
4847
|
int mode,
|
7330
4848
|
int n_ctx,
|
4849
|
+
int n_orig_ctx,
|
7331
4850
|
float freq_base,
|
7332
4851
|
float freq_scale,
|
4852
|
+
float ext_factor,
|
4853
|
+
float attn_factor,
|
4854
|
+
float beta_fast,
|
4855
|
+
float beta_slow,
|
7333
4856
|
float xpos_base,
|
7334
4857
|
bool xpos_down,
|
7335
4858
|
bool inplace) {
|
@@ -7345,11 +4868,15 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
7345
4868
|
|
7346
4869
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7347
4870
|
|
7348
|
-
int32_t params[
|
7349
|
-
memcpy(params +
|
7350
|
-
memcpy(params +
|
7351
|
-
memcpy(params +
|
7352
|
-
memcpy(params +
|
4871
|
+
int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
|
4872
|
+
memcpy(params + 5, &freq_base, sizeof(float));
|
4873
|
+
memcpy(params + 6, &freq_scale, sizeof(float));
|
4874
|
+
memcpy(params + 7, &ext_factor, sizeof(float));
|
4875
|
+
memcpy(params + 8, &attn_factor, sizeof(float));
|
4876
|
+
memcpy(params + 9, &beta_fast, sizeof(float));
|
4877
|
+
memcpy(params + 10, &beta_slow, sizeof(float));
|
4878
|
+
memcpy(params + 11, &xpos_base, sizeof(float));
|
4879
|
+
memcpy(params + 12, &xpos_down, sizeof(bool));
|
7353
4880
|
ggml_set_op_params(result, params, sizeof(params));
|
7354
4881
|
|
7355
4882
|
result->op = GGML_OP_ROPE;
|
@@ -7367,7 +4894,9 @@ struct ggml_tensor * ggml_rope(
|
|
7367
4894
|
int n_dims,
|
7368
4895
|
int mode,
|
7369
4896
|
int n_ctx) {
|
7370
|
-
return ggml_rope_impl(
|
4897
|
+
return ggml_rope_impl(
|
4898
|
+
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
|
4899
|
+
);
|
7371
4900
|
}
|
7372
4901
|
|
7373
4902
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -7377,7 +4906,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
7377
4906
|
int n_dims,
|
7378
4907
|
int mode,
|
7379
4908
|
int n_ctx) {
|
7380
|
-
return ggml_rope_impl(
|
4909
|
+
return ggml_rope_impl(
|
4910
|
+
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
|
4911
|
+
);
|
7381
4912
|
}
|
7382
4913
|
|
7383
4914
|
struct ggml_tensor * ggml_rope_custom(
|
@@ -7387,9 +4918,17 @@ struct ggml_tensor * ggml_rope_custom(
|
|
7387
4918
|
int n_dims,
|
7388
4919
|
int mode,
|
7389
4920
|
int n_ctx,
|
4921
|
+
int n_orig_ctx,
|
7390
4922
|
float freq_base,
|
7391
|
-
float freq_scale
|
7392
|
-
|
4923
|
+
float freq_scale,
|
4924
|
+
float ext_factor,
|
4925
|
+
float attn_factor,
|
4926
|
+
float beta_fast,
|
4927
|
+
float beta_slow) {
|
4928
|
+
return ggml_rope_impl(
|
4929
|
+
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
4930
|
+
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
|
4931
|
+
);
|
7393
4932
|
}
|
7394
4933
|
|
7395
4934
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -7399,9 +4938,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
7399
4938
|
int n_dims,
|
7400
4939
|
int mode,
|
7401
4940
|
int n_ctx,
|
4941
|
+
int n_orig_ctx,
|
7402
4942
|
float freq_base,
|
7403
|
-
float freq_scale
|
7404
|
-
|
4943
|
+
float freq_scale,
|
4944
|
+
float ext_factor,
|
4945
|
+
float attn_factor,
|
4946
|
+
float beta_fast,
|
4947
|
+
float beta_slow) {
|
4948
|
+
return ggml_rope_impl(
|
4949
|
+
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
4950
|
+
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
|
4951
|
+
);
|
7405
4952
|
}
|
7406
4953
|
|
7407
4954
|
struct ggml_tensor * ggml_rope_xpos_inplace(
|
@@ -7411,7 +4958,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
|
|
7411
4958
|
int n_dims,
|
7412
4959
|
float base,
|
7413
4960
|
bool down) {
|
7414
|
-
return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
4961
|
+
return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
7415
4962
|
}
|
7416
4963
|
|
7417
4964
|
// ggml_rope_back
|
@@ -9410,9 +6957,15 @@ static void ggml_compute_forward_add_f16_f32(
|
|
9410
6957
|
|
9411
6958
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
9412
6959
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
9413
|
-
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
9414
6960
|
|
9415
|
-
|
6961
|
+
if (dst->type == GGML_TYPE_F32) {
|
6962
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
6963
|
+
}
|
6964
|
+
else {
|
6965
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
6966
|
+
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
6967
|
+
}
|
6968
|
+
|
9416
6969
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
9417
6970
|
|
9418
6971
|
// rows per thread
|
@@ -9423,18 +6976,35 @@ static void ggml_compute_forward_add_f16_f32(
|
|
9423
6976
|
const int ir1 = MIN(ir0 + dr, nr);
|
9424
6977
|
|
9425
6978
|
if (nb10 == sizeof(float)) {
|
9426
|
-
|
9427
|
-
|
9428
|
-
|
9429
|
-
|
9430
|
-
|
9431
|
-
|
9432
|
-
|
9433
|
-
|
9434
|
-
|
9435
|
-
|
9436
|
-
|
9437
|
-
|
6979
|
+
if (dst->type == GGML_TYPE_F16) {
|
6980
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
6981
|
+
// src0, src1 and dst are same shape => same indices
|
6982
|
+
const int i3 = ir/(ne2*ne1);
|
6983
|
+
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
6984
|
+
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
6985
|
+
|
6986
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
6987
|
+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
6988
|
+
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
6989
|
+
|
6990
|
+
for (int i = 0; i < ne0; i++) {
|
6991
|
+
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
|
6992
|
+
}
|
6993
|
+
}
|
6994
|
+
} else {
|
6995
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
6996
|
+
// src0, src1 and dst are same shape => same indices
|
6997
|
+
const int i3 = ir/(ne2*ne1);
|
6998
|
+
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
6999
|
+
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
7000
|
+
|
7001
|
+
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
7002
|
+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
7003
|
+
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
7004
|
+
|
7005
|
+
for (int i = 0; i < ne0; i++) {
|
7006
|
+
dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
|
7007
|
+
}
|
9438
7008
|
}
|
9439
7009
|
}
|
9440
7010
|
}
|
@@ -12996,7 +10566,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12996
10566
|
// const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
|
12997
10567
|
ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
|
12998
10568
|
memcpy(&scvt, &s, sizeof(scvt));
|
12999
|
-
const float val = GGML_FP16_TO_FP32(
|
10569
|
+
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
13000
10570
|
sum += (ggml_float)val;
|
13001
10571
|
dp[i] = val;
|
13002
10572
|
}
|
@@ -13361,6 +10931,45 @@ static void ggml_compute_forward_clamp(
|
|
13361
10931
|
|
13362
10932
|
// ggml_compute_forward_rope
|
13363
10933
|
|
10934
|
+
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
10935
|
+
const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
10936
|
+
return 1 - MIN(1, MAX(0, y));
|
10937
|
+
}
|
10938
|
+
|
10939
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
10940
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
10941
|
+
static void rope_yarn(
|
10942
|
+
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
10943
|
+
float * cos_theta, float * sin_theta
|
10944
|
+
) {
|
10945
|
+
// Get n-d rotational scaling corrected for extrapolation
|
10946
|
+
float theta_interp = freq_scale * theta_extrap;
|
10947
|
+
float theta = theta_interp;
|
10948
|
+
if (ext_factor != 0.0f) {
|
10949
|
+
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
10950
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
10951
|
+
|
10952
|
+
// Get n-d magnitude scaling corrected for interpolation
|
10953
|
+
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
10954
|
+
}
|
10955
|
+
*cos_theta = cosf(theta) * mscale;
|
10956
|
+
*sin_theta = sinf(theta) * mscale;
|
10957
|
+
}
|
10958
|
+
|
10959
|
+
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
10960
|
+
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
10961
|
+
static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
|
10962
|
+
return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
10963
|
+
}
|
10964
|
+
|
10965
|
+
void ggml_rope_yarn_corr_dims(
|
10966
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
10967
|
+
) {
|
10968
|
+
// start and end correction dims
|
10969
|
+
dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
10970
|
+
dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
10971
|
+
}
|
10972
|
+
|
13364
10973
|
static void ggml_compute_forward_rope_f32(
|
13365
10974
|
const struct ggml_compute_params * params,
|
13366
10975
|
const struct ggml_tensor * src0,
|
@@ -13370,21 +10979,26 @@ static void ggml_compute_forward_rope_f32(
|
|
13370
10979
|
return;
|
13371
10980
|
}
|
13372
10981
|
|
13373
|
-
float freq_base;
|
13374
|
-
float freq_scale;
|
10982
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
13375
10983
|
|
13376
10984
|
// these two only relevant for xPos RoPE:
|
13377
10985
|
float xpos_base;
|
13378
10986
|
bool xpos_down;
|
13379
10987
|
|
13380
|
-
//const int n_past
|
13381
|
-
const int n_dims
|
13382
|
-
const int mode
|
13383
|
-
const int n_ctx
|
13384
|
-
|
13385
|
-
|
13386
|
-
memcpy(&
|
13387
|
-
memcpy(&
|
10988
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
10989
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
10990
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
10991
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
10992
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
10993
|
+
|
10994
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
10995
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
10996
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
10997
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
10998
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
10999
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
11000
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
|
11001
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
|
13388
11002
|
|
13389
11003
|
GGML_TENSOR_UNARY_OP_LOCALS
|
13390
11004
|
|
@@ -13412,6 +11026,9 @@ static void ggml_compute_forward_rope_f32(
|
|
13412
11026
|
int ir = 0;
|
13413
11027
|
|
13414
11028
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
11029
|
+
const float inv_ndims = -1.f/n_dims;
|
11030
|
+
float corr_dims[2];
|
11031
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
13415
11032
|
|
13416
11033
|
const bool is_neox = mode & 2;
|
13417
11034
|
const bool is_glm = mode & 4;
|
@@ -13425,18 +11042,18 @@ static void ggml_compute_forward_rope_f32(
|
|
13425
11042
|
if (ir++ < ir0) continue;
|
13426
11043
|
if (ir > ir1) break;
|
13427
11044
|
|
13428
|
-
float
|
11045
|
+
float theta_base = (float)p;
|
13429
11046
|
|
13430
11047
|
if (is_glm) {
|
13431
|
-
|
11048
|
+
theta_base = MIN(p, n_ctx - 2);
|
13432
11049
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
13433
11050
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
13434
|
-
const float cos_theta = cosf(
|
13435
|
-
const float sin_theta = sinf(
|
11051
|
+
const float cos_theta = cosf(theta_base);
|
11052
|
+
const float sin_theta = sinf(theta_base);
|
13436
11053
|
const float cos_block_theta = cosf(block_theta);
|
13437
11054
|
const float sin_block_theta = sinf(block_theta);
|
13438
11055
|
|
13439
|
-
|
11056
|
+
theta_base *= theta_scale;
|
13440
11057
|
block_theta *= theta_scale;
|
13441
11058
|
|
13442
11059
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -13454,13 +11071,16 @@ static void ggml_compute_forward_rope_f32(
|
|
13454
11071
|
}
|
13455
11072
|
} else if (!is_neox) {
|
13456
11073
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13457
|
-
|
13458
|
-
|
11074
|
+
float cos_theta, sin_theta;
|
11075
|
+
rope_yarn(
|
11076
|
+
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11077
|
+
);
|
11078
|
+
|
13459
11079
|
// zeta scaling for xPos only:
|
13460
11080
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
13461
11081
|
if (xpos_down) zeta = 1.0f / zeta;
|
13462
11082
|
|
13463
|
-
|
11083
|
+
theta_base *= theta_scale;
|
13464
11084
|
|
13465
11085
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
13466
11086
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -13474,12 +11094,19 @@ static void ggml_compute_forward_rope_f32(
|
|
13474
11094
|
} else {
|
13475
11095
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
13476
11096
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
11097
|
+
theta_base *= freq_scale;
|
13477
11098
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
13478
11099
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
13479
|
-
|
13480
|
-
|
11100
|
+
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11101
|
+
float cur_rot = inv_ndims * ic - ib;
|
11102
|
+
|
11103
|
+
float cos_theta, sin_theta;
|
11104
|
+
rope_yarn(
|
11105
|
+
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
11106
|
+
&cos_theta, &sin_theta
|
11107
|
+
);
|
13481
11108
|
|
13482
|
-
|
11109
|
+
theta_base *= theta_scale;
|
13483
11110
|
|
13484
11111
|
const int64_t i0 = ib*n_dims + ic/2;
|
13485
11112
|
|
@@ -13508,15 +11135,19 @@ static void ggml_compute_forward_rope_f16(
|
|
13508
11135
|
return;
|
13509
11136
|
}
|
13510
11137
|
|
13511
|
-
float freq_base;
|
13512
|
-
float freq_scale;
|
11138
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
13513
11139
|
|
13514
|
-
//const int n_past
|
13515
|
-
const int n_dims
|
13516
|
-
const int mode
|
13517
|
-
const int n_ctx
|
13518
|
-
|
13519
|
-
memcpy(&
|
11140
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
11141
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
11142
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
11143
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
11144
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
11145
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
11146
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
11147
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
11148
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
11149
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
11150
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
13520
11151
|
|
13521
11152
|
GGML_TENSOR_UNARY_OP_LOCALS
|
13522
11153
|
|
@@ -13544,6 +11175,9 @@ static void ggml_compute_forward_rope_f16(
|
|
13544
11175
|
int ir = 0;
|
13545
11176
|
|
13546
11177
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
11178
|
+
const float inv_ndims = -1.f/n_dims;
|
11179
|
+
float corr_dims[2];
|
11180
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
13547
11181
|
|
13548
11182
|
const bool is_neox = mode & 2;
|
13549
11183
|
const bool is_glm = mode & 4;
|
@@ -13557,18 +11191,18 @@ static void ggml_compute_forward_rope_f16(
|
|
13557
11191
|
if (ir++ < ir0) continue;
|
13558
11192
|
if (ir > ir1) break;
|
13559
11193
|
|
13560
|
-
float
|
11194
|
+
float theta_base = (float)p;
|
13561
11195
|
|
13562
11196
|
if (is_glm) {
|
13563
|
-
|
11197
|
+
theta_base = MIN(p, n_ctx - 2);
|
13564
11198
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
13565
11199
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
13566
|
-
const float cos_theta = cosf(
|
13567
|
-
const float sin_theta = sinf(
|
11200
|
+
const float cos_theta = cosf(theta_base);
|
11201
|
+
const float sin_theta = sinf(theta_base);
|
13568
11202
|
const float cos_block_theta = cosf(block_theta);
|
13569
11203
|
const float sin_block_theta = sinf(block_theta);
|
13570
11204
|
|
13571
|
-
|
11205
|
+
theta_base *= theta_scale;
|
13572
11206
|
block_theta *= theta_scale;
|
13573
11207
|
|
13574
11208
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -13586,10 +11220,12 @@ static void ggml_compute_forward_rope_f16(
|
|
13586
11220
|
}
|
13587
11221
|
} else if (!is_neox) {
|
13588
11222
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13589
|
-
|
13590
|
-
|
11223
|
+
float cos_theta, sin_theta;
|
11224
|
+
rope_yarn(
|
11225
|
+
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11226
|
+
);
|
13591
11227
|
|
13592
|
-
|
11228
|
+
theta_base *= theta_scale;
|
13593
11229
|
|
13594
11230
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
13595
11231
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -13603,12 +11239,19 @@ static void ggml_compute_forward_rope_f16(
|
|
13603
11239
|
} else {
|
13604
11240
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
13605
11241
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
11242
|
+
theta_base *= freq_scale;
|
13606
11243
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
13607
11244
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
13608
|
-
|
13609
|
-
|
11245
|
+
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11246
|
+
float cur_rot = inv_ndims * ic - ib;
|
11247
|
+
|
11248
|
+
float cos_theta, sin_theta;
|
11249
|
+
rope_yarn(
|
11250
|
+
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
11251
|
+
&cos_theta, &sin_theta
|
11252
|
+
);
|
13610
11253
|
|
13611
|
-
|
11254
|
+
theta_base *= theta_scale;
|
13612
11255
|
|
13613
11256
|
const int64_t i0 = ib*n_dims + ic/2;
|
13614
11257
|
|
@@ -13716,17 +11359,18 @@ static void ggml_compute_forward_rope_back_f32(
|
|
13716
11359
|
if (ir++ < ir0) continue;
|
13717
11360
|
if (ir > ir1) break;
|
13718
11361
|
|
13719
|
-
float
|
11362
|
+
float theta_base = freq_scale * (float)p;
|
13720
11363
|
|
13721
11364
|
if (!is_neox) {
|
13722
11365
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13723
|
-
const float cos_theta = cosf(
|
13724
|
-
const float sin_theta = sinf(
|
11366
|
+
const float cos_theta = cosf(theta_base);
|
11367
|
+
const float sin_theta = sinf(theta_base);
|
11368
|
+
|
13725
11369
|
// zeta scaling for xPos only:
|
13726
11370
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
13727
11371
|
if (xpos_down) zeta = 1.0f / zeta;
|
13728
11372
|
|
13729
|
-
|
11373
|
+
theta_base *= theta_scale;
|
13730
11374
|
|
13731
11375
|
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
13732
11376
|
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -13740,10 +11384,10 @@ static void ggml_compute_forward_rope_back_f32(
|
|
13740
11384
|
} else {
|
13741
11385
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
13742
11386
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
13743
|
-
const float cos_theta = cosf(
|
13744
|
-
const float sin_theta = sinf(
|
11387
|
+
const float cos_theta = cosf(theta_base);
|
11388
|
+
const float sin_theta = sinf(theta_base);
|
13745
11389
|
|
13746
|
-
|
11390
|
+
theta_base *= theta_scale;
|
13747
11391
|
|
13748
11392
|
const int64_t i0 = ib*n_dims + ic/2;
|
13749
11393
|
|
@@ -13816,14 +11460,14 @@ static void ggml_compute_forward_rope_back_f16(
|
|
13816
11460
|
if (ir++ < ir0) continue;
|
13817
11461
|
if (ir > ir1) break;
|
13818
11462
|
|
13819
|
-
float
|
11463
|
+
float theta_base = (float)p;
|
13820
11464
|
|
13821
11465
|
if (!is_neox) {
|
13822
11466
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13823
|
-
const float cos_theta = cosf(
|
13824
|
-
const float sin_theta = sinf(
|
11467
|
+
const float cos_theta = cosf(theta_base);
|
11468
|
+
const float sin_theta = sinf(theta_base);
|
13825
11469
|
|
13826
|
-
|
11470
|
+
theta_base *= theta_scale;
|
13827
11471
|
|
13828
11472
|
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
13829
11473
|
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -13837,10 +11481,10 @@ static void ggml_compute_forward_rope_back_f16(
|
|
13837
11481
|
} else {
|
13838
11482
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
13839
11483
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
13840
|
-
const float cos_theta = cosf(
|
13841
|
-
const float sin_theta = sinf(
|
11484
|
+
const float cos_theta = cosf(theta_base);
|
11485
|
+
const float sin_theta = sinf(theta_base);
|
13842
11486
|
|
13843
|
-
|
11487
|
+
theta_base *= theta_scale;
|
13844
11488
|
|
13845
11489
|
const int64_t i0 = ib*n_dims + ic/2;
|
13846
11490
|
|
@@ -15285,7 +12929,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
15285
12929
|
#else
|
15286
12930
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15287
12931
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15288
|
-
const float val = GGML_FP16_TO_FP32(
|
12932
|
+
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15289
12933
|
#endif
|
15290
12934
|
sump[j] += (ggml_float)val;
|
15291
12935
|
SS[j] = val;
|
@@ -15487,7 +13131,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
15487
13131
|
} else {
|
15488
13132
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15489
13133
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15490
|
-
const float val = GGML_FP16_TO_FP32(
|
13134
|
+
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15491
13135
|
sump[j] += (ggml_float)val;
|
15492
13136
|
SS[j] = val;
|
15493
13137
|
}
|
@@ -15938,7 +13582,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
15938
13582
|
#else
|
15939
13583
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
15940
13584
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15941
|
-
const float val = GGML_FP16_TO_FP32(
|
13585
|
+
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15942
13586
|
#endif
|
15943
13587
|
sump[j] += (ggml_float)val;
|
15944
13588
|
SW[j] = val;
|
@@ -16688,7 +14332,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
16688
14332
|
#else
|
16689
14333
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
16690
14334
|
memcpy(&scvt, &s, sizeof(scvt));
|
16691
|
-
const float val = GGML_FP16_TO_FP32(
|
14335
|
+
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
16692
14336
|
#endif
|
16693
14337
|
sum += (ggml_float)val;
|
16694
14338
|
st[i] = val;
|
@@ -16802,7 +14446,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
16802
14446
|
#else
|
16803
14447
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
16804
14448
|
memcpy(&scvt, &s, sizeof(scvt));
|
16805
|
-
const float val = GGML_FP16_TO_FP32(
|
14449
|
+
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
16806
14450
|
#endif
|
16807
14451
|
sum += (ggml_float)val;
|
16808
14452
|
ds0[i] = val;
|
@@ -17965,9 +15609,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
17965
15609
|
src1,
|
17966
15610
|
n_dims,
|
17967
15611
|
mode,
|
15612
|
+
0,
|
17968
15613
|
n_ctx,
|
17969
15614
|
freq_base,
|
17970
15615
|
freq_scale,
|
15616
|
+
0.0f,
|
15617
|
+
1.0f,
|
15618
|
+
0.0f,
|
15619
|
+
0.0f,
|
17971
15620
|
xpos_base,
|
17972
15621
|
xpos_down,
|
17973
15622
|
false),
|
@@ -21001,7 +18650,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
21001
18650
|
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
21002
18651
|
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
21003
18652
|
} break;
|
21004
|
-
#ifdef GGML_USE_K_QUANTS
|
21005
18653
|
case GGML_TYPE_Q2_K:
|
21006
18654
|
{
|
21007
18655
|
GGML_ASSERT(start % QK_K == 0);
|
@@ -21032,7 +18680,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
21032
18680
|
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
21033
18681
|
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
21034
18682
|
} break;
|
21035
|
-
#endif
|
21036
18683
|
case GGML_TYPE_F16:
|
21037
18684
|
{
|
21038
18685
|
int elemsize = sizeof(ggml_fp16_t);
|
@@ -21164,8 +18811,7 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
|
|
21164
18811
|
return n == size;
|
21165
18812
|
}
|
21166
18813
|
|
21167
|
-
|
21168
|
-
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
18814
|
+
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
21169
18815
|
p->n = 0;
|
21170
18816
|
p->data = NULL;
|
21171
18817
|
|
@@ -21177,19 +18823,6 @@ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset
|
|
21177
18823
|
return ok;
|
21178
18824
|
}
|
21179
18825
|
|
21180
|
-
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
21181
|
-
p->n = 0;
|
21182
|
-
p->data = NULL;
|
21183
|
-
|
21184
|
-
bool ok = true;
|
21185
|
-
|
21186
|
-
uint32_t n = 0;
|
21187
|
-
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
21188
|
-
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
21189
|
-
|
21190
|
-
return ok;
|
21191
|
-
}
|
21192
|
-
|
21193
18826
|
struct gguf_context * gguf_init_empty(void) {
|
21194
18827
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
21195
18828
|
|
@@ -21248,20 +18881,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
21248
18881
|
ctx->data = NULL;
|
21249
18882
|
|
21250
18883
|
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
18884
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
18885
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
21251
18886
|
|
21252
18887
|
if (ctx->header.version == 1) {
|
21253
|
-
|
21254
|
-
|
21255
|
-
|
21256
|
-
|
21257
|
-
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
21258
|
-
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
21259
|
-
|
21260
|
-
ctx->header.n_tensors = n_tensors;
|
21261
|
-
ctx->header.n_kv = n_kv;
|
21262
|
-
} else {
|
21263
|
-
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
21264
|
-
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
18888
|
+
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
|
18889
|
+
fclose(file);
|
18890
|
+
gguf_free(ctx);
|
18891
|
+
return NULL;
|
21265
18892
|
}
|
21266
18893
|
|
21267
18894
|
if (!ok) {
|
@@ -21272,12 +18899,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
21272
18899
|
}
|
21273
18900
|
}
|
21274
18901
|
|
21275
|
-
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
21276
|
-
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
21277
|
-
if (ctx->header.version == 1) {
|
21278
|
-
gguf_fread_str = gguf_fread_str_v1;
|
21279
|
-
}
|
21280
|
-
|
21281
18902
|
// read the kv pairs
|
21282
18903
|
{
|
21283
18904
|
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
@@ -21308,15 +18929,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
21308
18929
|
case GGUF_TYPE_ARRAY:
|
21309
18930
|
{
|
21310
18931
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
21311
|
-
|
21312
|
-
if (ctx->header.version == 1) {
|
21313
|
-
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
21314
|
-
uint32_t n = 0;
|
21315
|
-
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
21316
|
-
kv->value.arr.n = n;
|
21317
|
-
} else {
|
21318
|
-
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
21319
|
-
}
|
18932
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
21320
18933
|
|
21321
18934
|
switch (kv->value.arr.type) {
|
21322
18935
|
case GGUF_TYPE_UINT8:
|
@@ -21375,14 +18988,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
21375
18988
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
21376
18989
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
21377
18990
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
21378
|
-
|
21379
|
-
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
21380
|
-
uint32_t t = 0;
|
21381
|
-
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
21382
|
-
info->ne[j] = t;
|
21383
|
-
} else {
|
21384
|
-
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
21385
|
-
}
|
18991
|
+
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
21386
18992
|
}
|
21387
18993
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
21388
18994
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|