llama_cpp 0.0.7 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +736 -36
- data/ext/llama_cpp/src/ggml-cuda.h +8 -33
- data/ext/llama_cpp/src/ggml-opencl.c +202 -20
- data/ext/llama_cpp/src/ggml.c +732 -496
- data/ext/llama_cpp/src/ggml.h +47 -5
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +560 -147
- data/ext/llama_cpp/src/llama.h +71 -24
- data/lib/llama_cpp/client.rb +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +38 -3
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
135
135
|
#define UNUSED(x) (void)(x)
|
136
136
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
137
137
|
|
138
|
-
#define GGML_ASSERT(x) \
|
139
|
-
do { \
|
140
|
-
if (!(x)) { \
|
141
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
142
|
-
abort(); \
|
143
|
-
} \
|
144
|
-
} while (0)
|
145
|
-
|
146
138
|
#if defined(GGML_USE_ACCELERATE)
|
147
139
|
#include <Accelerate/Accelerate.h>
|
148
140
|
#elif defined(GGML_USE_OPENBLAS)
|
@@ -188,9 +180,13 @@ typedef double ggml_float;
|
|
188
180
|
#undef bool
|
189
181
|
#define bool _Bool
|
190
182
|
#else
|
183
|
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
184
|
+
#include <intrin.h>
|
185
|
+
#else
|
191
186
|
#include <immintrin.h>
|
192
187
|
#endif
|
193
188
|
#endif
|
189
|
+
#endif
|
194
190
|
|
195
191
|
#ifdef __F16C__
|
196
192
|
|
@@ -330,7 +326,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
|
|
330
326
|
// precomputed f32 table for f16 (256 KB)
|
331
327
|
static float table_f32_f16[1 << 16];
|
332
328
|
|
333
|
-
#if defined(__ARM_NEON)
|
329
|
+
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
334
330
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
335
331
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
336
332
|
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
@@ -370,6 +366,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
370
366
|
return GGML_FP32_TO_FP16(x);
|
371
367
|
}
|
372
368
|
|
369
|
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
|
370
|
+
for (size_t i = 0; i < n; i++) {
|
371
|
+
y[i] = GGML_FP16_TO_FP32(x[i]);
|
372
|
+
}
|
373
|
+
}
|
374
|
+
|
375
|
+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
376
|
+
size_t i = 0;
|
377
|
+
#if defined(__F16C__)
|
378
|
+
for (; i + 7 < n; i += 8) {
|
379
|
+
__m256 x_vec = _mm256_loadu_ps(x + i);
|
380
|
+
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
381
|
+
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
382
|
+
}
|
383
|
+
for(; i + 3 < n; i += 4) {
|
384
|
+
__m128 x_vec = _mm_loadu_ps(x + i);
|
385
|
+
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
386
|
+
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
387
|
+
}
|
388
|
+
#endif
|
389
|
+
for (; i < n; i++) {
|
390
|
+
y[i] = GGML_FP32_TO_FP16(x[i]);
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
|
373
395
|
//
|
374
396
|
// timing
|
375
397
|
//
|
@@ -653,19 +675,102 @@ float vmaxvq_f32(float32x4_t v) {
|
|
653
675
|
}
|
654
676
|
|
655
677
|
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
|
656
|
-
|
678
|
+
int8x8_t res;
|
679
|
+
|
680
|
+
res[0] = a[0]; res[1] = b[0];
|
681
|
+
res[2] = a[1]; res[3] = b[1];
|
682
|
+
res[4] = a[2]; res[5] = b[2];
|
683
|
+
res[6] = a[3]; res[7] = b[3];
|
684
|
+
|
685
|
+
return res;
|
657
686
|
}
|
658
687
|
|
659
688
|
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
|
660
|
-
|
689
|
+
int8x8_t res;
|
690
|
+
|
691
|
+
res[0] = a[4]; res[1] = b[4];
|
692
|
+
res[2] = a[5]; res[3] = b[5];
|
693
|
+
res[4] = a[6]; res[5] = b[6];
|
694
|
+
res[6] = a[7]; res[7] = b[7];
|
695
|
+
|
696
|
+
return res;
|
661
697
|
}
|
662
698
|
|
663
699
|
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
664
|
-
|
700
|
+
uint8x8_t res;
|
701
|
+
|
702
|
+
res[0] = a[0]; res[1] = b[0];
|
703
|
+
res[2] = a[1]; res[3] = b[1];
|
704
|
+
res[4] = a[2]; res[5] = b[2];
|
705
|
+
res[6] = a[3]; res[7] = b[3];
|
706
|
+
|
707
|
+
return res;
|
665
708
|
}
|
666
709
|
|
667
710
|
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
668
|
-
|
711
|
+
uint8x8_t res;
|
712
|
+
|
713
|
+
res[0] = a[4]; res[1] = b[4];
|
714
|
+
res[2] = a[5]; res[3] = b[5];
|
715
|
+
res[4] = a[6]; res[5] = b[6];
|
716
|
+
res[6] = a[7]; res[7] = b[7];
|
717
|
+
|
718
|
+
return res;
|
719
|
+
}
|
720
|
+
|
721
|
+
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
|
722
|
+
int8x16_t res;
|
723
|
+
|
724
|
+
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
|
725
|
+
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
|
726
|
+
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
|
727
|
+
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
|
728
|
+
|
729
|
+
return res;
|
730
|
+
}
|
731
|
+
|
732
|
+
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
|
733
|
+
int8x16_t res;
|
734
|
+
|
735
|
+
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
|
736
|
+
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
|
737
|
+
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
|
738
|
+
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
|
739
|
+
|
740
|
+
return res;
|
741
|
+
}
|
742
|
+
|
743
|
+
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
|
744
|
+
uint8x16_t res;
|
745
|
+
|
746
|
+
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
|
747
|
+
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
|
748
|
+
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
|
749
|
+
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
|
750
|
+
|
751
|
+
return res;
|
752
|
+
}
|
753
|
+
|
754
|
+
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
|
755
|
+
uint8x16_t res;
|
756
|
+
|
757
|
+
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
|
758
|
+
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
|
759
|
+
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
|
760
|
+
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
|
761
|
+
|
762
|
+
return res;
|
763
|
+
}
|
764
|
+
|
765
|
+
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
766
|
+
int32x4_t res;
|
767
|
+
|
768
|
+
res[0] = roundf(vgetq_lane_f32(v, 0));
|
769
|
+
res[1] = roundf(vgetq_lane_f32(v, 1));
|
770
|
+
res[2] = roundf(vgetq_lane_f32(v, 2));
|
771
|
+
res[3] = roundf(vgetq_lane_f32(v, 3));
|
772
|
+
|
773
|
+
return res;
|
669
774
|
}
|
670
775
|
|
671
776
|
#endif
|
@@ -694,14 +799,6 @@ typedef struct {
|
|
694
799
|
} block_q4_2;
|
695
800
|
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
|
696
801
|
|
697
|
-
#define QK4_3 16
|
698
|
-
typedef struct {
|
699
|
-
ggml_fp16_t d; // delta
|
700
|
-
ggml_fp16_t m; // min
|
701
|
-
uint8_t qs[QK4_3 / 2]; // nibbles / quants
|
702
|
-
} block_q4_3;
|
703
|
-
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
|
704
|
-
|
705
802
|
#define QK5_0 32
|
706
803
|
typedef struct {
|
707
804
|
ggml_fp16_t d; // delta
|
@@ -789,6 +886,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
789
886
|
float max = 0.0f;
|
790
887
|
float min = 0.0f;
|
791
888
|
|
889
|
+
vector float asrcv [8];
|
792
890
|
vector float srcv [8];
|
793
891
|
vector float maxv[8];
|
794
892
|
vector float minv[8];
|
@@ -1068,7 +1166,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
1068
1166
|
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
|
1069
1167
|
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
1070
1168
|
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
1071
|
-
const v128_t vc =
|
1169
|
+
const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
|
1072
1170
|
|
1073
1171
|
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
|
1074
1172
|
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
|
@@ -1291,49 +1389,6 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
|
|
1291
1389
|
quantize_row_q4_2_reference(x, y, k);
|
1292
1390
|
}
|
1293
1391
|
|
1294
|
-
static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
|
1295
|
-
assert(k % QK4_3 == 0);
|
1296
|
-
const int nb = k / QK4_3;
|
1297
|
-
|
1298
|
-
for (int i = 0; i < nb; i++) {
|
1299
|
-
float min = FLT_MAX;
|
1300
|
-
float max = -FLT_MAX;
|
1301
|
-
|
1302
|
-
for (int l = 0; l < QK4_3; l++) {
|
1303
|
-
const float v = x[i*QK4_3 + l];
|
1304
|
-
if (v < min) min = v;
|
1305
|
-
if (v > max) max = v;
|
1306
|
-
}
|
1307
|
-
|
1308
|
-
const float d = (max - min) / ((1 << 4) - 1);
|
1309
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1310
|
-
|
1311
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1312
|
-
y[i].m = GGML_FP32_TO_FP16(min);
|
1313
|
-
|
1314
|
-
for (int l = 0; l < QK4_3; l += 2) {
|
1315
|
-
const float v0 = (x[i*QK4_3 + l + 0] - min)*id;
|
1316
|
-
const float v1 = (x[i*QK4_3 + l + 1] - min)*id;
|
1317
|
-
|
1318
|
-
const uint8_t vi0 = (int) (v0 + 0.5f);
|
1319
|
-
const uint8_t vi1 = (int) (v1 + 0.5f);
|
1320
|
-
|
1321
|
-
assert(vi0 < 16);
|
1322
|
-
assert(vi1 < 16);
|
1323
|
-
|
1324
|
-
y[i].qs[l/2] = vi0 | (vi1 << 4);
|
1325
|
-
}
|
1326
|
-
}
|
1327
|
-
}
|
1328
|
-
|
1329
|
-
static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) {
|
1330
|
-
assert(k % QK4_3 == 0);
|
1331
|
-
|
1332
|
-
block_q4_3 * restrict y = vy;
|
1333
|
-
|
1334
|
-
quantize_row_q4_3_reference(x, y, k);
|
1335
|
-
}
|
1336
|
-
|
1337
1392
|
static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
|
1338
1393
|
assert(k % QK5_0 == 0);
|
1339
1394
|
const int nb = k / QK5_0;
|
@@ -1458,15 +1513,135 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
|
|
1458
1513
|
}
|
1459
1514
|
|
1460
1515
|
static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
1516
|
+
assert(QK8_0 == 32);
|
1461
1517
|
assert(k % QK8_0 == 0);
|
1518
|
+
const int nb = k / QK8_0;
|
1462
1519
|
|
1463
1520
|
block_q8_0 * restrict y = vy;
|
1464
1521
|
|
1522
|
+
#if defined(__ARM_NEON)
|
1523
|
+
for (int i = 0; i < nb; i++) {
|
1524
|
+
float32x4_t srcv [8];
|
1525
|
+
float32x4_t asrcv[8];
|
1526
|
+
float32x4_t amaxv[8];
|
1527
|
+
|
1528
|
+
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
|
1529
|
+
for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
|
1530
|
+
|
1531
|
+
for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
|
1532
|
+
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
1533
|
+
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
1534
|
+
|
1535
|
+
const float amax = vmaxvq_f32(amaxv[0]);
|
1536
|
+
|
1537
|
+
const float d = amax / ((1 << 7) - 1);
|
1538
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1539
|
+
|
1540
|
+
y[i].d = d;
|
1541
|
+
|
1542
|
+
for (int l = 0; l < 8; l++) {
|
1543
|
+
const float32x4_t v = vmulq_n_f32(srcv[l], id);
|
1544
|
+
const int32x4_t vi = vcvtnq_s32_f32(v);
|
1545
|
+
|
1546
|
+
y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
|
1547
|
+
y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
|
1548
|
+
y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
|
1549
|
+
y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
|
1550
|
+
}
|
1551
|
+
}
|
1552
|
+
#elif defined(__AVX2__) || defined(__AVX__)
|
1553
|
+
for (int i = 0; i < nb; i++) {
|
1554
|
+
// Load elements into 4 AVX vectors
|
1555
|
+
__m256 v0 = _mm256_loadu_ps( x );
|
1556
|
+
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
1557
|
+
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
1558
|
+
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
1559
|
+
x += 32;
|
1560
|
+
|
1561
|
+
// Compute max(abs(e)) for the block
|
1562
|
+
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
1563
|
+
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
1564
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
1565
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
1566
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
1567
|
+
|
1568
|
+
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
1569
|
+
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
1570
|
+
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
1571
|
+
const float maxScalar = _mm_cvtss_f32( max4 );
|
1572
|
+
|
1573
|
+
// Quantize these floats
|
1574
|
+
const float d = maxScalar / 127.f;
|
1575
|
+
y[i].d = d;
|
1576
|
+
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1577
|
+
const __m256 mul = _mm256_set1_ps( id );
|
1578
|
+
|
1579
|
+
// Apply the multiplier
|
1580
|
+
v0 = _mm256_mul_ps( v0, mul );
|
1581
|
+
v1 = _mm256_mul_ps( v1, mul );
|
1582
|
+
v2 = _mm256_mul_ps( v2, mul );
|
1583
|
+
v3 = _mm256_mul_ps( v3, mul );
|
1584
|
+
|
1585
|
+
// Round to nearest integer
|
1586
|
+
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
1587
|
+
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
1588
|
+
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
1589
|
+
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
1590
|
+
|
1591
|
+
// Convert floats to integers
|
1592
|
+
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
1593
|
+
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
1594
|
+
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
1595
|
+
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
1596
|
+
|
1597
|
+
#if defined(__AVX2__)
|
1598
|
+
// Convert int32 to int16
|
1599
|
+
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
1600
|
+
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
|
1601
|
+
// Convert int16 to int8
|
1602
|
+
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
|
1603
|
+
|
1604
|
+
// We got our precious signed bytes, but the order is now wrong
|
1605
|
+
// These AVX2 pack instructions process 16-byte pieces independently
|
1606
|
+
// The following instruction is fixing the order
|
1607
|
+
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
1608
|
+
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
1609
|
+
|
1610
|
+
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
1611
|
+
#else
|
1612
|
+
// Since we don't have in AVX some necessary functions,
|
1613
|
+
// we split the registers in half and call AVX2 analogs from SSE
|
1614
|
+
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
1615
|
+
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
1616
|
+
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
1617
|
+
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
1618
|
+
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
1619
|
+
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
1620
|
+
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
1621
|
+
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
1622
|
+
|
1623
|
+
// Convert int32 to int16
|
1624
|
+
ni0 = _mm_packs_epi32( ni0, ni1 );
|
1625
|
+
ni2 = _mm_packs_epi32( ni2, ni3 );
|
1626
|
+
ni4 = _mm_packs_epi32( ni4, ni5 );
|
1627
|
+
ni6 = _mm_packs_epi32( ni6, ni7 );
|
1628
|
+
// Convert int16 to int8
|
1629
|
+
ni0 = _mm_packs_epi16( ni0, ni2 );
|
1630
|
+
ni4 = _mm_packs_epi16( ni4, ni6 );
|
1631
|
+
|
1632
|
+
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
|
1633
|
+
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
1634
|
+
#endif
|
1635
|
+
}
|
1636
|
+
#else
|
1637
|
+
// scalar
|
1465
1638
|
quantize_row_q8_0_reference(x, y, k);
|
1639
|
+
#endif
|
1466
1640
|
}
|
1467
1641
|
|
1468
1642
|
// reference implementation for deterministic creation of model files
|
1469
1643
|
static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
|
1644
|
+
assert(QK8_1 == 32);
|
1470
1645
|
assert(k % QK8_1 == 0);
|
1471
1646
|
const int nb = k / QK8_1;
|
1472
1647
|
|
@@ -1917,36 +2092,6 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
|
|
1917
2092
|
}
|
1918
2093
|
}
|
1919
2094
|
|
1920
|
-
static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) {
|
1921
|
-
assert(k % QK4_3 == 0);
|
1922
|
-
const int nb = k / QK4_3;
|
1923
|
-
|
1924
|
-
const block_q4_3 * restrict x = vx;
|
1925
|
-
|
1926
|
-
for (int i = 0; i < nb; i++) {
|
1927
|
-
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1928
|
-
const float m = GGML_FP16_TO_FP32(x[i].m);
|
1929
|
-
|
1930
|
-
const uint8_t * restrict pp = x[i].qs;
|
1931
|
-
|
1932
|
-
for (int l = 0; l < QK4_3; l += 2) {
|
1933
|
-
const uint8_t vi = pp[l/2];
|
1934
|
-
|
1935
|
-
const int8_t vi0 = vi & 0x0F;
|
1936
|
-
const int8_t vi1 = vi >> 4;
|
1937
|
-
|
1938
|
-
const float v0 = vi0*d + m;
|
1939
|
-
const float v1 = vi1*d + m;
|
1940
|
-
|
1941
|
-
y[i*QK4_3 + l + 0] = v0;
|
1942
|
-
y[i*QK4_3 + l + 1] = v1;
|
1943
|
-
|
1944
|
-
assert(!isnan(y[i*QK4_3 + l + 0]));
|
1945
|
-
assert(!isnan(y[i*QK4_3 + l + 1]));
|
1946
|
-
}
|
1947
|
-
}
|
1948
|
-
}
|
1949
|
-
|
1950
2095
|
static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
|
1951
2096
|
assert(k % QK5_0 == 0);
|
1952
2097
|
const int nb = k / QK5_0;
|
@@ -1965,8 +2110,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
|
|
1965
2110
|
const uint8_t vi = pp[l/2];
|
1966
2111
|
|
1967
2112
|
// extract the 5-th bit from qh
|
1968
|
-
const uint8_t vh0 = ((qh & (
|
1969
|
-
const uint8_t vh1 = ((qh & (
|
2113
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
2114
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
1970
2115
|
|
1971
2116
|
const int8_t vi0 = (vi & 0x0F) | vh0;
|
1972
2117
|
const int8_t vi1 = (vi >> 4) | vh1;
|
@@ -2002,8 +2147,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
|
|
2002
2147
|
const uint8_t vi = pp[l/2];
|
2003
2148
|
|
2004
2149
|
// extract the 5-th bit from qh
|
2005
|
-
const uint8_t vh0 = ((qh & (
|
2006
|
-
const uint8_t vh1 = ((qh & (
|
2150
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
2151
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
2007
2152
|
|
2008
2153
|
const uint8_t vi0 = (vi & 0x0F) | vh0;
|
2009
2154
|
const uint8_t vi1 = (vi >> 4) | vh1;
|
@@ -2040,7 +2185,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
2040
2185
|
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2041
2186
|
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2042
2187
|
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2043
|
-
static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2044
2188
|
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2045
2189
|
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2046
2190
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
@@ -2070,14 +2214,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
2070
2214
|
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
|
2071
2215
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
2072
2216
|
},
|
2073
|
-
[GGML_TYPE_Q4_3] = {
|
2074
|
-
.dequantize_row_q = dequantize_row_q4_3,
|
2075
|
-
.quantize_row_q = quantize_row_q4_3,
|
2076
|
-
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference,
|
2077
|
-
.quantize_row_q_dot = quantize_row_q8_1,
|
2078
|
-
.vec_dot_q = ggml_vec_dot_q4_3_q8_1,
|
2079
|
-
.vec_dot_type = GGML_TYPE_Q8_1,
|
2080
|
-
},
|
2081
2217
|
[GGML_TYPE_Q5_0] = {
|
2082
2218
|
.dequantize_row_q = dequantize_row_q5_0,
|
2083
2219
|
.quantize_row_q = quantize_row_q5_0,
|
@@ -2748,35 +2884,35 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2748
2884
|
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
|
2749
2885
|
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
|
2750
2886
|
|
2887
|
+
// interleave
|
2888
|
+
const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
|
2889
|
+
const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
|
2890
|
+
const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
|
2891
|
+
const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
|
2892
|
+
|
2751
2893
|
// load y
|
2752
2894
|
const int8x16_t v1_0l = vld1q_s8(y0->qs);
|
2753
2895
|
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
|
2754
2896
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2755
2897
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2756
2898
|
|
2757
|
-
// interleave
|
2758
|
-
const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
|
2759
|
-
const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
|
2760
|
-
const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
|
2761
|
-
const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
|
2762
|
-
|
2763
2899
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2764
2900
|
// dot product into int32x4_t
|
2765
|
-
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0),
|
2766
|
-
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0),
|
2901
|
+
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
|
2902
|
+
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
|
2767
2903
|
|
2768
2904
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2769
2905
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2770
2906
|
#else
|
2771
|
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (
|
2772
|
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(
|
2773
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (
|
2774
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(
|
2907
|
+
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
|
2908
|
+
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
|
2909
|
+
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
|
2910
|
+
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
|
2775
2911
|
|
2776
|
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (
|
2777
|
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(
|
2778
|
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (
|
2779
|
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(
|
2912
|
+
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
|
2913
|
+
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
|
2914
|
+
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
|
2915
|
+
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
|
2780
2916
|
|
2781
2917
|
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
2782
2918
|
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
@@ -3171,136 +3307,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
|
|
3171
3307
|
#endif
|
3172
3308
|
}
|
3173
3309
|
|
3174
|
-
static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
3175
|
-
const int nb = n / QK8_1;
|
3176
|
-
|
3177
|
-
assert(n % QK8_1 == 0);
|
3178
|
-
assert(nb % 2 == 0);
|
3179
|
-
assert(QK8_1 == 2*QK4_3);
|
3180
|
-
|
3181
|
-
const block_q4_3 * restrict x = vx;
|
3182
|
-
const block_q8_1 * restrict y = vy;
|
3183
|
-
|
3184
|
-
#if defined(__ARM_NEON)
|
3185
|
-
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3186
|
-
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
3187
|
-
|
3188
|
-
float summs0 = 0.0f;
|
3189
|
-
float summs1 = 0.0f;
|
3190
|
-
|
3191
|
-
for (int i = 0; i < nb; ++i) {
|
3192
|
-
const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
|
3193
|
-
const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
|
3194
|
-
|
3195
|
-
const block_q8_1 * restrict y0 = &y[i + 0];
|
3196
|
-
|
3197
|
-
summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
|
3198
|
-
summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
|
3199
|
-
|
3200
|
-
const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
|
3201
|
-
|
3202
|
-
// 4-bit -> 8-bit
|
3203
|
-
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, vdupq_n_u8(0x0F)));
|
3204
|
-
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
3205
|
-
|
3206
|
-
// interleave
|
3207
|
-
const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
|
3208
|
-
const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
|
3209
|
-
|
3210
|
-
// load y
|
3211
|
-
const int8x16_t v1_0l = vld1q_s8(y0->qs);
|
3212
|
-
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
|
3213
|
-
|
3214
|
-
const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
|
3215
|
-
const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
|
3216
|
-
|
3217
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
3218
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
|
3219
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
|
3220
|
-
#else
|
3221
|
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
|
3222
|
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
|
3223
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
|
3224
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
|
3225
|
-
|
3226
|
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
3227
|
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
3228
|
-
|
3229
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
|
3230
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d);
|
3231
|
-
#endif
|
3232
|
-
}
|
3233
|
-
|
3234
|
-
*s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1;
|
3235
|
-
#elif defined(__AVX2__)
|
3236
|
-
// Initialize accumulator with zeros
|
3237
|
-
__m256 acc = _mm256_setzero_ps();
|
3238
|
-
float summs = 0.0f;
|
3239
|
-
|
3240
|
-
// Main loop
|
3241
|
-
for (int i = 0; i < nb; i++) {
|
3242
|
-
const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
|
3243
|
-
const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
|
3244
|
-
const __m256 dx = _mm256_set_m128(d1, d0);
|
3245
|
-
|
3246
|
-
summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0
|
3247
|
-
+ GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1;
|
3248
|
-
|
3249
|
-
const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
|
3250
|
-
const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
|
3251
|
-
const __m256i bx = _mm256_set_m128i(bx1, bx0);
|
3252
|
-
|
3253
|
-
const __m256 dy = _mm256_broadcast_ss(&y[i].d);
|
3254
|
-
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3255
|
-
|
3256
|
-
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
3257
|
-
|
3258
|
-
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
3259
|
-
}
|
3260
|
-
|
3261
|
-
*s = hsum_float_8(acc) + summs;
|
3262
|
-
#else
|
3263
|
-
// scalar
|
3264
|
-
float sumf = 0.0;
|
3265
|
-
for (int i = 0; i < nb; i++) {
|
3266
|
-
const uint8_t * restrict x0 = x[2*i + 0].qs;
|
3267
|
-
const uint8_t * restrict x1 = x[2*i + 1].qs;
|
3268
|
-
const int8_t * restrict y0 = y[i].qs;
|
3269
|
-
|
3270
|
-
const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
|
3271
|
-
const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m);
|
3272
|
-
const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
|
3273
|
-
const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
|
3274
|
-
|
3275
|
-
int sxy_0 = 0;
|
3276
|
-
int sxy_1 = 0;
|
3277
|
-
|
3278
|
-
for (int j = 0; j < QK8_1/4; j++) {
|
3279
|
-
const uint8_t v0 = x0[j];
|
3280
|
-
const uint8_t v1 = x1[j];
|
3281
|
-
|
3282
|
-
const int x0_0 = v0 & 0x0F;
|
3283
|
-
const int x1_0 = v0 >> 4;
|
3284
|
-
|
3285
|
-
const int x0_1 = v1 & 0x0F;
|
3286
|
-
const int x1_1 = v1 >> 4;
|
3287
|
-
|
3288
|
-
const int y0_0 = y0[2*j + 0];
|
3289
|
-
const int y1_0 = y0[2*j + 1];
|
3290
|
-
|
3291
|
-
const int y0_1 = y0[2*(j + QK8_1/4) + 0];
|
3292
|
-
const int y1_1 = y0[2*(j + QK8_1/4) + 1];
|
3293
|
-
|
3294
|
-
sxy_0 += x0_0*y0_0 + x1_0*y1_0;
|
3295
|
-
sxy_1 += x0_1*y0_1 + x1_1*y1_1;
|
3296
|
-
}
|
3297
|
-
|
3298
|
-
sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1;
|
3299
|
-
}
|
3300
|
-
*s = sumf;
|
3301
|
-
#endif
|
3302
|
-
}
|
3303
|
-
|
3304
3310
|
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
3305
3311
|
const int nb = n / QK8_0;
|
3306
3312
|
|
@@ -3373,6 +3379,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
3373
3379
|
}
|
3374
3380
|
|
3375
3381
|
*s = vaddvq_f32(sumv);
|
3382
|
+
#elif defined(__wasm_simd128__)
|
3383
|
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
3384
|
+
|
3385
|
+
uint64_t tmp[4];
|
3386
|
+
|
3387
|
+
for (int i = 0; i < nb; ++i) {
|
3388
|
+
const block_q5_0 * restrict x0 = &x[i];
|
3389
|
+
const block_q8_0 * restrict y0 = &y[i];
|
3390
|
+
|
3391
|
+
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
3392
|
+
const v128_t s16b = wasm_i8x16_splat(0x10);
|
3393
|
+
|
3394
|
+
// extract the 5th bit
|
3395
|
+
uint32_t qh;
|
3396
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
3397
|
+
|
3398
|
+
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
3399
|
+
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
3400
|
+
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
3401
|
+
tmp[3] = table_b2b_u[(qh >> 24) ];
|
3402
|
+
|
3403
|
+
const v128_t qhl = wasm_v128_load(tmp + 0);
|
3404
|
+
const v128_t qhh = wasm_v128_load(tmp + 2);
|
3405
|
+
|
3406
|
+
const v128_t v0 = wasm_v128_load(x0->qs);
|
3407
|
+
|
3408
|
+
// 4-bit -> 8-bit
|
3409
|
+
const v128_t v0l = wasm_v128_and (v0, m4b);
|
3410
|
+
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
3411
|
+
|
3412
|
+
// interleave
|
3413
|
+
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
3414
|
+
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
3415
|
+
|
3416
|
+
// add high bit and sub 16
|
3417
|
+
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
|
3418
|
+
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
|
3419
|
+
|
3420
|
+
// load y
|
3421
|
+
const v128_t v1l = wasm_v128_load(y0->qs);
|
3422
|
+
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
3423
|
+
|
3424
|
+
// int8x16 -> int16x8
|
3425
|
+
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
3426
|
+
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
3427
|
+
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
3428
|
+
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
3429
|
+
|
3430
|
+
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
3431
|
+
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
3432
|
+
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
3433
|
+
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
3434
|
+
|
3435
|
+
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
3436
|
+
|
3437
|
+
// dot product
|
3438
|
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
3439
|
+
wasm_i32x4_add(
|
3440
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
3441
|
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
3442
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
3443
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
3444
|
+
}
|
3445
|
+
|
3446
|
+
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
3447
|
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
3376
3448
|
#elif defined(__AVX2__)
|
3377
3449
|
// Initialize accumulator with zeros
|
3378
3450
|
__m256 acc = _mm256_setzero_ps();
|
@@ -3413,8 +3485,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
3413
3485
|
for (int j = 0; j < QK8_0/2; j++) {
|
3414
3486
|
const uint8_t v0 = x0[j];
|
3415
3487
|
|
3416
|
-
const int x0_0h = ((qh & (
|
3417
|
-
const int x1_0h = ((qh & (
|
3488
|
+
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
|
3489
|
+
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
|
3418
3490
|
|
3419
3491
|
const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
|
3420
3492
|
const int x1_0 = ((v0 >> 4) | x1_0h) - 16;
|
@@ -3504,6 +3576,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3504
3576
|
}
|
3505
3577
|
|
3506
3578
|
*s = vaddvq_f32(sumv) + summs;
|
3579
|
+
#elif defined(__wasm_simd128__)
|
3580
|
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
3581
|
+
|
3582
|
+
float summs = 0.0f;
|
3583
|
+
|
3584
|
+
uint64_t tmp[4];
|
3585
|
+
|
3586
|
+
for (int i = 0; i < nb; ++i) {
|
3587
|
+
const block_q5_1 * restrict x0 = &x[i];
|
3588
|
+
const block_q8_1 * restrict y0 = &y[i];
|
3589
|
+
|
3590
|
+
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
|
3591
|
+
|
3592
|
+
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
3593
|
+
|
3594
|
+
// extract the 5th bit
|
3595
|
+
uint32_t qh;
|
3596
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
3597
|
+
|
3598
|
+
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
3599
|
+
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
3600
|
+
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
3601
|
+
tmp[3] = table_b2b_u[(qh >> 24) ];
|
3602
|
+
|
3603
|
+
const v128_t qhl = wasm_v128_load(tmp + 0);
|
3604
|
+
const v128_t qhh = wasm_v128_load(tmp + 2);
|
3605
|
+
|
3606
|
+
const v128_t v0 = wasm_v128_load(x0->qs);
|
3607
|
+
|
3608
|
+
// 4-bit -> 8-bit
|
3609
|
+
const v128_t v0l = wasm_v128_and (v0, m4b);
|
3610
|
+
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
3611
|
+
|
3612
|
+
static bool x = true;
|
3613
|
+
|
3614
|
+
// interleave
|
3615
|
+
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
3616
|
+
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
3617
|
+
|
3618
|
+
// add high bit
|
3619
|
+
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
|
3620
|
+
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
|
3621
|
+
|
3622
|
+
// load y
|
3623
|
+
const v128_t v1l = wasm_v128_load(y0->qs);
|
3624
|
+
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
3625
|
+
|
3626
|
+
// int8x16 -> int16x8
|
3627
|
+
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
3628
|
+
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
3629
|
+
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
3630
|
+
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
3631
|
+
|
3632
|
+
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
3633
|
+
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
3634
|
+
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
3635
|
+
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
3636
|
+
|
3637
|
+
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
3638
|
+
|
3639
|
+
// dot product
|
3640
|
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
3641
|
+
wasm_i32x4_add(
|
3642
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
3643
|
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
3644
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
3645
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
3646
|
+
}
|
3647
|
+
|
3648
|
+
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
3649
|
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
3507
3650
|
#elif defined(__AVX2__)
|
3508
3651
|
// Initialize accumulator with zeros
|
3509
3652
|
__m256 acc = _mm256_setzero_ps();
|
@@ -3547,8 +3690,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3547
3690
|
for (int j = 0; j < QK8_1/2; j++) {
|
3548
3691
|
const uint8_t v0 = x0[j];
|
3549
3692
|
|
3550
|
-
const int x0_0h = ((qh & (
|
3551
|
-
const int x1_0h = ((qh & (
|
3693
|
+
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
|
3694
|
+
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
|
3552
3695
|
|
3553
3696
|
const int x0_0 = (v0 & 0x0F) | x0_0h;
|
3554
3697
|
const int x1_0 = (v0 >> 4) | x1_0h;
|
@@ -3925,7 +4068,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|
3925
4068
|
[GGML_TYPE_Q4_0] = QK4_0,
|
3926
4069
|
[GGML_TYPE_Q4_1] = QK4_1,
|
3927
4070
|
[GGML_TYPE_Q4_2] = QK4_2,
|
3928
|
-
[GGML_TYPE_Q4_3] = QK4_3,
|
3929
4071
|
[GGML_TYPE_Q5_0] = QK5_0,
|
3930
4072
|
[GGML_TYPE_Q5_1] = QK5_1,
|
3931
4073
|
[GGML_TYPE_Q8_0] = QK8_0,
|
@@ -3942,7 +4084,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|
3942
4084
|
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
3943
4085
|
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
3944
4086
|
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
|
3945
|
-
[GGML_TYPE_Q4_3] = sizeof(block_q4_3),
|
3946
4087
|
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
|
3947
4088
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3948
4089
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
@@ -3960,7 +4101,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|
3960
4101
|
[GGML_TYPE_Q4_0] = "q4_0",
|
3961
4102
|
[GGML_TYPE_Q4_1] = "q4_1",
|
3962
4103
|
[GGML_TYPE_Q4_2] = "q4_2",
|
3963
|
-
[GGML_TYPE_Q4_3] = "q4_3",
|
3964
4104
|
[GGML_TYPE_Q5_0] = "q5_0",
|
3965
4105
|
[GGML_TYPE_Q5_1] = "q5_1",
|
3966
4106
|
[GGML_TYPE_Q8_0] = "q8_0",
|
@@ -3977,7 +4117,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3977
4117
|
[GGML_TYPE_Q4_0] = true,
|
3978
4118
|
[GGML_TYPE_Q4_1] = true,
|
3979
4119
|
[GGML_TYPE_Q4_2] = true,
|
3980
|
-
[GGML_TYPE_Q4_3] = true,
|
3981
4120
|
[GGML_TYPE_Q5_0] = true,
|
3982
4121
|
[GGML_TYPE_Q5_1] = true,
|
3983
4122
|
[GGML_TYPE_Q8_0] = true,
|
@@ -4024,6 +4163,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
4024
4163
|
"DIAG_MASK_INF",
|
4025
4164
|
"SOFT_MAX",
|
4026
4165
|
"ROPE",
|
4166
|
+
"ALIBI",
|
4027
4167
|
"CONV_1D_1S",
|
4028
4168
|
"CONV_1D_2S",
|
4029
4169
|
|
@@ -4034,7 +4174,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
4034
4174
|
"MAP_BINARY",
|
4035
4175
|
};
|
4036
4176
|
|
4037
|
-
static_assert(GGML_OP_COUNT ==
|
4177
|
+
static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
|
4038
4178
|
|
4039
4179
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
4040
4180
|
"none",
|
@@ -4072,6 +4212,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
4072
4212
|
"diag_mask_inf(x)",
|
4073
4213
|
"soft_max(x)",
|
4074
4214
|
"rope(x)",
|
4215
|
+
"alibi(x)",
|
4075
4216
|
"conv_1d_1s(x)",
|
4076
4217
|
"conv_1d_2s(x)",
|
4077
4218
|
|
@@ -4082,7 +4223,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
4082
4223
|
"f(x,y)",
|
4083
4224
|
};
|
4084
4225
|
|
4085
|
-
static_assert(GGML_OP_COUNT ==
|
4226
|
+
static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
|
4086
4227
|
|
4087
4228
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
4088
4229
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -4252,6 +4393,27 @@ bool ggml_is_quantized(enum ggml_type type) {
|
|
4252
4393
|
return GGML_IS_QUANTIZED[type];
|
4253
4394
|
}
|
4254
4395
|
|
4396
|
+
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
4397
|
+
enum ggml_type wtype = GGML_TYPE_COUNT;
|
4398
|
+
|
4399
|
+
switch (ftype) {
|
4400
|
+
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
4401
|
+
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
4402
|
+
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
4403
|
+
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
4404
|
+
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
|
4405
|
+
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
4406
|
+
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
4407
|
+
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
4408
|
+
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
4409
|
+
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
4410
|
+
}
|
4411
|
+
|
4412
|
+
GGML_ASSERT(wtype != GGML_TYPE_COUNT);
|
4413
|
+
|
4414
|
+
return wtype;
|
4415
|
+
}
|
4416
|
+
|
4255
4417
|
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
4256
4418
|
return tensor->nb[0] > tensor->nb[1];
|
4257
4419
|
}
|
@@ -4362,12 +4524,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4362
4524
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4363
4525
|
}
|
4364
4526
|
|
4365
|
-
|
4366
|
-
#if defined(GGML_USE_CUBLAS)
|
4527
|
+
#if defined(GGML_USE_CUBLAS)
|
4367
4528
|
ggml_init_cublas();
|
4368
|
-
|
4529
|
+
#elif defined(GGML_USE_CLBLAST)
|
4369
4530
|
ggml_cl_init();
|
4370
|
-
|
4531
|
+
#endif
|
4371
4532
|
|
4372
4533
|
is_first_call = false;
|
4373
4534
|
}
|
@@ -4448,7 +4609,7 @@ void ggml_free(struct ggml_context * ctx) {
|
|
4448
4609
|
}
|
4449
4610
|
|
4450
4611
|
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
4451
|
-
return ctx->objects_end->offs + ctx->objects_end->size;
|
4612
|
+
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
|
4452
4613
|
}
|
4453
4614
|
|
4454
4615
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
|
@@ -4561,6 +4722,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4561
4722
|
/*.perf_cycles =*/ 0,
|
4562
4723
|
/*.perf_time_us =*/ 0,
|
4563
4724
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4725
|
+
/*.name =*/ { 0 },
|
4564
4726
|
/*.pad =*/ { 0 },
|
4565
4727
|
};
|
4566
4728
|
|
@@ -4915,6 +5077,15 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
4915
5077
|
return (float *)(tensor->data);
|
4916
5078
|
}
|
4917
5079
|
|
5080
|
+
const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
5081
|
+
return tensor->name;
|
5082
|
+
}
|
5083
|
+
|
5084
|
+
void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
5085
|
+
strncpy(tensor->name, name, sizeof(tensor->name));
|
5086
|
+
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
5087
|
+
}
|
5088
|
+
|
4918
5089
|
struct ggml_tensor * ggml_view_tensor(
|
4919
5090
|
struct ggml_context * ctx,
|
4920
5091
|
const struct ggml_tensor * src) {
|
@@ -6014,6 +6185,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
|
|
6014
6185
|
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6015
6186
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6016
6187
|
struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
|
6188
|
+
ggml_set_name(b, "n_past");
|
6017
6189
|
|
6018
6190
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6019
6191
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6071,6 +6243,7 @@ struct ggml_tensor * ggml_rope(
|
|
6071
6243
|
((int32_t *) b->data)[0] = n_past;
|
6072
6244
|
((int32_t *) b->data)[1] = n_dims;
|
6073
6245
|
((int32_t *) b->data)[2] = mode;
|
6246
|
+
ggml_set_name(b, "n_past, n_dims, mode");
|
6074
6247
|
|
6075
6248
|
result->op = GGML_OP_ROPE;
|
6076
6249
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6080,6 +6253,37 @@ struct ggml_tensor * ggml_rope(
|
|
6080
6253
|
return result;
|
6081
6254
|
}
|
6082
6255
|
|
6256
|
+
// ggml_alibi
|
6257
|
+
|
6258
|
+
struct ggml_tensor * ggml_alibi(
|
6259
|
+
struct ggml_context * ctx,
|
6260
|
+
struct ggml_tensor * a,
|
6261
|
+
int n_past,
|
6262
|
+
int n_head) {
|
6263
|
+
GGML_ASSERT(n_past >= 0);
|
6264
|
+
bool is_node = false;
|
6265
|
+
|
6266
|
+
if (a->grad) {
|
6267
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6268
|
+
is_node = true;
|
6269
|
+
}
|
6270
|
+
|
6271
|
+
// TODO: when implement backward, fix this:
|
6272
|
+
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6273
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6274
|
+
|
6275
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6276
|
+
((int32_t *) b->data)[0] = n_past;
|
6277
|
+
((int32_t *) b->data)[1] = n_head;
|
6278
|
+
|
6279
|
+
result->op = GGML_OP_ALIBI;
|
6280
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6281
|
+
result->src0 = a;
|
6282
|
+
result->src1 = b;
|
6283
|
+
|
6284
|
+
return result;
|
6285
|
+
}
|
6286
|
+
|
6083
6287
|
// ggml_conv_1d_1s
|
6084
6288
|
|
6085
6289
|
struct ggml_tensor * ggml_conv_1d_1s(
|
@@ -7199,7 +7403,6 @@ static void ggml_compute_forward_add(
|
|
7199
7403
|
case GGML_TYPE_Q4_0:
|
7200
7404
|
case GGML_TYPE_Q4_1:
|
7201
7405
|
case GGML_TYPE_Q4_2:
|
7202
|
-
case GGML_TYPE_Q4_3:
|
7203
7406
|
case GGML_TYPE_Q5_0:
|
7204
7407
|
case GGML_TYPE_Q5_1:
|
7205
7408
|
case GGML_TYPE_Q8_0:
|
@@ -8108,7 +8311,7 @@ static void ggml_compute_forward_rms_norm(
|
|
8108
8311
|
|
8109
8312
|
// ggml_compute_forward_mul_mat
|
8110
8313
|
|
8111
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
8314
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8112
8315
|
// helper function to determine if it is better to use BLAS or not
|
8113
8316
|
// for large matrices, BLAS is faster
|
8114
8317
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
@@ -8125,7 +8328,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
8125
8328
|
|
8126
8329
|
// TODO: find the optimal values for these
|
8127
8330
|
if (ggml_is_contiguous(src0) &&
|
8128
|
-
ggml_is_contiguous(src1) &&
|
8331
|
+
ggml_is_contiguous(src1) &&
|
8332
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
8129
8333
|
|
8130
8334
|
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
8131
8335
|
return true;
|
@@ -8133,7 +8337,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
8133
8337
|
|
8134
8338
|
return false;
|
8135
8339
|
}
|
8136
|
-
|
8137
8340
|
#endif
|
8138
8341
|
|
8139
8342
|
static void ggml_compute_forward_mul_mat_f32(
|
@@ -8149,7 +8352,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8149
8352
|
const int64_t ne02 = src0->ne[2];
|
8150
8353
|
const int64_t ne03 = src0->ne[3];
|
8151
8354
|
|
8152
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
8355
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8153
8356
|
const int64_t ne10 = src1->ne[0];
|
8154
8357
|
#endif
|
8155
8358
|
const int64_t ne11 = src1->ne[1];
|
@@ -8206,7 +8409,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8206
8409
|
// nb01 >= nb00 - src0 is not transposed
|
8207
8410
|
// compute by src0 rows
|
8208
8411
|
|
8209
|
-
#if defined(
|
8412
|
+
#if defined(GGML_USE_CUBLAS)
|
8413
|
+
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
8414
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
8415
|
+
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
8416
|
+
}
|
8417
|
+
return;
|
8418
|
+
}
|
8419
|
+
#endif
|
8420
|
+
|
8421
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8210
8422
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
8211
8423
|
if (params->ith != 0) {
|
8212
8424
|
return;
|
@@ -8220,42 +8432,13 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8220
8432
|
return;
|
8221
8433
|
}
|
8222
8434
|
|
8223
|
-
#if defined(GGML_USE_CUBLAS)
|
8224
|
-
const float alpha = 1.0f;
|
8225
|
-
const float beta = 0.0f;
|
8226
|
-
const int x_ne = ne01 * ne10;
|
8227
|
-
const int y_ne = ne11 * ne10;
|
8228
|
-
const int d_ne = ne11 * ne01;
|
8229
|
-
|
8230
|
-
size_t x_size, y_size, d_size;
|
8231
|
-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
8232
|
-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
8233
|
-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
8234
|
-
#endif
|
8235
|
-
|
8236
8435
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8237
8436
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8238
8437
|
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
8239
8438
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
8240
|
-
|
8241
8439
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
8242
8440
|
|
8243
|
-
#if defined(
|
8244
|
-
// copy data to device
|
8245
|
-
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8246
|
-
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8247
|
-
|
8248
|
-
// compute
|
8249
|
-
CUBLAS_CHECK(
|
8250
|
-
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
8251
|
-
ne01, ne11, ne10,
|
8252
|
-
&alpha, d_X, ne00,
|
8253
|
-
d_Y, ne10,
|
8254
|
-
&beta, d_D, ne01));
|
8255
|
-
|
8256
|
-
// copy data to host
|
8257
|
-
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
8258
|
-
#elif defined(GGML_USE_CLBLAST)
|
8441
|
+
#if defined(GGML_USE_CLBLAST)
|
8259
8442
|
// zT = y * xT
|
8260
8443
|
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
8261
8444
|
ne11, ne01, ne10,
|
@@ -8272,12 +8455,6 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8272
8455
|
#endif
|
8273
8456
|
}
|
8274
8457
|
}
|
8275
|
-
#if defined(GGML_USE_CUBLAS)
|
8276
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
8277
|
-
ggml_cuda_pool_free(d_X, x_size);
|
8278
|
-
ggml_cuda_pool_free(d_Y, y_size);
|
8279
|
-
ggml_cuda_pool_free(d_D, d_size);
|
8280
|
-
#endif
|
8281
8458
|
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
8282
8459
|
|
8283
8460
|
return;
|
@@ -8407,7 +8584,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8407
8584
|
// nb01 >= nb00 - src0 is not transposed
|
8408
8585
|
// compute by src0 rows
|
8409
8586
|
|
8410
|
-
#if defined(
|
8587
|
+
#if defined(GGML_USE_CUBLAS)
|
8588
|
+
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
8589
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
8590
|
+
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
8591
|
+
}
|
8592
|
+
return;
|
8593
|
+
}
|
8594
|
+
#endif
|
8595
|
+
|
8596
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8411
8597
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
8412
8598
|
GGML_ASSERT(nb10 == sizeof(float));
|
8413
8599
|
|
@@ -8423,35 +8609,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8423
8609
|
return;
|
8424
8610
|
}
|
8425
8611
|
|
8426
|
-
#if defined(GGML_USE_CUBLAS)
|
8427
|
-
ggml_fp16_t * const wdata = params->wdata;
|
8428
|
-
|
8429
|
-
const float alpha = 1.0f;
|
8430
|
-
const float beta = 0.0f;
|
8431
|
-
const int x_ne = ne01 * ne10;
|
8432
|
-
const int y_ne = ne11 * ne10;
|
8433
|
-
const int d_ne = ne11 * ne01;
|
8434
|
-
|
8435
|
-
size_t x_size, y_size, d_size;
|
8436
|
-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
8437
|
-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
8438
|
-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
8439
|
-
#else
|
8440
|
-
float * const wdata = params->wdata;
|
8441
|
-
#endif
|
8442
8612
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8443
8613
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8444
|
-
|
8445
|
-
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
|
8446
|
-
{
|
8447
|
-
size_t id = 0;
|
8448
|
-
for (int64_t i01 = 0; i01 < ne11; ++i01) {
|
8449
|
-
for (int64_t i00 = 0; i00 < ne10; ++i00) {
|
8450
|
-
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
|
8451
|
-
}
|
8452
|
-
}
|
8453
|
-
}
|
8454
|
-
#else
|
8614
|
+
float * const wdata = params->wdata;
|
8455
8615
|
{
|
8456
8616
|
size_t id = 0;
|
8457
8617
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
@@ -8459,32 +8619,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8459
8619
|
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
8460
8620
|
}
|
8461
8621
|
}
|
8462
|
-
}
|
8463
|
-
#endif
|
8464
|
-
|
8465
|
-
#if defined(GGML_USE_CUBLAS)
|
8466
|
-
const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
8467
|
-
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
|
8468
8622
|
|
8469
|
-
|
8623
|
+
assert(id*sizeof(float) <= params->wsize);
|
8624
|
+
}
|
8470
8625
|
|
8471
|
-
|
8472
|
-
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8473
|
-
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8474
|
-
|
8475
|
-
// compute
|
8476
|
-
CUBLAS_CHECK(
|
8477
|
-
cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
8478
|
-
ne01, ne11, ne10,
|
8479
|
-
&alpha, d_X, CUDA_R_16F, ne00,
|
8480
|
-
d_Y, CUDA_R_16F, ne10,
|
8481
|
-
&beta, d_D, CUDA_R_32F, ne01,
|
8482
|
-
CUBLAS_COMPUTE_32F,
|
8483
|
-
CUBLAS_GEMM_DEFAULT));
|
8484
|
-
|
8485
|
-
// copy data to host
|
8486
|
-
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
8487
|
-
#elif defined(GGML_USE_CLBLAST)
|
8626
|
+
#if defined(GGML_USE_CLBLAST)
|
8488
8627
|
const float * x = wdata;
|
8489
8628
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
8490
8629
|
|
@@ -8513,12 +8652,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8513
8652
|
}
|
8514
8653
|
}
|
8515
8654
|
|
8516
|
-
#if defined(GGML_USE_CUBLAS)
|
8517
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
8518
|
-
ggml_cuda_pool_free(d_X, x_size);
|
8519
|
-
ggml_cuda_pool_free(d_Y, y_size);
|
8520
|
-
ggml_cuda_pool_free(d_D, d_size);
|
8521
|
-
#endif
|
8522
8655
|
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
|
8523
8656
|
|
8524
8657
|
return;
|
@@ -8671,7 +8804,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8671
8804
|
// nb01 >= nb00 - src0 is not transposed
|
8672
8805
|
// compute by src0 rows
|
8673
8806
|
|
8674
|
-
#if defined(
|
8807
|
+
#if defined(GGML_USE_CUBLAS)
|
8808
|
+
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
8809
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
8810
|
+
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
8811
|
+
}
|
8812
|
+
return;
|
8813
|
+
}
|
8814
|
+
#endif
|
8815
|
+
|
8816
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8675
8817
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
8676
8818
|
if (params->ith != 0) {
|
8677
8819
|
return;
|
@@ -8685,48 +8827,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8685
8827
|
return;
|
8686
8828
|
}
|
8687
8829
|
|
8688
|
-
#if defined(GGML_USE_CUBLAS)
|
8689
|
-
const float alpha = 1.0f;
|
8690
|
-
const float beta = 0.0f;
|
8691
|
-
const int x_ne = ne01 * ne10;
|
8692
|
-
const int y_ne = ne11 * ne10;
|
8693
|
-
const int d_ne = ne11 * ne01;
|
8694
|
-
|
8695
|
-
size_t x_size, y_size, d_size, q_size;
|
8696
|
-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
8697
|
-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
8698
|
-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
8699
|
-
float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
|
8700
|
-
|
8701
|
-
void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL;
|
8702
|
-
if (type == GGML_TYPE_Q4_0) {
|
8703
|
-
dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
|
8704
|
-
}
|
8705
|
-
else if (type == GGML_TYPE_Q4_1) {
|
8706
|
-
dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
|
8707
|
-
}
|
8708
|
-
else if (type == GGML_TYPE_Q4_2) {
|
8709
|
-
dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
|
8710
|
-
}
|
8711
|
-
else if (type == GGML_TYPE_Q4_3) {
|
8712
|
-
dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
|
8713
|
-
}
|
8714
|
-
else if (type == GGML_TYPE_Q5_0) {
|
8715
|
-
dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
|
8716
|
-
}
|
8717
|
-
else if (type == GGML_TYPE_Q5_1) {
|
8718
|
-
dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
|
8719
|
-
}
|
8720
|
-
else if (type == GGML_TYPE_Q8_0) {
|
8721
|
-
dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
|
8722
|
-
}
|
8723
|
-
else {
|
8724
|
-
GGML_ASSERT(false);
|
8725
|
-
}
|
8726
|
-
#elif !defined(GGML_USE_CLBLAST)
|
8727
8830
|
float * const wdata = params->wdata;
|
8728
8831
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
8729
|
-
#endif
|
8730
8832
|
|
8731
8833
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8732
8834
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -8734,15 +8836,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8734
8836
|
|
8735
8837
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
8736
8838
|
|
8737
|
-
#if defined(
|
8738
|
-
// copy and dequantize on device
|
8739
|
-
CUDA_CHECK(
|
8740
|
-
cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
|
8741
|
-
GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream));
|
8742
|
-
|
8743
|
-
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
|
8744
|
-
CUDA_CHECK(cudaGetLastError());
|
8745
|
-
#elif defined(GGML_USE_CLBLAST)
|
8839
|
+
#if defined(GGML_USE_CLBLAST)
|
8746
8840
|
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
8747
8841
|
#else
|
8748
8842
|
{
|
@@ -8751,26 +8845,14 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8751
8845
|
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
8752
8846
|
id += ne00;
|
8753
8847
|
}
|
8848
|
+
|
8849
|
+
assert(id*sizeof(float) <= params->wsize);
|
8754
8850
|
}
|
8851
|
+
|
8755
8852
|
const float * x = wdata;
|
8756
8853
|
#endif
|
8757
8854
|
|
8758
|
-
|
8759
|
-
#if defined(GGML_USE_CUBLAS)
|
8760
|
-
// copy data to device
|
8761
|
-
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8762
|
-
|
8763
|
-
// compute
|
8764
|
-
CUBLAS_CHECK(
|
8765
|
-
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
8766
|
-
ne01, ne11, ne10,
|
8767
|
-
&alpha, d_X, ne00,
|
8768
|
-
d_Y, ne10,
|
8769
|
-
&beta, d_D, ne01));
|
8770
|
-
|
8771
|
-
// copy data to host
|
8772
|
-
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
8773
|
-
#elif defined(GGML_USE_CLBLAST)
|
8855
|
+
#if defined(GGML_USE_CLBLAST)
|
8774
8856
|
// zT = y * xT
|
8775
8857
|
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
8776
8858
|
ne11, ne01, ne10,
|
@@ -8788,13 +8870,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8788
8870
|
}
|
8789
8871
|
}
|
8790
8872
|
|
8791
|
-
#if defined(GGML_USE_CUBLAS)
|
8792
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
8793
|
-
ggml_cuda_pool_free(d_X, x_size);
|
8794
|
-
ggml_cuda_pool_free(d_Y, y_size);
|
8795
|
-
ggml_cuda_pool_free(d_D, d_size);
|
8796
|
-
ggml_cuda_pool_free(d_Q, q_size);
|
8797
|
-
#endif
|
8798
8873
|
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
8799
8874
|
|
8800
8875
|
return;
|
@@ -8883,7 +8958,6 @@ static void ggml_compute_forward_mul_mat(
|
|
8883
8958
|
case GGML_TYPE_Q4_0:
|
8884
8959
|
case GGML_TYPE_Q4_1:
|
8885
8960
|
case GGML_TYPE_Q4_2:
|
8886
|
-
case GGML_TYPE_Q4_3:
|
8887
8961
|
case GGML_TYPE_Q5_0:
|
8888
8962
|
case GGML_TYPE_Q5_1:
|
8889
8963
|
case GGML_TYPE_Q8_0:
|
@@ -9115,7 +9189,6 @@ static void ggml_compute_forward_get_rows(
|
|
9115
9189
|
case GGML_TYPE_Q4_0:
|
9116
9190
|
case GGML_TYPE_Q4_1:
|
9117
9191
|
case GGML_TYPE_Q4_2:
|
9118
|
-
case GGML_TYPE_Q4_3:
|
9119
9192
|
case GGML_TYPE_Q5_0:
|
9120
9193
|
case GGML_TYPE_Q5_1:
|
9121
9194
|
case GGML_TYPE_Q8_0:
|
@@ -9300,6 +9373,161 @@ static void ggml_compute_forward_soft_max(
|
|
9300
9373
|
}
|
9301
9374
|
}
|
9302
9375
|
|
9376
|
+
// ggml_compute_forward_alibi
|
9377
|
+
|
9378
|
+
static void ggml_compute_forward_alibi_f32(
|
9379
|
+
const struct ggml_compute_params * params,
|
9380
|
+
const struct ggml_tensor * src0,
|
9381
|
+
const struct ggml_tensor * src1,
|
9382
|
+
struct ggml_tensor * dst) {
|
9383
|
+
assert(params->ith == 0);
|
9384
|
+
assert(src1->type == GGML_TYPE_I32);
|
9385
|
+
assert(ggml_nelements(src1) == 2);
|
9386
|
+
|
9387
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9388
|
+
return;
|
9389
|
+
}
|
9390
|
+
|
9391
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
9392
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
9393
|
+
|
9394
|
+
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
9395
|
+
const int ne1 = src0->ne[1]; // seq_len_without_past
|
9396
|
+
//const int ne2 = src0->ne[2]; // n_head -> this is k
|
9397
|
+
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
9398
|
+
|
9399
|
+
const int n = ggml_nrows(src0);
|
9400
|
+
const int ne2_ne3 = n/ne1; // ne2*ne3
|
9401
|
+
|
9402
|
+
const int nb0 = src0->nb[0];
|
9403
|
+
const int nb1 = src0->nb[1];
|
9404
|
+
const int nb2 = src0->nb[2];
|
9405
|
+
//const int nb3 = src0->nb[3];
|
9406
|
+
|
9407
|
+
assert(nb0 == sizeof(float));
|
9408
|
+
assert(ne1 + n_past == ne0); (void) n_past;
|
9409
|
+
|
9410
|
+
// add alibi to src0 (KQ_scaled)
|
9411
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
9412
|
+
|
9413
|
+
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
|
9414
|
+
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
|
9415
|
+
|
9416
|
+
for (int i = 0; i < ne0; i++) {
|
9417
|
+
for (int j = 0; j < ne1; j++) {
|
9418
|
+
for (int k = 0; k < ne2_ne3; k++) {
|
9419
|
+
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
9420
|
+
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
9421
|
+
|
9422
|
+
// TODO: k*nb2 or k*nb3
|
9423
|
+
|
9424
|
+
float m_k;
|
9425
|
+
|
9426
|
+
if (k < n_heads_log2_floor) {
|
9427
|
+
m_k = powf(m0, k + 1);
|
9428
|
+
} else {
|
9429
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
9430
|
+
}
|
9431
|
+
|
9432
|
+
pdst[0] = (j+1) * m_k + src[0];
|
9433
|
+
}
|
9434
|
+
}
|
9435
|
+
}
|
9436
|
+
}
|
9437
|
+
|
9438
|
+
|
9439
|
+
static void ggml_compute_forward_alibi_f16(
|
9440
|
+
const struct ggml_compute_params * params,
|
9441
|
+
const struct ggml_tensor * src0,
|
9442
|
+
const struct ggml_tensor * src1,
|
9443
|
+
struct ggml_tensor * dst) {
|
9444
|
+
assert(params->ith == 0);
|
9445
|
+
assert(src1->type == GGML_TYPE_I32);
|
9446
|
+
assert(ggml_nelements(src1) == 2);
|
9447
|
+
|
9448
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9449
|
+
return;
|
9450
|
+
}
|
9451
|
+
|
9452
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
9453
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
9454
|
+
|
9455
|
+
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
9456
|
+
const int ne1 = src0->ne[1]; // seq_len_without_past
|
9457
|
+
//const int ne2 = src0->ne[2]; // n_head -> this is k
|
9458
|
+
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
9459
|
+
|
9460
|
+
const int n = ggml_nrows(src0);
|
9461
|
+
const int ne2_ne3 = n/ne1; // ne2*ne3
|
9462
|
+
|
9463
|
+
const int nb0 = src0->nb[0];
|
9464
|
+
const int nb1 = src0->nb[1];
|
9465
|
+
const int nb2 = src0->nb[2];
|
9466
|
+
//const int nb3 = src0->nb[3];
|
9467
|
+
|
9468
|
+
assert(nb0 == sizeof(ggml_fp16_t));
|
9469
|
+
assert(ne1 + n_past == ne0); (void) n_past;
|
9470
|
+
|
9471
|
+
// add alibi to src0 (KQ_scaled)
|
9472
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
9473
|
+
|
9474
|
+
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
|
9475
|
+
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
|
9476
|
+
|
9477
|
+
for (int i = 0; i < ne0; i++) {
|
9478
|
+
for (int j = 0; j < ne1; j++) {
|
9479
|
+
for (int k = 0; k < ne2_ne3; k++) {
|
9480
|
+
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
9481
|
+
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
9482
|
+
|
9483
|
+
// TODO: k*nb2 or k*nb3
|
9484
|
+
|
9485
|
+
float m_k;
|
9486
|
+
|
9487
|
+
if (k < n_heads_log2_floor) {
|
9488
|
+
m_k = powf(m0, k + 1);
|
9489
|
+
} else {
|
9490
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
9491
|
+
}
|
9492
|
+
|
9493
|
+
// we return F32
|
9494
|
+
pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
|
9495
|
+
}
|
9496
|
+
}
|
9497
|
+
}
|
9498
|
+
}
|
9499
|
+
|
9500
|
+
static void ggml_compute_forward_alibi(
|
9501
|
+
const struct ggml_compute_params * params,
|
9502
|
+
const struct ggml_tensor * src0,
|
9503
|
+
const struct ggml_tensor * src1,
|
9504
|
+
struct ggml_tensor * dst) {
|
9505
|
+
switch (src0->type) {
|
9506
|
+
case GGML_TYPE_F16:
|
9507
|
+
{
|
9508
|
+
ggml_compute_forward_alibi_f16(params, src0, src1, dst);
|
9509
|
+
} break;
|
9510
|
+
case GGML_TYPE_F32:
|
9511
|
+
{
|
9512
|
+
ggml_compute_forward_alibi_f32(params, src0, src1, dst);
|
9513
|
+
} break;
|
9514
|
+
case GGML_TYPE_Q4_0:
|
9515
|
+
case GGML_TYPE_Q4_1:
|
9516
|
+
case GGML_TYPE_Q4_2:
|
9517
|
+
case GGML_TYPE_Q5_0:
|
9518
|
+
case GGML_TYPE_Q5_1:
|
9519
|
+
case GGML_TYPE_Q8_0:
|
9520
|
+
case GGML_TYPE_Q8_1:
|
9521
|
+
case GGML_TYPE_I8:
|
9522
|
+
case GGML_TYPE_I16:
|
9523
|
+
case GGML_TYPE_I32:
|
9524
|
+
case GGML_TYPE_COUNT:
|
9525
|
+
{
|
9526
|
+
GGML_ASSERT(false);
|
9527
|
+
} break;
|
9528
|
+
}
|
9529
|
+
}
|
9530
|
+
|
9303
9531
|
// ggml_compute_forward_rope
|
9304
9532
|
|
9305
9533
|
static void ggml_compute_forward_rope_f32(
|
@@ -10938,6 +11166,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
10938
11166
|
{
|
10939
11167
|
ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
|
10940
11168
|
} break;
|
11169
|
+
case GGML_OP_ALIBI:
|
11170
|
+
{
|
11171
|
+
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
11172
|
+
} break;
|
10941
11173
|
case GGML_OP_CONV_1D_1S:
|
10942
11174
|
{
|
10943
11175
|
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
@@ -11140,6 +11372,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
11140
11372
|
{
|
11141
11373
|
GGML_ASSERT(false); // TODO: not implemented
|
11142
11374
|
} break;
|
11375
|
+
case GGML_OP_ALIBI:
|
11376
|
+
{
|
11377
|
+
GGML_ASSERT(false); // TODO: not implemented
|
11378
|
+
} break;
|
11143
11379
|
case GGML_OP_SILU:
|
11144
11380
|
{
|
11145
11381
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -11617,15 +11853,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
11617
11853
|
|
11618
11854
|
size_t cur = 0;
|
11619
11855
|
|
11856
|
+
#if defined(GGML_USE_CUBLAS)
|
11857
|
+
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
11858
|
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
11859
|
+
// the threads are still spinning
|
11860
|
+
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
11861
|
+
}
|
11862
|
+
else
|
11863
|
+
#endif
|
11620
11864
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
11621
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
11865
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11622
11866
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
11623
11867
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
11624
11868
|
// the threads are still spinning
|
11869
|
+
// here we need memory just for single 2D matrix from src0
|
11625
11870
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
11626
|
-
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
|
11627
|
-
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
|
11628
|
-
//printf("cur = %zu\n", cur);
|
11629
11871
|
} else {
|
11630
11872
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
11631
11873
|
}
|
@@ -11634,8 +11876,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
11634
11876
|
#endif
|
11635
11877
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
11636
11878
|
cur = 0;
|
11879
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11880
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
11881
|
+
node->n_tasks = 1;
|
11882
|
+
}
|
11883
|
+
#endif
|
11637
11884
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
11638
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
11885
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11639
11886
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
11640
11887
|
node->n_tasks = 1;
|
11641
11888
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
@@ -11673,6 +11920,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
11673
11920
|
{
|
11674
11921
|
node->n_tasks = n_threads;
|
11675
11922
|
} break;
|
11923
|
+
case GGML_OP_ALIBI:
|
11924
|
+
{
|
11925
|
+
node->n_tasks = 1; //TODO
|
11926
|
+
} break;
|
11676
11927
|
case GGML_OP_CONV_1D_1S:
|
11677
11928
|
case GGML_OP_CONV_1D_2S:
|
11678
11929
|
{
|
@@ -12060,10 +12311,16 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
12060
12311
|
snprintf(color, sizeof(color), "white");
|
12061
12312
|
}
|
12062
12313
|
|
12063
|
-
fprintf(fp, " \"%p\" [
|
12064
|
-
style = filled; fillcolor = %s; shape = record;
|
12065
|
-
label=\"
|
12066
|
-
(void *) node, color
|
12314
|
+
fprintf(fp, " \"%p\" [ "
|
12315
|
+
"style = filled; fillcolor = %s; shape = record; "
|
12316
|
+
"label=\"",
|
12317
|
+
(void *) node, color);
|
12318
|
+
|
12319
|
+
if (strlen(node->name) > 0) {
|
12320
|
+
fprintf(fp, "%s |", node->name);
|
12321
|
+
}
|
12322
|
+
|
12323
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
12067
12324
|
i, node->ne[0], node->ne[1],
|
12068
12325
|
GGML_OP_SYMBOL[node->op]);
|
12069
12326
|
|
@@ -12079,18 +12336,26 @@ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
|
12079
12336
|
|
12080
12337
|
snprintf(color, sizeof(color), "pink");
|
12081
12338
|
|
12339
|
+
fprintf(fp, " \"%p\" [ "
|
12340
|
+
"style = filled; fillcolor = %s; shape = record; "
|
12341
|
+
"label=\"<x>",
|
12342
|
+
(void *) node, color);
|
12343
|
+
|
12344
|
+
if (strlen(node->name) > 0) {
|
12345
|
+
fprintf(fp, "%s | ", node->name);
|
12346
|
+
}
|
12082
12347
|
if (ggml_nelements(node) == 1) {
|
12083
|
-
|
12084
|
-
|
12085
|
-
|
12086
|
-
|
12087
|
-
|
12088
|
-
|
12089
|
-
style = filled; fillcolor = %s; shape = record; \
|
12090
|
-
label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
|
12091
|
-
(void *) node, color,
|
12092
|
-
i, node->ne[0], node->ne[1]);
|
12348
|
+
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
12349
|
+
fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
|
12350
|
+
}
|
12351
|
+
else {
|
12352
|
+
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
|
12353
|
+
}
|
12093
12354
|
}
|
12355
|
+
else {
|
12356
|
+
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
12357
|
+
}
|
12358
|
+
fprintf(fp, "\"; ]\n");
|
12094
12359
|
}
|
12095
12360
|
|
12096
12361
|
for (int i = 0; i < gb->n_nodes; i++) {
|
@@ -12889,29 +13154,6 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
|
|
12889
13154
|
return (n/QK4_2*sizeof(block_q4_2));
|
12890
13155
|
}
|
12891
13156
|
|
12892
|
-
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
|
12893
|
-
assert(k % QK4_3 == 0);
|
12894
|
-
const int nb = k / QK4_3;
|
12895
|
-
|
12896
|
-
for (int j = 0; j < n; j += k) {
|
12897
|
-
block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
|
12898
|
-
|
12899
|
-
quantize_row_q4_3_reference(src + j, y, k);
|
12900
|
-
|
12901
|
-
for (int i = 0; i < nb; i++) {
|
12902
|
-
for (int l = 0; l < QK4_3; l += 2) {
|
12903
|
-
const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
|
12904
|
-
const uint8_t vi1 = y[i].qs[l/2] >> 4;
|
12905
|
-
|
12906
|
-
hist[vi0]++;
|
12907
|
-
hist[vi1]++;
|
12908
|
-
}
|
12909
|
-
}
|
12910
|
-
}
|
12911
|
-
|
12912
|
-
return (n/QK4_3*sizeof(block_q4_3));
|
12913
|
-
}
|
12914
|
-
|
12915
13157
|
size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
12916
13158
|
assert(k % QK5_0 == 0);
|
12917
13159
|
const int nb = k / QK5_0;
|
@@ -12926,8 +13168,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
12926
13168
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
12927
13169
|
|
12928
13170
|
for (int l = 0; l < QK5_0; l += 2) {
|
12929
|
-
const uint8_t vh0 = ((qh & (
|
12930
|
-
const uint8_t vh1 = ((qh & (
|
13171
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
13172
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
12931
13173
|
|
12932
13174
|
// cast to 16 bins
|
12933
13175
|
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
|
@@ -12956,8 +13198,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
12956
13198
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
12957
13199
|
|
12958
13200
|
for (int l = 0; l < QK5_1; l += 2) {
|
12959
|
-
const uint8_t vh0 = ((qh & (
|
12960
|
-
const uint8_t vh1 = ((qh & (
|
13201
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
13202
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
12961
13203
|
|
12962
13204
|
// cast to 16 bins
|
12963
13205
|
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
|
@@ -13014,12 +13256,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
13014
13256
|
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
|
13015
13257
|
result = ggml_quantize_q4_2(src + start, block, n, n, hist);
|
13016
13258
|
} break;
|
13017
|
-
case GGML_TYPE_Q4_3:
|
13018
|
-
{
|
13019
|
-
GGML_ASSERT(start % QK4_3 == 0);
|
13020
|
-
block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
|
13021
|
-
result = ggml_quantize_q4_3(src + start, block, n, n, hist);
|
13022
|
-
} break;
|
13023
13259
|
case GGML_TYPE_Q5_0:
|
13024
13260
|
{
|
13025
13261
|
GGML_ASSERT(start % QK5_0 == 0);
|