llama_cpp 0.0.7 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +736 -36
- data/ext/llama_cpp/src/ggml-cuda.h +8 -33
- data/ext/llama_cpp/src/ggml-opencl.c +202 -20
- data/ext/llama_cpp/src/ggml.c +732 -496
- data/ext/llama_cpp/src/ggml.h +47 -5
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +560 -147
- data/ext/llama_cpp/src/llama.h +71 -24
- data/lib/llama_cpp/client.rb +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +38 -3
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
135
135
|
#define UNUSED(x) (void)(x)
|
136
136
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
137
137
|
|
138
|
-
#define GGML_ASSERT(x) \
|
139
|
-
do { \
|
140
|
-
if (!(x)) { \
|
141
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
142
|
-
abort(); \
|
143
|
-
} \
|
144
|
-
} while (0)
|
145
|
-
|
146
138
|
#if defined(GGML_USE_ACCELERATE)
|
147
139
|
#include <Accelerate/Accelerate.h>
|
148
140
|
#elif defined(GGML_USE_OPENBLAS)
|
@@ -188,9 +180,13 @@ typedef double ggml_float;
|
|
188
180
|
#undef bool
|
189
181
|
#define bool _Bool
|
190
182
|
#else
|
183
|
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
184
|
+
#include <intrin.h>
|
185
|
+
#else
|
191
186
|
#include <immintrin.h>
|
192
187
|
#endif
|
193
188
|
#endif
|
189
|
+
#endif
|
194
190
|
|
195
191
|
#ifdef __F16C__
|
196
192
|
|
@@ -330,7 +326,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
|
|
330
326
|
// precomputed f32 table for f16 (256 KB)
|
331
327
|
static float table_f32_f16[1 << 16];
|
332
328
|
|
333
|
-
#if defined(__ARM_NEON)
|
329
|
+
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
334
330
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
335
331
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
336
332
|
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
@@ -370,6 +366,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
370
366
|
return GGML_FP32_TO_FP16(x);
|
371
367
|
}
|
372
368
|
|
369
|
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
|
370
|
+
for (size_t i = 0; i < n; i++) {
|
371
|
+
y[i] = GGML_FP16_TO_FP32(x[i]);
|
372
|
+
}
|
373
|
+
}
|
374
|
+
|
375
|
+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
376
|
+
size_t i = 0;
|
377
|
+
#if defined(__F16C__)
|
378
|
+
for (; i + 7 < n; i += 8) {
|
379
|
+
__m256 x_vec = _mm256_loadu_ps(x + i);
|
380
|
+
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
381
|
+
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
382
|
+
}
|
383
|
+
for(; i + 3 < n; i += 4) {
|
384
|
+
__m128 x_vec = _mm_loadu_ps(x + i);
|
385
|
+
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
386
|
+
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
387
|
+
}
|
388
|
+
#endif
|
389
|
+
for (; i < n; i++) {
|
390
|
+
y[i] = GGML_FP32_TO_FP16(x[i]);
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
|
373
395
|
//
|
374
396
|
// timing
|
375
397
|
//
|
@@ -653,19 +675,102 @@ float vmaxvq_f32(float32x4_t v) {
|
|
653
675
|
}
|
654
676
|
|
655
677
|
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
|
656
|
-
|
678
|
+
int8x8_t res;
|
679
|
+
|
680
|
+
res[0] = a[0]; res[1] = b[0];
|
681
|
+
res[2] = a[1]; res[3] = b[1];
|
682
|
+
res[4] = a[2]; res[5] = b[2];
|
683
|
+
res[6] = a[3]; res[7] = b[3];
|
684
|
+
|
685
|
+
return res;
|
657
686
|
}
|
658
687
|
|
659
688
|
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
|
660
|
-
|
689
|
+
int8x8_t res;
|
690
|
+
|
691
|
+
res[0] = a[4]; res[1] = b[4];
|
692
|
+
res[2] = a[5]; res[3] = b[5];
|
693
|
+
res[4] = a[6]; res[5] = b[6];
|
694
|
+
res[6] = a[7]; res[7] = b[7];
|
695
|
+
|
696
|
+
return res;
|
661
697
|
}
|
662
698
|
|
663
699
|
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
664
|
-
|
700
|
+
uint8x8_t res;
|
701
|
+
|
702
|
+
res[0] = a[0]; res[1] = b[0];
|
703
|
+
res[2] = a[1]; res[3] = b[1];
|
704
|
+
res[4] = a[2]; res[5] = b[2];
|
705
|
+
res[6] = a[3]; res[7] = b[3];
|
706
|
+
|
707
|
+
return res;
|
665
708
|
}
|
666
709
|
|
667
710
|
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
668
|
-
|
711
|
+
uint8x8_t res;
|
712
|
+
|
713
|
+
res[0] = a[4]; res[1] = b[4];
|
714
|
+
res[2] = a[5]; res[3] = b[5];
|
715
|
+
res[4] = a[6]; res[5] = b[6];
|
716
|
+
res[6] = a[7]; res[7] = b[7];
|
717
|
+
|
718
|
+
return res;
|
719
|
+
}
|
720
|
+
|
721
|
+
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
|
722
|
+
int8x16_t res;
|
723
|
+
|
724
|
+
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
|
725
|
+
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
|
726
|
+
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
|
727
|
+
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
|
728
|
+
|
729
|
+
return res;
|
730
|
+
}
|
731
|
+
|
732
|
+
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
|
733
|
+
int8x16_t res;
|
734
|
+
|
735
|
+
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
|
736
|
+
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
|
737
|
+
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
|
738
|
+
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
|
739
|
+
|
740
|
+
return res;
|
741
|
+
}
|
742
|
+
|
743
|
+
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
|
744
|
+
uint8x16_t res;
|
745
|
+
|
746
|
+
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
|
747
|
+
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
|
748
|
+
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
|
749
|
+
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
|
750
|
+
|
751
|
+
return res;
|
752
|
+
}
|
753
|
+
|
754
|
+
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
|
755
|
+
uint8x16_t res;
|
756
|
+
|
757
|
+
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
|
758
|
+
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
|
759
|
+
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
|
760
|
+
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
|
761
|
+
|
762
|
+
return res;
|
763
|
+
}
|
764
|
+
|
765
|
+
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
766
|
+
int32x4_t res;
|
767
|
+
|
768
|
+
res[0] = roundf(vgetq_lane_f32(v, 0));
|
769
|
+
res[1] = roundf(vgetq_lane_f32(v, 1));
|
770
|
+
res[2] = roundf(vgetq_lane_f32(v, 2));
|
771
|
+
res[3] = roundf(vgetq_lane_f32(v, 3));
|
772
|
+
|
773
|
+
return res;
|
669
774
|
}
|
670
775
|
|
671
776
|
#endif
|
@@ -694,14 +799,6 @@ typedef struct {
|
|
694
799
|
} block_q4_2;
|
695
800
|
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
|
696
801
|
|
697
|
-
#define QK4_3 16
|
698
|
-
typedef struct {
|
699
|
-
ggml_fp16_t d; // delta
|
700
|
-
ggml_fp16_t m; // min
|
701
|
-
uint8_t qs[QK4_3 / 2]; // nibbles / quants
|
702
|
-
} block_q4_3;
|
703
|
-
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
|
704
|
-
|
705
802
|
#define QK5_0 32
|
706
803
|
typedef struct {
|
707
804
|
ggml_fp16_t d; // delta
|
@@ -789,6 +886,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
789
886
|
float max = 0.0f;
|
790
887
|
float min = 0.0f;
|
791
888
|
|
889
|
+
vector float asrcv [8];
|
792
890
|
vector float srcv [8];
|
793
891
|
vector float maxv[8];
|
794
892
|
vector float minv[8];
|
@@ -1068,7 +1166,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
1068
1166
|
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
|
1069
1167
|
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
1070
1168
|
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
1071
|
-
const v128_t vc =
|
1169
|
+
const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
|
1072
1170
|
|
1073
1171
|
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
|
1074
1172
|
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
|
@@ -1291,49 +1389,6 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
|
|
1291
1389
|
quantize_row_q4_2_reference(x, y, k);
|
1292
1390
|
}
|
1293
1391
|
|
1294
|
-
static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
|
1295
|
-
assert(k % QK4_3 == 0);
|
1296
|
-
const int nb = k / QK4_3;
|
1297
|
-
|
1298
|
-
for (int i = 0; i < nb; i++) {
|
1299
|
-
float min = FLT_MAX;
|
1300
|
-
float max = -FLT_MAX;
|
1301
|
-
|
1302
|
-
for (int l = 0; l < QK4_3; l++) {
|
1303
|
-
const float v = x[i*QK4_3 + l];
|
1304
|
-
if (v < min) min = v;
|
1305
|
-
if (v > max) max = v;
|
1306
|
-
}
|
1307
|
-
|
1308
|
-
const float d = (max - min) / ((1 << 4) - 1);
|
1309
|
-
const float id = d ? 1.0f/d : 0.0f;
|
1310
|
-
|
1311
|
-
y[i].d = GGML_FP32_TO_FP16(d);
|
1312
|
-
y[i].m = GGML_FP32_TO_FP16(min);
|
1313
|
-
|
1314
|
-
for (int l = 0; l < QK4_3; l += 2) {
|
1315
|
-
const float v0 = (x[i*QK4_3 + l + 0] - min)*id;
|
1316
|
-
const float v1 = (x[i*QK4_3 + l + 1] - min)*id;
|
1317
|
-
|
1318
|
-
const uint8_t vi0 = (int) (v0 + 0.5f);
|
1319
|
-
const uint8_t vi1 = (int) (v1 + 0.5f);
|
1320
|
-
|
1321
|
-
assert(vi0 < 16);
|
1322
|
-
assert(vi1 < 16);
|
1323
|
-
|
1324
|
-
y[i].qs[l/2] = vi0 | (vi1 << 4);
|
1325
|
-
}
|
1326
|
-
}
|
1327
|
-
}
|
1328
|
-
|
1329
|
-
static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) {
|
1330
|
-
assert(k % QK4_3 == 0);
|
1331
|
-
|
1332
|
-
block_q4_3 * restrict y = vy;
|
1333
|
-
|
1334
|
-
quantize_row_q4_3_reference(x, y, k);
|
1335
|
-
}
|
1336
|
-
|
1337
1392
|
static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
|
1338
1393
|
assert(k % QK5_0 == 0);
|
1339
1394
|
const int nb = k / QK5_0;
|
@@ -1458,15 +1513,135 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
|
|
1458
1513
|
}
|
1459
1514
|
|
1460
1515
|
static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
1516
|
+
assert(QK8_0 == 32);
|
1461
1517
|
assert(k % QK8_0 == 0);
|
1518
|
+
const int nb = k / QK8_0;
|
1462
1519
|
|
1463
1520
|
block_q8_0 * restrict y = vy;
|
1464
1521
|
|
1522
|
+
#if defined(__ARM_NEON)
|
1523
|
+
for (int i = 0; i < nb; i++) {
|
1524
|
+
float32x4_t srcv [8];
|
1525
|
+
float32x4_t asrcv[8];
|
1526
|
+
float32x4_t amaxv[8];
|
1527
|
+
|
1528
|
+
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
|
1529
|
+
for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
|
1530
|
+
|
1531
|
+
for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
|
1532
|
+
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
1533
|
+
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
1534
|
+
|
1535
|
+
const float amax = vmaxvq_f32(amaxv[0]);
|
1536
|
+
|
1537
|
+
const float d = amax / ((1 << 7) - 1);
|
1538
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1539
|
+
|
1540
|
+
y[i].d = d;
|
1541
|
+
|
1542
|
+
for (int l = 0; l < 8; l++) {
|
1543
|
+
const float32x4_t v = vmulq_n_f32(srcv[l], id);
|
1544
|
+
const int32x4_t vi = vcvtnq_s32_f32(v);
|
1545
|
+
|
1546
|
+
y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
|
1547
|
+
y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
|
1548
|
+
y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
|
1549
|
+
y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
|
1550
|
+
}
|
1551
|
+
}
|
1552
|
+
#elif defined(__AVX2__) || defined(__AVX__)
|
1553
|
+
for (int i = 0; i < nb; i++) {
|
1554
|
+
// Load elements into 4 AVX vectors
|
1555
|
+
__m256 v0 = _mm256_loadu_ps( x );
|
1556
|
+
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
1557
|
+
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
1558
|
+
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
1559
|
+
x += 32;
|
1560
|
+
|
1561
|
+
// Compute max(abs(e)) for the block
|
1562
|
+
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
1563
|
+
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
1564
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
1565
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
1566
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
1567
|
+
|
1568
|
+
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
1569
|
+
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
1570
|
+
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
1571
|
+
const float maxScalar = _mm_cvtss_f32( max4 );
|
1572
|
+
|
1573
|
+
// Quantize these floats
|
1574
|
+
const float d = maxScalar / 127.f;
|
1575
|
+
y[i].d = d;
|
1576
|
+
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1577
|
+
const __m256 mul = _mm256_set1_ps( id );
|
1578
|
+
|
1579
|
+
// Apply the multiplier
|
1580
|
+
v0 = _mm256_mul_ps( v0, mul );
|
1581
|
+
v1 = _mm256_mul_ps( v1, mul );
|
1582
|
+
v2 = _mm256_mul_ps( v2, mul );
|
1583
|
+
v3 = _mm256_mul_ps( v3, mul );
|
1584
|
+
|
1585
|
+
// Round to nearest integer
|
1586
|
+
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
1587
|
+
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
1588
|
+
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
1589
|
+
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
1590
|
+
|
1591
|
+
// Convert floats to integers
|
1592
|
+
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
1593
|
+
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
1594
|
+
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
1595
|
+
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
1596
|
+
|
1597
|
+
#if defined(__AVX2__)
|
1598
|
+
// Convert int32 to int16
|
1599
|
+
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
1600
|
+
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
|
1601
|
+
// Convert int16 to int8
|
1602
|
+
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
|
1603
|
+
|
1604
|
+
// We got our precious signed bytes, but the order is now wrong
|
1605
|
+
// These AVX2 pack instructions process 16-byte pieces independently
|
1606
|
+
// The following instruction is fixing the order
|
1607
|
+
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
1608
|
+
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
1609
|
+
|
1610
|
+
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
1611
|
+
#else
|
1612
|
+
// Since we don't have in AVX some necessary functions,
|
1613
|
+
// we split the registers in half and call AVX2 analogs from SSE
|
1614
|
+
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
1615
|
+
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
1616
|
+
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
1617
|
+
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
1618
|
+
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
1619
|
+
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
1620
|
+
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
1621
|
+
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
1622
|
+
|
1623
|
+
// Convert int32 to int16
|
1624
|
+
ni0 = _mm_packs_epi32( ni0, ni1 );
|
1625
|
+
ni2 = _mm_packs_epi32( ni2, ni3 );
|
1626
|
+
ni4 = _mm_packs_epi32( ni4, ni5 );
|
1627
|
+
ni6 = _mm_packs_epi32( ni6, ni7 );
|
1628
|
+
// Convert int16 to int8
|
1629
|
+
ni0 = _mm_packs_epi16( ni0, ni2 );
|
1630
|
+
ni4 = _mm_packs_epi16( ni4, ni6 );
|
1631
|
+
|
1632
|
+
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
|
1633
|
+
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
1634
|
+
#endif
|
1635
|
+
}
|
1636
|
+
#else
|
1637
|
+
// scalar
|
1465
1638
|
quantize_row_q8_0_reference(x, y, k);
|
1639
|
+
#endif
|
1466
1640
|
}
|
1467
1641
|
|
1468
1642
|
// reference implementation for deterministic creation of model files
|
1469
1643
|
static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
|
1644
|
+
assert(QK8_1 == 32);
|
1470
1645
|
assert(k % QK8_1 == 0);
|
1471
1646
|
const int nb = k / QK8_1;
|
1472
1647
|
|
@@ -1917,36 +2092,6 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
|
|
1917
2092
|
}
|
1918
2093
|
}
|
1919
2094
|
|
1920
|
-
static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) {
|
1921
|
-
assert(k % QK4_3 == 0);
|
1922
|
-
const int nb = k / QK4_3;
|
1923
|
-
|
1924
|
-
const block_q4_3 * restrict x = vx;
|
1925
|
-
|
1926
|
-
for (int i = 0; i < nb; i++) {
|
1927
|
-
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1928
|
-
const float m = GGML_FP16_TO_FP32(x[i].m);
|
1929
|
-
|
1930
|
-
const uint8_t * restrict pp = x[i].qs;
|
1931
|
-
|
1932
|
-
for (int l = 0; l < QK4_3; l += 2) {
|
1933
|
-
const uint8_t vi = pp[l/2];
|
1934
|
-
|
1935
|
-
const int8_t vi0 = vi & 0x0F;
|
1936
|
-
const int8_t vi1 = vi >> 4;
|
1937
|
-
|
1938
|
-
const float v0 = vi0*d + m;
|
1939
|
-
const float v1 = vi1*d + m;
|
1940
|
-
|
1941
|
-
y[i*QK4_3 + l + 0] = v0;
|
1942
|
-
y[i*QK4_3 + l + 1] = v1;
|
1943
|
-
|
1944
|
-
assert(!isnan(y[i*QK4_3 + l + 0]));
|
1945
|
-
assert(!isnan(y[i*QK4_3 + l + 1]));
|
1946
|
-
}
|
1947
|
-
}
|
1948
|
-
}
|
1949
|
-
|
1950
2095
|
static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
|
1951
2096
|
assert(k % QK5_0 == 0);
|
1952
2097
|
const int nb = k / QK5_0;
|
@@ -1965,8 +2110,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
|
|
1965
2110
|
const uint8_t vi = pp[l/2];
|
1966
2111
|
|
1967
2112
|
// extract the 5-th bit from qh
|
1968
|
-
const uint8_t vh0 = ((qh & (
|
1969
|
-
const uint8_t vh1 = ((qh & (
|
2113
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
2114
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
1970
2115
|
|
1971
2116
|
const int8_t vi0 = (vi & 0x0F) | vh0;
|
1972
2117
|
const int8_t vi1 = (vi >> 4) | vh1;
|
@@ -2002,8 +2147,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
|
|
2002
2147
|
const uint8_t vi = pp[l/2];
|
2003
2148
|
|
2004
2149
|
// extract the 5-th bit from qh
|
2005
|
-
const uint8_t vh0 = ((qh & (
|
2006
|
-
const uint8_t vh1 = ((qh & (
|
2150
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
2151
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
2007
2152
|
|
2008
2153
|
const uint8_t vi0 = (vi & 0x0F) | vh0;
|
2009
2154
|
const uint8_t vi1 = (vi >> 4) | vh1;
|
@@ -2040,7 +2185,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
2040
2185
|
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2041
2186
|
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2042
2187
|
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2043
|
-
static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2044
2188
|
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2045
2189
|
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
2046
2190
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
@@ -2070,14 +2214,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
2070
2214
|
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
|
2071
2215
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
2072
2216
|
},
|
2073
|
-
[GGML_TYPE_Q4_3] = {
|
2074
|
-
.dequantize_row_q = dequantize_row_q4_3,
|
2075
|
-
.quantize_row_q = quantize_row_q4_3,
|
2076
|
-
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference,
|
2077
|
-
.quantize_row_q_dot = quantize_row_q8_1,
|
2078
|
-
.vec_dot_q = ggml_vec_dot_q4_3_q8_1,
|
2079
|
-
.vec_dot_type = GGML_TYPE_Q8_1,
|
2080
|
-
},
|
2081
2217
|
[GGML_TYPE_Q5_0] = {
|
2082
2218
|
.dequantize_row_q = dequantize_row_q5_0,
|
2083
2219
|
.quantize_row_q = quantize_row_q5_0,
|
@@ -2748,35 +2884,35 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2748
2884
|
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
|
2749
2885
|
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
|
2750
2886
|
|
2887
|
+
// interleave
|
2888
|
+
const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
|
2889
|
+
const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
|
2890
|
+
const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
|
2891
|
+
const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
|
2892
|
+
|
2751
2893
|
// load y
|
2752
2894
|
const int8x16_t v1_0l = vld1q_s8(y0->qs);
|
2753
2895
|
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
|
2754
2896
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2755
2897
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2756
2898
|
|
2757
|
-
// interleave
|
2758
|
-
const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
|
2759
|
-
const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
|
2760
|
-
const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
|
2761
|
-
const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
|
2762
|
-
|
2763
2899
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2764
2900
|
// dot product into int32x4_t
|
2765
|
-
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0),
|
2766
|
-
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0),
|
2901
|
+
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
|
2902
|
+
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
|
2767
2903
|
|
2768
2904
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2769
2905
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2770
2906
|
#else
|
2771
|
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (
|
2772
|
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(
|
2773
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (
|
2774
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(
|
2907
|
+
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
|
2908
|
+
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
|
2909
|
+
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
|
2910
|
+
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
|
2775
2911
|
|
2776
|
-
const int16x8_t pl1l = vmull_s8(vget_low_s8 (
|
2777
|
-
const int16x8_t pl1h = vmull_s8(vget_high_s8(
|
2778
|
-
const int16x8_t ph1l = vmull_s8(vget_low_s8 (
|
2779
|
-
const int16x8_t ph1h = vmull_s8(vget_high_s8(
|
2912
|
+
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
|
2913
|
+
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
|
2914
|
+
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
|
2915
|
+
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
|
2780
2916
|
|
2781
2917
|
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
2782
2918
|
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
@@ -3171,136 +3307,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
|
|
3171
3307
|
#endif
|
3172
3308
|
}
|
3173
3309
|
|
3174
|
-
static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
3175
|
-
const int nb = n / QK8_1;
|
3176
|
-
|
3177
|
-
assert(n % QK8_1 == 0);
|
3178
|
-
assert(nb % 2 == 0);
|
3179
|
-
assert(QK8_1 == 2*QK4_3);
|
3180
|
-
|
3181
|
-
const block_q4_3 * restrict x = vx;
|
3182
|
-
const block_q8_1 * restrict y = vy;
|
3183
|
-
|
3184
|
-
#if defined(__ARM_NEON)
|
3185
|
-
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3186
|
-
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
3187
|
-
|
3188
|
-
float summs0 = 0.0f;
|
3189
|
-
float summs1 = 0.0f;
|
3190
|
-
|
3191
|
-
for (int i = 0; i < nb; ++i) {
|
3192
|
-
const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
|
3193
|
-
const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
|
3194
|
-
|
3195
|
-
const block_q8_1 * restrict y0 = &y[i + 0];
|
3196
|
-
|
3197
|
-
summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
|
3198
|
-
summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
|
3199
|
-
|
3200
|
-
const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
|
3201
|
-
|
3202
|
-
// 4-bit -> 8-bit
|
3203
|
-
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, vdupq_n_u8(0x0F)));
|
3204
|
-
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
3205
|
-
|
3206
|
-
// interleave
|
3207
|
-
const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
|
3208
|
-
const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
|
3209
|
-
|
3210
|
-
// load y
|
3211
|
-
const int8x16_t v1_0l = vld1q_s8(y0->qs);
|
3212
|
-
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
|
3213
|
-
|
3214
|
-
const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
|
3215
|
-
const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
|
3216
|
-
|
3217
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
3218
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
|
3219
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
|
3220
|
-
#else
|
3221
|
-
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
|
3222
|
-
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
|
3223
|
-
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
|
3224
|
-
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
|
3225
|
-
|
3226
|
-
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
3227
|
-
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
3228
|
-
|
3229
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
|
3230
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d);
|
3231
|
-
#endif
|
3232
|
-
}
|
3233
|
-
|
3234
|
-
*s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1;
|
3235
|
-
#elif defined(__AVX2__)
|
3236
|
-
// Initialize accumulator with zeros
|
3237
|
-
__m256 acc = _mm256_setzero_ps();
|
3238
|
-
float summs = 0.0f;
|
3239
|
-
|
3240
|
-
// Main loop
|
3241
|
-
for (int i = 0; i < nb; i++) {
|
3242
|
-
const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
|
3243
|
-
const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
|
3244
|
-
const __m256 dx = _mm256_set_m128(d1, d0);
|
3245
|
-
|
3246
|
-
summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0
|
3247
|
-
+ GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1;
|
3248
|
-
|
3249
|
-
const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
|
3250
|
-
const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
|
3251
|
-
const __m256i bx = _mm256_set_m128i(bx1, bx0);
|
3252
|
-
|
3253
|
-
const __m256 dy = _mm256_broadcast_ss(&y[i].d);
|
3254
|
-
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3255
|
-
|
3256
|
-
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
3257
|
-
|
3258
|
-
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
3259
|
-
}
|
3260
|
-
|
3261
|
-
*s = hsum_float_8(acc) + summs;
|
3262
|
-
#else
|
3263
|
-
// scalar
|
3264
|
-
float sumf = 0.0;
|
3265
|
-
for (int i = 0; i < nb; i++) {
|
3266
|
-
const uint8_t * restrict x0 = x[2*i + 0].qs;
|
3267
|
-
const uint8_t * restrict x1 = x[2*i + 1].qs;
|
3268
|
-
const int8_t * restrict y0 = y[i].qs;
|
3269
|
-
|
3270
|
-
const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
|
3271
|
-
const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m);
|
3272
|
-
const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
|
3273
|
-
const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
|
3274
|
-
|
3275
|
-
int sxy_0 = 0;
|
3276
|
-
int sxy_1 = 0;
|
3277
|
-
|
3278
|
-
for (int j = 0; j < QK8_1/4; j++) {
|
3279
|
-
const uint8_t v0 = x0[j];
|
3280
|
-
const uint8_t v1 = x1[j];
|
3281
|
-
|
3282
|
-
const int x0_0 = v0 & 0x0F;
|
3283
|
-
const int x1_0 = v0 >> 4;
|
3284
|
-
|
3285
|
-
const int x0_1 = v1 & 0x0F;
|
3286
|
-
const int x1_1 = v1 >> 4;
|
3287
|
-
|
3288
|
-
const int y0_0 = y0[2*j + 0];
|
3289
|
-
const int y1_0 = y0[2*j + 1];
|
3290
|
-
|
3291
|
-
const int y0_1 = y0[2*(j + QK8_1/4) + 0];
|
3292
|
-
const int y1_1 = y0[2*(j + QK8_1/4) + 1];
|
3293
|
-
|
3294
|
-
sxy_0 += x0_0*y0_0 + x1_0*y1_0;
|
3295
|
-
sxy_1 += x0_1*y0_1 + x1_1*y1_1;
|
3296
|
-
}
|
3297
|
-
|
3298
|
-
sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1;
|
3299
|
-
}
|
3300
|
-
*s = sumf;
|
3301
|
-
#endif
|
3302
|
-
}
|
3303
|
-
|
3304
3310
|
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
3305
3311
|
const int nb = n / QK8_0;
|
3306
3312
|
|
@@ -3373,6 +3379,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
3373
3379
|
}
|
3374
3380
|
|
3375
3381
|
*s = vaddvq_f32(sumv);
|
3382
|
+
#elif defined(__wasm_simd128__)
|
3383
|
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
3384
|
+
|
3385
|
+
uint64_t tmp[4];
|
3386
|
+
|
3387
|
+
for (int i = 0; i < nb; ++i) {
|
3388
|
+
const block_q5_0 * restrict x0 = &x[i];
|
3389
|
+
const block_q8_0 * restrict y0 = &y[i];
|
3390
|
+
|
3391
|
+
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
3392
|
+
const v128_t s16b = wasm_i8x16_splat(0x10);
|
3393
|
+
|
3394
|
+
// extract the 5th bit
|
3395
|
+
uint32_t qh;
|
3396
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
3397
|
+
|
3398
|
+
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
3399
|
+
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
3400
|
+
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
3401
|
+
tmp[3] = table_b2b_u[(qh >> 24) ];
|
3402
|
+
|
3403
|
+
const v128_t qhl = wasm_v128_load(tmp + 0);
|
3404
|
+
const v128_t qhh = wasm_v128_load(tmp + 2);
|
3405
|
+
|
3406
|
+
const v128_t v0 = wasm_v128_load(x0->qs);
|
3407
|
+
|
3408
|
+
// 4-bit -> 8-bit
|
3409
|
+
const v128_t v0l = wasm_v128_and (v0, m4b);
|
3410
|
+
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
3411
|
+
|
3412
|
+
// interleave
|
3413
|
+
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
3414
|
+
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
3415
|
+
|
3416
|
+
// add high bit and sub 16
|
3417
|
+
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
|
3418
|
+
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
|
3419
|
+
|
3420
|
+
// load y
|
3421
|
+
const v128_t v1l = wasm_v128_load(y0->qs);
|
3422
|
+
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
3423
|
+
|
3424
|
+
// int8x16 -> int16x8
|
3425
|
+
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
3426
|
+
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
3427
|
+
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
3428
|
+
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
3429
|
+
|
3430
|
+
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
3431
|
+
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
3432
|
+
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
3433
|
+
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
3434
|
+
|
3435
|
+
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
3436
|
+
|
3437
|
+
// dot product
|
3438
|
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
3439
|
+
wasm_i32x4_add(
|
3440
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
3441
|
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
3442
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
3443
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
3444
|
+
}
|
3445
|
+
|
3446
|
+
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
3447
|
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
3376
3448
|
#elif defined(__AVX2__)
|
3377
3449
|
// Initialize accumulator with zeros
|
3378
3450
|
__m256 acc = _mm256_setzero_ps();
|
@@ -3413,8 +3485,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
3413
3485
|
for (int j = 0; j < QK8_0/2; j++) {
|
3414
3486
|
const uint8_t v0 = x0[j];
|
3415
3487
|
|
3416
|
-
const int x0_0h = ((qh & (
|
3417
|
-
const int x1_0h = ((qh & (
|
3488
|
+
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
|
3489
|
+
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
|
3418
3490
|
|
3419
3491
|
const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
|
3420
3492
|
const int x1_0 = ((v0 >> 4) | x1_0h) - 16;
|
@@ -3504,6 +3576,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3504
3576
|
}
|
3505
3577
|
|
3506
3578
|
*s = vaddvq_f32(sumv) + summs;
|
3579
|
+
#elif defined(__wasm_simd128__)
|
3580
|
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
3581
|
+
|
3582
|
+
float summs = 0.0f;
|
3583
|
+
|
3584
|
+
uint64_t tmp[4];
|
3585
|
+
|
3586
|
+
for (int i = 0; i < nb; ++i) {
|
3587
|
+
const block_q5_1 * restrict x0 = &x[i];
|
3588
|
+
const block_q8_1 * restrict y0 = &y[i];
|
3589
|
+
|
3590
|
+
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
|
3591
|
+
|
3592
|
+
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
3593
|
+
|
3594
|
+
// extract the 5th bit
|
3595
|
+
uint32_t qh;
|
3596
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
3597
|
+
|
3598
|
+
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
3599
|
+
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
3600
|
+
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
3601
|
+
tmp[3] = table_b2b_u[(qh >> 24) ];
|
3602
|
+
|
3603
|
+
const v128_t qhl = wasm_v128_load(tmp + 0);
|
3604
|
+
const v128_t qhh = wasm_v128_load(tmp + 2);
|
3605
|
+
|
3606
|
+
const v128_t v0 = wasm_v128_load(x0->qs);
|
3607
|
+
|
3608
|
+
// 4-bit -> 8-bit
|
3609
|
+
const v128_t v0l = wasm_v128_and (v0, m4b);
|
3610
|
+
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
3611
|
+
|
3612
|
+
static bool x = true;
|
3613
|
+
|
3614
|
+
// interleave
|
3615
|
+
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
3616
|
+
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
3617
|
+
|
3618
|
+
// add high bit
|
3619
|
+
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
|
3620
|
+
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
|
3621
|
+
|
3622
|
+
// load y
|
3623
|
+
const v128_t v1l = wasm_v128_load(y0->qs);
|
3624
|
+
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
3625
|
+
|
3626
|
+
// int8x16 -> int16x8
|
3627
|
+
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
3628
|
+
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
3629
|
+
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
3630
|
+
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
3631
|
+
|
3632
|
+
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
3633
|
+
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
3634
|
+
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
3635
|
+
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
3636
|
+
|
3637
|
+
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
3638
|
+
|
3639
|
+
// dot product
|
3640
|
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
3641
|
+
wasm_i32x4_add(
|
3642
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
3643
|
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
3644
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
3645
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
3646
|
+
}
|
3647
|
+
|
3648
|
+
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
3649
|
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
3507
3650
|
#elif defined(__AVX2__)
|
3508
3651
|
// Initialize accumulator with zeros
|
3509
3652
|
__m256 acc = _mm256_setzero_ps();
|
@@ -3547,8 +3690,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3547
3690
|
for (int j = 0; j < QK8_1/2; j++) {
|
3548
3691
|
const uint8_t v0 = x0[j];
|
3549
3692
|
|
3550
|
-
const int x0_0h = ((qh & (
|
3551
|
-
const int x1_0h = ((qh & (
|
3693
|
+
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
|
3694
|
+
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
|
3552
3695
|
|
3553
3696
|
const int x0_0 = (v0 & 0x0F) | x0_0h;
|
3554
3697
|
const int x1_0 = (v0 >> 4) | x1_0h;
|
@@ -3925,7 +4068,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|
3925
4068
|
[GGML_TYPE_Q4_0] = QK4_0,
|
3926
4069
|
[GGML_TYPE_Q4_1] = QK4_1,
|
3927
4070
|
[GGML_TYPE_Q4_2] = QK4_2,
|
3928
|
-
[GGML_TYPE_Q4_3] = QK4_3,
|
3929
4071
|
[GGML_TYPE_Q5_0] = QK5_0,
|
3930
4072
|
[GGML_TYPE_Q5_1] = QK5_1,
|
3931
4073
|
[GGML_TYPE_Q8_0] = QK8_0,
|
@@ -3942,7 +4084,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|
3942
4084
|
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
3943
4085
|
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
3944
4086
|
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
|
3945
|
-
[GGML_TYPE_Q4_3] = sizeof(block_q4_3),
|
3946
4087
|
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
|
3947
4088
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3948
4089
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
@@ -3960,7 +4101,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|
3960
4101
|
[GGML_TYPE_Q4_0] = "q4_0",
|
3961
4102
|
[GGML_TYPE_Q4_1] = "q4_1",
|
3962
4103
|
[GGML_TYPE_Q4_2] = "q4_2",
|
3963
|
-
[GGML_TYPE_Q4_3] = "q4_3",
|
3964
4104
|
[GGML_TYPE_Q5_0] = "q5_0",
|
3965
4105
|
[GGML_TYPE_Q5_1] = "q5_1",
|
3966
4106
|
[GGML_TYPE_Q8_0] = "q8_0",
|
@@ -3977,7 +4117,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3977
4117
|
[GGML_TYPE_Q4_0] = true,
|
3978
4118
|
[GGML_TYPE_Q4_1] = true,
|
3979
4119
|
[GGML_TYPE_Q4_2] = true,
|
3980
|
-
[GGML_TYPE_Q4_3] = true,
|
3981
4120
|
[GGML_TYPE_Q5_0] = true,
|
3982
4121
|
[GGML_TYPE_Q5_1] = true,
|
3983
4122
|
[GGML_TYPE_Q8_0] = true,
|
@@ -4024,6 +4163,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
4024
4163
|
"DIAG_MASK_INF",
|
4025
4164
|
"SOFT_MAX",
|
4026
4165
|
"ROPE",
|
4166
|
+
"ALIBI",
|
4027
4167
|
"CONV_1D_1S",
|
4028
4168
|
"CONV_1D_2S",
|
4029
4169
|
|
@@ -4034,7 +4174,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
4034
4174
|
"MAP_BINARY",
|
4035
4175
|
};
|
4036
4176
|
|
4037
|
-
static_assert(GGML_OP_COUNT ==
|
4177
|
+
static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
|
4038
4178
|
|
4039
4179
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
4040
4180
|
"none",
|
@@ -4072,6 +4212,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
4072
4212
|
"diag_mask_inf(x)",
|
4073
4213
|
"soft_max(x)",
|
4074
4214
|
"rope(x)",
|
4215
|
+
"alibi(x)",
|
4075
4216
|
"conv_1d_1s(x)",
|
4076
4217
|
"conv_1d_2s(x)",
|
4077
4218
|
|
@@ -4082,7 +4223,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
4082
4223
|
"f(x,y)",
|
4083
4224
|
};
|
4084
4225
|
|
4085
|
-
static_assert(GGML_OP_COUNT ==
|
4226
|
+
static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
|
4086
4227
|
|
4087
4228
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
4088
4229
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -4252,6 +4393,27 @@ bool ggml_is_quantized(enum ggml_type type) {
|
|
4252
4393
|
return GGML_IS_QUANTIZED[type];
|
4253
4394
|
}
|
4254
4395
|
|
4396
|
+
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
4397
|
+
enum ggml_type wtype = GGML_TYPE_COUNT;
|
4398
|
+
|
4399
|
+
switch (ftype) {
|
4400
|
+
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
4401
|
+
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
4402
|
+
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
4403
|
+
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
4404
|
+
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
|
4405
|
+
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
4406
|
+
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
4407
|
+
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
4408
|
+
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
4409
|
+
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
4410
|
+
}
|
4411
|
+
|
4412
|
+
GGML_ASSERT(wtype != GGML_TYPE_COUNT);
|
4413
|
+
|
4414
|
+
return wtype;
|
4415
|
+
}
|
4416
|
+
|
4255
4417
|
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
4256
4418
|
return tensor->nb[0] > tensor->nb[1];
|
4257
4419
|
}
|
@@ -4362,12 +4524,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4362
4524
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4363
4525
|
}
|
4364
4526
|
|
4365
|
-
|
4366
|
-
#if defined(GGML_USE_CUBLAS)
|
4527
|
+
#if defined(GGML_USE_CUBLAS)
|
4367
4528
|
ggml_init_cublas();
|
4368
|
-
|
4529
|
+
#elif defined(GGML_USE_CLBLAST)
|
4369
4530
|
ggml_cl_init();
|
4370
|
-
|
4531
|
+
#endif
|
4371
4532
|
|
4372
4533
|
is_first_call = false;
|
4373
4534
|
}
|
@@ -4448,7 +4609,7 @@ void ggml_free(struct ggml_context * ctx) {
|
|
4448
4609
|
}
|
4449
4610
|
|
4450
4611
|
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
4451
|
-
return ctx->objects_end->offs + ctx->objects_end->size;
|
4612
|
+
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
|
4452
4613
|
}
|
4453
4614
|
|
4454
4615
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
|
@@ -4561,6 +4722,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4561
4722
|
/*.perf_cycles =*/ 0,
|
4562
4723
|
/*.perf_time_us =*/ 0,
|
4563
4724
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4725
|
+
/*.name =*/ { 0 },
|
4564
4726
|
/*.pad =*/ { 0 },
|
4565
4727
|
};
|
4566
4728
|
|
@@ -4915,6 +5077,15 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
4915
5077
|
return (float *)(tensor->data);
|
4916
5078
|
}
|
4917
5079
|
|
5080
|
+
const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
5081
|
+
return tensor->name;
|
5082
|
+
}
|
5083
|
+
|
5084
|
+
void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
5085
|
+
strncpy(tensor->name, name, sizeof(tensor->name));
|
5086
|
+
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
5087
|
+
}
|
5088
|
+
|
4918
5089
|
struct ggml_tensor * ggml_view_tensor(
|
4919
5090
|
struct ggml_context * ctx,
|
4920
5091
|
const struct ggml_tensor * src) {
|
@@ -6014,6 +6185,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
|
|
6014
6185
|
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6015
6186
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6016
6187
|
struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
|
6188
|
+
ggml_set_name(b, "n_past");
|
6017
6189
|
|
6018
6190
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6019
6191
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6071,6 +6243,7 @@ struct ggml_tensor * ggml_rope(
|
|
6071
6243
|
((int32_t *) b->data)[0] = n_past;
|
6072
6244
|
((int32_t *) b->data)[1] = n_dims;
|
6073
6245
|
((int32_t *) b->data)[2] = mode;
|
6246
|
+
ggml_set_name(b, "n_past, n_dims, mode");
|
6074
6247
|
|
6075
6248
|
result->op = GGML_OP_ROPE;
|
6076
6249
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6080,6 +6253,37 @@ struct ggml_tensor * ggml_rope(
|
|
6080
6253
|
return result;
|
6081
6254
|
}
|
6082
6255
|
|
6256
|
+
// ggml_alibi
|
6257
|
+
|
6258
|
+
struct ggml_tensor * ggml_alibi(
|
6259
|
+
struct ggml_context * ctx,
|
6260
|
+
struct ggml_tensor * a,
|
6261
|
+
int n_past,
|
6262
|
+
int n_head) {
|
6263
|
+
GGML_ASSERT(n_past >= 0);
|
6264
|
+
bool is_node = false;
|
6265
|
+
|
6266
|
+
if (a->grad) {
|
6267
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6268
|
+
is_node = true;
|
6269
|
+
}
|
6270
|
+
|
6271
|
+
// TODO: when implement backward, fix this:
|
6272
|
+
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6273
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6274
|
+
|
6275
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6276
|
+
((int32_t *) b->data)[0] = n_past;
|
6277
|
+
((int32_t *) b->data)[1] = n_head;
|
6278
|
+
|
6279
|
+
result->op = GGML_OP_ALIBI;
|
6280
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6281
|
+
result->src0 = a;
|
6282
|
+
result->src1 = b;
|
6283
|
+
|
6284
|
+
return result;
|
6285
|
+
}
|
6286
|
+
|
6083
6287
|
// ggml_conv_1d_1s
|
6084
6288
|
|
6085
6289
|
struct ggml_tensor * ggml_conv_1d_1s(
|
@@ -7199,7 +7403,6 @@ static void ggml_compute_forward_add(
|
|
7199
7403
|
case GGML_TYPE_Q4_0:
|
7200
7404
|
case GGML_TYPE_Q4_1:
|
7201
7405
|
case GGML_TYPE_Q4_2:
|
7202
|
-
case GGML_TYPE_Q4_3:
|
7203
7406
|
case GGML_TYPE_Q5_0:
|
7204
7407
|
case GGML_TYPE_Q5_1:
|
7205
7408
|
case GGML_TYPE_Q8_0:
|
@@ -8108,7 +8311,7 @@ static void ggml_compute_forward_rms_norm(
|
|
8108
8311
|
|
8109
8312
|
// ggml_compute_forward_mul_mat
|
8110
8313
|
|
8111
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
8314
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8112
8315
|
// helper function to determine if it is better to use BLAS or not
|
8113
8316
|
// for large matrices, BLAS is faster
|
8114
8317
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
@@ -8125,7 +8328,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
8125
8328
|
|
8126
8329
|
// TODO: find the optimal values for these
|
8127
8330
|
if (ggml_is_contiguous(src0) &&
|
8128
|
-
ggml_is_contiguous(src1) &&
|
8331
|
+
ggml_is_contiguous(src1) &&
|
8332
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
8129
8333
|
|
8130
8334
|
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
8131
8335
|
return true;
|
@@ -8133,7 +8337,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
8133
8337
|
|
8134
8338
|
return false;
|
8135
8339
|
}
|
8136
|
-
|
8137
8340
|
#endif
|
8138
8341
|
|
8139
8342
|
static void ggml_compute_forward_mul_mat_f32(
|
@@ -8149,7 +8352,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8149
8352
|
const int64_t ne02 = src0->ne[2];
|
8150
8353
|
const int64_t ne03 = src0->ne[3];
|
8151
8354
|
|
8152
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
8355
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8153
8356
|
const int64_t ne10 = src1->ne[0];
|
8154
8357
|
#endif
|
8155
8358
|
const int64_t ne11 = src1->ne[1];
|
@@ -8206,7 +8409,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8206
8409
|
// nb01 >= nb00 - src0 is not transposed
|
8207
8410
|
// compute by src0 rows
|
8208
8411
|
|
8209
|
-
#if defined(
|
8412
|
+
#if defined(GGML_USE_CUBLAS)
|
8413
|
+
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
8414
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
8415
|
+
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
8416
|
+
}
|
8417
|
+
return;
|
8418
|
+
}
|
8419
|
+
#endif
|
8420
|
+
|
8421
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8210
8422
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
8211
8423
|
if (params->ith != 0) {
|
8212
8424
|
return;
|
@@ -8220,42 +8432,13 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8220
8432
|
return;
|
8221
8433
|
}
|
8222
8434
|
|
8223
|
-
#if defined(GGML_USE_CUBLAS)
|
8224
|
-
const float alpha = 1.0f;
|
8225
|
-
const float beta = 0.0f;
|
8226
|
-
const int x_ne = ne01 * ne10;
|
8227
|
-
const int y_ne = ne11 * ne10;
|
8228
|
-
const int d_ne = ne11 * ne01;
|
8229
|
-
|
8230
|
-
size_t x_size, y_size, d_size;
|
8231
|
-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
8232
|
-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
8233
|
-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
8234
|
-
#endif
|
8235
|
-
|
8236
8435
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8237
8436
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8238
8437
|
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
8239
8438
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
8240
|
-
|
8241
8439
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
8242
8440
|
|
8243
|
-
#if defined(
|
8244
|
-
// copy data to device
|
8245
|
-
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8246
|
-
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8247
|
-
|
8248
|
-
// compute
|
8249
|
-
CUBLAS_CHECK(
|
8250
|
-
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
8251
|
-
ne01, ne11, ne10,
|
8252
|
-
&alpha, d_X, ne00,
|
8253
|
-
d_Y, ne10,
|
8254
|
-
&beta, d_D, ne01));
|
8255
|
-
|
8256
|
-
// copy data to host
|
8257
|
-
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
8258
|
-
#elif defined(GGML_USE_CLBLAST)
|
8441
|
+
#if defined(GGML_USE_CLBLAST)
|
8259
8442
|
// zT = y * xT
|
8260
8443
|
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
8261
8444
|
ne11, ne01, ne10,
|
@@ -8272,12 +8455,6 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
8272
8455
|
#endif
|
8273
8456
|
}
|
8274
8457
|
}
|
8275
|
-
#if defined(GGML_USE_CUBLAS)
|
8276
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
8277
|
-
ggml_cuda_pool_free(d_X, x_size);
|
8278
|
-
ggml_cuda_pool_free(d_Y, y_size);
|
8279
|
-
ggml_cuda_pool_free(d_D, d_size);
|
8280
|
-
#endif
|
8281
8458
|
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
8282
8459
|
|
8283
8460
|
return;
|
@@ -8407,7 +8584,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8407
8584
|
// nb01 >= nb00 - src0 is not transposed
|
8408
8585
|
// compute by src0 rows
|
8409
8586
|
|
8410
|
-
#if defined(
|
8587
|
+
#if defined(GGML_USE_CUBLAS)
|
8588
|
+
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
8589
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
8590
|
+
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
8591
|
+
}
|
8592
|
+
return;
|
8593
|
+
}
|
8594
|
+
#endif
|
8595
|
+
|
8596
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8411
8597
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
8412
8598
|
GGML_ASSERT(nb10 == sizeof(float));
|
8413
8599
|
|
@@ -8423,35 +8609,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8423
8609
|
return;
|
8424
8610
|
}
|
8425
8611
|
|
8426
|
-
#if defined(GGML_USE_CUBLAS)
|
8427
|
-
ggml_fp16_t * const wdata = params->wdata;
|
8428
|
-
|
8429
|
-
const float alpha = 1.0f;
|
8430
|
-
const float beta = 0.0f;
|
8431
|
-
const int x_ne = ne01 * ne10;
|
8432
|
-
const int y_ne = ne11 * ne10;
|
8433
|
-
const int d_ne = ne11 * ne01;
|
8434
|
-
|
8435
|
-
size_t x_size, y_size, d_size;
|
8436
|
-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
8437
|
-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
8438
|
-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
8439
|
-
#else
|
8440
|
-
float * const wdata = params->wdata;
|
8441
|
-
#endif
|
8442
8612
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8443
8613
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8444
|
-
|
8445
|
-
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
|
8446
|
-
{
|
8447
|
-
size_t id = 0;
|
8448
|
-
for (int64_t i01 = 0; i01 < ne11; ++i01) {
|
8449
|
-
for (int64_t i00 = 0; i00 < ne10; ++i00) {
|
8450
|
-
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
|
8451
|
-
}
|
8452
|
-
}
|
8453
|
-
}
|
8454
|
-
#else
|
8614
|
+
float * const wdata = params->wdata;
|
8455
8615
|
{
|
8456
8616
|
size_t id = 0;
|
8457
8617
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
@@ -8459,32 +8619,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8459
8619
|
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
8460
8620
|
}
|
8461
8621
|
}
|
8462
|
-
}
|
8463
|
-
#endif
|
8464
|
-
|
8465
|
-
#if defined(GGML_USE_CUBLAS)
|
8466
|
-
const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
8467
|
-
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
|
8468
8622
|
|
8469
|
-
|
8623
|
+
assert(id*sizeof(float) <= params->wsize);
|
8624
|
+
}
|
8470
8625
|
|
8471
|
-
|
8472
|
-
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8473
|
-
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8474
|
-
|
8475
|
-
// compute
|
8476
|
-
CUBLAS_CHECK(
|
8477
|
-
cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
8478
|
-
ne01, ne11, ne10,
|
8479
|
-
&alpha, d_X, CUDA_R_16F, ne00,
|
8480
|
-
d_Y, CUDA_R_16F, ne10,
|
8481
|
-
&beta, d_D, CUDA_R_32F, ne01,
|
8482
|
-
CUBLAS_COMPUTE_32F,
|
8483
|
-
CUBLAS_GEMM_DEFAULT));
|
8484
|
-
|
8485
|
-
// copy data to host
|
8486
|
-
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
8487
|
-
#elif defined(GGML_USE_CLBLAST)
|
8626
|
+
#if defined(GGML_USE_CLBLAST)
|
8488
8627
|
const float * x = wdata;
|
8489
8628
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
8490
8629
|
|
@@ -8513,12 +8652,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
8513
8652
|
}
|
8514
8653
|
}
|
8515
8654
|
|
8516
|
-
#if defined(GGML_USE_CUBLAS)
|
8517
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
8518
|
-
ggml_cuda_pool_free(d_X, x_size);
|
8519
|
-
ggml_cuda_pool_free(d_Y, y_size);
|
8520
|
-
ggml_cuda_pool_free(d_D, d_size);
|
8521
|
-
#endif
|
8522
8655
|
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
|
8523
8656
|
|
8524
8657
|
return;
|
@@ -8671,7 +8804,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8671
8804
|
// nb01 >= nb00 - src0 is not transposed
|
8672
8805
|
// compute by src0 rows
|
8673
8806
|
|
8674
|
-
#if defined(
|
8807
|
+
#if defined(GGML_USE_CUBLAS)
|
8808
|
+
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
8809
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
8810
|
+
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
8811
|
+
}
|
8812
|
+
return;
|
8813
|
+
}
|
8814
|
+
#endif
|
8815
|
+
|
8816
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
8675
8817
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
8676
8818
|
if (params->ith != 0) {
|
8677
8819
|
return;
|
@@ -8685,48 +8827,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8685
8827
|
return;
|
8686
8828
|
}
|
8687
8829
|
|
8688
|
-
#if defined(GGML_USE_CUBLAS)
|
8689
|
-
const float alpha = 1.0f;
|
8690
|
-
const float beta = 0.0f;
|
8691
|
-
const int x_ne = ne01 * ne10;
|
8692
|
-
const int y_ne = ne11 * ne10;
|
8693
|
-
const int d_ne = ne11 * ne01;
|
8694
|
-
|
8695
|
-
size_t x_size, y_size, d_size, q_size;
|
8696
|
-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
8697
|
-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
8698
|
-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
8699
|
-
float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
|
8700
|
-
|
8701
|
-
void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL;
|
8702
|
-
if (type == GGML_TYPE_Q4_0) {
|
8703
|
-
dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
|
8704
|
-
}
|
8705
|
-
else if (type == GGML_TYPE_Q4_1) {
|
8706
|
-
dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
|
8707
|
-
}
|
8708
|
-
else if (type == GGML_TYPE_Q4_2) {
|
8709
|
-
dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
|
8710
|
-
}
|
8711
|
-
else if (type == GGML_TYPE_Q4_3) {
|
8712
|
-
dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
|
8713
|
-
}
|
8714
|
-
else if (type == GGML_TYPE_Q5_0) {
|
8715
|
-
dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
|
8716
|
-
}
|
8717
|
-
else if (type == GGML_TYPE_Q5_1) {
|
8718
|
-
dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
|
8719
|
-
}
|
8720
|
-
else if (type == GGML_TYPE_Q8_0) {
|
8721
|
-
dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
|
8722
|
-
}
|
8723
|
-
else {
|
8724
|
-
GGML_ASSERT(false);
|
8725
|
-
}
|
8726
|
-
#elif !defined(GGML_USE_CLBLAST)
|
8727
8830
|
float * const wdata = params->wdata;
|
8728
8831
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
8729
|
-
#endif
|
8730
8832
|
|
8731
8833
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8732
8834
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -8734,15 +8836,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8734
8836
|
|
8735
8837
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
8736
8838
|
|
8737
|
-
#if defined(
|
8738
|
-
// copy and dequantize on device
|
8739
|
-
CUDA_CHECK(
|
8740
|
-
cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
|
8741
|
-
GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream));
|
8742
|
-
|
8743
|
-
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
|
8744
|
-
CUDA_CHECK(cudaGetLastError());
|
8745
|
-
#elif defined(GGML_USE_CLBLAST)
|
8839
|
+
#if defined(GGML_USE_CLBLAST)
|
8746
8840
|
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
8747
8841
|
#else
|
8748
8842
|
{
|
@@ -8751,26 +8845,14 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8751
8845
|
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
8752
8846
|
id += ne00;
|
8753
8847
|
}
|
8848
|
+
|
8849
|
+
assert(id*sizeof(float) <= params->wsize);
|
8754
8850
|
}
|
8851
|
+
|
8755
8852
|
const float * x = wdata;
|
8756
8853
|
#endif
|
8757
8854
|
|
8758
|
-
|
8759
|
-
#if defined(GGML_USE_CUBLAS)
|
8760
|
-
// copy data to device
|
8761
|
-
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
8762
|
-
|
8763
|
-
// compute
|
8764
|
-
CUBLAS_CHECK(
|
8765
|
-
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
8766
|
-
ne01, ne11, ne10,
|
8767
|
-
&alpha, d_X, ne00,
|
8768
|
-
d_Y, ne10,
|
8769
|
-
&beta, d_D, ne01));
|
8770
|
-
|
8771
|
-
// copy data to host
|
8772
|
-
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
8773
|
-
#elif defined(GGML_USE_CLBLAST)
|
8855
|
+
#if defined(GGML_USE_CLBLAST)
|
8774
8856
|
// zT = y * xT
|
8775
8857
|
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
8776
8858
|
ne11, ne01, ne10,
|
@@ -8788,13 +8870,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
8788
8870
|
}
|
8789
8871
|
}
|
8790
8872
|
|
8791
|
-
#if defined(GGML_USE_CUBLAS)
|
8792
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
8793
|
-
ggml_cuda_pool_free(d_X, x_size);
|
8794
|
-
ggml_cuda_pool_free(d_Y, y_size);
|
8795
|
-
ggml_cuda_pool_free(d_D, d_size);
|
8796
|
-
ggml_cuda_pool_free(d_Q, q_size);
|
8797
|
-
#endif
|
8798
8873
|
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
8799
8874
|
|
8800
8875
|
return;
|
@@ -8883,7 +8958,6 @@ static void ggml_compute_forward_mul_mat(
|
|
8883
8958
|
case GGML_TYPE_Q4_0:
|
8884
8959
|
case GGML_TYPE_Q4_1:
|
8885
8960
|
case GGML_TYPE_Q4_2:
|
8886
|
-
case GGML_TYPE_Q4_3:
|
8887
8961
|
case GGML_TYPE_Q5_0:
|
8888
8962
|
case GGML_TYPE_Q5_1:
|
8889
8963
|
case GGML_TYPE_Q8_0:
|
@@ -9115,7 +9189,6 @@ static void ggml_compute_forward_get_rows(
|
|
9115
9189
|
case GGML_TYPE_Q4_0:
|
9116
9190
|
case GGML_TYPE_Q4_1:
|
9117
9191
|
case GGML_TYPE_Q4_2:
|
9118
|
-
case GGML_TYPE_Q4_3:
|
9119
9192
|
case GGML_TYPE_Q5_0:
|
9120
9193
|
case GGML_TYPE_Q5_1:
|
9121
9194
|
case GGML_TYPE_Q8_0:
|
@@ -9300,6 +9373,161 @@ static void ggml_compute_forward_soft_max(
|
|
9300
9373
|
}
|
9301
9374
|
}
|
9302
9375
|
|
9376
|
+
// ggml_compute_forward_alibi
|
9377
|
+
|
9378
|
+
static void ggml_compute_forward_alibi_f32(
|
9379
|
+
const struct ggml_compute_params * params,
|
9380
|
+
const struct ggml_tensor * src0,
|
9381
|
+
const struct ggml_tensor * src1,
|
9382
|
+
struct ggml_tensor * dst) {
|
9383
|
+
assert(params->ith == 0);
|
9384
|
+
assert(src1->type == GGML_TYPE_I32);
|
9385
|
+
assert(ggml_nelements(src1) == 2);
|
9386
|
+
|
9387
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9388
|
+
return;
|
9389
|
+
}
|
9390
|
+
|
9391
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
9392
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
9393
|
+
|
9394
|
+
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
9395
|
+
const int ne1 = src0->ne[1]; // seq_len_without_past
|
9396
|
+
//const int ne2 = src0->ne[2]; // n_head -> this is k
|
9397
|
+
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
9398
|
+
|
9399
|
+
const int n = ggml_nrows(src0);
|
9400
|
+
const int ne2_ne3 = n/ne1; // ne2*ne3
|
9401
|
+
|
9402
|
+
const int nb0 = src0->nb[0];
|
9403
|
+
const int nb1 = src0->nb[1];
|
9404
|
+
const int nb2 = src0->nb[2];
|
9405
|
+
//const int nb3 = src0->nb[3];
|
9406
|
+
|
9407
|
+
assert(nb0 == sizeof(float));
|
9408
|
+
assert(ne1 + n_past == ne0); (void) n_past;
|
9409
|
+
|
9410
|
+
// add alibi to src0 (KQ_scaled)
|
9411
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
9412
|
+
|
9413
|
+
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
|
9414
|
+
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
|
9415
|
+
|
9416
|
+
for (int i = 0; i < ne0; i++) {
|
9417
|
+
for (int j = 0; j < ne1; j++) {
|
9418
|
+
for (int k = 0; k < ne2_ne3; k++) {
|
9419
|
+
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
9420
|
+
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
9421
|
+
|
9422
|
+
// TODO: k*nb2 or k*nb3
|
9423
|
+
|
9424
|
+
float m_k;
|
9425
|
+
|
9426
|
+
if (k < n_heads_log2_floor) {
|
9427
|
+
m_k = powf(m0, k + 1);
|
9428
|
+
} else {
|
9429
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
9430
|
+
}
|
9431
|
+
|
9432
|
+
pdst[0] = (j+1) * m_k + src[0];
|
9433
|
+
}
|
9434
|
+
}
|
9435
|
+
}
|
9436
|
+
}
|
9437
|
+
|
9438
|
+
|
9439
|
+
static void ggml_compute_forward_alibi_f16(
|
9440
|
+
const struct ggml_compute_params * params,
|
9441
|
+
const struct ggml_tensor * src0,
|
9442
|
+
const struct ggml_tensor * src1,
|
9443
|
+
struct ggml_tensor * dst) {
|
9444
|
+
assert(params->ith == 0);
|
9445
|
+
assert(src1->type == GGML_TYPE_I32);
|
9446
|
+
assert(ggml_nelements(src1) == 2);
|
9447
|
+
|
9448
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9449
|
+
return;
|
9450
|
+
}
|
9451
|
+
|
9452
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
9453
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
9454
|
+
|
9455
|
+
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
9456
|
+
const int ne1 = src0->ne[1]; // seq_len_without_past
|
9457
|
+
//const int ne2 = src0->ne[2]; // n_head -> this is k
|
9458
|
+
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
9459
|
+
|
9460
|
+
const int n = ggml_nrows(src0);
|
9461
|
+
const int ne2_ne3 = n/ne1; // ne2*ne3
|
9462
|
+
|
9463
|
+
const int nb0 = src0->nb[0];
|
9464
|
+
const int nb1 = src0->nb[1];
|
9465
|
+
const int nb2 = src0->nb[2];
|
9466
|
+
//const int nb3 = src0->nb[3];
|
9467
|
+
|
9468
|
+
assert(nb0 == sizeof(ggml_fp16_t));
|
9469
|
+
assert(ne1 + n_past == ne0); (void) n_past;
|
9470
|
+
|
9471
|
+
// add alibi to src0 (KQ_scaled)
|
9472
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
9473
|
+
|
9474
|
+
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
|
9475
|
+
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
|
9476
|
+
|
9477
|
+
for (int i = 0; i < ne0; i++) {
|
9478
|
+
for (int j = 0; j < ne1; j++) {
|
9479
|
+
for (int k = 0; k < ne2_ne3; k++) {
|
9480
|
+
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
9481
|
+
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
9482
|
+
|
9483
|
+
// TODO: k*nb2 or k*nb3
|
9484
|
+
|
9485
|
+
float m_k;
|
9486
|
+
|
9487
|
+
if (k < n_heads_log2_floor) {
|
9488
|
+
m_k = powf(m0, k + 1);
|
9489
|
+
} else {
|
9490
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
9491
|
+
}
|
9492
|
+
|
9493
|
+
// we return F32
|
9494
|
+
pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
|
9495
|
+
}
|
9496
|
+
}
|
9497
|
+
}
|
9498
|
+
}
|
9499
|
+
|
9500
|
+
static void ggml_compute_forward_alibi(
|
9501
|
+
const struct ggml_compute_params * params,
|
9502
|
+
const struct ggml_tensor * src0,
|
9503
|
+
const struct ggml_tensor * src1,
|
9504
|
+
struct ggml_tensor * dst) {
|
9505
|
+
switch (src0->type) {
|
9506
|
+
case GGML_TYPE_F16:
|
9507
|
+
{
|
9508
|
+
ggml_compute_forward_alibi_f16(params, src0, src1, dst);
|
9509
|
+
} break;
|
9510
|
+
case GGML_TYPE_F32:
|
9511
|
+
{
|
9512
|
+
ggml_compute_forward_alibi_f32(params, src0, src1, dst);
|
9513
|
+
} break;
|
9514
|
+
case GGML_TYPE_Q4_0:
|
9515
|
+
case GGML_TYPE_Q4_1:
|
9516
|
+
case GGML_TYPE_Q4_2:
|
9517
|
+
case GGML_TYPE_Q5_0:
|
9518
|
+
case GGML_TYPE_Q5_1:
|
9519
|
+
case GGML_TYPE_Q8_0:
|
9520
|
+
case GGML_TYPE_Q8_1:
|
9521
|
+
case GGML_TYPE_I8:
|
9522
|
+
case GGML_TYPE_I16:
|
9523
|
+
case GGML_TYPE_I32:
|
9524
|
+
case GGML_TYPE_COUNT:
|
9525
|
+
{
|
9526
|
+
GGML_ASSERT(false);
|
9527
|
+
} break;
|
9528
|
+
}
|
9529
|
+
}
|
9530
|
+
|
9303
9531
|
// ggml_compute_forward_rope
|
9304
9532
|
|
9305
9533
|
static void ggml_compute_forward_rope_f32(
|
@@ -10938,6 +11166,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
10938
11166
|
{
|
10939
11167
|
ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
|
10940
11168
|
} break;
|
11169
|
+
case GGML_OP_ALIBI:
|
11170
|
+
{
|
11171
|
+
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
11172
|
+
} break;
|
10941
11173
|
case GGML_OP_CONV_1D_1S:
|
10942
11174
|
{
|
10943
11175
|
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
@@ -11140,6 +11372,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
11140
11372
|
{
|
11141
11373
|
GGML_ASSERT(false); // TODO: not implemented
|
11142
11374
|
} break;
|
11375
|
+
case GGML_OP_ALIBI:
|
11376
|
+
{
|
11377
|
+
GGML_ASSERT(false); // TODO: not implemented
|
11378
|
+
} break;
|
11143
11379
|
case GGML_OP_SILU:
|
11144
11380
|
{
|
11145
11381
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -11617,15 +11853,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
11617
11853
|
|
11618
11854
|
size_t cur = 0;
|
11619
11855
|
|
11856
|
+
#if defined(GGML_USE_CUBLAS)
|
11857
|
+
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
11858
|
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
11859
|
+
// the threads are still spinning
|
11860
|
+
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
11861
|
+
}
|
11862
|
+
else
|
11863
|
+
#endif
|
11620
11864
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
11621
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
11865
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11622
11866
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
11623
11867
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
11624
11868
|
// the threads are still spinning
|
11869
|
+
// here we need memory just for single 2D matrix from src0
|
11625
11870
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
11626
|
-
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
|
11627
|
-
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
|
11628
|
-
//printf("cur = %zu\n", cur);
|
11629
11871
|
} else {
|
11630
11872
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
11631
11873
|
}
|
@@ -11634,8 +11876,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
11634
11876
|
#endif
|
11635
11877
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
11636
11878
|
cur = 0;
|
11879
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11880
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
11881
|
+
node->n_tasks = 1;
|
11882
|
+
}
|
11883
|
+
#endif
|
11637
11884
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
11638
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
11885
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11639
11886
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
11640
11887
|
node->n_tasks = 1;
|
11641
11888
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
@@ -11673,6 +11920,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
11673
11920
|
{
|
11674
11921
|
node->n_tasks = n_threads;
|
11675
11922
|
} break;
|
11923
|
+
case GGML_OP_ALIBI:
|
11924
|
+
{
|
11925
|
+
node->n_tasks = 1; //TODO
|
11926
|
+
} break;
|
11676
11927
|
case GGML_OP_CONV_1D_1S:
|
11677
11928
|
case GGML_OP_CONV_1D_2S:
|
11678
11929
|
{
|
@@ -12060,10 +12311,16 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
12060
12311
|
snprintf(color, sizeof(color), "white");
|
12061
12312
|
}
|
12062
12313
|
|
12063
|
-
fprintf(fp, " \"%p\" [
|
12064
|
-
style = filled; fillcolor = %s; shape = record;
|
12065
|
-
label=\"
|
12066
|
-
(void *) node, color
|
12314
|
+
fprintf(fp, " \"%p\" [ "
|
12315
|
+
"style = filled; fillcolor = %s; shape = record; "
|
12316
|
+
"label=\"",
|
12317
|
+
(void *) node, color);
|
12318
|
+
|
12319
|
+
if (strlen(node->name) > 0) {
|
12320
|
+
fprintf(fp, "%s |", node->name);
|
12321
|
+
}
|
12322
|
+
|
12323
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
12067
12324
|
i, node->ne[0], node->ne[1],
|
12068
12325
|
GGML_OP_SYMBOL[node->op]);
|
12069
12326
|
|
@@ -12079,18 +12336,26 @@ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
|
12079
12336
|
|
12080
12337
|
snprintf(color, sizeof(color), "pink");
|
12081
12338
|
|
12339
|
+
fprintf(fp, " \"%p\" [ "
|
12340
|
+
"style = filled; fillcolor = %s; shape = record; "
|
12341
|
+
"label=\"<x>",
|
12342
|
+
(void *) node, color);
|
12343
|
+
|
12344
|
+
if (strlen(node->name) > 0) {
|
12345
|
+
fprintf(fp, "%s | ", node->name);
|
12346
|
+
}
|
12082
12347
|
if (ggml_nelements(node) == 1) {
|
12083
|
-
|
12084
|
-
|
12085
|
-
|
12086
|
-
|
12087
|
-
|
12088
|
-
|
12089
|
-
style = filled; fillcolor = %s; shape = record; \
|
12090
|
-
label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
|
12091
|
-
(void *) node, color,
|
12092
|
-
i, node->ne[0], node->ne[1]);
|
12348
|
+
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
12349
|
+
fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
|
12350
|
+
}
|
12351
|
+
else {
|
12352
|
+
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
|
12353
|
+
}
|
12093
12354
|
}
|
12355
|
+
else {
|
12356
|
+
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
12357
|
+
}
|
12358
|
+
fprintf(fp, "\"; ]\n");
|
12094
12359
|
}
|
12095
12360
|
|
12096
12361
|
for (int i = 0; i < gb->n_nodes; i++) {
|
@@ -12889,29 +13154,6 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
|
|
12889
13154
|
return (n/QK4_2*sizeof(block_q4_2));
|
12890
13155
|
}
|
12891
13156
|
|
12892
|
-
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
|
12893
|
-
assert(k % QK4_3 == 0);
|
12894
|
-
const int nb = k / QK4_3;
|
12895
|
-
|
12896
|
-
for (int j = 0; j < n; j += k) {
|
12897
|
-
block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
|
12898
|
-
|
12899
|
-
quantize_row_q4_3_reference(src + j, y, k);
|
12900
|
-
|
12901
|
-
for (int i = 0; i < nb; i++) {
|
12902
|
-
for (int l = 0; l < QK4_3; l += 2) {
|
12903
|
-
const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
|
12904
|
-
const uint8_t vi1 = y[i].qs[l/2] >> 4;
|
12905
|
-
|
12906
|
-
hist[vi0]++;
|
12907
|
-
hist[vi1]++;
|
12908
|
-
}
|
12909
|
-
}
|
12910
|
-
}
|
12911
|
-
|
12912
|
-
return (n/QK4_3*sizeof(block_q4_3));
|
12913
|
-
}
|
12914
|
-
|
12915
13157
|
size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
12916
13158
|
assert(k % QK5_0 == 0);
|
12917
13159
|
const int nb = k / QK5_0;
|
@@ -12926,8 +13168,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
12926
13168
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
12927
13169
|
|
12928
13170
|
for (int l = 0; l < QK5_0; l += 2) {
|
12929
|
-
const uint8_t vh0 = ((qh & (
|
12930
|
-
const uint8_t vh1 = ((qh & (
|
13171
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
13172
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
12931
13173
|
|
12932
13174
|
// cast to 16 bins
|
12933
13175
|
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
|
@@ -12956,8 +13198,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
12956
13198
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
12957
13199
|
|
12958
13200
|
for (int l = 0; l < QK5_1; l += 2) {
|
12959
|
-
const uint8_t vh0 = ((qh & (
|
12960
|
-
const uint8_t vh1 = ((qh & (
|
13201
|
+
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
13202
|
+
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
12961
13203
|
|
12962
13204
|
// cast to 16 bins
|
12963
13205
|
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
|
@@ -13014,12 +13256,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
13014
13256
|
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
|
13015
13257
|
result = ggml_quantize_q4_2(src + start, block, n, n, hist);
|
13016
13258
|
} break;
|
13017
|
-
case GGML_TYPE_Q4_3:
|
13018
|
-
{
|
13019
|
-
GGML_ASSERT(start % QK4_3 == 0);
|
13020
|
-
block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
|
13021
|
-
result = ggml_quantize_q4_3(src + start, block, n, n, hist);
|
13022
|
-
} break;
|
13023
13259
|
case GGML_TYPE_Q5_0:
|
13024
13260
|
{
|
13025
13261
|
GGML_ASSERT(start % QK5_0 == 0);
|