llama_cpp 0.14.7 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +59 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -3
- data/vendor/tmp/llama.cpp/Makefile +42 -18
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +78 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +399 -184
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +302 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +28 -16
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +951 -263
- data/vendor/tmp/llama.cpp/ggml.c +1457 -92
- data/vendor/tmp/llama.cpp/ggml.h +37 -7
- data/vendor/tmp/llama.cpp/llama.cpp +671 -403
- data/vendor/tmp/llama.cpp/llama.h +34 -10
- data/vendor/tmp/llama.cpp/sgemm.cpp +134 -103
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1188 -656
- data/vendor/tmp/llama.cpp/unicode-data.h +4 -3
- data/vendor/tmp/llama.cpp/unicode.cpp +590 -49
- data/vendor/tmp/llama.cpp/unicode.h +6 -3
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -322,7 +322,7 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
|
322
322
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
323
323
|
float ggml_table_f32_f16[1 << 16];
|
324
324
|
|
325
|
-
const char * ggml_status_to_string(enum ggml_status status) {
|
325
|
+
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
|
326
326
|
switch (status) {
|
327
327
|
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
328
328
|
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
@@ -333,16 +333,26 @@ const char * ggml_status_to_string(enum ggml_status status) {
|
|
333
333
|
return "GGML status: unknown";
|
334
334
|
}
|
335
335
|
|
336
|
-
// note: do not use these inside ggml.c
|
337
|
-
// these are meant to be used via the ggml.h API
|
338
336
|
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
337
|
+
#define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml
|
339
338
|
return GGML_FP16_TO_FP32(x);
|
340
339
|
}
|
341
340
|
|
342
341
|
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
342
|
+
#define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml
|
343
343
|
return GGML_FP32_TO_FP16(x);
|
344
344
|
}
|
345
345
|
|
346
|
+
float ggml_bf16_to_fp32(ggml_bf16_t x) {
|
347
|
+
#define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml
|
348
|
+
return GGML_BF16_TO_FP32(x); // it just left shifts
|
349
|
+
}
|
350
|
+
|
351
|
+
ggml_bf16_t ggml_fp32_to_bf16(float x) {
|
352
|
+
#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
|
353
|
+
return GGML_FP32_TO_BF16(x);
|
354
|
+
}
|
355
|
+
|
346
356
|
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
347
357
|
for (int64_t i = 0; i < n; i++) {
|
348
358
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
@@ -368,6 +378,49 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
|
368
378
|
}
|
369
379
|
}
|
370
380
|
|
381
|
+
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
382
|
+
int64_t i = 0;
|
383
|
+
#if defined(__AVX512F__)
|
384
|
+
for (; i + 16 <= n; i += 16) {
|
385
|
+
_mm512_storeu_ps(y + i,
|
386
|
+
_mm512_castsi512_ps(
|
387
|
+
_mm512_slli_epi32(
|
388
|
+
_mm512_cvtepu16_epi32(
|
389
|
+
_mm256_loadu_si256(
|
390
|
+
(const __m256i *)(x + i))),
|
391
|
+
16)));
|
392
|
+
}
|
393
|
+
#elif defined(__AVX2__)
|
394
|
+
for (; i + 8 <= n; i += 8) {
|
395
|
+
_mm256_storeu_ps(y + i,
|
396
|
+
_mm256_castsi256_ps(
|
397
|
+
_mm256_slli_epi32(
|
398
|
+
_mm256_cvtepu16_epi32(
|
399
|
+
_mm_loadu_si128(
|
400
|
+
(const __m128i *)(x + i))),
|
401
|
+
16)));
|
402
|
+
}
|
403
|
+
#endif
|
404
|
+
for (; i < n; i++) {
|
405
|
+
y[i] = GGML_BF16_TO_FP32(x[i]);
|
406
|
+
}
|
407
|
+
}
|
408
|
+
|
409
|
+
void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
|
410
|
+
int i = 0;
|
411
|
+
#if defined(__AVX512BF16__)
|
412
|
+
for (; i + 32 <= n; i += 32) {
|
413
|
+
_mm512_storeu_ps(
|
414
|
+
(__m512 *)(y + i),
|
415
|
+
(__m512)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
|
416
|
+
_mm512_loadu_ps(x + i)));
|
417
|
+
}
|
418
|
+
#endif
|
419
|
+
for (; i < n; i++) {
|
420
|
+
y[i] = GGML_FP32_TO_BF16(x[i]);
|
421
|
+
}
|
422
|
+
}
|
423
|
+
|
371
424
|
bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
|
372
425
|
return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
|
373
426
|
}
|
@@ -503,6 +556,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
503
556
|
|
504
557
|
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
505
558
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
559
|
+
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
|
506
560
|
|
507
561
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
508
562
|
[GGML_TYPE_I8] = {
|
@@ -845,6 +899,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
845
899
|
.type_size = sizeof(block_q8_K),
|
846
900
|
.is_quantized = true,
|
847
901
|
.from_float = quantize_row_q8_K,
|
902
|
+
},
|
903
|
+
[GGML_TYPE_BF16] = {
|
904
|
+
.type_name = "bf16",
|
905
|
+
.blck_size = 1,
|
906
|
+
.type_size = sizeof(ggml_bf16_t),
|
907
|
+
.is_quantized = false,
|
908
|
+
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
909
|
+
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
910
|
+
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
911
|
+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
912
|
+
.vec_dot_type = GGML_TYPE_BF16,
|
913
|
+
.nrows = 1,
|
848
914
|
}
|
849
915
|
};
|
850
916
|
|
@@ -951,7 +1017,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
951
1017
|
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
952
1018
|
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
953
1019
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
954
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
|
1020
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
955
1021
|
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
956
1022
|
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
957
1023
|
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
@@ -977,7 +1043,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
977
1043
|
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
978
1044
|
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
979
1045
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
980
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
1046
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
|
981
1047
|
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
982
1048
|
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
983
1049
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
@@ -1046,7 +1112,7 @@ do { \
|
|
1046
1112
|
|
1047
1113
|
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
1048
1114
|
// so F16C guard isn't required
|
1049
|
-
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
1115
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
|
1050
1116
|
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
1051
1117
|
|
1052
1118
|
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
@@ -1144,7 +1210,7 @@ do { \
|
|
1144
1210
|
|
1145
1211
|
#if defined(__F16C__)
|
1146
1212
|
// the _mm256_cvt intrinsics require F16C
|
1147
|
-
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
1213
|
+
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
1148
1214
|
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
1149
1215
|
#else
|
1150
1216
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
@@ -1480,6 +1546,8 @@ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) {
|
|
1480
1546
|
|
1481
1547
|
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1482
1548
|
|
1549
|
+
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1550
|
+
|
1483
1551
|
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
1484
1552
|
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
1485
1553
|
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
@@ -1498,7 +1566,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
|
|
1498
1566
|
UNUSED(by);
|
1499
1567
|
UNUSED(bs);
|
1500
1568
|
|
1501
|
-
#
|
1569
|
+
#if defined(GGML_SIMD)
|
1502
1570
|
float sumf = 0.0f;
|
1503
1571
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
1504
1572
|
|
@@ -1534,6 +1602,70 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
|
|
1534
1602
|
*s = sumf;
|
1535
1603
|
}
|
1536
1604
|
|
1605
|
+
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
|
1606
|
+
assert(nrc == 1);
|
1607
|
+
UNUSED(nrc);
|
1608
|
+
UNUSED(bx);
|
1609
|
+
UNUSED(by);
|
1610
|
+
UNUSED(bs);
|
1611
|
+
int i = 0;
|
1612
|
+
ggml_float sumf = 0;
|
1613
|
+
|
1614
|
+
#if defined(__AVX512BF16__)
|
1615
|
+
__m512 c1 = _mm512_setzero_ps();
|
1616
|
+
__m512 c2 = _mm512_setzero_ps();
|
1617
|
+
for (; i + 64 <= n; i += 64) {
|
1618
|
+
c1 = _mm512_dpbf16_ps(c1, (__m512bh)_mm512_loadu_ps((const float *)(x + i)),
|
1619
|
+
(__m512bh)_mm512_loadu_ps((const float *)(y + i)));
|
1620
|
+
c2 = _mm512_dpbf16_ps(c2, (__m512bh)_mm512_loadu_ps((const float *)(x + i + 32)),
|
1621
|
+
(__m512bh)_mm512_loadu_ps((const float *)(y + i + 32)));
|
1622
|
+
}
|
1623
|
+
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
1624
|
+
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
1625
|
+
|
1626
|
+
#elif defined(__AVX512F__)
|
1627
|
+
#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
|
1628
|
+
__m512 c1 = _mm512_setzero_ps();
|
1629
|
+
__m512 c2 = _mm512_setzero_ps();
|
1630
|
+
for (; i + 32 <= n; i += 32) {
|
1631
|
+
c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
|
1632
|
+
c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
|
1633
|
+
}
|
1634
|
+
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
1635
|
+
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
1636
|
+
|
1637
|
+
#undef LOAD
|
1638
|
+
#elif defined(__AVX2__)
|
1639
|
+
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
|
1640
|
+
__m256 c1 = _mm256_setzero_ps();
|
1641
|
+
__m256 c2 = _mm256_setzero_ps();
|
1642
|
+
__m256 c3 = _mm256_setzero_ps();
|
1643
|
+
__m256 c4 = _mm256_setzero_ps();
|
1644
|
+
for (; i + 32 <= n; i += 32) {
|
1645
|
+
c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
|
1646
|
+
c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
|
1647
|
+
c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
|
1648
|
+
c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
|
1649
|
+
}
|
1650
|
+
__m128 g;
|
1651
|
+
c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
|
1652
|
+
_mm256_add_ps(c2, c4));
|
1653
|
+
g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
|
1654
|
+
_mm256_castps256_ps128(c1));
|
1655
|
+
g = _mm_add_ps(g, _mm_movehl_ps(g, g));
|
1656
|
+
g = _mm_add_ss(g, _mm_movehdup_ps(g));
|
1657
|
+
sumf += (ggml_float)_mm_cvtss_f32(g);
|
1658
|
+
|
1659
|
+
#undef LOAD
|
1660
|
+
#endif
|
1661
|
+
|
1662
|
+
for (; i < n; ++i) {
|
1663
|
+
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
|
1664
|
+
GGML_BF16_TO_FP32(y[i]));
|
1665
|
+
}
|
1666
|
+
*s = sumf;
|
1667
|
+
}
|
1668
|
+
|
1537
1669
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
|
1538
1670
|
assert(nrc == 1);
|
1539
1671
|
UNUSED(nrc);
|
@@ -1662,6 +1794,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
1662
1794
|
#endif
|
1663
1795
|
}
|
1664
1796
|
|
1797
|
+
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
|
1798
|
+
#if defined(GGML_SIMD)
|
1799
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
1800
|
+
|
1801
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
1802
|
+
|
1803
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
1804
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
1805
|
+
|
1806
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
1807
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
1808
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
1809
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
1810
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
1811
|
+
|
1812
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
1813
|
+
}
|
1814
|
+
}
|
1815
|
+
|
1816
|
+
// leftovers
|
1817
|
+
for (int i = np; i < n; ++i) {
|
1818
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
1819
|
+
}
|
1820
|
+
#else
|
1821
|
+
// scalar
|
1822
|
+
for (int i = 0; i < n; ++i) {
|
1823
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
1824
|
+
}
|
1825
|
+
#endif
|
1826
|
+
}
|
1827
|
+
|
1665
1828
|
// xs and vs are byte strides of x and v
|
1666
1829
|
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
|
1667
1830
|
|
@@ -1746,6 +1909,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
1746
1909
|
#endif
|
1747
1910
|
}
|
1748
1911
|
|
1912
|
+
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
1913
|
+
#if defined(GGML_SIMD)
|
1914
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
1915
|
+
|
1916
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
1917
|
+
|
1918
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
1919
|
+
|
1920
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
1921
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
1922
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
1923
|
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
1924
|
+
|
1925
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
1926
|
+
}
|
1927
|
+
}
|
1928
|
+
|
1929
|
+
// leftovers
|
1930
|
+
for (int i = np; i < n; ++i) {
|
1931
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
1932
|
+
}
|
1933
|
+
#else
|
1934
|
+
// scalar
|
1935
|
+
for (int i = 0; i < n; ++i) {
|
1936
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
1937
|
+
}
|
1938
|
+
#endif
|
1939
|
+
}
|
1940
|
+
|
1749
1941
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
1750
1942
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
1751
1943
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
@@ -1907,6 +2099,14 @@ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_
|
|
1907
2099
|
*s = sum;
|
1908
2100
|
}
|
1909
2101
|
|
2102
|
+
inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
|
2103
|
+
float sum = 0.0f;
|
2104
|
+
for (int i = 0; i < n; ++i) {
|
2105
|
+
sum += GGML_BF16_TO_FP32(x[i]);
|
2106
|
+
}
|
2107
|
+
*s = sum;
|
2108
|
+
}
|
2109
|
+
|
1910
2110
|
inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
|
1911
2111
|
#ifndef GGML_USE_ACCELERATE
|
1912
2112
|
float max = -INFINITY;
|
@@ -2000,6 +2200,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2000
2200
|
"LEAKY_RELU",
|
2001
2201
|
|
2002
2202
|
"FLASH_ATTN",
|
2203
|
+
"FLASH_ATTN_EXT",
|
2003
2204
|
"FLASH_FF",
|
2004
2205
|
"FLASH_ATTN_BACK",
|
2005
2206
|
"SSM_CONV",
|
@@ -2026,7 +2227,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2026
2227
|
"CROSS_ENTROPY_LOSS_BACK",
|
2027
2228
|
};
|
2028
2229
|
|
2029
|
-
static_assert(GGML_OP_COUNT ==
|
2230
|
+
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
2030
2231
|
|
2031
2232
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2032
2233
|
"none",
|
@@ -2090,6 +2291,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2090
2291
|
"leaky_relu(x)",
|
2091
2292
|
|
2092
2293
|
"flash_attn(x)",
|
2294
|
+
"flash_attn_ext(x)",
|
2093
2295
|
"flash_ff(x)",
|
2094
2296
|
"flash_attn_back(x)",
|
2095
2297
|
"ssm_conv(x)",
|
@@ -2116,7 +2318,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2116
2318
|
"cross_entropy_loss_back(x,y)",
|
2117
2319
|
};
|
2118
2320
|
|
2119
|
-
static_assert(GGML_OP_COUNT ==
|
2321
|
+
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
2120
2322
|
|
2121
2323
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
2122
2324
|
|
@@ -2315,7 +2517,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
|
2315
2517
|
// figure out which node we're on
|
2316
2518
|
uint current_cpu;
|
2317
2519
|
int getcpu_ret = 0;
|
2318
|
-
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
|
2520
|
+
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
|
2319
2521
|
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
2320
2522
|
#else
|
2321
2523
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
@@ -2526,6 +2728,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2526
2728
|
switch (ftype) {
|
2527
2729
|
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
2528
2730
|
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
2731
|
+
case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break;
|
2529
2732
|
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
2530
2733
|
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
2531
2734
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
@@ -2667,15 +2870,16 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2667
2870
|
{
|
2668
2871
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
2669
2872
|
|
2670
|
-
ggml_fp16_t ii;
|
2671
2873
|
for (int i = 0; i < (1 << 16); ++i) {
|
2672
|
-
|
2673
|
-
|
2674
|
-
|
2874
|
+
union {
|
2875
|
+
uint16_t u16;
|
2876
|
+
ggml_fp16_t fp16;
|
2877
|
+
} u = {i};
|
2878
|
+
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
2675
2879
|
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
2676
2880
|
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
2677
2881
|
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
2678
|
-
ggml_table_exp_f16[i]
|
2882
|
+
ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
2679
2883
|
}
|
2680
2884
|
|
2681
2885
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
@@ -3139,6 +3343,13 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
|
|
3139
3343
|
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
3140
3344
|
}
|
3141
3345
|
} break;
|
3346
|
+
case GGML_TYPE_BF16:
|
3347
|
+
{
|
3348
|
+
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
3349
|
+
for (int i = 0; i < n; i++) {
|
3350
|
+
ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
|
3351
|
+
}
|
3352
|
+
} break;
|
3142
3353
|
case GGML_TYPE_F32:
|
3143
3354
|
{
|
3144
3355
|
assert(tensor->nb[0] == sizeof(float));
|
@@ -3191,6 +3402,13 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
|
|
3191
3402
|
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
3192
3403
|
}
|
3193
3404
|
} break;
|
3405
|
+
case GGML_TYPE_BF16:
|
3406
|
+
{
|
3407
|
+
assert(tensor->nb[0] == sizeof(ggml_bf16_t));
|
3408
|
+
for (int i = 0; i < n; i++) {
|
3409
|
+
ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
|
3410
|
+
}
|
3411
|
+
} break;
|
3194
3412
|
case GGML_TYPE_F32:
|
3195
3413
|
{
|
3196
3414
|
assert(tensor->nb[0] == sizeof(float));
|
@@ -3258,6 +3476,11 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
|
|
3258
3476
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
3259
3477
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
3260
3478
|
}
|
3479
|
+
case GGML_TYPE_BF16:
|
3480
|
+
{
|
3481
|
+
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
3482
|
+
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
3483
|
+
}
|
3261
3484
|
case GGML_TYPE_F32:
|
3262
3485
|
{
|
3263
3486
|
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
@@ -3300,6 +3523,11 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
|
|
3300
3523
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
3301
3524
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
3302
3525
|
} break;
|
3526
|
+
case GGML_TYPE_BF16:
|
3527
|
+
{
|
3528
|
+
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
3529
|
+
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
3530
|
+
} break;
|
3303
3531
|
case GGML_TYPE_F32:
|
3304
3532
|
{
|
3305
3533
|
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
@@ -3323,6 +3551,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
|
|
3323
3551
|
return ((int32_t *) data)[0];
|
3324
3552
|
case GGML_TYPE_F16:
|
3325
3553
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
3554
|
+
case GGML_TYPE_BF16:
|
3555
|
+
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
3326
3556
|
case GGML_TYPE_F32:
|
3327
3557
|
return ((float *) data)[0];
|
3328
3558
|
default:
|
@@ -3351,6 +3581,10 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
3351
3581
|
{
|
3352
3582
|
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
|
3353
3583
|
} break;
|
3584
|
+
case GGML_TYPE_BF16:
|
3585
|
+
{
|
3586
|
+
((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
|
3587
|
+
} break;
|
3354
3588
|
case GGML_TYPE_F32:
|
3355
3589
|
{
|
3356
3590
|
((float *)(data))[0] = value;
|
@@ -3389,6 +3623,11 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
3389
3623
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
3390
3624
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
3391
3625
|
}
|
3626
|
+
case GGML_TYPE_BF16:
|
3627
|
+
{
|
3628
|
+
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
3629
|
+
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
3630
|
+
}
|
3392
3631
|
case GGML_TYPE_F32:
|
3393
3632
|
{
|
3394
3633
|
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
@@ -3431,6 +3670,11 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
3431
3670
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
3432
3671
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
3433
3672
|
} break;
|
3673
|
+
case GGML_TYPE_BF16:
|
3674
|
+
{
|
3675
|
+
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
3676
|
+
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
3677
|
+
} break;
|
3434
3678
|
case GGML_TYPE_F32:
|
3435
3679
|
{
|
3436
3680
|
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
@@ -3454,6 +3698,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
3454
3698
|
return ((int32_t *) data)[0];
|
3455
3699
|
case GGML_TYPE_F16:
|
3456
3700
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
3701
|
+
case GGML_TYPE_BF16:
|
3702
|
+
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
3457
3703
|
case GGML_TYPE_F32:
|
3458
3704
|
return ((float *) data)[0];
|
3459
3705
|
default:
|
@@ -3482,6 +3728,10 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
3482
3728
|
{
|
3483
3729
|
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
|
3484
3730
|
} break;
|
3731
|
+
case GGML_TYPE_BF16:
|
3732
|
+
{
|
3733
|
+
((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
|
3734
|
+
} break;
|
3485
3735
|
case GGML_TYPE_F32:
|
3486
3736
|
{
|
3487
3737
|
((float *)(data))[0] = value;
|
@@ -3676,7 +3926,11 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|
3676
3926
|
// TODO: support less-strict constraint
|
3677
3927
|
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3678
3928
|
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3679
|
-
|
3929
|
+
|
3930
|
+
// currently only supported for quantized input and f16
|
3931
|
+
GGML_ASSERT(ggml_is_quantized(a->type) ||
|
3932
|
+
a->type == GGML_TYPE_F16 ||
|
3933
|
+
a->type == GGML_TYPE_BF16);
|
3680
3934
|
|
3681
3935
|
bool is_node = false;
|
3682
3936
|
|
@@ -4559,6 +4813,8 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4559
4813
|
void ggml_mul_mat_set_prec(
|
4560
4814
|
struct ggml_tensor * a,
|
4561
4815
|
enum ggml_prec prec) {
|
4816
|
+
GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
|
4817
|
+
|
4562
4818
|
const int32_t prec_i32 = (int32_t) prec;
|
4563
4819
|
|
4564
4820
|
ggml_set_op_params_i32(a, 0, prec_i32);
|
@@ -5397,17 +5653,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5397
5653
|
GGML_ASSERT(ggml_is_contiguous(a));
|
5398
5654
|
|
5399
5655
|
if (mask) {
|
5656
|
+
GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
|
5400
5657
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
5401
5658
|
GGML_ASSERT(ggml_is_matrix(mask));
|
5402
|
-
GGML_ASSERT(
|
5659
|
+
GGML_ASSERT(mask->ne[0] == a->ne[0]);
|
5660
|
+
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
5403
5661
|
}
|
5404
5662
|
|
5405
5663
|
if (pos) {
|
5406
5664
|
GGML_ASSERT(ggml_is_vector(pos));
|
5407
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F32);
|
5665
|
+
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
5408
5666
|
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
5409
5667
|
}
|
5410
5668
|
|
5669
|
+
if (pos && mask) {
|
5670
|
+
GGML_ASSERT(pos->type == mask->type);
|
5671
|
+
}
|
5672
|
+
|
5411
5673
|
if (max_bias > 0.0f) {
|
5412
5674
|
GGML_ASSERT(pos);
|
5413
5675
|
}
|
@@ -6216,6 +6478,59 @@ struct ggml_tensor * ggml_flash_attn(
|
|
6216
6478
|
return result;
|
6217
6479
|
}
|
6218
6480
|
|
6481
|
+
// ggml_flash_attn_ext
|
6482
|
+
|
6483
|
+
struct ggml_tensor * ggml_flash_attn_ext(
|
6484
|
+
struct ggml_context * ctx,
|
6485
|
+
struct ggml_tensor * q,
|
6486
|
+
struct ggml_tensor * k,
|
6487
|
+
struct ggml_tensor * v,
|
6488
|
+
struct ggml_tensor * mask,
|
6489
|
+
float scale) {
|
6490
|
+
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6491
|
+
// TODO: check if vT can be multiplied by (k*qT)
|
6492
|
+
if (mask) {
|
6493
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
6494
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
6495
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
6496
|
+
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
6497
|
+
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
6498
|
+
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
6499
|
+
}
|
6500
|
+
|
6501
|
+
bool is_node = false;
|
6502
|
+
|
6503
|
+
if (q->grad || k->grad || v->grad) {
|
6504
|
+
is_node = true;
|
6505
|
+
}
|
6506
|
+
|
6507
|
+
// permute(0, 2, 1, 3)
|
6508
|
+
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
6509
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6510
|
+
|
6511
|
+
float params[] = { scale };
|
6512
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6513
|
+
|
6514
|
+
result->op = GGML_OP_FLASH_ATTN_EXT;
|
6515
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6516
|
+
result->src[0] = q;
|
6517
|
+
result->src[1] = k;
|
6518
|
+
result->src[2] = v;
|
6519
|
+
result->src[3] = mask;
|
6520
|
+
|
6521
|
+
return result;
|
6522
|
+
}
|
6523
|
+
|
6524
|
+
void ggml_flash_attn_ext_set_prec(
|
6525
|
+
struct ggml_tensor * a,
|
6526
|
+
enum ggml_prec prec) {
|
6527
|
+
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
6528
|
+
|
6529
|
+
const int32_t prec_i32 = (int32_t) prec;
|
6530
|
+
|
6531
|
+
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
|
6532
|
+
}
|
6533
|
+
|
6219
6534
|
// ggml_flash_ff
|
6220
6535
|
|
6221
6536
|
struct ggml_tensor * ggml_flash_ff(
|
@@ -7092,8 +7407,8 @@ static void ggml_compute_forward_dup_same_cont(
|
|
7092
7407
|
((char *) src0->data + ie0*nb00),
|
7093
7408
|
(ie1 - ie0) * ggml_type_size(src0->type));
|
7094
7409
|
}
|
7095
|
-
|
7096
7410
|
}
|
7411
|
+
|
7097
7412
|
static void ggml_compute_forward_dup_f16(
|
7098
7413
|
const struct ggml_compute_params * params,
|
7099
7414
|
struct ggml_tensor * dst) {
|
@@ -7367,7 +7682,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7367
7682
|
}
|
7368
7683
|
}
|
7369
7684
|
|
7370
|
-
static void
|
7685
|
+
static void ggml_compute_forward_dup_bf16(
|
7371
7686
|
const struct ggml_compute_params * params,
|
7372
7687
|
struct ggml_tensor * dst) {
|
7373
7688
|
|
@@ -7415,10 +7730,11 @@ static void ggml_compute_forward_dup_f32(
|
|
7415
7730
|
return;
|
7416
7731
|
}
|
7417
7732
|
|
7733
|
+
// TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
|
7734
|
+
|
7418
7735
|
if (ggml_is_contiguous(dst)) {
|
7419
|
-
|
7420
|
-
|
7421
|
-
if (dst->type == GGML_TYPE_F32) {
|
7736
|
+
if (nb00 == sizeof(ggml_bf16_t)) {
|
7737
|
+
if (dst->type == GGML_TYPE_BF16) {
|
7422
7738
|
size_t id = 0;
|
7423
7739
|
const size_t rs = ne00 * nb00;
|
7424
7740
|
char * dst_ptr = (char *) dst->data;
|
@@ -7434,8 +7750,43 @@ static void ggml_compute_forward_dup_f32(
|
|
7434
7750
|
id += rs * (ne01 - ir1);
|
7435
7751
|
}
|
7436
7752
|
}
|
7753
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
7754
|
+
size_t id = 0;
|
7755
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
7756
|
+
|
7757
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
7758
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
7759
|
+
id += ne00 * ir0;
|
7760
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
7761
|
+
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
7762
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
7763
|
+
dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
|
7764
|
+
id++;
|
7765
|
+
}
|
7766
|
+
}
|
7767
|
+
id += ne00 * (ne01 - ir1);
|
7768
|
+
}
|
7769
|
+
}
|
7770
|
+
} else if (dst->type == GGML_TYPE_F32) {
|
7771
|
+
size_t id = 0;
|
7772
|
+
float * dst_ptr = (float *) dst->data;
|
7773
|
+
|
7774
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
7775
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
7776
|
+
id += ne00 * ir0;
|
7777
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
7778
|
+
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
7779
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
7780
|
+
dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]);
|
7781
|
+
id++;
|
7782
|
+
}
|
7783
|
+
}
|
7784
|
+
id += ne00 * (ne01 - ir1);
|
7785
|
+
}
|
7786
|
+
}
|
7437
7787
|
} else if (type_traits[dst->type].from_float) {
|
7438
7788
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
7789
|
+
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
7439
7790
|
|
7440
7791
|
size_t id = 0;
|
7441
7792
|
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
@@ -7445,8 +7796,13 @@ static void ggml_compute_forward_dup_f32(
|
|
7445
7796
|
for (int i02 = 0; i02 < ne02; i02++) {
|
7446
7797
|
id += rs * ir0;
|
7447
7798
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
7448
|
-
const
|
7449
|
-
|
7799
|
+
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
7800
|
+
|
7801
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
7802
|
+
src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]);
|
7803
|
+
}
|
7804
|
+
|
7805
|
+
quantize_row_q(src0_f32, dst_ptr + id, ne00);
|
7450
7806
|
id += rs;
|
7451
7807
|
}
|
7452
7808
|
id += rs * (ne01 - ir1);
|
@@ -7467,7 +7823,25 @@ static void ggml_compute_forward_dup_f32(
|
|
7467
7823
|
id += ne00 * ir0;
|
7468
7824
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
7469
7825
|
for (int i00 = 0; i00 < ne00; i00++) {
|
7470
|
-
const
|
7826
|
+
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
7827
|
+
|
7828
|
+
dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr);
|
7829
|
+
id++;
|
7830
|
+
}
|
7831
|
+
}
|
7832
|
+
id += ne00 * (ne01 - ir1);
|
7833
|
+
}
|
7834
|
+
}
|
7835
|
+
} else if (dst->type == GGML_TYPE_BF16) {
|
7836
|
+
size_t id = 0;
|
7837
|
+
ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
|
7838
|
+
|
7839
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
7840
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
7841
|
+
id += ne00 * ir0;
|
7842
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
7843
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
7844
|
+
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
7471
7845
|
|
7472
7846
|
dst_ptr[id] = *src0_ptr;
|
7473
7847
|
id++;
|
@@ -7485,9 +7859,9 @@ static void ggml_compute_forward_dup_f32(
|
|
7485
7859
|
id += ne00 * ir0;
|
7486
7860
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
7487
7861
|
for (int i00 = 0; i00 < ne00; i00++) {
|
7488
|
-
const
|
7862
|
+
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
7489
7863
|
|
7490
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
7864
|
+
dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
|
7491
7865
|
id++;
|
7492
7866
|
}
|
7493
7867
|
}
|
@@ -7498,18 +7872,16 @@ static void ggml_compute_forward_dup_f32(
|
|
7498
7872
|
GGML_ASSERT(false); // TODO: implement
|
7499
7873
|
}
|
7500
7874
|
}
|
7501
|
-
|
7502
7875
|
return;
|
7503
7876
|
}
|
7504
7877
|
|
7505
7878
|
// dst counters
|
7506
|
-
|
7507
7879
|
int64_t i10 = 0;
|
7508
7880
|
int64_t i11 = 0;
|
7509
7881
|
int64_t i12 = 0;
|
7510
7882
|
int64_t i13 = 0;
|
7511
7883
|
|
7512
|
-
if (dst->type ==
|
7884
|
+
if (dst->type == GGML_TYPE_BF16) {
|
7513
7885
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
7514
7886
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7515
7887
|
i10 += ne00 * ir0;
|
@@ -7530,15 +7902,15 @@ static void ggml_compute_forward_dup_f32(
|
|
7530
7902
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
7531
7903
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
7532
7904
|
|
7533
|
-
memcpy(dst_ptr, src0_ptr, sizeof(
|
7905
|
+
memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));
|
7534
7906
|
|
7535
|
-
if (++i10 ==
|
7907
|
+
if (++i10 == ne00) {
|
7536
7908
|
i10 = 0;
|
7537
|
-
if (++i11 ==
|
7909
|
+
if (++i11 == ne01) {
|
7538
7910
|
i11 = 0;
|
7539
|
-
if (++i12 ==
|
7911
|
+
if (++i12 == ne02) {
|
7540
7912
|
i12 = 0;
|
7541
|
-
if (++i13 ==
|
7913
|
+
if (++i13 == ne03) {
|
7542
7914
|
i13 = 0;
|
7543
7915
|
}
|
7544
7916
|
}
|
@@ -7582,7 +7954,7 @@ static void ggml_compute_forward_dup_f32(
|
|
7582
7954
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
7583
7955
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
7584
7956
|
|
7585
|
-
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const
|
7957
|
+
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
|
7586
7958
|
|
7587
7959
|
if (++i10 == ne0) {
|
7588
7960
|
i10 = 0;
|
@@ -7613,10 +7985,383 @@ static void ggml_compute_forward_dup_f32(
|
|
7613
7985
|
}
|
7614
7986
|
}
|
7615
7987
|
}
|
7616
|
-
} else {
|
7617
|
-
|
7618
|
-
|
7619
|
-
|
7988
|
+
} else if (dst->type == GGML_TYPE_F32) {
|
7989
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
7990
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7991
|
+
i10 += ne00 * ir0;
|
7992
|
+
while (i10 >= ne0) {
|
7993
|
+
i10 -= ne0;
|
7994
|
+
if (++i11 == ne1) {
|
7995
|
+
i11 = 0;
|
7996
|
+
if (++i12 == ne2) {
|
7997
|
+
i12 = 0;
|
7998
|
+
if (++i13 == ne3) {
|
7999
|
+
i13 = 0;
|
8000
|
+
}
|
8001
|
+
}
|
8002
|
+
}
|
8003
|
+
}
|
8004
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
8005
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
8006
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
8007
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
8008
|
+
|
8009
|
+
*(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr);
|
8010
|
+
|
8011
|
+
if (++i10 == ne0) {
|
8012
|
+
i10 = 0;
|
8013
|
+
if (++i11 == ne1) {
|
8014
|
+
i11 = 0;
|
8015
|
+
if (++i12 == ne2) {
|
8016
|
+
i12 = 0;
|
8017
|
+
if (++i13 == ne3) {
|
8018
|
+
i13 = 0;
|
8019
|
+
}
|
8020
|
+
}
|
8021
|
+
}
|
8022
|
+
}
|
8023
|
+
}
|
8024
|
+
}
|
8025
|
+
i10 += ne00 * (ne01 - ir1);
|
8026
|
+
while (i10 >= ne0) {
|
8027
|
+
i10 -= ne0;
|
8028
|
+
if (++i11 == ne1) {
|
8029
|
+
i11 = 0;
|
8030
|
+
if (++i12 == ne2) {
|
8031
|
+
i12 = 0;
|
8032
|
+
if (++i13 == ne3) {
|
8033
|
+
i13 = 0;
|
8034
|
+
}
|
8035
|
+
}
|
8036
|
+
}
|
8037
|
+
}
|
8038
|
+
}
|
8039
|
+
}
|
8040
|
+
} else {
|
8041
|
+
GGML_ASSERT(false); // TODO: implement
|
8042
|
+
}
|
8043
|
+
}
|
8044
|
+
|
8045
|
+
static void ggml_compute_forward_dup_f32(
|
8046
|
+
const struct ggml_compute_params * params,
|
8047
|
+
struct ggml_tensor * dst) {
|
8048
|
+
|
8049
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
8050
|
+
|
8051
|
+
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
8052
|
+
|
8053
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
8054
|
+
return;
|
8055
|
+
}
|
8056
|
+
|
8057
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
8058
|
+
|
8059
|
+
const int ith = params->ith; // thread index
|
8060
|
+
const int nth = params->nth; // number of threads
|
8061
|
+
|
8062
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
8063
|
+
ggml_compute_forward_dup_same_cont(params, dst);
|
8064
|
+
return;
|
8065
|
+
}
|
8066
|
+
|
8067
|
+
// parallelize by rows
|
8068
|
+
const int nr = ne01;
|
8069
|
+
// number of rows per thread
|
8070
|
+
const int dr = (nr + nth - 1) / nth;
|
8071
|
+
// row range for this thread
|
8072
|
+
const int ir0 = dr * ith;
|
8073
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
8074
|
+
|
8075
|
+
if (src0->type == dst->type &&
|
8076
|
+
ne00 == ne0 &&
|
8077
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
8078
|
+
// copy by rows
|
8079
|
+
const size_t rs = ne00*nb00;
|
8080
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8081
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8082
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
8083
|
+
memcpy(
|
8084
|
+
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
8085
|
+
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
|
8086
|
+
rs);
|
8087
|
+
}
|
8088
|
+
}
|
8089
|
+
}
|
8090
|
+
return;
|
8091
|
+
}
|
8092
|
+
|
8093
|
+
if (ggml_is_contiguous(dst)) {
|
8094
|
+
// TODO: simplify
|
8095
|
+
if (nb00 == sizeof(float)) {
|
8096
|
+
if (dst->type == GGML_TYPE_F32) {
|
8097
|
+
size_t id = 0;
|
8098
|
+
const size_t rs = ne00 * nb00;
|
8099
|
+
char * dst_ptr = (char *) dst->data;
|
8100
|
+
|
8101
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
8102
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
8103
|
+
id += rs * ir0;
|
8104
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
8105
|
+
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
8106
|
+
memcpy(dst_ptr + id, src0_ptr, rs);
|
8107
|
+
id += rs;
|
8108
|
+
}
|
8109
|
+
id += rs * (ne01 - ir1);
|
8110
|
+
}
|
8111
|
+
}
|
8112
|
+
} else if (type_traits[dst->type].from_float) {
|
8113
|
+
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
8114
|
+
|
8115
|
+
size_t id = 0;
|
8116
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
8117
|
+
char * dst_ptr = (char *) dst->data;
|
8118
|
+
|
8119
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
8120
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
8121
|
+
id += rs * ir0;
|
8122
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
8123
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
8124
|
+
quantize_row_q(src0_ptr, dst_ptr + id, ne00);
|
8125
|
+
id += rs;
|
8126
|
+
}
|
8127
|
+
id += rs * (ne01 - ir1);
|
8128
|
+
}
|
8129
|
+
}
|
8130
|
+
} else {
|
8131
|
+
GGML_ASSERT(false); // TODO: implement
|
8132
|
+
}
|
8133
|
+
} else {
|
8134
|
+
//printf("%s: this is not optimal - fix me\n", __func__);
|
8135
|
+
|
8136
|
+
if (dst->type == GGML_TYPE_F32) {
|
8137
|
+
size_t id = 0;
|
8138
|
+
float * dst_ptr = (float *) dst->data;
|
8139
|
+
|
8140
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
8141
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
8142
|
+
id += ne00 * ir0;
|
8143
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
8144
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
8145
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
8146
|
+
|
8147
|
+
dst_ptr[id] = *src0_ptr;
|
8148
|
+
id++;
|
8149
|
+
}
|
8150
|
+
}
|
8151
|
+
id += ne00 * (ne01 - ir1);
|
8152
|
+
}
|
8153
|
+
}
|
8154
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
8155
|
+
size_t id = 0;
|
8156
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
8157
|
+
|
8158
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
8159
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
8160
|
+
id += ne00 * ir0;
|
8161
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
8162
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
8163
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
8164
|
+
|
8165
|
+
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
8166
|
+
id++;
|
8167
|
+
}
|
8168
|
+
}
|
8169
|
+
id += ne00 * (ne01 - ir1);
|
8170
|
+
}
|
8171
|
+
}
|
8172
|
+
} else if (dst->type == GGML_TYPE_BF16) {
|
8173
|
+
size_t id = 0;
|
8174
|
+
ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
|
8175
|
+
|
8176
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
8177
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
8178
|
+
id += ne00 * ir0;
|
8179
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
8180
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
8181
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
8182
|
+
|
8183
|
+
dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr);
|
8184
|
+
id++;
|
8185
|
+
}
|
8186
|
+
}
|
8187
|
+
id += ne00 * (ne01 - ir1);
|
8188
|
+
}
|
8189
|
+
}
|
8190
|
+
} else {
|
8191
|
+
GGML_ASSERT(false); // TODO: implement
|
8192
|
+
}
|
8193
|
+
}
|
8194
|
+
|
8195
|
+
return;
|
8196
|
+
}
|
8197
|
+
|
8198
|
+
// dst counters
|
8199
|
+
|
8200
|
+
int64_t i10 = 0;
|
8201
|
+
int64_t i11 = 0;
|
8202
|
+
int64_t i12 = 0;
|
8203
|
+
int64_t i13 = 0;
|
8204
|
+
|
8205
|
+
if (dst->type == GGML_TYPE_F32) {
|
8206
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8207
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8208
|
+
i10 += ne00 * ir0;
|
8209
|
+
while (i10 >= ne0) {
|
8210
|
+
i10 -= ne0;
|
8211
|
+
if (++i11 == ne1) {
|
8212
|
+
i11 = 0;
|
8213
|
+
if (++i12 == ne2) {
|
8214
|
+
i12 = 0;
|
8215
|
+
if (++i13 == ne3) {
|
8216
|
+
i13 = 0;
|
8217
|
+
}
|
8218
|
+
}
|
8219
|
+
}
|
8220
|
+
}
|
8221
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
8222
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
8223
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
8224
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
8225
|
+
|
8226
|
+
memcpy(dst_ptr, src0_ptr, sizeof(float));
|
8227
|
+
|
8228
|
+
if (++i10 == ne0) {
|
8229
|
+
i10 = 0;
|
8230
|
+
if (++i11 == ne1) {
|
8231
|
+
i11 = 0;
|
8232
|
+
if (++i12 == ne2) {
|
8233
|
+
i12 = 0;
|
8234
|
+
if (++i13 == ne3) {
|
8235
|
+
i13 = 0;
|
8236
|
+
}
|
8237
|
+
}
|
8238
|
+
}
|
8239
|
+
}
|
8240
|
+
}
|
8241
|
+
}
|
8242
|
+
i10 += ne00 * (ne01 - ir1);
|
8243
|
+
while (i10 >= ne0) {
|
8244
|
+
i10 -= ne0;
|
8245
|
+
if (++i11 == ne1) {
|
8246
|
+
i11 = 0;
|
8247
|
+
if (++i12 == ne2) {
|
8248
|
+
i12 = 0;
|
8249
|
+
if (++i13 == ne3) {
|
8250
|
+
i13 = 0;
|
8251
|
+
}
|
8252
|
+
}
|
8253
|
+
}
|
8254
|
+
}
|
8255
|
+
}
|
8256
|
+
}
|
8257
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
8258
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8259
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8260
|
+
i10 += ne00 * ir0;
|
8261
|
+
while (i10 >= ne0) {
|
8262
|
+
i10 -= ne0;
|
8263
|
+
if (++i11 == ne1) {
|
8264
|
+
i11 = 0;
|
8265
|
+
if (++i12 == ne2) {
|
8266
|
+
i12 = 0;
|
8267
|
+
if (++i13 == ne3) {
|
8268
|
+
i13 = 0;
|
8269
|
+
}
|
8270
|
+
}
|
8271
|
+
}
|
8272
|
+
}
|
8273
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
8274
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
8275
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
8276
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
8277
|
+
|
8278
|
+
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
|
8279
|
+
|
8280
|
+
if (++i10 == ne0) {
|
8281
|
+
i10 = 0;
|
8282
|
+
if (++i11 == ne1) {
|
8283
|
+
i11 = 0;
|
8284
|
+
if (++i12 == ne2) {
|
8285
|
+
i12 = 0;
|
8286
|
+
if (++i13 == ne3) {
|
8287
|
+
i13 = 0;
|
8288
|
+
}
|
8289
|
+
}
|
8290
|
+
}
|
8291
|
+
}
|
8292
|
+
}
|
8293
|
+
}
|
8294
|
+
i10 += ne00 * (ne01 - ir1);
|
8295
|
+
while (i10 >= ne0) {
|
8296
|
+
i10 -= ne0;
|
8297
|
+
if (++i11 == ne1) {
|
8298
|
+
i11 = 0;
|
8299
|
+
if (++i12 == ne2) {
|
8300
|
+
i12 = 0;
|
8301
|
+
if (++i13 == ne3) {
|
8302
|
+
i13 = 0;
|
8303
|
+
}
|
8304
|
+
}
|
8305
|
+
}
|
8306
|
+
}
|
8307
|
+
}
|
8308
|
+
}
|
8309
|
+
} else if (dst->type == GGML_TYPE_BF16) {
|
8310
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
8311
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8312
|
+
i10 += ne00 * ir0;
|
8313
|
+
while (i10 >= ne0) {
|
8314
|
+
i10 -= ne0;
|
8315
|
+
if (++i11 == ne1) {
|
8316
|
+
i11 = 0;
|
8317
|
+
if (++i12 == ne2) {
|
8318
|
+
i12 = 0;
|
8319
|
+
if (++i13 == ne3) {
|
8320
|
+
i13 = 0;
|
8321
|
+
}
|
8322
|
+
}
|
8323
|
+
}
|
8324
|
+
}
|
8325
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
8326
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
8327
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
8328
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
8329
|
+
|
8330
|
+
*(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr);
|
8331
|
+
|
8332
|
+
if (++i10 == ne0) {
|
8333
|
+
i10 = 0;
|
8334
|
+
if (++i11 == ne1) {
|
8335
|
+
i11 = 0;
|
8336
|
+
if (++i12 == ne2) {
|
8337
|
+
i12 = 0;
|
8338
|
+
if (++i13 == ne3) {
|
8339
|
+
i13 = 0;
|
8340
|
+
}
|
8341
|
+
}
|
8342
|
+
}
|
8343
|
+
}
|
8344
|
+
}
|
8345
|
+
}
|
8346
|
+
i10 += ne00 * (ne01 - ir1);
|
8347
|
+
while (i10 >= ne0) {
|
8348
|
+
i10 -= ne0;
|
8349
|
+
if (++i11 == ne1) {
|
8350
|
+
i11 = 0;
|
8351
|
+
if (++i12 == ne2) {
|
8352
|
+
i12 = 0;
|
8353
|
+
if (++i13 == ne3) {
|
8354
|
+
i13 = 0;
|
8355
|
+
}
|
8356
|
+
}
|
8357
|
+
}
|
8358
|
+
}
|
8359
|
+
}
|
8360
|
+
}
|
8361
|
+
} else {
|
8362
|
+
GGML_ASSERT(false); // TODO: implement
|
8363
|
+
}
|
8364
|
+
}
|
7620
8365
|
|
7621
8366
|
// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
|
7622
8367
|
static void ggml_compute_forward_dup_bytes(
|
@@ -7786,6 +8531,10 @@ static void ggml_compute_forward_dup(
|
|
7786
8531
|
{
|
7787
8532
|
ggml_compute_forward_dup_f16(params, dst);
|
7788
8533
|
} break;
|
8534
|
+
case GGML_TYPE_BF16:
|
8535
|
+
{
|
8536
|
+
ggml_compute_forward_dup_bf16(params, dst);
|
8537
|
+
} break;
|
7789
8538
|
case GGML_TYPE_F32:
|
7790
8539
|
{
|
7791
8540
|
ggml_compute_forward_dup_f32(params, dst);
|
@@ -7968,6 +8717,85 @@ static void ggml_compute_forward_add_f16_f32(
|
|
7968
8717
|
}
|
7969
8718
|
}
|
7970
8719
|
|
8720
|
+
static void ggml_compute_forward_add_bf16_f32(
|
8721
|
+
const struct ggml_compute_params * params,
|
8722
|
+
struct ggml_tensor * dst) {
|
8723
|
+
|
8724
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
8725
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8726
|
+
|
8727
|
+
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
8728
|
+
|
8729
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
8730
|
+
return;
|
8731
|
+
}
|
8732
|
+
|
8733
|
+
const int ith = params->ith;
|
8734
|
+
const int nth = params->nth;
|
8735
|
+
|
8736
|
+
const int nr = ggml_nrows(src0);
|
8737
|
+
|
8738
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
8739
|
+
|
8740
|
+
GGML_ASSERT(src0->type == GGML_TYPE_BF16);
|
8741
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8742
|
+
|
8743
|
+
if (dst->type == GGML_TYPE_F32) {
|
8744
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
8745
|
+
}
|
8746
|
+
else {
|
8747
|
+
GGML_ASSERT(dst->type == GGML_TYPE_BF16);
|
8748
|
+
GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
|
8749
|
+
}
|
8750
|
+
|
8751
|
+
GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
|
8752
|
+
|
8753
|
+
// rows per thread
|
8754
|
+
const int dr = (nr + nth - 1)/nth;
|
8755
|
+
|
8756
|
+
// row range for this thread
|
8757
|
+
const int ir0 = dr*ith;
|
8758
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
8759
|
+
|
8760
|
+
if (nb10 == sizeof(float)) {
|
8761
|
+
if (dst->type == GGML_TYPE_BF16) {
|
8762
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
8763
|
+
// src0, src1 and dst are same shape => same indices
|
8764
|
+
const int i3 = ir/(ne2*ne1);
|
8765
|
+
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
8766
|
+
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
8767
|
+
|
8768
|
+
ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
8769
|
+
ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
8770
|
+
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
8771
|
+
|
8772
|
+
for (int i = 0; i < ne0; i++) {
|
8773
|
+
dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
|
8774
|
+
}
|
8775
|
+
}
|
8776
|
+
} else {
|
8777
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
8778
|
+
// src0, src1 and dst are same shape => same indices
|
8779
|
+
const int i3 = ir/(ne2*ne1);
|
8780
|
+
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
8781
|
+
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
8782
|
+
|
8783
|
+
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
8784
|
+
ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
8785
|
+
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
8786
|
+
|
8787
|
+
for (int i = 0; i < ne0; i++) {
|
8788
|
+
dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
|
8789
|
+
}
|
8790
|
+
}
|
8791
|
+
}
|
8792
|
+
}
|
8793
|
+
else {
|
8794
|
+
// src1 is not contiguous
|
8795
|
+
GGML_ASSERT(false);
|
8796
|
+
}
|
8797
|
+
}
|
8798
|
+
|
7971
8799
|
static void ggml_compute_forward_add_f16_f16(
|
7972
8800
|
const struct ggml_compute_params * params,
|
7973
8801
|
struct ggml_tensor * dst) {
|
@@ -8024,6 +8852,62 @@ static void ggml_compute_forward_add_f16_f16(
|
|
8024
8852
|
}
|
8025
8853
|
}
|
8026
8854
|
|
8855
|
+
static void ggml_compute_forward_add_bf16_bf16(
|
8856
|
+
const struct ggml_compute_params * params,
|
8857
|
+
struct ggml_tensor * dst) {
|
8858
|
+
|
8859
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
8860
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
8861
|
+
|
8862
|
+
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
8863
|
+
|
8864
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
8865
|
+
return;
|
8866
|
+
}
|
8867
|
+
|
8868
|
+
const int ith = params->ith;
|
8869
|
+
const int nth = params->nth;
|
8870
|
+
|
8871
|
+
const int nr = ggml_nrows(src0);
|
8872
|
+
|
8873
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
8874
|
+
|
8875
|
+
GGML_ASSERT(src0->type == GGML_TYPE_BF16);
|
8876
|
+
GGML_ASSERT(src1->type == GGML_TYPE_BF16);
|
8877
|
+
GGML_ASSERT(dst->type == GGML_TYPE_BF16);
|
8878
|
+
|
8879
|
+
GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
|
8880
|
+
GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
|
8881
|
+
|
8882
|
+
// rows per thread
|
8883
|
+
const int dr = (nr + nth - 1)/nth;
|
8884
|
+
|
8885
|
+
// row range for this thread
|
8886
|
+
const int ir0 = dr*ith;
|
8887
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
8888
|
+
|
8889
|
+
if (nb10 == sizeof(ggml_bf16_t)) {
|
8890
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
8891
|
+
// src0, src1 and dst are same shape => same indices
|
8892
|
+
const int i3 = ir/(ne2*ne1);
|
8893
|
+
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
8894
|
+
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
8895
|
+
|
8896
|
+
ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
8897
|
+
ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
8898
|
+
ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
8899
|
+
|
8900
|
+
for (int i = 0; i < ne0; i++) {
|
8901
|
+
dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i]));
|
8902
|
+
}
|
8903
|
+
}
|
8904
|
+
}
|
8905
|
+
else {
|
8906
|
+
// src1 is not contiguous
|
8907
|
+
GGML_ASSERT(false);
|
8908
|
+
}
|
8909
|
+
}
|
8910
|
+
|
8027
8911
|
static void ggml_compute_forward_add_q_f32(
|
8028
8912
|
const struct ggml_compute_params * params,
|
8029
8913
|
struct ggml_tensor * dst) {
|
@@ -8133,6 +9017,18 @@ static void ggml_compute_forward_add(
|
|
8133
9017
|
GGML_ASSERT(false);
|
8134
9018
|
}
|
8135
9019
|
} break;
|
9020
|
+
case GGML_TYPE_BF16:
|
9021
|
+
{
|
9022
|
+
if (src1->type == GGML_TYPE_BF16) {
|
9023
|
+
ggml_compute_forward_add_bf16_bf16(params, dst);
|
9024
|
+
}
|
9025
|
+
else if (src1->type == GGML_TYPE_F32) {
|
9026
|
+
ggml_compute_forward_add_bf16_f32(params, dst);
|
9027
|
+
}
|
9028
|
+
else {
|
9029
|
+
GGML_ASSERT(false);
|
9030
|
+
}
|
9031
|
+
} break;
|
8136
9032
|
case GGML_TYPE_Q4_0:
|
8137
9033
|
case GGML_TYPE_Q4_1:
|
8138
9034
|
case GGML_TYPE_Q5_0:
|
@@ -8346,21 +9242,133 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8346
9242
|
|
8347
9243
|
GGML_TENSOR_UNARY_OP_LOCALS
|
8348
9244
|
|
8349
|
-
const enum ggml_type type = src0->type;
|
8350
|
-
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
8351
|
-
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8352
|
-
|
8353
|
-
// we don't support permuted src0
|
8354
|
-
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8355
|
-
|
8356
|
-
// dst cannot be transposed or permuted
|
8357
|
-
GGML_ASSERT(nb0 <= nb1);
|
8358
|
-
GGML_ASSERT(nb1 <= nb2);
|
8359
|
-
GGML_ASSERT(nb2 <= nb3);
|
9245
|
+
const enum ggml_type type = src0->type;
|
9246
|
+
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
9247
|
+
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
9248
|
+
|
9249
|
+
// we don't support permuted src0
|
9250
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
9251
|
+
|
9252
|
+
// dst cannot be transposed or permuted
|
9253
|
+
GGML_ASSERT(nb0 <= nb1);
|
9254
|
+
GGML_ASSERT(nb1 <= nb2);
|
9255
|
+
GGML_ASSERT(nb2 <= nb3);
|
9256
|
+
|
9257
|
+
GGML_ASSERT(ggml_is_quantized(src0->type));
|
9258
|
+
GGML_ASSERT(dst->type == src0->type);
|
9259
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
9260
|
+
|
9261
|
+
// rows per thread
|
9262
|
+
const int dr = (nr + nth - 1)/nth;
|
9263
|
+
|
9264
|
+
// row range for this thread
|
9265
|
+
const int ir0 = dr*ith;
|
9266
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
9267
|
+
|
9268
|
+
float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
|
9269
|
+
|
9270
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
9271
|
+
// src0 and dst are same shape => same indices
|
9272
|
+
const int i3 = ir/(ne2*ne1);
|
9273
|
+
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
9274
|
+
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
9275
|
+
|
9276
|
+
void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
|
9277
|
+
void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 ));
|
9278
|
+
|
9279
|
+
assert(ne0 % 32 == 0);
|
9280
|
+
|
9281
|
+
// unquantize row from src0 to temp buffer
|
9282
|
+
dequantize_row_q(src0_row, wdata, ne0);
|
9283
|
+
// add src1
|
9284
|
+
ggml_vec_acc1_f32(ne0, wdata, v);
|
9285
|
+
// quantize row to dst
|
9286
|
+
quantize_row_q(wdata, dst_row, ne0);
|
9287
|
+
}
|
9288
|
+
}
|
9289
|
+
|
9290
|
+
static void ggml_compute_forward_add1_bf16_f32(
|
9291
|
+
const struct ggml_compute_params * params,
|
9292
|
+
struct ggml_tensor * dst) {
|
9293
|
+
|
9294
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
9295
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9296
|
+
|
9297
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
9298
|
+
GGML_ASSERT(ggml_is_scalar(src1));
|
9299
|
+
|
9300
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
9301
|
+
return;
|
9302
|
+
}
|
9303
|
+
|
9304
|
+
// scalar to add
|
9305
|
+
const float v = *(float *) src1->data;
|
9306
|
+
|
9307
|
+
const int ith = params->ith;
|
9308
|
+
const int nth = params->nth;
|
9309
|
+
|
9310
|
+
const int nr = ggml_nrows(src0);
|
9311
|
+
|
9312
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
9313
|
+
|
9314
|
+
GGML_ASSERT(src0->type == GGML_TYPE_BF16);
|
9315
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
9316
|
+
GGML_ASSERT(dst->type == GGML_TYPE_BF16);
|
9317
|
+
|
9318
|
+
GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
|
9319
|
+
GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
|
9320
|
+
|
9321
|
+
// rows per thread
|
9322
|
+
const int dr = (nr + nth - 1)/nth;
|
9323
|
+
|
9324
|
+
// row range for this thread
|
9325
|
+
const int ir0 = dr*ith;
|
9326
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
9327
|
+
|
9328
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
9329
|
+
// src0 and dst are same shape => same indices
|
9330
|
+
const int i3 = ir/(ne2*ne1);
|
9331
|
+
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
9332
|
+
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
9333
|
+
|
9334
|
+
ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
9335
|
+
ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
9336
|
+
for (int i = 0; i < ne0; i++) {
|
9337
|
+
dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
|
9338
|
+
}
|
9339
|
+
}
|
9340
|
+
}
|
9341
|
+
|
9342
|
+
static void ggml_compute_forward_add1_bf16_bf16(
|
9343
|
+
const struct ggml_compute_params * params,
|
9344
|
+
struct ggml_tensor * dst) {
|
9345
|
+
|
9346
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
9347
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9348
|
+
|
9349
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
9350
|
+
GGML_ASSERT(ggml_is_scalar(src1));
|
9351
|
+
|
9352
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
9353
|
+
return;
|
9354
|
+
}
|
9355
|
+
|
9356
|
+
// scalar to add
|
9357
|
+
const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data);
|
9358
|
+
|
9359
|
+
const int ith = params->ith;
|
9360
|
+
const int nth = params->nth;
|
9361
|
+
|
9362
|
+
const int nr = ggml_nrows(src0);
|
9363
|
+
|
9364
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
9365
|
+
|
9366
|
+
GGML_ASSERT(src0->type == GGML_TYPE_BF16);
|
9367
|
+
GGML_ASSERT(src1->type == GGML_TYPE_BF16);
|
9368
|
+
GGML_ASSERT(dst->type == GGML_TYPE_BF16);
|
8360
9369
|
|
8361
|
-
GGML_ASSERT(
|
8362
|
-
GGML_ASSERT(
|
8363
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
9370
|
+
GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
|
9371
|
+
GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
|
8364
9372
|
|
8365
9373
|
// rows per thread
|
8366
9374
|
const int dr = (nr + nth - 1)/nth;
|
@@ -8369,25 +9377,17 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8369
9377
|
const int ir0 = dr*ith;
|
8370
9378
|
const int ir1 = MIN(ir0 + dr, nr);
|
8371
9379
|
|
8372
|
-
float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
|
8373
|
-
|
8374
9380
|
for (int ir = ir0; ir < ir1; ++ir) {
|
8375
9381
|
// src0 and dst are same shape => same indices
|
8376
9382
|
const int i3 = ir/(ne2*ne1);
|
8377
9383
|
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
8378
9384
|
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
8379
9385
|
|
8380
|
-
|
8381
|
-
|
8382
|
-
|
8383
|
-
|
8384
|
-
|
8385
|
-
// unquantize row from src0 to temp buffer
|
8386
|
-
dequantize_row_q(src0_row, wdata, ne0);
|
8387
|
-
// add src1
|
8388
|
-
ggml_vec_acc1_f32(ne0, wdata, v);
|
8389
|
-
// quantize row to dst
|
8390
|
-
quantize_row_q(wdata, dst_row, ne0);
|
9386
|
+
ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
9387
|
+
ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
9388
|
+
for (int i = 0; i < ne0; i++) {
|
9389
|
+
dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
|
9390
|
+
}
|
8391
9391
|
}
|
8392
9392
|
}
|
8393
9393
|
|
@@ -8415,6 +9415,18 @@ static void ggml_compute_forward_add1(
|
|
8415
9415
|
GGML_ASSERT(false);
|
8416
9416
|
}
|
8417
9417
|
} break;
|
9418
|
+
case GGML_TYPE_BF16:
|
9419
|
+
{
|
9420
|
+
if (src1->type == GGML_TYPE_BF16) {
|
9421
|
+
ggml_compute_forward_add1_bf16_bf16(params, dst);
|
9422
|
+
}
|
9423
|
+
else if (src1->type == GGML_TYPE_F32) {
|
9424
|
+
ggml_compute_forward_add1_bf16_f32(params, dst);
|
9425
|
+
}
|
9426
|
+
else {
|
9427
|
+
GGML_ASSERT(false);
|
9428
|
+
}
|
9429
|
+
} break;
|
8418
9430
|
case GGML_TYPE_Q4_0:
|
8419
9431
|
case GGML_TYPE_Q4_1:
|
8420
9432
|
case GGML_TYPE_Q5_0:
|
@@ -8543,6 +9555,7 @@ static void ggml_compute_forward_acc(
|
|
8543
9555
|
ggml_compute_forward_acc_f32(params, dst);
|
8544
9556
|
} break;
|
8545
9557
|
case GGML_TYPE_F16:
|
9558
|
+
case GGML_TYPE_BF16:
|
8546
9559
|
case GGML_TYPE_Q4_0:
|
8547
9560
|
case GGML_TYPE_Q4_1:
|
8548
9561
|
case GGML_TYPE_Q5_0:
|
@@ -9064,6 +10077,40 @@ static void ggml_compute_forward_sum_f16(
|
|
9064
10077
|
((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
|
9065
10078
|
}
|
9066
10079
|
|
10080
|
+
static void ggml_compute_forward_sum_bf16(
|
10081
|
+
const struct ggml_compute_params * params,
|
10082
|
+
struct ggml_tensor * dst) {
|
10083
|
+
|
10084
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
10085
|
+
|
10086
|
+
assert(params->ith == 0);
|
10087
|
+
assert(ggml_is_scalar(dst));
|
10088
|
+
|
10089
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
10090
|
+
return;
|
10091
|
+
}
|
10092
|
+
|
10093
|
+
assert(src0->nb[0] == sizeof(ggml_bf16_t));
|
10094
|
+
|
10095
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
|
10096
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
|
10097
|
+
|
10098
|
+
float sum = 0;
|
10099
|
+
float row_sum = 0;
|
10100
|
+
|
10101
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
10102
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
10103
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10104
|
+
ggml_vec_sum_bf16_ggf(ne00,
|
10105
|
+
&row_sum,
|
10106
|
+
(ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
|
10107
|
+
sum += row_sum;
|
10108
|
+
}
|
10109
|
+
}
|
10110
|
+
}
|
10111
|
+
((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum);
|
10112
|
+
}
|
10113
|
+
|
9067
10114
|
static void ggml_compute_forward_sum(
|
9068
10115
|
const struct ggml_compute_params * params,
|
9069
10116
|
struct ggml_tensor * dst) {
|
@@ -9079,6 +10126,10 @@ static void ggml_compute_forward_sum(
|
|
9079
10126
|
{
|
9080
10127
|
ggml_compute_forward_sum_f16(params, dst);
|
9081
10128
|
} break;
|
10129
|
+
case GGML_TYPE_BF16:
|
10130
|
+
{
|
10131
|
+
ggml_compute_forward_sum_bf16(params, dst);
|
10132
|
+
} break;
|
9082
10133
|
default:
|
9083
10134
|
{
|
9084
10135
|
GGML_ASSERT(false);
|
@@ -9353,6 +10404,7 @@ static void ggml_compute_forward_repeat(
|
|
9353
10404
|
|
9354
10405
|
switch (src0->type) {
|
9355
10406
|
case GGML_TYPE_F16:
|
10407
|
+
case GGML_TYPE_BF16:
|
9356
10408
|
case GGML_TYPE_I16:
|
9357
10409
|
{
|
9358
10410
|
ggml_compute_forward_repeat_f16(params, dst);
|
@@ -11670,6 +12722,7 @@ static void ggml_compute_forward_set(
|
|
11670
12722
|
ggml_compute_forward_set_f32(params, dst);
|
11671
12723
|
} break;
|
11672
12724
|
case GGML_TYPE_F16:
|
12725
|
+
case GGML_TYPE_BF16:
|
11673
12726
|
case GGML_TYPE_Q4_0:
|
11674
12727
|
case GGML_TYPE_Q4_1:
|
11675
12728
|
case GGML_TYPE_Q5_0:
|
@@ -11844,6 +12897,49 @@ static void ggml_compute_forward_get_rows_f16(
|
|
11844
12897
|
}
|
11845
12898
|
}
|
11846
12899
|
|
12900
|
+
static void ggml_compute_forward_get_rows_bf16(
|
12901
|
+
const struct ggml_compute_params * params,
|
12902
|
+
struct ggml_tensor * dst) {
|
12903
|
+
|
12904
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
12905
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
12906
|
+
|
12907
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
12908
|
+
return;
|
12909
|
+
}
|
12910
|
+
|
12911
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
12912
|
+
|
12913
|
+
const int64_t nc = ne00;
|
12914
|
+
const int64_t nr = ggml_nelements(src1);
|
12915
|
+
|
12916
|
+
assert(ne0 == nc);
|
12917
|
+
assert(ne02 == ne11);
|
12918
|
+
assert(nb00 == sizeof(ggml_bf16_t));
|
12919
|
+
assert(ggml_nrows(dst) == nr);
|
12920
|
+
|
12921
|
+
const int ith = params->ith;
|
12922
|
+
const int nth = params->nth;
|
12923
|
+
|
12924
|
+
// rows per thread
|
12925
|
+
const int dr = (nr + nth - 1)/nth;
|
12926
|
+
|
12927
|
+
// row range for this thread
|
12928
|
+
const int ir0 = dr*ith;
|
12929
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
12930
|
+
|
12931
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
12932
|
+
const int64_t i12 = i/(ne11*ne10);
|
12933
|
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
12934
|
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
12935
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
12936
|
+
|
12937
|
+
ggml_bf16_to_fp32_row(
|
12938
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
12939
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
12940
|
+
}
|
12941
|
+
}
|
12942
|
+
|
11847
12943
|
static void ggml_compute_forward_get_rows_f32(
|
11848
12944
|
const struct ggml_compute_params * params,
|
11849
12945
|
struct ggml_tensor * dst) {
|
@@ -11921,6 +13017,10 @@ static void ggml_compute_forward_get_rows(
|
|
11921
13017
|
{
|
11922
13018
|
ggml_compute_forward_get_rows_f16(params, dst);
|
11923
13019
|
} break;
|
13020
|
+
case GGML_TYPE_BF16:
|
13021
|
+
{
|
13022
|
+
ggml_compute_forward_get_rows_bf16(params, dst);
|
13023
|
+
} break;
|
11924
13024
|
case GGML_TYPE_F32:
|
11925
13025
|
case GGML_TYPE_I32:
|
11926
13026
|
{
|
@@ -12255,7 +13355,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12255
13355
|
|
12256
13356
|
GGML_TENSOR_UNARY_OP_LOCALS
|
12257
13357
|
|
12258
|
-
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
13358
|
+
//const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
12259
13359
|
|
12260
13360
|
// TODO: is this supposed to be ceil instead of floor?
|
12261
13361
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
@@ -12278,19 +13378,31 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12278
13378
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
12279
13379
|
|
12280
13380
|
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
|
12281
|
-
|
13381
|
+
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
13382
|
+
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
13383
|
+
|
13384
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
12282
13385
|
|
12283
13386
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
12284
13387
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
12285
13388
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
12286
13389
|
|
12287
13390
|
// broadcast the mask across rows
|
12288
|
-
|
13391
|
+
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
13392
|
+
float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
12289
13393
|
|
12290
13394
|
ggml_vec_cpy_f32 (nc, wp, sp);
|
12291
13395
|
ggml_vec_scale_f32(nc, wp, scale);
|
12292
|
-
if (
|
12293
|
-
|
13396
|
+
if (mp_f32) {
|
13397
|
+
if (use_f16) {
|
13398
|
+
for (int i = 0; i < nc; ++i) {
|
13399
|
+
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
13400
|
+
}
|
13401
|
+
} else {
|
13402
|
+
for (int i = 0; i < nc; ++i) {
|
13403
|
+
wp[i] += mp_f32[i];
|
13404
|
+
}
|
13405
|
+
}
|
12294
13406
|
}
|
12295
13407
|
|
12296
13408
|
// ALiBi bias
|
@@ -12298,8 +13410,14 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12298
13410
|
const uint32_t h = (i1/ne01)%ne02; // head
|
12299
13411
|
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
12300
13412
|
|
12301
|
-
|
12302
|
-
|
13413
|
+
if (use_f16) {
|
13414
|
+
for (int i = 0; i < nc; ++i) {
|
13415
|
+
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
13416
|
+
}
|
13417
|
+
} else {
|
13418
|
+
for (int i = 0; i < nc; ++i) {
|
13419
|
+
wp[i] += slope*pos_f32[i];
|
13420
|
+
}
|
12303
13421
|
}
|
12304
13422
|
}
|
12305
13423
|
|
@@ -12598,6 +13716,7 @@ static void ggml_compute_forward_alibi(
|
|
12598
13716
|
{
|
12599
13717
|
ggml_compute_forward_alibi_f32(params, dst);
|
12600
13718
|
} break;
|
13719
|
+
case GGML_TYPE_BF16:
|
12601
13720
|
case GGML_TYPE_Q4_0:
|
12602
13721
|
case GGML_TYPE_Q4_1:
|
12603
13722
|
case GGML_TYPE_Q5_0:
|
@@ -12687,6 +13806,7 @@ static void ggml_compute_forward_clamp(
|
|
12687
13806
|
ggml_compute_forward_clamp_f32(params, dst);
|
12688
13807
|
} break;
|
12689
13808
|
case GGML_TYPE_F16:
|
13809
|
+
case GGML_TYPE_BF16:
|
12690
13810
|
case GGML_TYPE_Q4_0:
|
12691
13811
|
case GGML_TYPE_Q4_1:
|
12692
13812
|
case GGML_TYPE_Q5_0:
|
@@ -14569,6 +15689,198 @@ static void ggml_compute_forward_flash_attn(
|
|
14569
15689
|
}
|
14570
15690
|
}
|
14571
15691
|
|
15692
|
+
// ggml_compute_forward_flash_attn_ext
|
15693
|
+
|
15694
|
+
static void ggml_compute_forward_flash_attn_ext_f16(
|
15695
|
+
const struct ggml_compute_params * params,
|
15696
|
+
const struct ggml_tensor * q,
|
15697
|
+
const struct ggml_tensor * k,
|
15698
|
+
const struct ggml_tensor * v,
|
15699
|
+
const struct ggml_tensor * mask,
|
15700
|
+
struct ggml_tensor * dst) {
|
15701
|
+
int64_t t0 = ggml_perf_time_us();
|
15702
|
+
UNUSED(t0);
|
15703
|
+
|
15704
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
15705
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
15706
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
15707
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
15708
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
15709
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
15710
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
15711
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
15712
|
+
|
15713
|
+
const int ith = params->ith;
|
15714
|
+
const int nth = params->nth;
|
15715
|
+
|
15716
|
+
const int64_t D = neq0;
|
15717
|
+
const int64_t N = neq1;
|
15718
|
+
|
15719
|
+
GGML_ASSERT(ne0 == D);
|
15720
|
+
GGML_ASSERT(ne2 == N);
|
15721
|
+
|
15722
|
+
GGML_ASSERT(nbq0 == sizeof(float));
|
15723
|
+
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
15724
|
+
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
15725
|
+
|
15726
|
+
GGML_ASSERT(neq0 == D);
|
15727
|
+
GGML_ASSERT(nek0 == D);
|
15728
|
+
GGML_ASSERT(nev0 == D);
|
15729
|
+
|
15730
|
+
GGML_ASSERT(neq1 == N);
|
15731
|
+
GGML_ASSERT(nev0 == D);
|
15732
|
+
|
15733
|
+
// dst cannot be transposed or permuted
|
15734
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
15735
|
+
GGML_ASSERT(nb0 <= nb1);
|
15736
|
+
GGML_ASSERT(nb1 <= nb2);
|
15737
|
+
GGML_ASSERT(nb2 <= nb3);
|
15738
|
+
|
15739
|
+
// broadcast factors
|
15740
|
+
const int64_t rk2 = neq2/nek2;
|
15741
|
+
const int64_t rk3 = neq3/nek3;
|
15742
|
+
|
15743
|
+
const int64_t rv2 = neq2/nev2;
|
15744
|
+
const int64_t rv3 = neq3/nev3;
|
15745
|
+
|
15746
|
+
if (params->type == GGML_TASK_TYPE_INIT) {
|
15747
|
+
return;
|
15748
|
+
}
|
15749
|
+
|
15750
|
+
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
15751
|
+
return;
|
15752
|
+
}
|
15753
|
+
|
15754
|
+
// parallelize by q rows using ggml_vec_dot_f32
|
15755
|
+
|
15756
|
+
// total rows in q
|
15757
|
+
const int nr = neq1*neq2*neq3;
|
15758
|
+
|
15759
|
+
// rows per thread
|
15760
|
+
const int dr = (nr + nth - 1)/nth;
|
15761
|
+
|
15762
|
+
// row range for this thread
|
15763
|
+
const int ir0 = dr*ith;
|
15764
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
15765
|
+
|
15766
|
+
float scale = 1.0f;
|
15767
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
15768
|
+
|
15769
|
+
// loop over n_batch and n_head
|
15770
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
15771
|
+
// q indices
|
15772
|
+
const int iq3 = ir/(neq2*neq1);
|
15773
|
+
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15774
|
+
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15775
|
+
|
15776
|
+
float S = 0.0f;
|
15777
|
+
float M = -INFINITY;
|
15778
|
+
|
15779
|
+
float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32);
|
15780
|
+
ggml_fp16_t * Q16 = (ggml_fp16_t *) (V32); // reuse memory
|
15781
|
+
ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D);
|
15782
|
+
|
15783
|
+
memset(V16, 0, D*sizeof(ggml_fp16_t));
|
15784
|
+
|
15785
|
+
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
15786
|
+
|
15787
|
+
// k indices
|
15788
|
+
const int ik3 = iq3 / rk3;
|
15789
|
+
const int ik2 = iq2 / rk2;
|
15790
|
+
|
15791
|
+
// v indices
|
15792
|
+
const int iv3 = iq3 / rv3;
|
15793
|
+
const int iv2 = iq2 / rv2;
|
15794
|
+
|
15795
|
+
// online softmax / attention
|
15796
|
+
// loop over n_kv and n_head_kv
|
15797
|
+
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
15798
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
15799
|
+
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15800
|
+
if (mv == -INFINITY) {
|
15801
|
+
continue;
|
15802
|
+
}
|
15803
|
+
|
15804
|
+
float s;
|
15805
|
+
|
15806
|
+
// convert Q to F16 in V32
|
15807
|
+
{
|
15808
|
+
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
15809
|
+
|
15810
|
+
for (int64_t d = 0; d < D; ++d) {
|
15811
|
+
Q16[d] = GGML_FP32_TO_FP16(pq[d]);
|
15812
|
+
}
|
15813
|
+
}
|
15814
|
+
|
15815
|
+
ggml_vec_dot_f16(D,
|
15816
|
+
&s, 0,
|
15817
|
+
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
15818
|
+
Q16, 0, 1);
|
15819
|
+
|
15820
|
+
s = s*scale + mv;
|
15821
|
+
|
15822
|
+
const float Mold = M;
|
15823
|
+
|
15824
|
+
float ms = 1.0f;
|
15825
|
+
float vs = 1.0f;
|
15826
|
+
|
15827
|
+
if (s > M) {
|
15828
|
+
M = s;
|
15829
|
+
ms = expf(Mold - M);
|
15830
|
+
|
15831
|
+
// V = V*expf(Mold - M)
|
15832
|
+
ggml_vec_scale_f16(D, V16, ms);
|
15833
|
+
} else {
|
15834
|
+
vs = expf(s - M);
|
15835
|
+
}
|
15836
|
+
|
15837
|
+
const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
15838
|
+
|
15839
|
+
// V += v*expf(s - M)
|
15840
|
+
ggml_vec_mad_f16(D, V16, v16, vs);
|
15841
|
+
|
15842
|
+
S = S*ms + vs;
|
15843
|
+
}
|
15844
|
+
|
15845
|
+
// V /= S
|
15846
|
+
for (int64_t d = 0; d < D; ++d) {
|
15847
|
+
V32[d] = GGML_FP16_TO_FP32(V16[d])/S;
|
15848
|
+
}
|
15849
|
+
|
15850
|
+
// dst indices
|
15851
|
+
const int i1 = iq1;
|
15852
|
+
const int i2 = iq2;
|
15853
|
+
const int i3 = iq3;
|
15854
|
+
|
15855
|
+
// original
|
15856
|
+
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
|
15857
|
+
|
15858
|
+
// permute(0, 2, 1, 3)
|
15859
|
+
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1);
|
15860
|
+
}
|
15861
|
+
}
|
15862
|
+
|
15863
|
+
static void ggml_compute_forward_flash_attn_ext(
|
15864
|
+
const struct ggml_compute_params * params,
|
15865
|
+
const struct ggml_tensor * q,
|
15866
|
+
const struct ggml_tensor * k,
|
15867
|
+
const struct ggml_tensor * v,
|
15868
|
+
const struct ggml_tensor * mask,
|
15869
|
+
struct ggml_tensor * dst) {
|
15870
|
+
switch (dst->op_params[1]) {
|
15871
|
+
case GGML_PREC_DEFAULT:
|
15872
|
+
case GGML_PREC_F32:
|
15873
|
+
{
|
15874
|
+
// uses F32 accumulators
|
15875
|
+
ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
|
15876
|
+
} break;
|
15877
|
+
default:
|
15878
|
+
{
|
15879
|
+
GGML_ASSERT(false);
|
15880
|
+
} break;
|
15881
|
+
}
|
15882
|
+
}
|
15883
|
+
|
14572
15884
|
// ggml_compute_forward_flash_ff
|
14573
15885
|
|
14574
15886
|
static void ggml_compute_forward_flash_ff_f16(
|
@@ -15588,6 +16900,7 @@ static void ggml_compute_forward_get_rel_pos(
|
|
15588
16900
|
|
15589
16901
|
switch (src0->type) {
|
15590
16902
|
case GGML_TYPE_F16:
|
16903
|
+
case GGML_TYPE_BF16:
|
15591
16904
|
{
|
15592
16905
|
ggml_compute_forward_get_rel_pos_f16(params, dst);
|
15593
16906
|
} break;
|
@@ -16376,6 +17689,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
16376
17689
|
const bool masked = t != 0;
|
16377
17690
|
ggml_compute_forward_flash_attn(params, masked, tensor);
|
16378
17691
|
} break;
|
17692
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
17693
|
+
{
|
17694
|
+
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
17695
|
+
} break;
|
16379
17696
|
case GGML_OP_FLASH_FF:
|
16380
17697
|
{
|
16381
17698
|
ggml_compute_forward_flash_ff(params, tensor);
|
@@ -17388,6 +18705,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
17388
18705
|
GGML_ASSERT(false); // TODO: not implemented
|
17389
18706
|
} break;
|
17390
18707
|
case GGML_OP_FLASH_ATTN:
|
18708
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
17391
18709
|
{
|
17392
18710
|
struct ggml_tensor * flash_grad = NULL;
|
17393
18711
|
if (src0->grad || src1->grad || tensor->src[2]->grad) {
|
@@ -18160,6 +19478,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18160
19478
|
n_tasks = n_threads;
|
18161
19479
|
} break;
|
18162
19480
|
case GGML_OP_FLASH_ATTN:
|
19481
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
18163
19482
|
{
|
18164
19483
|
n_tasks = n_threads;
|
18165
19484
|
} break;
|
@@ -18446,7 +19765,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18446
19765
|
case GGML_OP_CPY:
|
18447
19766
|
case GGML_OP_DUP:
|
18448
19767
|
{
|
18449
|
-
if (ggml_is_quantized(node->type)
|
19768
|
+
if (ggml_is_quantized(node->type) ||
|
19769
|
+
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
19770
|
+
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
|
19771
|
+
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
|
18450
19772
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
18451
19773
|
}
|
18452
19774
|
} break;
|
@@ -18525,7 +19847,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18525
19847
|
const int64_t ne10 = node->src[1]->ne[0]; // L
|
18526
19848
|
const int64_t ne11 = node->src[1]->ne[1]; // Cin
|
18527
19849
|
|
18528
|
-
if (node->src[0]->type == GGML_TYPE_F16
|
19850
|
+
if ((node->src[0]->type == GGML_TYPE_F16 ||
|
19851
|
+
node->src[0]->type == GGML_TYPE_BF16) &&
|
18529
19852
|
node->src[1]->type == GGML_TYPE_F32) {
|
18530
19853
|
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
|
18531
19854
|
cur += sizeof(ggml_fp16_t)*ne10*ne11;
|
@@ -18561,8 +19884,17 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18561
19884
|
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
18562
19885
|
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
18563
19886
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
19887
|
+
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
19888
|
+
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
19889
|
+
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
18564
19890
|
}
|
18565
19891
|
} break;
|
19892
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
19893
|
+
{
|
19894
|
+
const int64_t ne00 = node->src[0]->ne[0]; // D
|
19895
|
+
|
19896
|
+
cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size
|
19897
|
+
} break;
|
18566
19898
|
case GGML_OP_FLASH_FF:
|
18567
19899
|
{
|
18568
19900
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -18571,6 +19903,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18571
19903
|
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
18572
19904
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
18573
19905
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
19906
|
+
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
19907
|
+
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
19908
|
+
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
18574
19909
|
}
|
18575
19910
|
} break;
|
18576
19911
|
case GGML_OP_FLASH_ATTN_BACK:
|
@@ -18584,6 +19919,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18584
19919
|
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
18585
19920
|
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
18586
19921
|
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
19922
|
+
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
19923
|
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
19924
|
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
18587
19925
|
}
|
18588
19926
|
} break;
|
18589
19927
|
|
@@ -19360,7 +20698,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
19360
20698
|
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
19361
20699
|
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
19362
20700
|
}
|
19363
|
-
else if (node->type == GGML_TYPE_F32 ||
|
20701
|
+
else if (node->type == GGML_TYPE_F32 ||
|
20702
|
+
node->type == GGML_TYPE_F16 ||
|
20703
|
+
node->type == GGML_TYPE_BF16) {
|
19364
20704
|
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
19365
20705
|
}
|
19366
20706
|
else {
|
@@ -20418,6 +21758,12 @@ size_t ggml_quantize_chunk(
|
|
20418
21758
|
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
20419
21759
|
result = n * elemsize;
|
20420
21760
|
} break;
|
21761
|
+
case GGML_TYPE_BF16:
|
21762
|
+
{
|
21763
|
+
size_t elemsize = sizeof(ggml_bf16_t);
|
21764
|
+
ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n);
|
21765
|
+
result = n * elemsize;
|
21766
|
+
} break;
|
20421
21767
|
case GGML_TYPE_F32:
|
20422
21768
|
{
|
20423
21769
|
size_t elemsize = sizeof(float);
|
@@ -20614,7 +21960,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
|
|
20614
21960
|
}
|
20615
21961
|
|
20616
21962
|
struct gguf_context * gguf_init_empty(void) {
|
20617
|
-
struct gguf_context * ctx =
|
21963
|
+
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
20618
21964
|
|
20619
21965
|
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
20620
21966
|
ctx->header.version = GGUF_VERSION;
|
@@ -20659,7 +22005,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20659
22005
|
|
20660
22006
|
bool ok = true;
|
20661
22007
|
|
20662
|
-
struct gguf_context * ctx =
|
22008
|
+
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
20663
22009
|
|
20664
22010
|
// read the header
|
20665
22011
|
{
|
@@ -20696,9 +22042,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20696
22042
|
|
20697
22043
|
// read the kv pairs
|
20698
22044
|
{
|
20699
|
-
|
22045
|
+
const uint64_t n_kv = ctx->header.n_kv;
|
20700
22046
|
|
20701
|
-
|
22047
|
+
// header.n_kv will hold the actual value of pairs that were successfully read in the loop below
|
22048
|
+
ctx->header.n_kv = 0;
|
22049
|
+
ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
|
22050
|
+
|
22051
|
+
for (uint64_t i = 0; i < n_kv; ++i) {
|
20702
22052
|
struct gguf_kv * kv = &ctx->kv[i];
|
20703
22053
|
|
20704
22054
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
@@ -20747,7 +22097,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20747
22097
|
return NULL;
|
20748
22098
|
}
|
20749
22099
|
|
20750
|
-
kv->value.arr.data =
|
22100
|
+
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
|
20751
22101
|
|
20752
22102
|
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
20753
22103
|
} break;
|
@@ -20761,7 +22111,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20761
22111
|
return NULL;
|
20762
22112
|
}
|
20763
22113
|
|
20764
|
-
kv->value.arr.data =
|
22114
|
+
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
|
20765
22115
|
|
20766
22116
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20767
22117
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
@@ -20777,6 +22127,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20777
22127
|
if (!ok) {
|
20778
22128
|
break;
|
20779
22129
|
}
|
22130
|
+
|
22131
|
+
ctx->header.n_kv++;
|
20780
22132
|
}
|
20781
22133
|
|
20782
22134
|
if (!ok) {
|
@@ -20788,8 +22140,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20788
22140
|
}
|
20789
22141
|
|
20790
22142
|
// read the tensor infos
|
20791
|
-
{
|
20792
|
-
ctx->infos =
|
22143
|
+
if (ctx->header.n_tensors > 0) {
|
22144
|
+
ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
20793
22145
|
|
20794
22146
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20795
22147
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -20810,8 +22162,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20810
22162
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
20811
22163
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
20812
22164
|
|
22165
|
+
// TODO: return an error instead of crashing with GGML_ASSERT
|
20813
22166
|
gguf_tensor_info_sanitize(info);
|
20814
22167
|
|
22168
|
+
// make sure there is no duplicated tensor names
|
22169
|
+
for (uint64_t j = 0; j < i; ++j) {
|
22170
|
+
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
|
22171
|
+
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
|
22172
|
+
ok = false;
|
22173
|
+
}
|
22174
|
+
}
|
22175
|
+
|
20815
22176
|
if (!ok) {
|
20816
22177
|
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
20817
22178
|
fclose(file);
|
@@ -20980,7 +22341,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20980
22341
|
GGML_FREE(ctx->infos);
|
20981
22342
|
}
|
20982
22343
|
|
20983
|
-
|
22344
|
+
GGML_FREE(ctx);
|
20984
22345
|
}
|
20985
22346
|
|
20986
22347
|
const char * gguf_type_name(enum gguf_type type) {
|
@@ -21291,7 +22652,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|
21291
22652
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
21292
22653
|
ctx->kv[idx].value.arr.type = type;
|
21293
22654
|
ctx->kv[idx].value.arr.n = n;
|
21294
|
-
ctx->kv[idx].value.arr.data =
|
22655
|
+
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
|
21295
22656
|
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
21296
22657
|
}
|
21297
22658
|
|
@@ -21301,7 +22662,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
21301
22662
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
21302
22663
|
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
21303
22664
|
ctx->kv[idx].value.arr.n = n;
|
21304
|
-
ctx->kv[idx].value.arr.data =
|
22665
|
+
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
|
21305
22666
|
for (int i = 0; i < n; i++) {
|
21306
22667
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
21307
22668
|
str->n = strlen(data[i]);
|
@@ -21328,7 +22689,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
21328
22689
|
case GGUF_TYPE_ARRAY:
|
21329
22690
|
{
|
21330
22691
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
21331
|
-
const char ** data =
|
22692
|
+
const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
|
21332
22693
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
21333
22694
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
21334
22695
|
}
|
@@ -21348,6 +22709,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
21348
22709
|
void gguf_add_tensor(
|
21349
22710
|
struct gguf_context * ctx,
|
21350
22711
|
const struct ggml_tensor * tensor) {
|
22712
|
+
if (gguf_find_tensor(ctx, tensor->name) != -1) {
|
22713
|
+
GGML_ASSERT(false && "duplicated tensor name");
|
22714
|
+
}
|
22715
|
+
|
21351
22716
|
const int idx = ctx->header.n_tensors;
|
21352
22717
|
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
21353
22718
|
|
@@ -21416,7 +22781,7 @@ struct gguf_buf {
|
|
21416
22781
|
|
21417
22782
|
static struct gguf_buf gguf_buf_init(size_t size) {
|
21418
22783
|
struct gguf_buf buf = {
|
21419
|
-
/*buf.data =*/ size == 0 ? NULL :
|
22784
|
+
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
|
21420
22785
|
/*buf.size =*/ size,
|
21421
22786
|
/*buf.offset =*/ 0,
|
21422
22787
|
};
|