llama_cpp 0.15.1 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
-
#include "sgemm.h"
|
8
7
|
|
9
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
10
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -37,6 +36,10 @@
|
|
37
36
|
#undef GGML_USE_LLAMAFILE
|
38
37
|
#endif
|
39
38
|
|
39
|
+
#ifdef GGML_USE_LLAMAFILE
|
40
|
+
#include "sgemm.h"
|
41
|
+
#endif
|
42
|
+
|
40
43
|
#if defined(_MSC_VER)
|
41
44
|
// disable "possible loss of data" to avoid hundreds of casts
|
42
45
|
// we should just be careful :)
|
@@ -109,6 +112,8 @@ typedef void * thread_ret_t;
|
|
109
112
|
|
110
113
|
#endif
|
111
114
|
|
115
|
+
typedef pthread_t ggml_thread_t;
|
116
|
+
|
112
117
|
#ifdef GGML_USE_CPU_HBM
|
113
118
|
#include <hbwmalloc.h>
|
114
119
|
#endif
|
@@ -160,9 +165,6 @@ void ggml_print_backtrace(void) {
|
|
160
165
|
#define GGML_DEBUG 0
|
161
166
|
#define GGML_GELU_FP16
|
162
167
|
#define GGML_GELU_QUICK_FP16
|
163
|
-
#define GGML_SILU_FP16
|
164
|
-
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
165
|
-
// #define GGML_FLASH_ATTN_EXP_FP16
|
166
168
|
|
167
169
|
#define GGML_SOFT_MAX_UNROLL 4
|
168
170
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -313,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
|
313
315
|
// precomputed quick gelu table for f16 (128 KB)
|
314
316
|
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
315
317
|
|
316
|
-
// precomputed silu table for f16 (128 KB)
|
317
|
-
static ggml_fp16_t ggml_table_silu_f16[1 << 16];
|
318
|
-
|
319
|
-
// precomputed exp table for f16 (128 KB)
|
320
|
-
static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
321
|
-
|
322
318
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
323
319
|
float ggml_table_f32_f16[1 << 16];
|
324
320
|
|
@@ -410,10 +406,10 @@ void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
|
|
410
406
|
int i = 0;
|
411
407
|
#if defined(__AVX512BF16__)
|
412
408
|
for (; i + 32 <= n; i += 32) {
|
413
|
-
|
414
|
-
(
|
415
|
-
(
|
416
|
-
|
409
|
+
_mm512_storeu_si512(
|
410
|
+
(__m512i *)(y + i),
|
411
|
+
m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
|
412
|
+
_mm512_loadu_ps(x + i))));
|
417
413
|
}
|
418
414
|
#endif
|
419
415
|
for (; i < n; i++) {
|
@@ -875,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
875
871
|
},
|
876
872
|
[GGML_TYPE_IQ4_XS] = {
|
877
873
|
.type_name = "iq4_xs",
|
878
|
-
#if QK_K == 64
|
879
|
-
.blck_size = QK4_NL,
|
880
|
-
#else
|
881
874
|
.blck_size = QK_K,
|
882
|
-
#endif
|
883
875
|
.type_size = sizeof(block_iq4_xs),
|
884
876
|
.is_quantized = true,
|
885
877
|
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
886
878
|
.from_float = quantize_row_iq4_xs,
|
887
879
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
888
880
|
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
889
|
-
#if QK_K == 64
|
890
|
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
891
|
-
#else
|
892
881
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
893
|
-
#endif
|
894
882
|
.nrows = 1,
|
895
883
|
},
|
896
884
|
[GGML_TYPE_Q8_K] = {
|
@@ -1303,6 +1291,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1303
1291
|
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
1304
1292
|
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
1305
1293
|
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
1294
|
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
1295
|
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
1306
1296
|
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
1307
1297
|
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
1308
1298
|
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
@@ -1525,6 +1515,195 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
1525
1515
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
1526
1516
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
1527
1517
|
|
1518
|
+
#elif defined(__loongarch_asx)
|
1519
|
+
|
1520
|
+
#define GGML_SIMD
|
1521
|
+
|
1522
|
+
// F32 LASX
|
1523
|
+
#define GGML_F32_STEP 32
|
1524
|
+
#define GGML_F32_EPR 8
|
1525
|
+
|
1526
|
+
#define GGML_F32x8 __m256
|
1527
|
+
#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
|
1528
|
+
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
|
1529
|
+
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
|
1530
|
+
#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
|
1531
|
+
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
|
1532
|
+
#define GGML_F32x8_ADD __lasx_xvfadd_s
|
1533
|
+
#define GGML_F32x8_MUL __lasx_xvfmul_s
|
1534
|
+
#define GGML_F32x8_REDUCE(res, x) \
|
1535
|
+
do { \
|
1536
|
+
int offset = GGML_F32_ARR >> 1; \
|
1537
|
+
for (int i = 0; i < offset; ++i) { \
|
1538
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
1539
|
+
} \
|
1540
|
+
offset >>= 1; \
|
1541
|
+
for (int i = 0; i < offset; ++i) { \
|
1542
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
1543
|
+
} \
|
1544
|
+
offset >>= 1; \
|
1545
|
+
for (int i = 0; i < offset; ++i) { \
|
1546
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
1547
|
+
} \
|
1548
|
+
float *tmp_p = (float *)&x[0]; \
|
1549
|
+
res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
|
1550
|
+
} while (0)
|
1551
|
+
// TODO: is this optimal ?
|
1552
|
+
|
1553
|
+
#define GGML_F32_VEC GGML_F32x8
|
1554
|
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
1555
|
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
1556
|
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
1557
|
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
1558
|
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
1559
|
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
1560
|
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
1561
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
1562
|
+
|
1563
|
+
// F16 LASX
|
1564
|
+
|
1565
|
+
#define GGML_F16_STEP 32
|
1566
|
+
#define GGML_F16_EPR 8
|
1567
|
+
|
1568
|
+
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
1569
|
+
|
1570
|
+
#define GGML_F32Cx8 __m256
|
1571
|
+
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
1572
|
+
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
1573
|
+
|
1574
|
+
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
1575
|
+
float tmp[8];
|
1576
|
+
|
1577
|
+
for (int i = 0; i < 8; i++) {
|
1578
|
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
1579
|
+
}
|
1580
|
+
|
1581
|
+
return (__m256)__lasx_xvld(tmp, 0);
|
1582
|
+
}
|
1583
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
1584
|
+
float arr[8];
|
1585
|
+
|
1586
|
+
__lasx_xvst(y, arr, 0);
|
1587
|
+
|
1588
|
+
for (int i = 0; i < 8; i++)
|
1589
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1590
|
+
}
|
1591
|
+
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
1592
|
+
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
1593
|
+
|
1594
|
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
1595
|
+
#define GGML_F32Cx8_ADD __lasx_xvfadd_s
|
1596
|
+
#define GGML_F32Cx8_MUL __lasx_xvfmul_s
|
1597
|
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
1598
|
+
|
1599
|
+
#define GGML_F16_VEC GGML_F32Cx8
|
1600
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
1601
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
1602
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
1603
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
1604
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
1605
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
1606
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
1607
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
1608
|
+
|
1609
|
+
#elif defined(__loongarch_sx)
|
1610
|
+
|
1611
|
+
#define GGML_SIMD
|
1612
|
+
|
1613
|
+
// F32 LSX
|
1614
|
+
|
1615
|
+
#define GGML_F32_STEP 32
|
1616
|
+
#define GGML_F32_EPR 4
|
1617
|
+
|
1618
|
+
#define GGML_F32x4 __m128
|
1619
|
+
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
1620
|
+
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
1621
|
+
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
1622
|
+
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
|
1623
|
+
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
1624
|
+
#define GGML_F32x4_ADD __lsx_vfadd_s
|
1625
|
+
#define GGML_F32x4_MUL __lsx_vfmul_s
|
1626
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
1627
|
+
{ \
|
1628
|
+
int offset = GGML_F32_ARR >> 1; \
|
1629
|
+
for (int i = 0; i < offset; ++i) { \
|
1630
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
1631
|
+
} \
|
1632
|
+
offset >>= 1; \
|
1633
|
+
for (int i = 0; i < offset; ++i) { \
|
1634
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
1635
|
+
} \
|
1636
|
+
offset >>= 1; \
|
1637
|
+
for (int i = 0; i < offset; ++i) { \
|
1638
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
1639
|
+
} \
|
1640
|
+
__m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
|
1641
|
+
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
|
1642
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
1643
|
+
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
1644
|
+
tmp = __lsx_vsrli_d((__m128i)t0, 32); \
|
1645
|
+
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
|
1646
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
1647
|
+
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
1648
|
+
}
|
1649
|
+
|
1650
|
+
#define GGML_F32_VEC GGML_F32x4
|
1651
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
1652
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
1653
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
1654
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
1655
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
1656
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
1657
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
1658
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
1659
|
+
|
1660
|
+
// F16 LSX
|
1661
|
+
|
1662
|
+
#define GGML_F16_STEP 32
|
1663
|
+
#define GGML_F16_EPR 4
|
1664
|
+
|
1665
|
+
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
1666
|
+
float tmp[4];
|
1667
|
+
|
1668
|
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
1669
|
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
1670
|
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
1671
|
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
1672
|
+
|
1673
|
+
return __lsx_vld(tmp, 0);
|
1674
|
+
}
|
1675
|
+
|
1676
|
+
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
1677
|
+
float arr[4];
|
1678
|
+
|
1679
|
+
__lsx_vst(y, arr, 0);
|
1680
|
+
|
1681
|
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
1682
|
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
1683
|
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
1684
|
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
#define GGML_F32Cx4 __m128
|
1688
|
+
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
1689
|
+
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
1690
|
+
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
1691
|
+
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
1692
|
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
1693
|
+
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
1694
|
+
#define GGML_F32Cx4_MUL __lsx_vfmul_s
|
1695
|
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
1696
|
+
|
1697
|
+
#define GGML_F16_VEC GGML_F32Cx4
|
1698
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
1699
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
1700
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
1701
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
1702
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
1703
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
1704
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
1705
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
1706
|
+
|
1528
1707
|
#endif
|
1529
1708
|
|
1530
1709
|
// GGML_F32_ARR / GGML_F16_ARR
|
@@ -1534,6 +1713,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
1534
1713
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
1535
1714
|
#endif
|
1536
1715
|
|
1716
|
+
//
|
1717
|
+
// ggml context
|
1718
|
+
//
|
1719
|
+
|
1720
|
+
struct ggml_context {
|
1721
|
+
size_t mem_size;
|
1722
|
+
void* mem_buffer;
|
1723
|
+
bool mem_buffer_owned;
|
1724
|
+
bool no_alloc;
|
1725
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
1726
|
+
|
1727
|
+
int n_objects;
|
1728
|
+
|
1729
|
+
struct ggml_object* objects_begin;
|
1730
|
+
struct ggml_object* objects_end;
|
1731
|
+
|
1732
|
+
struct ggml_scratch scratch;
|
1733
|
+
struct ggml_scratch scratch_save;
|
1734
|
+
};
|
1735
|
+
|
1736
|
+
struct ggml_context_container {
|
1737
|
+
bool used;
|
1738
|
+
|
1739
|
+
struct ggml_context context;
|
1740
|
+
};
|
1741
|
+
|
1742
|
+
struct ggml_compute_state_shared {
|
1743
|
+
const struct ggml_cgraph* cgraph;
|
1744
|
+
const struct ggml_cplan* cplan;
|
1745
|
+
|
1746
|
+
int64_t perf_node_start_cycles;
|
1747
|
+
int64_t perf_node_start_time_us;
|
1748
|
+
|
1749
|
+
const int n_threads;
|
1750
|
+
|
1751
|
+
// synchronization primitives
|
1752
|
+
atomic_int n_active; // num active threads
|
1753
|
+
atomic_int node_n; // active graph node
|
1754
|
+
atomic_int node_task; // active graph node task phase
|
1755
|
+
|
1756
|
+
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
1757
|
+
void* abort_callback_data;
|
1758
|
+
|
1759
|
+
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
1760
|
+
};
|
1761
|
+
|
1762
|
+
struct ggml_compute_state {
|
1763
|
+
ggml_thread_t thrd;
|
1764
|
+
int ith;
|
1765
|
+
struct ggml_compute_state_shared* shared;
|
1766
|
+
enum ggml_status ec;
|
1767
|
+
};
|
1768
|
+
|
1537
1769
|
//
|
1538
1770
|
// fundamental operations
|
1539
1771
|
//
|
@@ -1615,10 +1847,10 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
|
1615
1847
|
__m512 c1 = _mm512_setzero_ps();
|
1616
1848
|
__m512 c2 = _mm512_setzero_ps();
|
1617
1849
|
for (; i + 64 <= n; i += 64) {
|
1618
|
-
c1 = _mm512_dpbf16_ps(c1, (
|
1619
|
-
|
1620
|
-
c2 = _mm512_dpbf16_ps(c2, (
|
1621
|
-
|
1850
|
+
c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
|
1851
|
+
m512bh(_mm512_loadu_si512((y + i))));
|
1852
|
+
c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
|
1853
|
+
m512bh(_mm512_loadu_si512((y + i + 32))));
|
1622
1854
|
}
|
1623
1855
|
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
1624
1856
|
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
@@ -1949,6 +2181,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
1949
2181
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1950
2182
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1951
2183
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
2184
|
+
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
1952
2185
|
// TODO: optimize performance
|
1953
2186
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1954
2187
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
@@ -2024,52 +2257,291 @@ inline static float ggml_silu_f32(float x) {
|
|
2024
2257
|
return x/(1.0f + expf(-x));
|
2025
2258
|
}
|
2026
2259
|
|
2027
|
-
|
2028
|
-
|
2029
|
-
//
|
2030
|
-
//
|
2031
|
-
//
|
2032
|
-
//
|
2260
|
+
#if defined(__ARM_NEON) && defined(__aarch64__)
|
2261
|
+
|
2262
|
+
// adapted from arm limited optimized routine
|
2263
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2264
|
+
// numbers above 88.38 will flush to infinity
|
2265
|
+
// numbers beneath -103.97 will flush to zero
|
2266
|
+
inline static float32x4_t ggml_v_expf(float32x4_t x) {
|
2267
|
+
const float32x4_t r = vdupq_n_f32(0x1.8p23f);
|
2268
|
+
const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
|
2269
|
+
const float32x4_t n = vsubq_f32(z, r);
|
2270
|
+
const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
|
2271
|
+
vdupq_n_f32(0x1.7f7d1cp-20f));
|
2272
|
+
const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
|
2273
|
+
const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
|
2274
|
+
const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
|
2275
|
+
const float32x4_t u = vmulq_f32(b, b);
|
2276
|
+
const float32x4_t j = vfmaq_f32(
|
2277
|
+
vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
|
2278
|
+
vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
|
2279
|
+
vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
|
2280
|
+
if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
|
2281
|
+
return vfmaq_f32(k, j, k);
|
2282
|
+
const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
|
2283
|
+
const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
|
2284
|
+
const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
|
2285
|
+
return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
|
2286
|
+
vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
|
2287
|
+
}
|
2288
|
+
|
2289
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2290
|
+
inline static float32x4_t ggml_v_silu(float32x4_t x) {
|
2291
|
+
const float32x4_t one = vdupq_n_f32(1.0f);
|
2292
|
+
const float32x4_t zero = vdupq_n_f32(0.0f);
|
2293
|
+
const float32x4_t neg_x = vsubq_f32(zero, x);
|
2294
|
+
const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
|
2295
|
+
const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
|
2296
|
+
return vdivq_f32(x, one_plus_exp_neg_x);
|
2297
|
+
}
|
2298
|
+
|
2299
|
+
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
|
2300
|
+
|
2301
|
+
// adapted from arm limited optimized routine
|
2302
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2303
|
+
// numbers above 88.38 will flush to infinity
|
2304
|
+
// numbers beneath -103.97 will flush to zero
|
2305
|
+
inline static __m512 ggml_v_expf(__m512 x) {
|
2306
|
+
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
2307
|
+
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
2308
|
+
const __m512 n = _mm512_sub_ps(z, r);
|
2309
|
+
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
2310
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
2311
|
+
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
|
2312
|
+
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
2313
|
+
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
2314
|
+
const __m512 u = _mm512_mul_ps(b, b);
|
2315
|
+
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2316
|
+
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
2317
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2318
|
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2319
|
+
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
2320
|
+
if (_mm512_kortestz(c, c))
|
2321
|
+
return _mm512_fmadd_ps(j, k, k);
|
2322
|
+
const __m512i g = _mm512_and_si512(
|
2323
|
+
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
2324
|
+
_mm512_set1_epi32(0x82000000u));
|
2325
|
+
const __m512 s1 =
|
2326
|
+
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
2327
|
+
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
2328
|
+
const __mmask16 d =
|
2329
|
+
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
2330
|
+
return _mm512_mask_blend_ps(
|
2331
|
+
d, _mm512_mask_blend_ps(
|
2332
|
+
c, _mm512_fmadd_ps(k, j, k),
|
2333
|
+
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
|
2334
|
+
_mm512_mul_ps(s1, s1));
|
2335
|
+
}
|
2336
|
+
|
2337
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2338
|
+
inline static __m512 ggml_v_silu(__m512 x) {
|
2339
|
+
const __m512 one = _mm512_set1_ps(1);
|
2340
|
+
const __m512 zero = _mm512_setzero_ps();
|
2341
|
+
const __m512 neg_x = _mm512_sub_ps(zero, x);
|
2342
|
+
const __m512 exp_neg_x = ggml_v_expf(neg_x);
|
2343
|
+
const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
|
2344
|
+
return _mm512_div_ps(x, one_plus_exp_neg_x);
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2348
|
+
|
2349
|
+
// adapted from arm limited optimized routine
|
2350
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2351
|
+
// numbers above 88.38 will flush to infinity
|
2352
|
+
// numbers beneath -103.97 will flush to zero
|
2353
|
+
inline static __m256 ggml_v_expf(__m256 x) {
|
2354
|
+
const __m256 r = _mm256_set1_ps(0x1.8p23f);
|
2355
|
+
const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
|
2356
|
+
const __m256 n = _mm256_sub_ps(z, r);
|
2357
|
+
const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
|
2358
|
+
_mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
|
2359
|
+
const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
|
2360
|
+
const __m256 k = _mm256_castsi256_ps(
|
2361
|
+
_mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
|
2362
|
+
const __m256i c = _mm256_castps_si256(
|
2363
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2364
|
+
_mm256_set1_ps(126), _CMP_GT_OQ));
|
2365
|
+
const __m256 u = _mm256_mul_ps(b, b);
|
2366
|
+
const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
|
2367
|
+
_mm256_set1_ps(0x1.573e2ep-5f)), u,
|
2368
|
+
_mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
|
2369
|
+
_mm256_set1_ps(0x1.fffdb6p-2f))),
|
2370
|
+
u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
|
2371
|
+
if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
|
2372
|
+
return _mm256_fmadd_ps(j, k, k);
|
2373
|
+
const __m256i g = _mm256_and_si256(
|
2374
|
+
_mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
|
2375
|
+
_mm256_set1_epi32(0x82000000u));
|
2376
|
+
const __m256 s1 =
|
2377
|
+
_mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
|
2378
|
+
const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
|
2379
|
+
const __m256i d = _mm256_castps_si256(
|
2380
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2381
|
+
_mm256_set1_ps(192), _CMP_GT_OQ));
|
2382
|
+
return _mm256_or_ps(
|
2383
|
+
_mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
|
2384
|
+
_mm256_andnot_ps(
|
2385
|
+
_mm256_castsi256_ps(d),
|
2386
|
+
_mm256_or_ps(
|
2387
|
+
_mm256_and_ps(_mm256_castsi256_ps(c),
|
2388
|
+
_mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
|
2389
|
+
_mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
|
2390
|
+
}
|
2391
|
+
|
2392
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2393
|
+
inline static __m256 ggml_v_silu(__m256 x) {
|
2394
|
+
const __m256 one = _mm256_set1_ps(1);
|
2395
|
+
const __m256 zero = _mm256_setzero_ps();
|
2396
|
+
const __m256 neg_x = _mm256_sub_ps(zero, x);
|
2397
|
+
const __m256 exp_neg_x = ggml_v_expf(neg_x);
|
2398
|
+
const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
|
2399
|
+
return _mm256_div_ps(x, one_plus_exp_neg_x);
|
2400
|
+
}
|
2401
|
+
|
2402
|
+
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
|
2033
2403
|
|
2034
|
-
#
|
2035
|
-
|
2036
|
-
|
2037
|
-
for (int i = 0; i < n; ++i) {
|
2038
|
-
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
2039
|
-
memcpy(&t, &fp16, sizeof(uint16_t));
|
2040
|
-
y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
|
2041
|
-
}
|
2042
|
-
}
|
2404
|
+
#if defined(__FMA__)
|
2405
|
+
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
|
2406
|
+
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
|
2043
2407
|
#else
|
2044
|
-
|
2045
|
-
|
2408
|
+
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
|
2409
|
+
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
|
2410
|
+
#endif
|
2411
|
+
|
2412
|
+
// adapted from arm limited optimized routine
|
2413
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2414
|
+
// numbers above 88.38 will flush to infinity
|
2415
|
+
// numbers beneath -103.97 will flush to zero
|
2416
|
+
inline static __m128 ggml_v_expf(__m128 x) {
|
2417
|
+
const __m128 r = _mm_set1_ps(0x1.8p23f);
|
2418
|
+
const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
|
2419
|
+
const __m128 n = _mm_sub_ps(z, r);
|
2420
|
+
const __m128 b =
|
2421
|
+
NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
|
2422
|
+
const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
|
2423
|
+
const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
|
2424
|
+
const __m128i c =
|
2425
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
|
2426
|
+
const __m128 u = _mm_mul_ps(b, b);
|
2427
|
+
const __m128 j =
|
2428
|
+
MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
|
2429
|
+
MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
|
2430
|
+
u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
|
2431
|
+
if (!_mm_movemask_epi8(c))
|
2432
|
+
return MADD128(j, k, k);
|
2433
|
+
const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
|
2434
|
+
_mm_set1_epi32(0x82000000u));
|
2435
|
+
const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
|
2436
|
+
const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
|
2437
|
+
const __m128i d =
|
2438
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
|
2439
|
+
return _mm_or_ps(
|
2440
|
+
_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
|
2441
|
+
_mm_andnot_ps(_mm_castsi128_ps(d),
|
2442
|
+
_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
|
2443
|
+
_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
|
2444
|
+
}
|
2445
|
+
|
2446
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2447
|
+
inline static __m128 ggml_v_silu(__m128 x) {
|
2448
|
+
const __m128 one = _mm_set1_ps(1);
|
2449
|
+
const __m128 zero = _mm_setzero_ps();
|
2450
|
+
const __m128 neg_x = _mm_sub_ps(zero, x);
|
2451
|
+
const __m128 exp_neg_x = ggml_v_expf(neg_x);
|
2452
|
+
const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
|
2453
|
+
return _mm_div_ps(x, one_plus_exp_neg_x);
|
2454
|
+
}
|
2455
|
+
|
2456
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__
|
2457
|
+
|
2458
|
+
static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
2459
|
+
int i = 0;
|
2460
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2461
|
+
for (; i + 15 < n; i += 16) {
|
2462
|
+
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
|
2463
|
+
}
|
2464
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2465
|
+
for (; i + 7 < n; i += 8) {
|
2466
|
+
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
|
2467
|
+
}
|
2468
|
+
#elif defined(__SSE2__)
|
2469
|
+
for (; i + 3 < n; i += 4) {
|
2470
|
+
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
2471
|
+
}
|
2472
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
2473
|
+
for (; i + 3 < n; i += 4) {
|
2474
|
+
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
2475
|
+
}
|
2476
|
+
#endif
|
2477
|
+
for (; i < n; ++i) {
|
2046
2478
|
y[i] = ggml_silu_f32(x[i]);
|
2047
2479
|
}
|
2048
2480
|
}
|
2481
|
+
|
2482
|
+
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
2483
|
+
int i = 0;
|
2484
|
+
ggml_float sum = 0;
|
2485
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2486
|
+
for (; i + 15 < n; i += 16) {
|
2487
|
+
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
|
2488
|
+
_mm512_set1_ps(max)));
|
2489
|
+
_mm512_storeu_ps(y + i, val);
|
2490
|
+
sum += (ggml_float)_mm512_reduce_add_ps(val);
|
2491
|
+
}
|
2492
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2493
|
+
for (; i + 7 < n; i += 8) {
|
2494
|
+
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
|
2495
|
+
_mm256_set1_ps(max)));
|
2496
|
+
_mm256_storeu_ps(y + i, val);
|
2497
|
+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
2498
|
+
_mm256_castps256_ps128(val));
|
2499
|
+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
2500
|
+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
2501
|
+
sum += (ggml_float)_mm_cvtss_f32(val2);
|
2502
|
+
}
|
2503
|
+
#elif defined(__SSE2__)
|
2504
|
+
for (; i + 3 < n; i += 4) {
|
2505
|
+
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
|
2506
|
+
_mm_set1_ps(max)));
|
2507
|
+
_mm_storeu_ps(y + i, val);
|
2508
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
2509
|
+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
2510
|
+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
2511
|
+
#else
|
2512
|
+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
2513
|
+
val = _mm_add_ps(val, tmp);
|
2514
|
+
tmp = _mm_movehl_ps(tmp, val);
|
2515
|
+
val = _mm_add_ss(val, tmp);
|
2516
|
+
#endif
|
2517
|
+
sum += (ggml_float)_mm_cvtss_f32(val);
|
2518
|
+
}
|
2519
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
2520
|
+
for (; i + 3 < n; i += 4) {
|
2521
|
+
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
2522
|
+
vdupq_n_f32(max)));
|
2523
|
+
vst1q_f32(y + i, val);
|
2524
|
+
sum += (ggml_float)vaddvq_f32(val);
|
2525
|
+
}
|
2049
2526
|
#endif
|
2527
|
+
for (; i < n; ++i) {
|
2528
|
+
float val = expf(x[i] - max);
|
2529
|
+
sum += (ggml_float)val;
|
2530
|
+
y[i] = val;
|
2531
|
+
}
|
2532
|
+
return sum;
|
2533
|
+
}
|
2050
2534
|
|
2051
2535
|
inline static float ggml_silu_backward_f32(float x, float dy) {
|
2052
2536
|
const float s = 1.0f/(1.0f + expf(-x));
|
2053
2537
|
return dy*s*(1.0f + x*(1.0f - s));
|
2054
2538
|
}
|
2055
2539
|
|
2056
|
-
#ifdef GGML_SILU_FP16
|
2057
|
-
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2058
|
-
for (int i = 0; i < n; ++i) {
|
2059
|
-
// we did not use x[i] to compute forward silu but its f16 equivalent
|
2060
|
-
// take derivative at f16 of x[i]:
|
2061
|
-
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
2062
|
-
float usedx = GGML_FP16_TO_FP32(fp16);
|
2063
|
-
dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
|
2064
|
-
}
|
2065
|
-
}
|
2066
|
-
#else
|
2067
2540
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2068
2541
|
for (int i = 0; i < n; ++i) {
|
2069
2542
|
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
2070
2543
|
}
|
2071
2544
|
}
|
2072
|
-
#endif
|
2073
2545
|
|
2074
2546
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
2075
2547
|
#ifndef GGML_USE_ACCELERATE
|
@@ -2185,7 +2657,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2185
2657
|
"SOFT_MAX_BACK",
|
2186
2658
|
"ROPE",
|
2187
2659
|
"ROPE_BACK",
|
2188
|
-
"ALIBI",
|
2189
2660
|
"CLAMP",
|
2190
2661
|
"CONV_TRANSPOSE_1D",
|
2191
2662
|
"IM2COL",
|
@@ -2199,9 +2670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2199
2670
|
"ARGSORT",
|
2200
2671
|
"LEAKY_RELU",
|
2201
2672
|
|
2202
|
-
"FLASH_ATTN",
|
2203
2673
|
"FLASH_ATTN_EXT",
|
2204
|
-
"FLASH_FF",
|
2205
2674
|
"FLASH_ATTN_BACK",
|
2206
2675
|
"SSM_CONV",
|
2207
2676
|
"SSM_SCAN",
|
@@ -2227,7 +2696,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2227
2696
|
"CROSS_ENTROPY_LOSS_BACK",
|
2228
2697
|
};
|
2229
2698
|
|
2230
|
-
static_assert(GGML_OP_COUNT ==
|
2699
|
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
2231
2700
|
|
2232
2701
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2233
2702
|
"none",
|
@@ -2276,7 +2745,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2276
2745
|
"soft_max_back(x)",
|
2277
2746
|
"rope(x)",
|
2278
2747
|
"rope_back(x)",
|
2279
|
-
"alibi(x)",
|
2280
2748
|
"clamp(x)",
|
2281
2749
|
"conv_transpose_1d(x)",
|
2282
2750
|
"im2col(x)",
|
@@ -2290,9 +2758,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2290
2758
|
"argsort(x)",
|
2291
2759
|
"leaky_relu(x)",
|
2292
2760
|
|
2293
|
-
"flash_attn(x)",
|
2294
2761
|
"flash_attn_ext(x)",
|
2295
|
-
"flash_ff(x)",
|
2296
2762
|
"flash_attn_back(x)",
|
2297
2763
|
"ssm_conv(x)",
|
2298
2764
|
"ssm_scan(x)",
|
@@ -2318,7 +2784,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2318
2784
|
"cross_entropy_loss_back(x,y)",
|
2319
2785
|
};
|
2320
2786
|
|
2321
|
-
static_assert(GGML_OP_COUNT ==
|
2787
|
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
2322
2788
|
|
2323
2789
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
2324
2790
|
|
@@ -2331,6 +2797,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2331
2797
|
"TANH",
|
2332
2798
|
"ELU",
|
2333
2799
|
"RELU",
|
2800
|
+
"SIGMOID",
|
2334
2801
|
"GELU",
|
2335
2802
|
"GELU_QUICK",
|
2336
2803
|
"SILU",
|
@@ -2338,7 +2805,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2338
2805
|
"HARDSIGMOID",
|
2339
2806
|
};
|
2340
2807
|
|
2341
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
2808
|
+
static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
2342
2809
|
|
2343
2810
|
|
2344
2811
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
@@ -2380,32 +2847,6 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
2380
2847
|
}
|
2381
2848
|
}
|
2382
2849
|
|
2383
|
-
//
|
2384
|
-
// ggml context
|
2385
|
-
//
|
2386
|
-
|
2387
|
-
struct ggml_context {
|
2388
|
-
size_t mem_size;
|
2389
|
-
void * mem_buffer;
|
2390
|
-
bool mem_buffer_owned;
|
2391
|
-
bool no_alloc;
|
2392
|
-
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
2393
|
-
|
2394
|
-
int n_objects;
|
2395
|
-
|
2396
|
-
struct ggml_object * objects_begin;
|
2397
|
-
struct ggml_object * objects_end;
|
2398
|
-
|
2399
|
-
struct ggml_scratch scratch;
|
2400
|
-
struct ggml_scratch scratch_save;
|
2401
|
-
};
|
2402
|
-
|
2403
|
-
struct ggml_context_container {
|
2404
|
-
bool used;
|
2405
|
-
|
2406
|
-
struct ggml_context context;
|
2407
|
-
};
|
2408
|
-
|
2409
2850
|
//
|
2410
2851
|
// NUMA support
|
2411
2852
|
//
|
@@ -2819,8 +3260,18 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2819
3260
|
(t0->ne[3] == t1->ne[3] );
|
2820
3261
|
}
|
2821
3262
|
|
2822
|
-
|
2823
|
-
|
3263
|
+
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3264
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3265
|
+
|
3266
|
+
return
|
3267
|
+
(t0->nb[0] == t1->nb[0] ) &&
|
3268
|
+
(t0->nb[1] == t1->nb[1] ) &&
|
3269
|
+
(t0->nb[2] == t1->nb[2] ) &&
|
3270
|
+
(t0->nb[3] == t1->nb[3] );
|
3271
|
+
}
|
3272
|
+
|
3273
|
+
// check if t1 can be represented as a repeatition of t0
|
3274
|
+
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2824
3275
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2825
3276
|
|
2826
3277
|
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
|
@@ -2878,8 +3329,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2878
3329
|
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
2879
3330
|
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
2880
3331
|
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
2881
|
-
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
2882
|
-
ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
2883
3332
|
}
|
2884
3333
|
|
2885
3334
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
@@ -3163,6 +3612,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3163
3612
|
|
3164
3613
|
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
3165
3614
|
|
3615
|
+
#ifdef __clang__
|
3616
|
+
// temporary until ggml_tensor::backend is removed
|
3617
|
+
#pragma clang diagnostic push
|
3618
|
+
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
3619
|
+
#endif
|
3620
|
+
|
3166
3621
|
*result = (struct ggml_tensor) {
|
3167
3622
|
/*.type =*/ type,
|
3168
3623
|
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
|
@@ -3185,6 +3640,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3185
3640
|
/*.padding =*/ { 0 },
|
3186
3641
|
};
|
3187
3642
|
|
3643
|
+
#ifdef __clang__
|
3644
|
+
#pragma clang diagnostic pop
|
3645
|
+
#endif
|
3646
|
+
|
3188
3647
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
3189
3648
|
//ggml_assert_aligned(result->data);
|
3190
3649
|
|
@@ -4563,6 +5022,20 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
4563
5022
|
return result;
|
4564
5023
|
}
|
4565
5024
|
|
5025
|
+
// ggml_sigmoid
|
5026
|
+
|
5027
|
+
struct ggml_tensor * ggml_sigmoid(
|
5028
|
+
struct ggml_context * ctx,
|
5029
|
+
struct ggml_tensor * a) {
|
5030
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
|
5031
|
+
}
|
5032
|
+
|
5033
|
+
struct ggml_tensor * ggml_sigmoid_inplace(
|
5034
|
+
struct ggml_context * ctx,
|
5035
|
+
struct ggml_tensor * a) {
|
5036
|
+
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
|
5037
|
+
}
|
5038
|
+
|
4566
5039
|
// ggml_gelu
|
4567
5040
|
|
4568
5041
|
struct ggml_tensor * ggml_gelu(
|
@@ -5646,7 +6119,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5646
6119
|
struct ggml_context * ctx,
|
5647
6120
|
struct ggml_tensor * a,
|
5648
6121
|
struct ggml_tensor * mask,
|
5649
|
-
struct ggml_tensor * pos,
|
5650
6122
|
float scale,
|
5651
6123
|
float max_bias,
|
5652
6124
|
bool inplace) {
|
@@ -5660,18 +6132,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5660
6132
|
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
5661
6133
|
}
|
5662
6134
|
|
5663
|
-
if (pos) {
|
5664
|
-
GGML_ASSERT(ggml_is_vector(pos));
|
5665
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
5666
|
-
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
5667
|
-
}
|
5668
|
-
|
5669
|
-
if (pos && mask) {
|
5670
|
-
GGML_ASSERT(pos->type == mask->type);
|
5671
|
-
}
|
5672
|
-
|
5673
6135
|
if (max_bias > 0.0f) {
|
5674
|
-
GGML_ASSERT(
|
6136
|
+
GGML_ASSERT(mask);
|
5675
6137
|
}
|
5676
6138
|
|
5677
6139
|
bool is_node = false;
|
@@ -5689,7 +6151,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5689
6151
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5690
6152
|
result->src[0] = a;
|
5691
6153
|
result->src[1] = mask;
|
5692
|
-
result->src[2] = pos;
|
5693
6154
|
|
5694
6155
|
return result;
|
5695
6156
|
}
|
@@ -5697,23 +6158,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5697
6158
|
struct ggml_tensor * ggml_soft_max(
|
5698
6159
|
struct ggml_context * ctx,
|
5699
6160
|
struct ggml_tensor * a) {
|
5700
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
6161
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
|
5701
6162
|
}
|
5702
6163
|
|
5703
6164
|
struct ggml_tensor * ggml_soft_max_inplace(
|
5704
6165
|
struct ggml_context * ctx,
|
5705
6166
|
struct ggml_tensor * a) {
|
5706
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
6167
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
|
5707
6168
|
}
|
5708
6169
|
|
5709
6170
|
struct ggml_tensor * ggml_soft_max_ext(
|
5710
6171
|
struct ggml_context * ctx,
|
5711
6172
|
struct ggml_tensor * a,
|
5712
6173
|
struct ggml_tensor * mask,
|
5713
|
-
struct ggml_tensor * pos,
|
5714
6174
|
float scale,
|
5715
6175
|
float max_bias) {
|
5716
|
-
return ggml_soft_max_impl(ctx, a, mask,
|
6176
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
5717
6177
|
}
|
5718
6178
|
|
5719
6179
|
// ggml_soft_max_back
|
@@ -5759,6 +6219,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
5759
6219
|
struct ggml_context * ctx,
|
5760
6220
|
struct ggml_tensor * a,
|
5761
6221
|
struct ggml_tensor * b,
|
6222
|
+
struct ggml_tensor * c,
|
5762
6223
|
int n_dims,
|
5763
6224
|
int mode,
|
5764
6225
|
int n_ctx,
|
@@ -5772,10 +6233,17 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
5772
6233
|
float xpos_base,
|
5773
6234
|
bool xpos_down,
|
5774
6235
|
bool inplace) {
|
6236
|
+
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
6237
|
+
|
5775
6238
|
GGML_ASSERT(ggml_is_vector(b));
|
5776
6239
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
5777
6240
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
5778
6241
|
|
6242
|
+
if (c) {
|
6243
|
+
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
6244
|
+
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
6245
|
+
}
|
6246
|
+
|
5779
6247
|
bool is_node = false;
|
5780
6248
|
|
5781
6249
|
if (a->grad) {
|
@@ -5799,6 +6267,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
5799
6267
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5800
6268
|
result->src[0] = a;
|
5801
6269
|
result->src[1] = b;
|
6270
|
+
result->src[2] = c;
|
5802
6271
|
|
5803
6272
|
return result;
|
5804
6273
|
}
|
@@ -5811,7 +6280,7 @@ struct ggml_tensor * ggml_rope(
|
|
5811
6280
|
int mode,
|
5812
6281
|
int n_ctx) {
|
5813
6282
|
return ggml_rope_impl(
|
5814
|
-
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
|
6283
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
|
5815
6284
|
);
|
5816
6285
|
}
|
5817
6286
|
|
@@ -5823,14 +6292,15 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
5823
6292
|
int mode,
|
5824
6293
|
int n_ctx) {
|
5825
6294
|
return ggml_rope_impl(
|
5826
|
-
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
|
6295
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
|
5827
6296
|
);
|
5828
6297
|
}
|
5829
6298
|
|
5830
|
-
struct ggml_tensor *
|
6299
|
+
struct ggml_tensor * ggml_rope_ext(
|
5831
6300
|
struct ggml_context * ctx,
|
5832
6301
|
struct ggml_tensor * a,
|
5833
6302
|
struct ggml_tensor * b,
|
6303
|
+
struct ggml_tensor * c,
|
5834
6304
|
int n_dims,
|
5835
6305
|
int mode,
|
5836
6306
|
int n_ctx,
|
@@ -5842,15 +6312,16 @@ struct ggml_tensor * ggml_rope_custom(
|
|
5842
6312
|
float beta_fast,
|
5843
6313
|
float beta_slow) {
|
5844
6314
|
return ggml_rope_impl(
|
5845
|
-
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6315
|
+
ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
5846
6316
|
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
|
5847
6317
|
);
|
5848
6318
|
}
|
5849
6319
|
|
5850
|
-
struct ggml_tensor *
|
6320
|
+
struct ggml_tensor * ggml_rope_ext_inplace(
|
5851
6321
|
struct ggml_context * ctx,
|
5852
6322
|
struct ggml_tensor * a,
|
5853
6323
|
struct ggml_tensor * b,
|
6324
|
+
struct ggml_tensor * c,
|
5854
6325
|
int n_dims,
|
5855
6326
|
int mode,
|
5856
6327
|
int n_ctx,
|
@@ -5862,19 +6333,49 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
5862
6333
|
float beta_fast,
|
5863
6334
|
float beta_slow) {
|
5864
6335
|
return ggml_rope_impl(
|
5865
|
-
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6336
|
+
ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
5866
6337
|
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
|
5867
6338
|
);
|
5868
6339
|
}
|
5869
6340
|
|
5870
|
-
struct ggml_tensor *
|
6341
|
+
struct ggml_tensor * ggml_rope_custom(
|
6342
|
+
struct ggml_context * ctx,
|
6343
|
+
struct ggml_tensor * a,
|
6344
|
+
struct ggml_tensor * b,
|
6345
|
+
int n_dims,
|
6346
|
+
int mode,
|
6347
|
+
int n_ctx,
|
6348
|
+
int n_orig_ctx,
|
6349
|
+
float freq_base,
|
6350
|
+
float freq_scale,
|
6351
|
+
float ext_factor,
|
6352
|
+
float attn_factor,
|
6353
|
+
float beta_fast,
|
6354
|
+
float beta_slow) {
|
6355
|
+
return ggml_rope_impl(
|
6356
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6357
|
+
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
|
6358
|
+
);
|
6359
|
+
}
|
6360
|
+
|
6361
|
+
struct ggml_tensor * ggml_rope_custom_inplace(
|
5871
6362
|
struct ggml_context * ctx,
|
5872
6363
|
struct ggml_tensor * a,
|
5873
6364
|
struct ggml_tensor * b,
|
5874
6365
|
int n_dims,
|
5875
|
-
|
5876
|
-
|
5877
|
-
|
6366
|
+
int mode,
|
6367
|
+
int n_ctx,
|
6368
|
+
int n_orig_ctx,
|
6369
|
+
float freq_base,
|
6370
|
+
float freq_scale,
|
6371
|
+
float ext_factor,
|
6372
|
+
float attn_factor,
|
6373
|
+
float beta_fast,
|
6374
|
+
float beta_slow) {
|
6375
|
+
return ggml_rope_impl(
|
6376
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6377
|
+
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
|
6378
|
+
);
|
5878
6379
|
}
|
5879
6380
|
|
5880
6381
|
// ggml_rope_back
|
@@ -5883,6 +6384,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
5883
6384
|
struct ggml_context * ctx,
|
5884
6385
|
struct ggml_tensor * a,
|
5885
6386
|
struct ggml_tensor * b,
|
6387
|
+
struct ggml_tensor * c,
|
5886
6388
|
int n_dims,
|
5887
6389
|
int mode,
|
5888
6390
|
int n_ctx,
|
@@ -5898,6 +6400,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
5898
6400
|
GGML_ASSERT(ggml_is_vector(b));
|
5899
6401
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
5900
6402
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
6403
|
+
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
|
5901
6404
|
|
5902
6405
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
5903
6406
|
|
@@ -5928,37 +6431,6 @@ struct ggml_tensor * ggml_rope_back(
|
|
5928
6431
|
return result;
|
5929
6432
|
}
|
5930
6433
|
|
5931
|
-
// ggml_alibi
|
5932
|
-
|
5933
|
-
struct ggml_tensor * ggml_alibi(
|
5934
|
-
struct ggml_context * ctx,
|
5935
|
-
struct ggml_tensor * a,
|
5936
|
-
int n_past,
|
5937
|
-
int n_head,
|
5938
|
-
float bias_max) {
|
5939
|
-
GGML_ASSERT(n_past >= 0);
|
5940
|
-
bool is_node = false;
|
5941
|
-
|
5942
|
-
if (a->grad) {
|
5943
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5944
|
-
is_node = true;
|
5945
|
-
}
|
5946
|
-
|
5947
|
-
// TODO: when implement backward, fix this:
|
5948
|
-
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5949
|
-
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
5950
|
-
|
5951
|
-
int32_t op_params[3] = { n_past, n_head };
|
5952
|
-
memcpy(op_params + 2, &bias_max, sizeof(float));
|
5953
|
-
ggml_set_op_params(result, op_params, sizeof(op_params));
|
5954
|
-
|
5955
|
-
result->op = GGML_OP_ALIBI;
|
5956
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5957
|
-
result->src[0] = a;
|
5958
|
-
|
5959
|
-
return result;
|
5960
|
-
}
|
5961
|
-
|
5962
6434
|
// ggml_clamp
|
5963
6435
|
|
5964
6436
|
struct ggml_tensor * ggml_clamp(
|
@@ -6308,7 +6780,10 @@ struct ggml_tensor * ggml_pool_2d(
|
|
6308
6780
|
static struct ggml_tensor * ggml_upscale_impl(
|
6309
6781
|
struct ggml_context * ctx,
|
6310
6782
|
struct ggml_tensor * a,
|
6311
|
-
int
|
6783
|
+
int ne0,
|
6784
|
+
int ne1,
|
6785
|
+
int ne2,
|
6786
|
+
int ne3) {
|
6312
6787
|
bool is_node = false;
|
6313
6788
|
|
6314
6789
|
if (a->grad) {
|
@@ -6316,19 +6791,45 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
6316
6791
|
is_node = true;
|
6317
6792
|
}
|
6318
6793
|
|
6794
|
+
GGML_ASSERT(a->ne[0] <= ne0);
|
6795
|
+
GGML_ASSERT(a->ne[1] <= ne1);
|
6796
|
+
GGML_ASSERT(a->ne[2] <= ne2);
|
6797
|
+
GGML_ASSERT(a->ne[3] <= ne3);
|
6798
|
+
|
6319
6799
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
6320
|
-
|
6321
|
-
|
6322
|
-
|
6800
|
+
ne0,
|
6801
|
+
ne1,
|
6802
|
+
ne2,
|
6803
|
+
ne3
|
6804
|
+
);
|
6323
6805
|
|
6324
6806
|
result->op = GGML_OP_UPSCALE;
|
6325
|
-
|
6807
|
+
|
6326
6808
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6327
6809
|
result->src[0] = a;
|
6328
6810
|
|
6329
6811
|
return result;
|
6330
6812
|
}
|
6331
6813
|
|
6814
|
+
struct ggml_tensor * ggml_upscale(
|
6815
|
+
struct ggml_context * ctx,
|
6816
|
+
struct ggml_tensor * a,
|
6817
|
+
int scale_factor) {
|
6818
|
+
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
|
6819
|
+
}
|
6820
|
+
|
6821
|
+
struct ggml_tensor * ggml_upscale_ext(
|
6822
|
+
struct ggml_context * ctx,
|
6823
|
+
struct ggml_tensor * a,
|
6824
|
+
int ne0,
|
6825
|
+
int ne1,
|
6826
|
+
int ne2,
|
6827
|
+
int ne3) {
|
6828
|
+
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
|
6829
|
+
}
|
6830
|
+
|
6831
|
+
// ggml_pad
|
6832
|
+
|
6332
6833
|
struct ggml_tensor * ggml_pad(
|
6333
6834
|
struct ggml_context * ctx,
|
6334
6835
|
struct ggml_tensor * a,
|
@@ -6353,12 +6854,7 @@ struct ggml_tensor * ggml_pad(
|
|
6353
6854
|
return result;
|
6354
6855
|
}
|
6355
6856
|
|
6356
|
-
|
6357
|
-
struct ggml_context * ctx,
|
6358
|
-
struct ggml_tensor * a,
|
6359
|
-
int scale_factor) {
|
6360
|
-
return ggml_upscale_impl(ctx, a, scale_factor);
|
6361
|
-
}
|
6857
|
+
// ggml_arange
|
6362
6858
|
|
6363
6859
|
struct ggml_tensor * ggml_arange(
|
6364
6860
|
struct ggml_context * ctx,
|
@@ -6380,6 +6876,8 @@ struct ggml_tensor * ggml_arange(
|
|
6380
6876
|
return result;
|
6381
6877
|
}
|
6382
6878
|
|
6879
|
+
// ggml_timestep_embedding
|
6880
|
+
|
6383
6881
|
struct ggml_tensor * ggml_timestep_embedding(
|
6384
6882
|
struct ggml_context * ctx,
|
6385
6883
|
struct ggml_tensor * timesteps,
|
@@ -6446,38 +6944,6 @@ struct ggml_tensor * ggml_top_k(
|
|
6446
6944
|
return result;
|
6447
6945
|
}
|
6448
6946
|
|
6449
|
-
// ggml_flash_attn
|
6450
|
-
|
6451
|
-
struct ggml_tensor * ggml_flash_attn(
|
6452
|
-
struct ggml_context * ctx,
|
6453
|
-
struct ggml_tensor * q,
|
6454
|
-
struct ggml_tensor * k,
|
6455
|
-
struct ggml_tensor * v,
|
6456
|
-
bool masked) {
|
6457
|
-
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6458
|
-
// TODO: check if vT can be multiplied by (k*qT)
|
6459
|
-
|
6460
|
-
bool is_node = false;
|
6461
|
-
|
6462
|
-
if (q->grad || k->grad || v->grad) {
|
6463
|
-
is_node = true;
|
6464
|
-
}
|
6465
|
-
|
6466
|
-
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
6467
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
6468
|
-
|
6469
|
-
int32_t t = masked ? 1 : 0;
|
6470
|
-
ggml_set_op_params(result, &t, sizeof(t));
|
6471
|
-
|
6472
|
-
result->op = GGML_OP_FLASH_ATTN;
|
6473
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6474
|
-
result->src[0] = q;
|
6475
|
-
result->src[1] = k;
|
6476
|
-
result->src[2] = v;
|
6477
|
-
|
6478
|
-
return result;
|
6479
|
-
}
|
6480
|
-
|
6481
6947
|
// ggml_flash_attn_ext
|
6482
6948
|
|
6483
6949
|
struct ggml_tensor * ggml_flash_attn_ext(
|
@@ -6486,9 +6952,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6486
6952
|
struct ggml_tensor * k,
|
6487
6953
|
struct ggml_tensor * v,
|
6488
6954
|
struct ggml_tensor * mask,
|
6489
|
-
float scale
|
6955
|
+
float scale,
|
6956
|
+
float max_bias) {
|
6490
6957
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6491
6958
|
// TODO: check if vT can be multiplied by (k*qT)
|
6959
|
+
|
6492
6960
|
if (mask) {
|
6493
6961
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
6494
6962
|
GGML_ASSERT(mask->ne[2] == 1);
|
@@ -6498,6 +6966,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6498
6966
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
6499
6967
|
}
|
6500
6968
|
|
6969
|
+
if (max_bias > 0.0f) {
|
6970
|
+
GGML_ASSERT(mask);
|
6971
|
+
}
|
6972
|
+
|
6501
6973
|
bool is_node = false;
|
6502
6974
|
|
6503
6975
|
if (q->grad || k->grad || v->grad) {
|
@@ -6508,7 +6980,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6508
6980
|
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
6509
6981
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6510
6982
|
|
6511
|
-
float params[] = { scale };
|
6983
|
+
float params[] = { scale, max_bias };
|
6512
6984
|
ggml_set_op_params(result, params, sizeof(params));
|
6513
6985
|
|
6514
6986
|
result->op = GGML_OP_FLASH_ATTN_EXT;
|
@@ -6528,39 +7000,7 @@ void ggml_flash_attn_ext_set_prec(
|
|
6528
7000
|
|
6529
7001
|
const int32_t prec_i32 = (int32_t) prec;
|
6530
7002
|
|
6531
|
-
ggml_set_op_params_i32(a,
|
6532
|
-
}
|
6533
|
-
|
6534
|
-
// ggml_flash_ff
|
6535
|
-
|
6536
|
-
struct ggml_tensor * ggml_flash_ff(
|
6537
|
-
struct ggml_context * ctx,
|
6538
|
-
struct ggml_tensor * a,
|
6539
|
-
struct ggml_tensor * b0,
|
6540
|
-
struct ggml_tensor * b1,
|
6541
|
-
struct ggml_tensor * c0,
|
6542
|
-
struct ggml_tensor * c1) {
|
6543
|
-
GGML_ASSERT(ggml_can_mul_mat(b0, a));
|
6544
|
-
// TODO: more checks
|
6545
|
-
|
6546
|
-
bool is_node = false;
|
6547
|
-
|
6548
|
-
if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
|
6549
|
-
is_node = true;
|
6550
|
-
}
|
6551
|
-
|
6552
|
-
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6553
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
6554
|
-
|
6555
|
-
result->op = GGML_OP_FLASH_FF;
|
6556
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6557
|
-
result->src[0] = a;
|
6558
|
-
result->src[1] = b0;
|
6559
|
-
result->src[2] = b1;
|
6560
|
-
result->src[3] = c0;
|
6561
|
-
result->src[4] = c1;
|
6562
|
-
|
6563
|
-
return result;
|
7003
|
+
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
6564
7004
|
}
|
6565
7005
|
|
6566
7006
|
// ggml_flash_attn_back
|
@@ -6572,6 +7012,8 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6572
7012
|
struct ggml_tensor * v,
|
6573
7013
|
struct ggml_tensor * d,
|
6574
7014
|
bool masked) {
|
7015
|
+
GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
|
7016
|
+
|
6575
7017
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6576
7018
|
// TODO: check if vT can be multiplied by (k*qT)
|
6577
7019
|
|
@@ -10892,6 +11334,52 @@ static void ggml_compute_forward_relu(
|
|
10892
11334
|
}
|
10893
11335
|
}
|
10894
11336
|
|
11337
|
+
// ggml_compute_forward_sigmoid
|
11338
|
+
|
11339
|
+
static void ggml_compute_forward_sigmoid_f32(
|
11340
|
+
const struct ggml_compute_params * params,
|
11341
|
+
struct ggml_tensor * dst) {
|
11342
|
+
|
11343
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11344
|
+
|
11345
|
+
assert(params->ith == 0);
|
11346
|
+
assert(ggml_are_same_shape(src0, dst));
|
11347
|
+
|
11348
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11349
|
+
return;
|
11350
|
+
}
|
11351
|
+
|
11352
|
+
const int n = ggml_nrows(src0);
|
11353
|
+
const int nc = src0->ne[0];
|
11354
|
+
|
11355
|
+
assert(dst->nb[0] == sizeof(float));
|
11356
|
+
assert(src0->nb[0] == sizeof(float));
|
11357
|
+
|
11358
|
+
for (int i = 0; i < n; i++) {
|
11359
|
+
ggml_vec_sigmoid_f32(nc,
|
11360
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
11361
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
11362
|
+
}
|
11363
|
+
}
|
11364
|
+
|
11365
|
+
static void ggml_compute_forward_sigmoid(
|
11366
|
+
const struct ggml_compute_params * params,
|
11367
|
+
struct ggml_tensor * dst) {
|
11368
|
+
|
11369
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11370
|
+
|
11371
|
+
switch (src0->type) {
|
11372
|
+
case GGML_TYPE_F32:
|
11373
|
+
{
|
11374
|
+
ggml_compute_forward_sigmoid_f32(params, dst);
|
11375
|
+
} break;
|
11376
|
+
default:
|
11377
|
+
{
|
11378
|
+
GGML_ASSERT(false);
|
11379
|
+
} break;
|
11380
|
+
}
|
11381
|
+
}
|
11382
|
+
|
10895
11383
|
// ggml_compute_forward_gelu
|
10896
11384
|
|
10897
11385
|
static void ggml_compute_forward_gelu_f32(
|
@@ -11742,80 +12230,171 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
11742
12230
|
}
|
11743
12231
|
#endif
|
11744
12232
|
|
11745
|
-
static void
|
11746
|
-
|
11747
|
-
|
12233
|
+
static void ggml_compute_forward_mul_mat_one_chunk(
|
12234
|
+
const struct ggml_compute_params * params,
|
12235
|
+
struct ggml_tensor * dst,
|
12236
|
+
const int64_t num_rows_per_vec_dot,
|
12237
|
+
const int64_t ir0_start,
|
12238
|
+
const int64_t ir0_end,
|
12239
|
+
const int64_t ir1_start,
|
12240
|
+
const int64_t ir1_end) {
|
11748
12241
|
|
11749
12242
|
const struct ggml_tensor * src0 = dst->src[0];
|
11750
12243
|
const struct ggml_tensor * src1 = dst->src[1];
|
11751
12244
|
|
11752
|
-
int64_t t0 = ggml_perf_time_us();
|
11753
|
-
UNUSED(t0);
|
11754
|
-
|
11755
12245
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11756
12246
|
|
11757
|
-
const int ith = params->ith;
|
11758
|
-
const int nth = params->nth;
|
11759
|
-
|
11760
12247
|
const enum ggml_type type = src0->type;
|
11761
12248
|
|
11762
12249
|
const bool src1_cont = ggml_is_contiguous(src1);
|
11763
12250
|
|
11764
|
-
ggml_vec_dot_t const vec_dot
|
11765
|
-
enum ggml_type const vec_dot_type
|
11766
|
-
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
11767
|
-
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
11768
|
-
|
11769
|
-
GGML_ASSERT(ne0 == ne01);
|
11770
|
-
GGML_ASSERT(ne1 == ne11);
|
11771
|
-
GGML_ASSERT(ne2 == ne12);
|
11772
|
-
GGML_ASSERT(ne3 == ne13);
|
11773
|
-
|
11774
|
-
// we don't support permuted src0 or src1
|
11775
|
-
GGML_ASSERT(nb00 == ggml_type_size(type));
|
11776
|
-
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
11777
|
-
|
11778
|
-
// dst cannot be transposed or permuted
|
11779
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
11780
|
-
GGML_ASSERT(nb0 <= nb1);
|
11781
|
-
GGML_ASSERT(nb1 <= nb2);
|
11782
|
-
GGML_ASSERT(nb2 <= nb3);
|
12251
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
12252
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
11783
12253
|
|
11784
12254
|
// broadcast factors
|
11785
|
-
const int64_t r2 = ne12/ne02;
|
11786
|
-
const int64_t r3 = ne13/ne03;
|
12255
|
+
const int64_t r2 = ne12 / ne02;
|
12256
|
+
const int64_t r3 = ne13 / ne03;
|
11787
12257
|
|
11788
|
-
//
|
11789
|
-
// compute by src0 rows
|
12258
|
+
//printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
|
11790
12259
|
|
11791
|
-
|
11792
|
-
if (
|
11793
|
-
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
11794
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
11795
|
-
}
|
12260
|
+
// threads with no work simply yield (not sure if it helps)
|
12261
|
+
if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
|
11796
12262
|
return;
|
11797
12263
|
}
|
11798
|
-
#endif
|
11799
12264
|
|
11800
|
-
|
11801
|
-
|
11802
|
-
const int64_t ne_plane = ne01*ne00;
|
11803
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
11804
|
-
UNUSED(desired_wsize);
|
12265
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12266
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11805
12267
|
|
11806
|
-
|
11807
|
-
|
11808
|
-
assert(params->wsize >= desired_wsize);
|
11809
|
-
// parallelize by src0 rows
|
11810
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
11811
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
11812
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
11813
|
-
const int64_t i03 = i13/r3;
|
11814
|
-
const int64_t i02 = i12/r2;
|
12268
|
+
assert(ne12 % ne02 == 0);
|
12269
|
+
assert(ne13 % ne03 == 0);
|
11815
12270
|
|
11816
|
-
|
11817
|
-
|
11818
|
-
|
12271
|
+
// block-tiling attempt
|
12272
|
+
const int64_t blck_0 = 16;
|
12273
|
+
const int64_t blck_1 = 16;
|
12274
|
+
|
12275
|
+
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
12276
|
+
|
12277
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
12278
|
+
// 16 * 2, accounting for mmla kernels
|
12279
|
+
float tmp[32];
|
12280
|
+
|
12281
|
+
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
12282
|
+
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
12283
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
|
12284
|
+
const int64_t i13 = (ir1 / (ne12 * ne1));
|
12285
|
+
const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
12286
|
+
const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
12287
|
+
|
12288
|
+
// broadcast src0 into src1
|
12289
|
+
const int64_t i03 = i13 / r3;
|
12290
|
+
const int64_t i02 = i12 / r2;
|
12291
|
+
|
12292
|
+
const int64_t i1 = i11;
|
12293
|
+
const int64_t i2 = i12;
|
12294
|
+
const int64_t i3 = i13;
|
12295
|
+
|
12296
|
+
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
12297
|
+
|
12298
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
12299
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
12300
|
+
// the original src1 data pointer, so we should index using the indices directly
|
12301
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12302
|
+
const char * src1_col = (const char*)wdata +
|
12303
|
+
(src1_cont || src1->type != vec_dot_type
|
12304
|
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
12305
|
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
12306
|
+
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
12307
|
+
|
12308
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
12309
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
12310
|
+
//}
|
12311
|
+
|
12312
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
12313
|
+
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
12314
|
+
}
|
12315
|
+
|
12316
|
+
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
12317
|
+
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
|
12318
|
+
}
|
12319
|
+
}
|
12320
|
+
}
|
12321
|
+
}
|
12322
|
+
}
|
12323
|
+
|
12324
|
+
static void ggml_compute_forward_mul_mat(
|
12325
|
+
const struct ggml_compute_params * params,
|
12326
|
+
struct ggml_tensor * dst,
|
12327
|
+
struct ggml_compute_state * state) {
|
12328
|
+
|
12329
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
12330
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
12331
|
+
|
12332
|
+
int64_t t0 = ggml_perf_time_us();
|
12333
|
+
UNUSED(t0);
|
12334
|
+
|
12335
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
12336
|
+
|
12337
|
+
const int ith = params->ith;
|
12338
|
+
const int nth = params->nth;
|
12339
|
+
|
12340
|
+
const enum ggml_type type = src0->type;
|
12341
|
+
|
12342
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
12343
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
12344
|
+
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
12345
|
+
|
12346
|
+
GGML_ASSERT(ne0 == ne01);
|
12347
|
+
GGML_ASSERT(ne1 == ne11);
|
12348
|
+
GGML_ASSERT(ne2 == ne12);
|
12349
|
+
GGML_ASSERT(ne3 == ne13);
|
12350
|
+
|
12351
|
+
// we don't support permuted src0 or src1
|
12352
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
12353
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
12354
|
+
|
12355
|
+
// dst cannot be transposed or permuted
|
12356
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12357
|
+
GGML_ASSERT(nb0 <= nb1);
|
12358
|
+
GGML_ASSERT(nb1 <= nb2);
|
12359
|
+
GGML_ASSERT(nb2 <= nb3);
|
12360
|
+
|
12361
|
+
// broadcast factors
|
12362
|
+
const int64_t r2 = ne12 / ne02;
|
12363
|
+
const int64_t r3 = ne13 / ne03;
|
12364
|
+
UNUSED(r2);
|
12365
|
+
UNUSED(r3);
|
12366
|
+
|
12367
|
+
// nb01 >= nb00 - src0 is not transposed
|
12368
|
+
// compute by src0 rows
|
12369
|
+
|
12370
|
+
#if defined(GGML_USE_CLBLAST)
|
12371
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
12372
|
+
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
12373
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
12374
|
+
}
|
12375
|
+
return;
|
12376
|
+
}
|
12377
|
+
#endif
|
12378
|
+
|
12379
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12380
|
+
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
12381
|
+
const int64_t ne_plane = ne01*ne00;
|
12382
|
+
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
12383
|
+
UNUSED(desired_wsize);
|
12384
|
+
|
12385
|
+
if (params->type == GGML_TASK_TYPE_INIT) {
|
12386
|
+
if (type != GGML_TYPE_F32) {
|
12387
|
+
assert(params->wsize >= desired_wsize);
|
12388
|
+
// parallelize by src0 rows
|
12389
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12390
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12391
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
12392
|
+
const int64_t i03 = i13/r3;
|
12393
|
+
const int64_t i02 = i12/r2;
|
12394
|
+
|
12395
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12396
|
+
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12397
|
+
ggml_to_float_t const to_float = type_traits[type].to_float;
|
11819
12398
|
|
11820
12399
|
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
11821
12400
|
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
@@ -11865,6 +12444,8 @@ static void ggml_compute_forward_mul_mat(
|
|
11865
12444
|
#endif
|
11866
12445
|
|
11867
12446
|
#if GGML_USE_LLAMAFILE
|
12447
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
12448
|
+
|
11868
12449
|
if (src1_cont) {
|
11869
12450
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11870
12451
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
@@ -11890,6 +12471,8 @@ UseGgmlGemm1:;
|
|
11890
12471
|
if (ith != 0) {
|
11891
12472
|
return;
|
11892
12473
|
}
|
12474
|
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
12475
|
+
atomic_store(&state->shared->current_chunk, nth);
|
11893
12476
|
if (src1->type != vec_dot_type) {
|
11894
12477
|
char * wdata = params->wdata;
|
11895
12478
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -11914,11 +12497,11 @@ UseGgmlGemm1:;
|
|
11914
12497
|
return;
|
11915
12498
|
}
|
11916
12499
|
|
11917
|
-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11918
|
-
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11919
|
-
|
11920
12500
|
#if GGML_USE_LLAMAFILE
|
11921
12501
|
if (src1->type != vec_dot_type) {
|
12502
|
+
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12503
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
12504
|
+
|
11922
12505
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11923
12506
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
11924
12507
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
@@ -11939,98 +12522,87 @@ UseGgmlGemm1:;
|
|
11939
12522
|
UseGgmlGemm2:;
|
11940
12523
|
#endif
|
11941
12524
|
|
11942
|
-
|
11943
|
-
|
11944
|
-
|
11945
|
-
|
11946
|
-
|
11947
|
-
// distribute the thread work across the inner or outer loop based on which one is larger
|
11948
|
-
|
11949
|
-
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
11950
|
-
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
11951
|
-
|
11952
|
-
const int64_t ith0 = ith % nth0;
|
11953
|
-
const int64_t ith1 = ith / nth0;
|
11954
|
-
|
11955
|
-
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
11956
|
-
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
11957
|
-
|
11958
|
-
const int64_t ir010 = dr0*ith0;
|
11959
|
-
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
11960
|
-
|
11961
|
-
const int64_t ir110 = dr1*ith1;
|
11962
|
-
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11963
|
-
|
11964
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11965
|
-
|
11966
|
-
// threads with no work simply yield (not sure if it helps)
|
11967
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11968
|
-
sched_yield();
|
11969
|
-
return;
|
11970
|
-
}
|
12525
|
+
#ifdef GGML_PERF
|
12526
|
+
int chunks_executed = 0;
|
12527
|
+
UNUSED(chunks_executed);
|
12528
|
+
#endif
|
11971
12529
|
|
11972
|
-
|
11973
|
-
|
12530
|
+
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
12531
|
+
const int64_t nr0 = ne0;
|
11974
12532
|
|
11975
|
-
//
|
11976
|
-
const int64_t
|
11977
|
-
const int64_t blck_1 = 16;
|
12533
|
+
// This is the size of the rest of the dimensions of the result
|
12534
|
+
const int64_t nr1 = ne1 * ne2 * ne3;
|
11978
12535
|
|
11979
12536
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
11980
|
-
int64_t
|
12537
|
+
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
11981
12538
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
11982
12539
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
11983
12540
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
11984
|
-
|
12541
|
+
num_rows_per_vec_dot = 1;
|
11985
12542
|
}
|
11986
12543
|
|
11987
|
-
|
12544
|
+
// Now select a reasonable chunk size.
|
12545
|
+
int chunk_size = 16;
|
11988
12546
|
|
11989
|
-
//
|
11990
|
-
|
11991
|
-
|
12547
|
+
// We need to step up the size if it's small
|
12548
|
+
if (nr0 == 1 || nr1 == 1) {
|
12549
|
+
chunk_size = 64;
|
12550
|
+
}
|
11992
12551
|
|
11993
|
-
|
11994
|
-
|
11995
|
-
|
11996
|
-
|
11997
|
-
|
11998
|
-
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
12552
|
+
// distribute the work across the inner or outer loop based on which one is larger
|
12553
|
+
// The number of chunks in the 0/1 dim.
|
12554
|
+
// CEIL(nr0/chunk_size)
|
12555
|
+
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
12556
|
+
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
11999
12557
|
|
12000
|
-
|
12001
|
-
|
12002
|
-
|
12558
|
+
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
12559
|
+
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
12560
|
+
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
12561
|
+
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
12562
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
12563
|
+
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
12564
|
+
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
12565
|
+
}
|
12003
12566
|
|
12004
|
-
|
12005
|
-
|
12006
|
-
|
12567
|
+
// The number of elements in each chunk
|
12568
|
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
12569
|
+
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
12007
12570
|
|
12008
|
-
|
12571
|
+
//if (ith == 0)
|
12572
|
+
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
12009
12573
|
|
12010
|
-
|
12011
|
-
|
12012
|
-
// the original src1 data pointer, so we should index using the indices directly
|
12013
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12014
|
-
const char * src1_col = (const char *) wdata +
|
12015
|
-
(src1_cont || src1->type != vec_dot_type
|
12016
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
12017
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
12018
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
12574
|
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
12575
|
+
int current_chunk = ith;
|
12019
12576
|
|
12020
|
-
|
12021
|
-
|
12022
|
-
|
12577
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
12578
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
12579
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
12023
12580
|
|
12024
|
-
|
12025
|
-
|
12026
|
-
}
|
12581
|
+
const int64_t ir0_start = dr0 * ith0;
|
12582
|
+
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
12027
12583
|
|
12028
|
-
|
12029
|
-
|
12030
|
-
|
12031
|
-
|
12584
|
+
const int64_t ir1_start = dr1 * ith1;
|
12585
|
+
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
12586
|
+
|
12587
|
+
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
12588
|
+
|
12589
|
+
#ifdef GGML_PERF
|
12590
|
+
chunks_executed++;
|
12591
|
+
#endif
|
12592
|
+
|
12593
|
+
if (nth >= nchunk0 * nchunk1) {
|
12594
|
+
break;
|
12032
12595
|
}
|
12596
|
+
|
12597
|
+
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
12033
12598
|
}
|
12599
|
+
|
12600
|
+
#ifdef GGML_PERF
|
12601
|
+
// These numbers are useful when trying to measure how well the threading scheduling works.
|
12602
|
+
//int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
|
12603
|
+
//float time = (ggml_perf_time_us() - t0);
|
12604
|
+
//printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
|
12605
|
+
#endif
|
12034
12606
|
}
|
12035
12607
|
|
12036
12608
|
// ggml_compute_forward_mul_mat_id
|
@@ -13333,7 +13905,6 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13333
13905
|
|
13334
13906
|
const struct ggml_tensor * src0 = dst->src[0];
|
13335
13907
|
const struct ggml_tensor * src1 = dst->src[1];
|
13336
|
-
const struct ggml_tensor * src2 = dst->src[2];
|
13337
13908
|
|
13338
13909
|
assert(ggml_is_contiguous(dst));
|
13339
13910
|
assert(ggml_are_same_shape(src0, dst));
|
@@ -13359,8 +13930,8 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13359
13930
|
|
13360
13931
|
// TODO: is this supposed to be ceil instead of floor?
|
13361
13932
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
13362
|
-
const uint32_t
|
13363
|
-
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(
|
13933
|
+
const uint32_t n_head = ne02;
|
13934
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
13364
13935
|
|
13365
13936
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
13366
13937
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
@@ -13377,13 +13948,13 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13377
13948
|
|
13378
13949
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
13379
13950
|
|
13380
|
-
|
13381
|
-
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
13382
|
-
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
13383
|
-
|
13384
|
-
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
13951
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
13385
13952
|
|
13386
13953
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
13954
|
+
// ALiBi
|
13955
|
+
const uint32_t h = (i1/ne01)%ne02; // head
|
13956
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
13957
|
+
|
13387
13958
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
13388
13959
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
13389
13960
|
|
@@ -13396,27 +13967,11 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13396
13967
|
if (mp_f32) {
|
13397
13968
|
if (use_f16) {
|
13398
13969
|
for (int i = 0; i < nc; ++i) {
|
13399
|
-
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
13400
|
-
}
|
13401
|
-
} else {
|
13402
|
-
for (int i = 0; i < nc; ++i) {
|
13403
|
-
wp[i] += mp_f32[i];
|
13404
|
-
}
|
13405
|
-
}
|
13406
|
-
}
|
13407
|
-
|
13408
|
-
// ALiBi bias
|
13409
|
-
if (max_bias > 0.0f) {
|
13410
|
-
const uint32_t h = (i1/ne01)%ne02; // head
|
13411
|
-
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
13412
|
-
|
13413
|
-
if (use_f16) {
|
13414
|
-
for (int i = 0; i < nc; ++i) {
|
13415
|
-
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
13970
|
+
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
|
13416
13971
|
}
|
13417
13972
|
} else {
|
13418
13973
|
for (int i = 0; i < nc; ++i) {
|
13419
|
-
wp[i] += slope*
|
13974
|
+
wp[i] += slope*mp_f32[i];
|
13420
13975
|
}
|
13421
13976
|
}
|
13422
13977
|
}
|
@@ -13431,22 +13986,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13431
13986
|
float max = -INFINITY;
|
13432
13987
|
ggml_vec_max_f32(nc, &max, wp);
|
13433
13988
|
|
13434
|
-
ggml_float sum =
|
13435
|
-
|
13436
|
-
uint16_t scvt;
|
13437
|
-
for (int i = 0; i < nc; i++) {
|
13438
|
-
if (wp[i] == -INFINITY) {
|
13439
|
-
dp[i] = 0.0f;
|
13440
|
-
} else {
|
13441
|
-
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
13442
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
13443
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
13444
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
13445
|
-
sum += (ggml_float)val;
|
13446
|
-
dp[i] = val;
|
13447
|
-
}
|
13448
|
-
}
|
13449
|
-
|
13989
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
|
13450
13990
|
assert(sum > 0.0);
|
13451
13991
|
|
13452
13992
|
sum = 1.0/sum;
|
@@ -13578,68 +14118,9 @@ static void ggml_compute_forward_soft_max_back(
|
|
13578
14118
|
}
|
13579
14119
|
}
|
13580
14120
|
|
13581
|
-
//
|
13582
|
-
|
13583
|
-
static void ggml_compute_forward_alibi_f32(
|
13584
|
-
const struct ggml_compute_params * params,
|
13585
|
-
struct ggml_tensor * dst) {
|
13586
|
-
|
13587
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13588
|
-
|
13589
|
-
assert(params->ith == 0);
|
13590
|
-
|
13591
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13592
|
-
return;
|
13593
|
-
}
|
13594
|
-
|
13595
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13596
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
13597
|
-
float max_bias;
|
13598
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13599
|
-
|
13600
|
-
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13601
|
-
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13602
|
-
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13603
|
-
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13604
|
-
|
13605
|
-
const int64_t n = ggml_nrows(src0);
|
13606
|
-
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13607
|
-
|
13608
|
-
const size_t nb0 = src0->nb[0];
|
13609
|
-
const size_t nb1 = src0->nb[1];
|
13610
|
-
const size_t nb2 = src0->nb[2];
|
13611
|
-
//const int nb3 = src0->nb[3];
|
13612
|
-
|
13613
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
13614
|
-
GGML_ASSERT(n_head == ne2);
|
13615
|
-
|
13616
|
-
// add alibi to src0 (KQ_scaled)
|
13617
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
13618
|
-
|
13619
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13620
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13621
|
-
|
13622
|
-
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13623
|
-
// TODO: k*nb2 or k*nb3
|
13624
|
-
float m_k;
|
13625
|
-
|
13626
|
-
if (k < n_heads_log2_floor) {
|
13627
|
-
m_k = powf(m0, k + 1);
|
13628
|
-
} else {
|
13629
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13630
|
-
}
|
13631
|
-
|
13632
|
-
for (int64_t i = 0; i < ne0; i++) {
|
13633
|
-
for (int64_t j = 0; j < ne1; j++) {
|
13634
|
-
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13635
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13636
|
-
pdst[0] = i * m_k + src[0];
|
13637
|
-
}
|
13638
|
-
}
|
13639
|
-
}
|
13640
|
-
}
|
14121
|
+
// ggml_compute_forward_clamp
|
13641
14122
|
|
13642
|
-
static void
|
14123
|
+
static void ggml_compute_forward_clamp_f32(
|
13643
14124
|
const struct ggml_compute_params * params,
|
13644
14125
|
struct ggml_tensor * dst) {
|
13645
14126
|
|
@@ -13651,71 +14132,48 @@ static void ggml_compute_forward_alibi_f16(
|
|
13651
14132
|
return;
|
13652
14133
|
}
|
13653
14134
|
|
13654
|
-
|
13655
|
-
|
13656
|
-
float
|
13657
|
-
memcpy(&
|
14135
|
+
float min;
|
14136
|
+
float max;
|
14137
|
+
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
14138
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
13658
14139
|
|
13659
|
-
const int
|
13660
|
-
const int
|
13661
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13662
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
14140
|
+
const int ith = params->ith;
|
14141
|
+
const int nth = params->nth;
|
13663
14142
|
|
13664
14143
|
const int n = ggml_nrows(src0);
|
13665
|
-
const int
|
13666
|
-
|
13667
|
-
const int nb0 = src0->nb[0];
|
13668
|
-
const int nb1 = src0->nb[1];
|
13669
|
-
const int nb2 = src0->nb[2];
|
13670
|
-
//const int nb3 = src0->nb[3];
|
13671
|
-
|
13672
|
-
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
13673
|
-
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
13674
|
-
GGML_ASSERT(n_head == ne2);
|
13675
|
-
|
13676
|
-
// add alibi to src0 (KQ_scaled)
|
13677
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
14144
|
+
const int nc = src0->ne[0];
|
13678
14145
|
|
13679
|
-
const
|
13680
|
-
const
|
14146
|
+
const size_t nb00 = src0->nb[0];
|
14147
|
+
const size_t nb01 = src0->nb[1];
|
13681
14148
|
|
13682
|
-
|
13683
|
-
|
13684
|
-
float m_k;
|
14149
|
+
const size_t nb0 = dst->nb[0];
|
14150
|
+
const size_t nb1 = dst->nb[1];
|
13685
14151
|
|
13686
|
-
|
13687
|
-
|
13688
|
-
} else {
|
13689
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13690
|
-
}
|
14152
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
14153
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
13691
14154
|
|
13692
|
-
|
13693
|
-
|
13694
|
-
|
13695
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
14155
|
+
for (int j = ith; j < n; j += nth) {
|
14156
|
+
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
14157
|
+
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
13696
14158
|
|
13697
|
-
|
13698
|
-
|
13699
|
-
}
|
14159
|
+
for (int i = 0; i < nc; i++) {
|
14160
|
+
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
13700
14161
|
}
|
13701
14162
|
}
|
13702
14163
|
}
|
13703
14164
|
|
13704
|
-
static void
|
14165
|
+
static void ggml_compute_forward_clamp(
|
13705
14166
|
const struct ggml_compute_params * params,
|
13706
14167
|
struct ggml_tensor * dst) {
|
13707
14168
|
|
13708
14169
|
const struct ggml_tensor * src0 = dst->src[0];
|
13709
14170
|
|
13710
14171
|
switch (src0->type) {
|
13711
|
-
case GGML_TYPE_F16:
|
13712
|
-
{
|
13713
|
-
ggml_compute_forward_alibi_f16(params, dst);
|
13714
|
-
} break;
|
13715
14172
|
case GGML_TYPE_F32:
|
13716
14173
|
{
|
13717
|
-
|
14174
|
+
ggml_compute_forward_clamp_f32(params, dst);
|
13718
14175
|
} break;
|
14176
|
+
case GGML_TYPE_F16:
|
13719
14177
|
case GGML_TYPE_BF16:
|
13720
14178
|
case GGML_TYPE_Q4_0:
|
13721
14179
|
case GGML_TYPE_Q4_1:
|
@@ -13750,102 +14208,12 @@ static void ggml_compute_forward_alibi(
|
|
13750
14208
|
}
|
13751
14209
|
}
|
13752
14210
|
|
13753
|
-
//
|
13754
|
-
|
13755
|
-
static void ggml_compute_forward_clamp_f32(
|
13756
|
-
const struct ggml_compute_params * params,
|
13757
|
-
struct ggml_tensor * dst) {
|
13758
|
-
|
13759
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
14211
|
+
// ggml_compute_forward_rope
|
13760
14212
|
|
13761
|
-
|
13762
|
-
|
13763
|
-
|
13764
|
-
|
13765
|
-
}
|
13766
|
-
|
13767
|
-
float min;
|
13768
|
-
float max;
|
13769
|
-
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
13770
|
-
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
13771
|
-
|
13772
|
-
const int ith = params->ith;
|
13773
|
-
const int nth = params->nth;
|
13774
|
-
|
13775
|
-
const int n = ggml_nrows(src0);
|
13776
|
-
const int nc = src0->ne[0];
|
13777
|
-
|
13778
|
-
const size_t nb00 = src0->nb[0];
|
13779
|
-
const size_t nb01 = src0->nb[1];
|
13780
|
-
|
13781
|
-
const size_t nb0 = dst->nb[0];
|
13782
|
-
const size_t nb1 = dst->nb[1];
|
13783
|
-
|
13784
|
-
GGML_ASSERT( nb0 == sizeof(float));
|
13785
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
13786
|
-
|
13787
|
-
for (int j = ith; j < n; j += nth) {
|
13788
|
-
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
13789
|
-
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
13790
|
-
|
13791
|
-
for (int i = 0; i < nc; i++) {
|
13792
|
-
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
13793
|
-
}
|
13794
|
-
}
|
13795
|
-
}
|
13796
|
-
|
13797
|
-
static void ggml_compute_forward_clamp(
|
13798
|
-
const struct ggml_compute_params * params,
|
13799
|
-
struct ggml_tensor * dst) {
|
13800
|
-
|
13801
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13802
|
-
|
13803
|
-
switch (src0->type) {
|
13804
|
-
case GGML_TYPE_F32:
|
13805
|
-
{
|
13806
|
-
ggml_compute_forward_clamp_f32(params, dst);
|
13807
|
-
} break;
|
13808
|
-
case GGML_TYPE_F16:
|
13809
|
-
case GGML_TYPE_BF16:
|
13810
|
-
case GGML_TYPE_Q4_0:
|
13811
|
-
case GGML_TYPE_Q4_1:
|
13812
|
-
case GGML_TYPE_Q5_0:
|
13813
|
-
case GGML_TYPE_Q5_1:
|
13814
|
-
case GGML_TYPE_Q8_0:
|
13815
|
-
case GGML_TYPE_Q8_1:
|
13816
|
-
case GGML_TYPE_Q2_K:
|
13817
|
-
case GGML_TYPE_Q3_K:
|
13818
|
-
case GGML_TYPE_Q4_K:
|
13819
|
-
case GGML_TYPE_Q5_K:
|
13820
|
-
case GGML_TYPE_Q6_K:
|
13821
|
-
case GGML_TYPE_IQ2_XXS:
|
13822
|
-
case GGML_TYPE_IQ2_XS:
|
13823
|
-
case GGML_TYPE_IQ3_XXS:
|
13824
|
-
case GGML_TYPE_IQ1_S:
|
13825
|
-
case GGML_TYPE_IQ1_M:
|
13826
|
-
case GGML_TYPE_IQ4_NL:
|
13827
|
-
case GGML_TYPE_IQ4_XS:
|
13828
|
-
case GGML_TYPE_IQ3_S:
|
13829
|
-
case GGML_TYPE_IQ2_S:
|
13830
|
-
case GGML_TYPE_Q8_K:
|
13831
|
-
case GGML_TYPE_I8:
|
13832
|
-
case GGML_TYPE_I16:
|
13833
|
-
case GGML_TYPE_I32:
|
13834
|
-
case GGML_TYPE_I64:
|
13835
|
-
case GGML_TYPE_F64:
|
13836
|
-
case GGML_TYPE_COUNT:
|
13837
|
-
{
|
13838
|
-
GGML_ASSERT(false);
|
13839
|
-
} break;
|
13840
|
-
}
|
13841
|
-
}
|
13842
|
-
|
13843
|
-
// ggml_compute_forward_rope
|
13844
|
-
|
13845
|
-
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
13846
|
-
const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
13847
|
-
return 1 - MIN(1, MAX(0, y));
|
13848
|
-
}
|
14213
|
+
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
14214
|
+
const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
14215
|
+
return 1 - MIN(1, MAX(0, y));
|
14216
|
+
}
|
13849
14217
|
|
13850
14218
|
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
13851
14219
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
@@ -13905,6 +14273,7 @@ static void ggml_compute_forward_rope_f32(
|
|
13905
14273
|
|
13906
14274
|
const struct ggml_tensor * src0 = dst->src[0];
|
13907
14275
|
const struct ggml_tensor * src1 = dst->src[1];
|
14276
|
+
const struct ggml_tensor * src2 = dst->src[2];
|
13908
14277
|
|
13909
14278
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13910
14279
|
return;
|
@@ -13964,6 +14333,17 @@ static void ggml_compute_forward_rope_f32(
|
|
13964
14333
|
const bool is_neox = mode & 2;
|
13965
14334
|
const bool is_glm = mode & 4;
|
13966
14335
|
|
14336
|
+
const float * freq_factors = NULL;
|
14337
|
+
if (is_neox) {
|
14338
|
+
if (src2 != NULL) {
|
14339
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14340
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14341
|
+
freq_factors = (const float *) src2->data;
|
14342
|
+
}
|
14343
|
+
} else {
|
14344
|
+
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14345
|
+
}
|
14346
|
+
|
13967
14347
|
// backward process uses inverse rotation by cos and sin.
|
13968
14348
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
13969
14349
|
// this essentially just switches the sign of sin.
|
@@ -14040,10 +14420,11 @@ static void ggml_compute_forward_rope_f32(
|
|
14040
14420
|
|
14041
14421
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
14042
14422
|
float cur_rot = inv_ndims * ic - ib;
|
14423
|
+
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14043
14424
|
|
14044
14425
|
float cos_theta, sin_theta;
|
14045
14426
|
rope_yarn(
|
14046
|
-
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14427
|
+
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14047
14428
|
&cos_theta, &sin_theta
|
14048
14429
|
);
|
14049
14430
|
sin_theta *= sin_sign;
|
@@ -14076,6 +14457,7 @@ static void ggml_compute_forward_rope_f32(
|
|
14076
14457
|
}
|
14077
14458
|
}
|
14078
14459
|
|
14460
|
+
// TODO: deduplicate f16/f32 code
|
14079
14461
|
static void ggml_compute_forward_rope_f16(
|
14080
14462
|
const struct ggml_compute_params * params,
|
14081
14463
|
struct ggml_tensor * dst,
|
@@ -14083,6 +14465,7 @@ static void ggml_compute_forward_rope_f16(
|
|
14083
14465
|
|
14084
14466
|
const struct ggml_tensor * src0 = dst->src[0];
|
14085
14467
|
const struct ggml_tensor * src1 = dst->src[1];
|
14468
|
+
const struct ggml_tensor * src2 = dst->src[2];
|
14086
14469
|
|
14087
14470
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
14088
14471
|
return;
|
@@ -14135,6 +14518,17 @@ static void ggml_compute_forward_rope_f16(
|
|
14135
14518
|
const bool is_neox = mode & 2;
|
14136
14519
|
const bool is_glm = mode & 4;
|
14137
14520
|
|
14521
|
+
const float * freq_factors = NULL;
|
14522
|
+
if (is_neox) {
|
14523
|
+
if (src2 != NULL) {
|
14524
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14525
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14526
|
+
freq_factors = (const float *) src2->data;
|
14527
|
+
}
|
14528
|
+
} else {
|
14529
|
+
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14530
|
+
}
|
14531
|
+
|
14138
14532
|
// backward process uses inverse rotation by cos and sin.
|
14139
14533
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
14140
14534
|
// this essentially just switches the sign of sin.
|
@@ -14207,10 +14601,11 @@ static void ggml_compute_forward_rope_f16(
|
|
14207
14601
|
|
14208
14602
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
14209
14603
|
float cur_rot = inv_ndims * ic - ib;
|
14604
|
+
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14210
14605
|
|
14211
14606
|
float cos_theta, sin_theta;
|
14212
14607
|
rope_yarn(
|
14213
|
-
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14608
|
+
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14214
14609
|
&cos_theta, &sin_theta
|
14215
14610
|
);
|
14216
14611
|
sin_theta *= sin_sign;
|
@@ -14972,25 +15367,28 @@ static void ggml_compute_forward_upscale_f32(
|
|
14972
15367
|
return;
|
14973
15368
|
}
|
14974
15369
|
|
14975
|
-
GGML_ASSERT(src0->
|
15370
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14976
15371
|
|
14977
15372
|
const int ith = params->ith;
|
14978
15373
|
const int nth = params->nth;
|
14979
15374
|
|
14980
15375
|
GGML_TENSOR_UNARY_OP_LOCALS
|
14981
15376
|
|
14982
|
-
const
|
15377
|
+
const float sf0 = (float)ne0/src0->ne[0];
|
15378
|
+
const float sf1 = (float)ne1/src0->ne[1];
|
15379
|
+
const float sf2 = (float)ne2/src0->ne[2];
|
15380
|
+
const float sf3 = (float)ne3/src0->ne[3];
|
14983
15381
|
|
14984
15382
|
// TODO: optimize
|
14985
15383
|
|
14986
15384
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
14987
|
-
const int64_t i03 = i3;
|
15385
|
+
const int64_t i03 = i3 / sf3;
|
14988
15386
|
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
14989
|
-
const int64_t i02 = i2;
|
15387
|
+
const int64_t i02 = i2 / sf2;
|
14990
15388
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14991
|
-
const int64_t i01 = i1 /
|
15389
|
+
const int64_t i01 = i1 / sf1;
|
14992
15390
|
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
14993
|
-
const int64_t i00 = i0 /
|
15391
|
+
const int64_t i00 = i0 / sf0;
|
14994
15392
|
|
14995
15393
|
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
14996
15394
|
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
@@ -15020,6 +15418,7 @@ static void ggml_compute_forward_upscale(
|
|
15020
15418
|
}
|
15021
15419
|
}
|
15022
15420
|
|
15421
|
+
|
15023
15422
|
// ggml_compute_forward_pad
|
15024
15423
|
|
15025
15424
|
static void ggml_compute_forward_pad_f32(
|
@@ -15200,487 +15599,42 @@ static void ggml_compute_forward_argsort_f32(
|
|
15200
15599
|
const int ith = params->ith;
|
15201
15600
|
const int nth = params->nth;
|
15202
15601
|
|
15203
|
-
const int64_t nr = ggml_nrows(src0);
|
15204
|
-
|
15205
|
-
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
15206
|
-
|
15207
|
-
for (int64_t i = ith; i < nr; i += nth) {
|
15208
|
-
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
15209
|
-
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
15210
|
-
|
15211
|
-
for (int64_t j = 0; j < ne0; j++) {
|
15212
|
-
dst_data[j] = j;
|
15213
|
-
}
|
15214
|
-
|
15215
|
-
// C doesn't have a functional sort, so we do a bubble sort instead
|
15216
|
-
for (int64_t j = 0; j < ne0; j++) {
|
15217
|
-
for (int64_t k = j + 1; k < ne0; k++) {
|
15218
|
-
if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
15219
|
-
(order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
15220
|
-
int32_t tmp = dst_data[j];
|
15221
|
-
dst_data[j] = dst_data[k];
|
15222
|
-
dst_data[k] = tmp;
|
15223
|
-
}
|
15224
|
-
}
|
15225
|
-
}
|
15226
|
-
}
|
15227
|
-
}
|
15228
|
-
|
15229
|
-
static void ggml_compute_forward_argsort(
|
15230
|
-
const struct ggml_compute_params * params,
|
15231
|
-
struct ggml_tensor * dst) {
|
15232
|
-
|
15233
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
15234
|
-
|
15235
|
-
switch (src0->type) {
|
15236
|
-
case GGML_TYPE_F32:
|
15237
|
-
{
|
15238
|
-
ggml_compute_forward_argsort_f32(params, dst);
|
15239
|
-
} break;
|
15240
|
-
default:
|
15241
|
-
{
|
15242
|
-
GGML_ASSERT(false);
|
15243
|
-
} break;
|
15244
|
-
}
|
15245
|
-
}
|
15246
|
-
|
15247
|
-
// ggml_compute_forward_flash_attn
|
15248
|
-
|
15249
|
-
static void ggml_compute_forward_flash_attn_f32(
|
15250
|
-
const struct ggml_compute_params * params,
|
15251
|
-
const bool masked,
|
15252
|
-
struct ggml_tensor * dst) {
|
15253
|
-
|
15254
|
-
const struct ggml_tensor * q = dst->src[0];
|
15255
|
-
const struct ggml_tensor * k = dst->src[1];
|
15256
|
-
const struct ggml_tensor * v = dst->src[2];
|
15257
|
-
|
15258
|
-
int64_t t0 = ggml_perf_time_us();
|
15259
|
-
UNUSED(t0);
|
15260
|
-
|
15261
|
-
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
15262
|
-
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
15263
|
-
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
15264
|
-
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
15265
|
-
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
15266
|
-
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
15267
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
15268
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
15269
|
-
|
15270
|
-
const int ith = params->ith;
|
15271
|
-
const int nth = params->nth;
|
15272
|
-
|
15273
|
-
const int64_t D = neq0;
|
15274
|
-
const int64_t N = neq1;
|
15275
|
-
const int64_t P = nek1 - N;
|
15276
|
-
const int64_t M = P + N;
|
15277
|
-
|
15278
|
-
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
15279
|
-
|
15280
|
-
GGML_ASSERT(ne0 == D);
|
15281
|
-
GGML_ASSERT(ne1 == N);
|
15282
|
-
GGML_ASSERT(P >= 0);
|
15283
|
-
|
15284
|
-
GGML_ASSERT(nbq0 == sizeof(float));
|
15285
|
-
GGML_ASSERT(nbk0 == sizeof(float));
|
15286
|
-
GGML_ASSERT(nbv0 == sizeof(float));
|
15287
|
-
|
15288
|
-
GGML_ASSERT(neq0 == D);
|
15289
|
-
GGML_ASSERT(nek0 == D);
|
15290
|
-
GGML_ASSERT(nev1 == D);
|
15291
|
-
|
15292
|
-
GGML_ASSERT(neq1 == N);
|
15293
|
-
GGML_ASSERT(nek1 == N + P);
|
15294
|
-
GGML_ASSERT(nev1 == D);
|
15295
|
-
|
15296
|
-
// dst cannot be transposed or permuted
|
15297
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
15298
|
-
GGML_ASSERT(nb0 <= nb1);
|
15299
|
-
GGML_ASSERT(nb1 <= nb2);
|
15300
|
-
GGML_ASSERT(nb2 <= nb3);
|
15301
|
-
|
15302
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
15303
|
-
return;
|
15304
|
-
}
|
15305
|
-
|
15306
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
15307
|
-
return;
|
15308
|
-
}
|
15309
|
-
|
15310
|
-
// parallelize by q rows using ggml_vec_dot_f32
|
15311
|
-
|
15312
|
-
// total rows in q
|
15313
|
-
const int nr = neq1*neq2*neq3;
|
15314
|
-
|
15315
|
-
// rows per thread
|
15316
|
-
const int dr = (nr + nth - 1)/nth;
|
15317
|
-
|
15318
|
-
// row range for this thread
|
15319
|
-
const int ir0 = dr*ith;
|
15320
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
15321
|
-
|
15322
|
-
const float scale = 1.0f/sqrtf(D);
|
15323
|
-
|
15324
|
-
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
15325
|
-
|
15326
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
15327
|
-
// q indices
|
15328
|
-
const int iq3 = ir/(neq2*neq1);
|
15329
|
-
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15330
|
-
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15331
|
-
|
15332
|
-
float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
|
15333
|
-
|
15334
|
-
for (int i = M; i < Mup; ++i) {
|
15335
|
-
S[i] = -INFINITY;
|
15336
|
-
}
|
15337
|
-
|
15338
|
-
const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
|
15339
|
-
for (int64_t ic = 0; ic < masked_begin; ++ic) {
|
15340
|
-
// k indices
|
15341
|
-
const int ik3 = iq3;
|
15342
|
-
const int ik2 = iq2 % nek2;
|
15343
|
-
const int ik1 = ic;
|
15344
|
-
|
15345
|
-
// S indices
|
15346
|
-
const int i1 = ik1;
|
15347
|
-
|
15348
|
-
ggml_vec_dot_f32(neq0,
|
15349
|
-
S + i1, 0,
|
15350
|
-
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
15351
|
-
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
15352
|
-
}
|
15353
|
-
|
15354
|
-
// scale
|
15355
|
-
ggml_vec_scale_f32(masked_begin, S, scale);
|
15356
|
-
|
15357
|
-
for (int64_t i = masked_begin; i < M; i++) {
|
15358
|
-
S[i] = -INFINITY;
|
15359
|
-
}
|
15360
|
-
|
15361
|
-
// softmax
|
15362
|
-
// exclude known -INF S[..] values from max and loop
|
15363
|
-
// dont forget to set their SW values to zero
|
15364
|
-
{
|
15365
|
-
float max = -INFINITY;
|
15366
|
-
ggml_vec_max_f32(masked_begin, &max, S);
|
15367
|
-
|
15368
|
-
ggml_float sum = 0.0;
|
15369
|
-
{
|
15370
|
-
#ifdef GGML_SOFT_MAX_ACCELERATE
|
15371
|
-
max = -max;
|
15372
|
-
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
15373
|
-
vvexpf(S, S, &Mup);
|
15374
|
-
ggml_vec_sum_f32(Mup, &sum, S);
|
15375
|
-
#else
|
15376
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
15377
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15378
|
-
|
15379
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15380
|
-
if (i >= masked_begin) {
|
15381
|
-
break;
|
15382
|
-
}
|
15383
|
-
float * SS = S + i;
|
15384
|
-
|
15385
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15386
|
-
if (i + j >= masked_begin) {
|
15387
|
-
break;
|
15388
|
-
} else if (SS[j] == -INFINITY) {
|
15389
|
-
SS[j] = 0.0f;
|
15390
|
-
} else {
|
15391
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
15392
|
-
const float val = expf(SS[j] - max);
|
15393
|
-
#else
|
15394
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15395
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15396
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15397
|
-
#endif
|
15398
|
-
sump[j] += (ggml_float)val;
|
15399
|
-
SS[j] = val;
|
15400
|
-
}
|
15401
|
-
}
|
15402
|
-
}
|
15403
|
-
|
15404
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15405
|
-
sum += sump[i];
|
15406
|
-
}
|
15407
|
-
#endif
|
15408
|
-
}
|
15409
|
-
|
15410
|
-
assert(sum > 0.0);
|
15411
|
-
|
15412
|
-
sum = 1.0/sum;
|
15413
|
-
ggml_vec_scale_f32(masked_begin, S, sum);
|
15414
|
-
|
15415
|
-
#ifndef NDEBUG
|
15416
|
-
for (int i = 0; i < masked_begin; ++i) {
|
15417
|
-
assert(!isnan(S[i]));
|
15418
|
-
assert(!isinf(S[i]));
|
15419
|
-
}
|
15420
|
-
#endif
|
15421
|
-
}
|
15422
|
-
|
15423
|
-
for (int64_t ic = 0; ic < nev1; ++ic) {
|
15424
|
-
// dst indices
|
15425
|
-
const int i1 = iq1;
|
15426
|
-
const int i2 = iq2;
|
15427
|
-
const int i3 = iq3;
|
15428
|
-
|
15429
|
-
// v indices
|
15430
|
-
const int iv2 = iq2 % nev2;
|
15431
|
-
const int iv3 = iq3;
|
15432
|
-
|
15433
|
-
ggml_vec_dot_f32(masked_begin,
|
15434
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
15435
|
-
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
15436
|
-
S, 0, 1);
|
15437
|
-
}
|
15438
|
-
}
|
15439
|
-
}
|
15440
|
-
|
15441
|
-
static void ggml_compute_forward_flash_attn_f16(
|
15442
|
-
const struct ggml_compute_params * params,
|
15443
|
-
const bool masked,
|
15444
|
-
struct ggml_tensor * dst) {
|
15445
|
-
|
15446
|
-
const struct ggml_tensor * q = dst->src[0];
|
15447
|
-
const struct ggml_tensor * k = dst->src[1];
|
15448
|
-
const struct ggml_tensor * v = dst->src[2];
|
15449
|
-
|
15450
|
-
int64_t t0 = ggml_perf_time_us();
|
15451
|
-
UNUSED(t0);
|
15452
|
-
|
15453
|
-
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
15454
|
-
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
15455
|
-
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
15456
|
-
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
15457
|
-
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
15458
|
-
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
15459
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
15460
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
15461
|
-
|
15462
|
-
const int ith = params->ith;
|
15463
|
-
const int nth = params->nth;
|
15464
|
-
|
15465
|
-
const int64_t D = neq0;
|
15466
|
-
const int64_t N = neq1;
|
15467
|
-
const int64_t P = nek1 - N;
|
15468
|
-
const int64_t M = P + N;
|
15469
|
-
|
15470
|
-
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
15471
|
-
|
15472
|
-
GGML_ASSERT(ne0 == D);
|
15473
|
-
GGML_ASSERT(ne1 == N);
|
15474
|
-
GGML_ASSERT(P >= 0);
|
15475
|
-
|
15476
|
-
GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
|
15477
|
-
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
15478
|
-
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
15479
|
-
|
15480
|
-
GGML_ASSERT(neq0 == D);
|
15481
|
-
GGML_ASSERT(nek0 == D);
|
15482
|
-
GGML_ASSERT(nev1 == D);
|
15483
|
-
|
15484
|
-
GGML_ASSERT(neq1 == N);
|
15485
|
-
GGML_ASSERT(nek1 == N + P);
|
15486
|
-
GGML_ASSERT(nev1 == D);
|
15487
|
-
|
15488
|
-
// dst cannot be transposed or permuted
|
15489
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
15490
|
-
GGML_ASSERT(nb0 <= nb1);
|
15491
|
-
GGML_ASSERT(nb1 <= nb2);
|
15492
|
-
GGML_ASSERT(nb2 <= nb3);
|
15493
|
-
|
15494
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
15495
|
-
return;
|
15496
|
-
}
|
15497
|
-
|
15498
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
15499
|
-
return;
|
15500
|
-
}
|
15501
|
-
|
15502
|
-
// parallelize by q rows using ggml_vec_dot_f32
|
15503
|
-
|
15504
|
-
// total rows in q
|
15505
|
-
const int nr = neq1*neq2*neq3;
|
15506
|
-
|
15507
|
-
// rows per thread
|
15508
|
-
const int dr = (nr + nth - 1)/nth;
|
15509
|
-
|
15510
|
-
// row range for this thread
|
15511
|
-
const int ir0 = dr*ith;
|
15512
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
15513
|
-
|
15514
|
-
const float scale = 1.0f/sqrtf(D);
|
15515
|
-
|
15516
|
-
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
15517
|
-
|
15518
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
15519
|
-
// q indices
|
15520
|
-
const int iq3 = ir/(neq2*neq1);
|
15521
|
-
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15522
|
-
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15523
|
-
|
15524
|
-
float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
|
15525
|
-
|
15526
|
-
for (int i = M; i < Mup; ++i) {
|
15527
|
-
S[i] = -INFINITY;
|
15528
|
-
}
|
15529
|
-
|
15530
|
-
if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
|
15531
|
-
for (int64_t ic = 0; ic < nek1; ++ic) {
|
15532
|
-
// k indices
|
15533
|
-
const int ik3 = iq3;
|
15534
|
-
const int ik2 = iq2 % nek2;
|
15535
|
-
const int ik1 = ic;
|
15536
|
-
|
15537
|
-
// S indices
|
15538
|
-
const int i1 = ik1;
|
15539
|
-
|
15540
|
-
ggml_vec_dot_f16(neq0,
|
15541
|
-
S + i1, 0,
|
15542
|
-
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
15543
|
-
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
15544
|
-
}
|
15545
|
-
} else {
|
15546
|
-
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
15547
|
-
// k indices
|
15548
|
-
const int ik3 = iq3;
|
15549
|
-
const int ik2 = iq2 % nek2;
|
15550
|
-
const int ik1 = ic;
|
15551
|
-
|
15552
|
-
// S indices
|
15553
|
-
const int i1 = ik1;
|
15554
|
-
|
15555
|
-
ggml_vec_dot_f16_unroll(neq0, nbk1,
|
15556
|
-
S + i1,
|
15557
|
-
((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
15558
|
-
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
15559
|
-
}
|
15560
|
-
}
|
15561
|
-
|
15562
|
-
// scale
|
15563
|
-
ggml_vec_scale_f32(nek1, S, scale);
|
15564
|
-
|
15565
|
-
if (masked) {
|
15566
|
-
for (int64_t i = P; i < M; i++) {
|
15567
|
-
if (i > P + iq1) {
|
15568
|
-
S[i] = -INFINITY;
|
15569
|
-
}
|
15570
|
-
}
|
15571
|
-
}
|
15572
|
-
|
15573
|
-
// softmax
|
15574
|
-
// todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
|
15575
|
-
// dont forget to set their S values to zero
|
15576
|
-
{
|
15577
|
-
float max = -INFINITY;
|
15578
|
-
ggml_vec_max_f32(M, &max, S);
|
15579
|
-
|
15580
|
-
ggml_float sum = 0.0;
|
15581
|
-
{
|
15582
|
-
#ifdef GGML_SOFT_MAX_ACCELERATE
|
15583
|
-
max = -max;
|
15584
|
-
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
15585
|
-
vvexpf(S, S, &Mup);
|
15586
|
-
ggml_vec_sum_f32(Mup, &sum, S);
|
15587
|
-
#else
|
15588
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
15589
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15590
|
-
|
15591
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15592
|
-
float * SS = S + i;
|
15593
|
-
|
15594
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15595
|
-
if (SS[j] == -INFINITY) {
|
15596
|
-
SS[j] = 0.0f;
|
15597
|
-
} else {
|
15598
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15599
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15600
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15601
|
-
sump[j] += (ggml_float)val;
|
15602
|
-
SS[j] = val;
|
15603
|
-
}
|
15604
|
-
}
|
15605
|
-
}
|
15606
|
-
|
15607
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15608
|
-
sum += sump[i];
|
15609
|
-
}
|
15610
|
-
#endif
|
15611
|
-
}
|
15612
|
-
|
15613
|
-
assert(sum > 0.0);
|
15614
|
-
|
15615
|
-
sum = 1.0/sum;
|
15616
|
-
ggml_vec_scale_f32(M, S, sum);
|
15617
|
-
|
15618
|
-
#ifndef NDEBUG
|
15619
|
-
for (int i = 0; i < M; ++i) {
|
15620
|
-
assert(!isnan(S[i]));
|
15621
|
-
assert(!isinf(S[i]));
|
15622
|
-
}
|
15623
|
-
#endif
|
15624
|
-
}
|
15625
|
-
|
15626
|
-
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
|
15602
|
+
const int64_t nr = ggml_nrows(src0);
|
15603
|
+
|
15604
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
15605
|
+
|
15606
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
15607
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
15608
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
15627
15609
|
|
15628
|
-
for (int64_t
|
15629
|
-
|
15610
|
+
for (int64_t j = 0; j < ne0; j++) {
|
15611
|
+
dst_data[j] = j;
|
15630
15612
|
}
|
15631
15613
|
|
15632
|
-
//
|
15633
|
-
|
15634
|
-
for (int64_t
|
15635
|
-
|
15636
|
-
|
15637
|
-
|
15638
|
-
|
15639
|
-
|
15640
|
-
|
15641
|
-
const int iv2 = iq2 % nev2;
|
15642
|
-
const int iv3 = iq3;
|
15643
|
-
|
15644
|
-
ggml_vec_dot_f16(nev0,
|
15645
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
15646
|
-
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
15647
|
-
S16, 0, 1);
|
15648
|
-
}
|
15649
|
-
} else {
|
15650
|
-
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
15651
|
-
// dst indices
|
15652
|
-
const int i1 = iq1;
|
15653
|
-
const int i2 = iq2;
|
15654
|
-
const int i3 = iq3;
|
15655
|
-
|
15656
|
-
// v indices
|
15657
|
-
const int iv2 = iq2 % nev2;
|
15658
|
-
const int iv3 = iq3;
|
15659
|
-
|
15660
|
-
ggml_vec_dot_f16_unroll(nev0, nbv1,
|
15661
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
15662
|
-
((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
15663
|
-
S16);
|
15614
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
15615
|
+
for (int64_t j = 0; j < ne0; j++) {
|
15616
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
15617
|
+
if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
15618
|
+
(order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
15619
|
+
int32_t tmp = dst_data[j];
|
15620
|
+
dst_data[j] = dst_data[k];
|
15621
|
+
dst_data[k] = tmp;
|
15622
|
+
}
|
15664
15623
|
}
|
15665
15624
|
}
|
15666
15625
|
}
|
15667
15626
|
}
|
15668
15627
|
|
15669
|
-
static void
|
15670
|
-
|
15671
|
-
|
15672
|
-
struct ggml_tensor * dst) {
|
15628
|
+
static void ggml_compute_forward_argsort(
|
15629
|
+
const struct ggml_compute_params * params,
|
15630
|
+
struct ggml_tensor * dst) {
|
15673
15631
|
|
15674
|
-
const struct ggml_tensor *
|
15632
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
15675
15633
|
|
15676
|
-
switch (
|
15677
|
-
case GGML_TYPE_F16:
|
15678
|
-
{
|
15679
|
-
ggml_compute_forward_flash_attn_f16(params, masked, dst);
|
15680
|
-
} break;
|
15634
|
+
switch (src0->type) {
|
15681
15635
|
case GGML_TYPE_F32:
|
15682
15636
|
{
|
15683
|
-
|
15637
|
+
ggml_compute_forward_argsort_f32(params, dst);
|
15684
15638
|
} break;
|
15685
15639
|
default:
|
15686
15640
|
{
|
@@ -15719,9 +15673,10 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15719
15673
|
GGML_ASSERT(ne0 == D);
|
15720
15674
|
GGML_ASSERT(ne2 == N);
|
15721
15675
|
|
15722
|
-
|
15723
|
-
GGML_ASSERT(
|
15724
|
-
GGML_ASSERT(
|
15676
|
+
// input tensor rows must be contiguous
|
15677
|
+
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
|
15678
|
+
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
|
15679
|
+
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
|
15725
15680
|
|
15726
15681
|
GGML_ASSERT(neq0 == D);
|
15727
15682
|
GGML_ASSERT(nek0 == D);
|
@@ -15763,8 +15718,22 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15763
15718
|
const int ir0 = dr*ith;
|
15764
15719
|
const int ir1 = MIN(ir0 + dr, nr);
|
15765
15720
|
|
15766
|
-
float scale
|
15767
|
-
|
15721
|
+
float scale = 1.0f;
|
15722
|
+
float max_bias = 0.0f;
|
15723
|
+
|
15724
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
15725
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
15726
|
+
|
15727
|
+
const uint32_t n_head = neq2;
|
15728
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
15729
|
+
|
15730
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
15731
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
15732
|
+
|
15733
|
+
enum ggml_type const k_vec_dot_type = type_traits[k->type].vec_dot_type;
|
15734
|
+
ggml_from_float_t const q_to_vec_dot = type_traits[k_vec_dot_type].from_float;
|
15735
|
+
ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot;
|
15736
|
+
ggml_to_float_t const v_to_float = type_traits[v->type].to_float;
|
15768
15737
|
|
15769
15738
|
// loop over n_batch and n_head
|
15770
15739
|
for (int ir = ir0; ir < ir1; ++ir) {
|
@@ -15773,14 +15742,22 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15773
15742
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15774
15743
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15775
15744
|
|
15776
|
-
|
15777
|
-
float
|
15745
|
+
const uint32_t h = iq2; // head index
|
15746
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
15747
|
+
|
15748
|
+
float S = 0.0f; // sum
|
15749
|
+
float M = -INFINITY; // maximum KQ value
|
15778
15750
|
|
15779
|
-
float *
|
15780
|
-
|
15781
|
-
ggml_fp16_t *
|
15751
|
+
float * VKQ32 = (float *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator
|
15752
|
+
float * V32 = (VKQ32 + 1*D); // (temporary) FP32 V buffer
|
15753
|
+
ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator
|
15754
|
+
ggml_fp16_t * Q_q = (ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16
|
15782
15755
|
|
15783
|
-
|
15756
|
+
if (v->type == GGML_TYPE_F16) {
|
15757
|
+
memset(VKQ16, 0, D*sizeof(ggml_fp16_t));
|
15758
|
+
} else {
|
15759
|
+
memset(VKQ32, 0, D*sizeof(float));
|
15760
|
+
}
|
15784
15761
|
|
15785
15762
|
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
15786
15763
|
|
@@ -15792,61 +15769,79 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15792
15769
|
const int iv3 = iq3 / rv3;
|
15793
15770
|
const int iv2 = iq2 / rv2;
|
15794
15771
|
|
15772
|
+
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
15773
|
+
q_to_vec_dot(pq, Q_q, D);
|
15774
|
+
|
15795
15775
|
// online softmax / attention
|
15796
15776
|
// loop over n_kv and n_head_kv
|
15797
15777
|
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
15798
15778
|
for (int64_t ic = 0; ic < nek1; ++ic) {
|
15799
|
-
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15779
|
+
const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15800
15780
|
if (mv == -INFINITY) {
|
15801
15781
|
continue;
|
15802
15782
|
}
|
15803
15783
|
|
15804
|
-
float s;
|
15784
|
+
float s; // KQ value
|
15805
15785
|
|
15806
|
-
|
15807
|
-
|
15808
|
-
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
15786
|
+
const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
|
15787
|
+
kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1);
|
15809
15788
|
|
15810
|
-
|
15811
|
-
Q16[d] = GGML_FP32_TO_FP16(pq[d]);
|
15812
|
-
}
|
15813
|
-
}
|
15789
|
+
s = s*scale + mv; // scale KQ value and apply mask
|
15814
15790
|
|
15815
|
-
|
15816
|
-
&s, 0,
|
15817
|
-
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
15818
|
-
Q16, 0, 1);
|
15791
|
+
const float Mold = M;
|
15819
15792
|
|
15820
|
-
|
15793
|
+
float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
|
15794
|
+
float vs = 1.0f; // post-softmax KQ value, expf(s - M)
|
15821
15795
|
|
15822
|
-
const
|
15796
|
+
const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
15823
15797
|
|
15824
|
-
|
15825
|
-
|
15798
|
+
if (v->type== GGML_TYPE_F16) {
|
15799
|
+
if (s > M) {
|
15800
|
+
// s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
|
15801
|
+
M = s;
|
15802
|
+
ms = expf(Mold - M);
|
15826
15803
|
|
15827
|
-
|
15828
|
-
|
15829
|
-
|
15804
|
+
// V = V*expf(Mold - M)
|
15805
|
+
ggml_vec_scale_f16(D, VKQ16, ms);
|
15806
|
+
} else {
|
15807
|
+
// no new maximum, ms == 1.0f, vs != 1.0f
|
15808
|
+
vs = expf(s - M);
|
15809
|
+
}
|
15830
15810
|
|
15831
|
-
// V
|
15832
|
-
|
15811
|
+
// V += v*expf(s - M)
|
15812
|
+
ggml_vec_mad_f16(D, VKQ16, (const ggml_fp16_t *) v_data, vs);
|
15833
15813
|
} else {
|
15834
|
-
|
15835
|
-
|
15814
|
+
if (s > M) {
|
15815
|
+
// s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
|
15816
|
+
M = s;
|
15817
|
+
ms = expf(Mold - M);
|
15818
|
+
|
15819
|
+
// V = V*expf(Mold - M)
|
15820
|
+
ggml_vec_scale_f32(D, VKQ32, ms);
|
15821
|
+
} else {
|
15822
|
+
// no new maximum, ms == 1.0f, vs != 1.0f
|
15823
|
+
vs = expf(s - M);
|
15824
|
+
}
|
15836
15825
|
|
15837
|
-
|
15826
|
+
v_to_float(v_data, V32, D);
|
15838
15827
|
|
15839
|
-
|
15840
|
-
|
15828
|
+
// V += v*expf(s - M)
|
15829
|
+
ggml_vec_mad_f32(D, VKQ32, V32, vs);
|
15830
|
+
}
|
15841
15831
|
|
15842
|
-
S = S*ms + vs;
|
15832
|
+
S = S*ms + vs; // scale and increment sum with partial sum
|
15843
15833
|
}
|
15844
15834
|
|
15845
|
-
|
15846
|
-
|
15847
|
-
|
15835
|
+
if (v->type == GGML_TYPE_F16) {
|
15836
|
+
for (int64_t d = 0; d < D; ++d) {
|
15837
|
+
VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
|
15838
|
+
}
|
15848
15839
|
}
|
15849
15840
|
|
15841
|
+
// V /= S
|
15842
|
+
const float S_inv = 1.0f/S;
|
15843
|
+
ggml_vec_scale_f32(D, VKQ32, S_inv);
|
15844
|
+
|
15850
15845
|
// dst indices
|
15851
15846
|
const int i1 = iq1;
|
15852
15847
|
const int i2 = iq2;
|
@@ -15856,7 +15851,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15856
15851
|
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
|
15857
15852
|
|
15858
15853
|
// permute(0, 2, 1, 3)
|
15859
|
-
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1,
|
15854
|
+
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
|
15860
15855
|
}
|
15861
15856
|
}
|
15862
15857
|
|
@@ -15867,7 +15862,7 @@ static void ggml_compute_forward_flash_attn_ext(
|
|
15867
15862
|
const struct ggml_tensor * v,
|
15868
15863
|
const struct ggml_tensor * mask,
|
15869
15864
|
struct ggml_tensor * dst) {
|
15870
|
-
switch (dst->op_params[
|
15865
|
+
switch (dst->op_params[2]) {
|
15871
15866
|
case GGML_PREC_DEFAULT:
|
15872
15867
|
case GGML_PREC_F32:
|
15873
15868
|
{
|
@@ -15881,165 +15876,6 @@ static void ggml_compute_forward_flash_attn_ext(
|
|
15881
15876
|
}
|
15882
15877
|
}
|
15883
15878
|
|
15884
|
-
// ggml_compute_forward_flash_ff
|
15885
|
-
|
15886
|
-
static void ggml_compute_forward_flash_ff_f16(
|
15887
|
-
const struct ggml_compute_params * params,
|
15888
|
-
struct ggml_tensor * dst) {
|
15889
|
-
|
15890
|
-
const struct ggml_tensor * a = dst->src[0]; // F16
|
15891
|
-
const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
|
15892
|
-
const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
|
15893
|
-
const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
|
15894
|
-
const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
|
15895
|
-
|
15896
|
-
int64_t t0 = ggml_perf_time_us();
|
15897
|
-
UNUSED(t0);
|
15898
|
-
|
15899
|
-
GGML_TENSOR_LOCALS(int64_t, nea, a, ne)
|
15900
|
-
GGML_TENSOR_LOCALS(size_t, nba, a, nb)
|
15901
|
-
GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne)
|
15902
|
-
GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb)
|
15903
|
-
GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne)
|
15904
|
-
GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb)
|
15905
|
-
GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne)
|
15906
|
-
GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb)
|
15907
|
-
GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne)
|
15908
|
-
GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb)
|
15909
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
15910
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
15911
|
-
|
15912
|
-
const int ith = params->ith;
|
15913
|
-
const int nth = params->nth;
|
15914
|
-
|
15915
|
-
const int64_t D = nea0;
|
15916
|
-
//const int64_t N = nea1;
|
15917
|
-
const int64_t M = neb01;
|
15918
|
-
|
15919
|
-
GGML_ASSERT(ne0 == nea0);
|
15920
|
-
GGML_ASSERT(ne1 == nea1);
|
15921
|
-
GGML_ASSERT(ne2 == nea2);
|
15922
|
-
|
15923
|
-
GGML_ASSERT(nba0 == sizeof(ggml_fp16_t));
|
15924
|
-
GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
|
15925
|
-
GGML_ASSERT(nbb10 == sizeof(float));
|
15926
|
-
GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
|
15927
|
-
GGML_ASSERT(nbc10 == sizeof(float));
|
15928
|
-
|
15929
|
-
GGML_ASSERT(neb00 == D);
|
15930
|
-
GGML_ASSERT(neb01 == M);
|
15931
|
-
GGML_ASSERT(neb10 == M);
|
15932
|
-
GGML_ASSERT(neb11 == 1);
|
15933
|
-
|
15934
|
-
GGML_ASSERT(nec00 == M);
|
15935
|
-
GGML_ASSERT(nec01 == D);
|
15936
|
-
GGML_ASSERT(nec10 == D);
|
15937
|
-
GGML_ASSERT(nec11 == 1);
|
15938
|
-
|
15939
|
-
// dst cannot be transposed or permuted
|
15940
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
15941
|
-
GGML_ASSERT(nb0 <= nb1);
|
15942
|
-
GGML_ASSERT(nb1 <= nb2);
|
15943
|
-
GGML_ASSERT(nb2 <= nb3);
|
15944
|
-
|
15945
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
15946
|
-
return;
|
15947
|
-
}
|
15948
|
-
|
15949
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
15950
|
-
return;
|
15951
|
-
}
|
15952
|
-
|
15953
|
-
// parallelize by a rows using ggml_vec_dot_f32
|
15954
|
-
|
15955
|
-
// total rows in a
|
15956
|
-
const int nr = nea1*nea2*nea3;
|
15957
|
-
|
15958
|
-
// rows per thread
|
15959
|
-
const int dr = (nr + nth - 1)/nth;
|
15960
|
-
|
15961
|
-
// row range for this thread
|
15962
|
-
const int ir0 = dr*ith;
|
15963
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
15964
|
-
|
15965
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
15966
|
-
// a indices
|
15967
|
-
const int ia3 = ir/(nea2*nea1);
|
15968
|
-
const int ia2 = (ir - ia3*nea2*nea1)/nea1;
|
15969
|
-
const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
|
15970
|
-
|
15971
|
-
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
15972
|
-
|
15973
|
-
for (int64_t ic = 0; ic < neb01; ++ic) {
|
15974
|
-
// b0 indices
|
15975
|
-
const int ib03 = ia3;
|
15976
|
-
const int ib02 = ia2;
|
15977
|
-
const int ib01 = ic;
|
15978
|
-
|
15979
|
-
// S indices
|
15980
|
-
const int i1 = ib01;
|
15981
|
-
|
15982
|
-
ggml_vec_dot_f16(nea0,
|
15983
|
-
S + i1, 0,
|
15984
|
-
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
15985
|
-
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
15986
|
-
}
|
15987
|
-
|
15988
|
-
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
15989
|
-
//ggml_vec_gelu_f32(neb01, S, S);
|
15990
|
-
|
15991
|
-
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
15992
|
-
|
15993
|
-
for (int64_t i = 0; i < M; i++) {
|
15994
|
-
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
15995
|
-
}
|
15996
|
-
|
15997
|
-
ggml_vec_gelu_f16(neb01, S16, S16);
|
15998
|
-
|
15999
|
-
{
|
16000
|
-
// dst indices
|
16001
|
-
const int i1 = ia1;
|
16002
|
-
const int i2 = ia2;
|
16003
|
-
const int i3 = ia3;
|
16004
|
-
|
16005
|
-
for (int64_t ic = 0; ic < nec01; ++ic) {
|
16006
|
-
|
16007
|
-
ggml_vec_dot_f16(neb01,
|
16008
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
16009
|
-
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
16010
|
-
S16, 0, 1);
|
16011
|
-
}
|
16012
|
-
|
16013
|
-
ggml_vec_add_f32(nec01,
|
16014
|
-
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
16015
|
-
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
16016
|
-
(float *) c1->data);
|
16017
|
-
}
|
16018
|
-
}
|
16019
|
-
}
|
16020
|
-
|
16021
|
-
static void ggml_compute_forward_flash_ff(
|
16022
|
-
const struct ggml_compute_params * params,
|
16023
|
-
struct ggml_tensor * dst) {
|
16024
|
-
|
16025
|
-
const struct ggml_tensor * b0 = dst->src[1];
|
16026
|
-
|
16027
|
-
switch (b0->type) {
|
16028
|
-
case GGML_TYPE_F16:
|
16029
|
-
{
|
16030
|
-
ggml_compute_forward_flash_ff_f16(params, dst);
|
16031
|
-
} break;
|
16032
|
-
case GGML_TYPE_F32:
|
16033
|
-
{
|
16034
|
-
GGML_ASSERT(false); // TODO
|
16035
|
-
} break;
|
16036
|
-
default:
|
16037
|
-
{
|
16038
|
-
GGML_ASSERT(false);
|
16039
|
-
} break;
|
16040
|
-
}
|
16041
|
-
}
|
16042
|
-
|
16043
15879
|
// ggml_compute_forward_flash_attn_back
|
16044
15880
|
|
16045
15881
|
static void ggml_compute_forward_flash_attn_back_f32(
|
@@ -16221,38 +16057,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
16221
16057
|
vvexpf(SM, SM, &Mup);
|
16222
16058
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
16223
16059
|
#else
|
16224
|
-
|
16225
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
16226
|
-
|
16227
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
16228
|
-
if (i >= masked_begin) {
|
16229
|
-
break;
|
16230
|
-
}
|
16231
|
-
float * SR = S + i;
|
16232
|
-
float * SW = SM + i;
|
16233
|
-
|
16234
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
16235
|
-
if (i + j >= masked_begin) {
|
16236
|
-
break;
|
16237
|
-
} else if (SR[j] == -INFINITY) {
|
16238
|
-
SW[j] = 0.0f;
|
16239
|
-
} else {
|
16240
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
16241
|
-
const float val = expf(SR[j] - max);
|
16242
|
-
#else
|
16243
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
16244
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
16245
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
16246
|
-
#endif
|
16247
|
-
sump[j] += (ggml_float)val;
|
16248
|
-
SW[j] = val;
|
16249
|
-
}
|
16250
|
-
}
|
16251
|
-
}
|
16252
|
-
|
16253
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
16254
|
-
sum += sump[i];
|
16255
|
-
}
|
16060
|
+
sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
|
16256
16061
|
#endif
|
16257
16062
|
}
|
16258
16063
|
|
@@ -16834,6 +16639,10 @@ static void ggml_compute_forward_unary(
|
|
16834
16639
|
{
|
16835
16640
|
ggml_compute_forward_relu(params, dst);
|
16836
16641
|
} break;
|
16642
|
+
case GGML_UNARY_OP_SIGMOID:
|
16643
|
+
{
|
16644
|
+
ggml_compute_forward_sigmoid(params, dst);
|
16645
|
+
} break;
|
16837
16646
|
case GGML_UNARY_OP_GELU:
|
16838
16647
|
{
|
16839
16648
|
ggml_compute_forward_gelu(params, dst);
|
@@ -17274,35 +17083,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
17274
17083
|
assert(!isnan(s1[i]));
|
17275
17084
|
}
|
17276
17085
|
#endif
|
17277
|
-
// soft_max
|
17278
|
-
ggml_float sum = 0.0;
|
17279
|
-
{
|
17280
|
-
float max = -INFINITY;
|
17281
|
-
ggml_vec_max_f32(nc, &max, s0);
|
17282
17086
|
|
17283
|
-
|
17284
|
-
|
17285
|
-
|
17286
|
-
|
17287
|
-
|
17288
|
-
|
17289
|
-
const float s = s0[i] - max;
|
17290
|
-
const float val = expf(s);
|
17291
|
-
#else
|
17292
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17293
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17294
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17295
|
-
#endif
|
17296
|
-
sum += (ggml_float)val;
|
17297
|
-
st[i] = val;
|
17298
|
-
}
|
17299
|
-
}
|
17087
|
+
// soft_max
|
17088
|
+
float max = -INFINITY;
|
17089
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17090
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
|
17091
|
+
assert(sum > 0.0);
|
17092
|
+
sum = (1.0 - eps) / sum;
|
17300
17093
|
|
17301
|
-
assert(sum > 0.0);
|
17302
|
-
// sum = 1.0/sum;
|
17303
|
-
}
|
17304
17094
|
// avoid log(0) by rescaling from [0..1] to [eps..1]
|
17305
|
-
sum = (1.0 - eps) / sum;
|
17306
17095
|
ggml_vec_scale_f32(nc, st, sum);
|
17307
17096
|
ggml_vec_add1_f32(nc, st, st, eps);
|
17308
17097
|
ggml_vec_log_f32(nc, st, st);
|
@@ -17392,32 +17181,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
17392
17181
|
#endif
|
17393
17182
|
|
17394
17183
|
// soft_max
|
17395
|
-
|
17396
|
-
|
17397
|
-
|
17398
|
-
|
17399
|
-
|
17400
|
-
uint16_t scvt; UNUSED(scvt);
|
17401
|
-
for (int i = 0; i < nc; i++) {
|
17402
|
-
if (s0[i] == -INFINITY) {
|
17403
|
-
ds0[i] = 0.0f;
|
17404
|
-
} else {
|
17405
|
-
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
17406
|
-
const float s = s0[i] - max;
|
17407
|
-
const float val = expf(s);
|
17408
|
-
#else
|
17409
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17410
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17411
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17412
|
-
#endif
|
17413
|
-
sum += (ggml_float)val;
|
17414
|
-
ds0[i] = val;
|
17415
|
-
}
|
17416
|
-
}
|
17417
|
-
|
17418
|
-
assert(sum > 0.0);
|
17419
|
-
sum = (1.0 - eps)/sum;
|
17420
|
-
}
|
17184
|
+
float max = -INFINITY;
|
17185
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17186
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
|
17187
|
+
assert(sum > 0.0);
|
17188
|
+
sum = (1.0 - eps) / sum;
|
17421
17189
|
|
17422
17190
|
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
17423
17191
|
ggml_vec_scale_f32(nc, ds0, sum);
|
@@ -17454,7 +17222,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
17454
17222
|
|
17455
17223
|
/////////////////////////////////
|
17456
17224
|
|
17457
|
-
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
17225
|
+
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
|
17458
17226
|
GGML_ASSERT(params);
|
17459
17227
|
|
17460
17228
|
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
@@ -17552,7 +17320,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17552
17320
|
} break;
|
17553
17321
|
case GGML_OP_MUL_MAT:
|
17554
17322
|
{
|
17555
|
-
ggml_compute_forward_mul_mat(params, tensor);
|
17323
|
+
ggml_compute_forward_mul_mat(params, tensor, state);
|
17556
17324
|
} break;
|
17557
17325
|
case GGML_OP_MUL_MAT_ID:
|
17558
17326
|
{
|
@@ -17630,10 +17398,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17630
17398
|
{
|
17631
17399
|
ggml_compute_forward_rope_back(params, tensor);
|
17632
17400
|
} break;
|
17633
|
-
case GGML_OP_ALIBI:
|
17634
|
-
{
|
17635
|
-
ggml_compute_forward_alibi(params, tensor);
|
17636
|
-
} break;
|
17637
17401
|
case GGML_OP_CLAMP:
|
17638
17402
|
{
|
17639
17403
|
ggml_compute_forward_clamp(params, tensor);
|
@@ -17682,21 +17446,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17682
17446
|
{
|
17683
17447
|
ggml_compute_forward_leaky_relu(params, tensor);
|
17684
17448
|
} break;
|
17685
|
-
case GGML_OP_FLASH_ATTN:
|
17686
|
-
{
|
17687
|
-
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
17688
|
-
GGML_ASSERT(t == 0 || t == 1);
|
17689
|
-
const bool masked = t != 0;
|
17690
|
-
ggml_compute_forward_flash_attn(params, masked, tensor);
|
17691
|
-
} break;
|
17692
17449
|
case GGML_OP_FLASH_ATTN_EXT:
|
17693
17450
|
{
|
17694
17451
|
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
17695
17452
|
} break;
|
17696
|
-
case GGML_OP_FLASH_FF:
|
17697
|
-
{
|
17698
|
-
ggml_compute_forward_flash_ff(params, tensor);
|
17699
|
-
} break;
|
17700
17453
|
case GGML_OP_FLASH_ATTN_BACK:
|
17701
17454
|
{
|
17702
17455
|
int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -18066,6 +17819,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg
|
|
18066
17819
|
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
|
18067
17820
|
struct ggml_tensor * src0 = tensor->src[0];
|
18068
17821
|
struct ggml_tensor * src1 = tensor->src[1];
|
17822
|
+
struct ggml_tensor * src2 = tensor->src[2];
|
18069
17823
|
|
18070
17824
|
switch (tensor->op) {
|
18071
17825
|
case GGML_OP_DUP:
|
@@ -18597,6 +18351,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18597
18351
|
ggml_rope_back(ctx,
|
18598
18352
|
tensor->grad,
|
18599
18353
|
src1,
|
18354
|
+
src2,
|
18600
18355
|
n_dims,
|
18601
18356
|
mode,
|
18602
18357
|
n_ctx,
|
@@ -18636,6 +18391,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18636
18391
|
ggml_rope_impl(ctx,
|
18637
18392
|
tensor->grad,
|
18638
18393
|
src1,
|
18394
|
+
src2,
|
18639
18395
|
n_dims,
|
18640
18396
|
mode,
|
18641
18397
|
n_ctx,
|
@@ -18652,10 +18408,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18652
18408
|
zero_table);
|
18653
18409
|
}
|
18654
18410
|
} break;
|
18655
|
-
case GGML_OP_ALIBI:
|
18656
|
-
{
|
18657
|
-
GGML_ASSERT(false); // TODO: not implemented
|
18658
|
-
} break;
|
18659
18411
|
case GGML_OP_CLAMP:
|
18660
18412
|
{
|
18661
18413
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -18704,7 +18456,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18704
18456
|
{
|
18705
18457
|
GGML_ASSERT(false); // TODO: not implemented
|
18706
18458
|
} break;
|
18707
|
-
case GGML_OP_FLASH_ATTN:
|
18708
18459
|
case GGML_OP_FLASH_ATTN_EXT:
|
18709
18460
|
{
|
18710
18461
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -18721,7 +18472,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18721
18472
|
masked);
|
18722
18473
|
}
|
18723
18474
|
|
18724
|
-
struct ggml_tensor * src2 = tensor->src[2];
|
18725
18475
|
const int64_t elem_q = ggml_nelements(src0);
|
18726
18476
|
const int64_t elem_k = ggml_nelements(src1);
|
18727
18477
|
const int64_t elem_v = ggml_nelements(src2);
|
@@ -18759,10 +18509,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18759
18509
|
zero_table);
|
18760
18510
|
}
|
18761
18511
|
} break;
|
18762
|
-
case GGML_OP_FLASH_FF:
|
18763
|
-
{
|
18764
|
-
GGML_ASSERT(false); // not supported
|
18765
|
-
} break;
|
18766
18512
|
case GGML_OP_FLASH_ATTN_BACK:
|
18767
18513
|
{
|
18768
18514
|
GGML_ASSERT(false); // not supported
|
@@ -18826,6 +18572,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18826
18572
|
zero_table);
|
18827
18573
|
}
|
18828
18574
|
} break;
|
18575
|
+
case GGML_UNARY_OP_SIGMOID:
|
18576
|
+
{
|
18577
|
+
GGML_ASSERT(false); // TODO: not implemented
|
18578
|
+
} break;
|
18829
18579
|
case GGML_UNARY_OP_GELU:
|
18830
18580
|
{
|
18831
18581
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -19172,8 +18922,6 @@ typedef int ggml_lock_t;
|
|
19172
18922
|
|
19173
18923
|
#define GGML_LOCK_INITIALIZER 0
|
19174
18924
|
|
19175
|
-
typedef pthread_t ggml_thread_t;
|
19176
|
-
|
19177
18925
|
#define ggml_thread_create pthread_create
|
19178
18926
|
#define ggml_thread_join pthread_join
|
19179
18927
|
|
@@ -19199,8 +18947,6 @@ typedef int ggml_lock_t;
|
|
19199
18947
|
|
19200
18948
|
#define GGML_LOCK_INITIALIZER 0
|
19201
18949
|
|
19202
|
-
typedef pthread_t ggml_thread_t;
|
19203
|
-
|
19204
18950
|
#define ggml_thread_create pthread_create
|
19205
18951
|
#define ggml_thread_join pthread_join
|
19206
18952
|
|
@@ -19280,31 +19026,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
|
19280
19026
|
static void clear_numa_thread_affinity(void) {}
|
19281
19027
|
#endif
|
19282
19028
|
|
19283
|
-
struct ggml_compute_state_shared {
|
19284
|
-
const struct ggml_cgraph * cgraph;
|
19285
|
-
const struct ggml_cplan * cplan;
|
19286
|
-
|
19287
|
-
int64_t perf_node_start_cycles;
|
19288
|
-
int64_t perf_node_start_time_us;
|
19289
|
-
|
19290
|
-
const int n_threads;
|
19291
|
-
|
19292
|
-
// synchronization primitives
|
19293
|
-
atomic_int n_active; // num active threads
|
19294
|
-
atomic_int node_n; // active graph node
|
19295
|
-
atomic_int node_task; // active graph node task phase
|
19296
|
-
|
19297
|
-
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
19298
|
-
void * abort_callback_data;
|
19299
|
-
};
|
19300
|
-
|
19301
|
-
struct ggml_compute_state {
|
19302
|
-
ggml_thread_t thrd;
|
19303
|
-
int ith;
|
19304
|
-
struct ggml_compute_state_shared * shared;
|
19305
|
-
enum ggml_status ec;
|
19306
|
-
};
|
19307
|
-
|
19308
19029
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
19309
19030
|
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
19310
19031
|
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
@@ -19355,6 +19076,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19355
19076
|
case GGML_UNARY_OP_TANH:
|
19356
19077
|
case GGML_UNARY_OP_ELU:
|
19357
19078
|
case GGML_UNARY_OP_RELU:
|
19079
|
+
case GGML_UNARY_OP_SIGMOID:
|
19358
19080
|
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
19359
19081
|
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
19360
19082
|
{
|
@@ -19428,10 +19150,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19428
19150
|
{
|
19429
19151
|
n_tasks = n_threads;
|
19430
19152
|
} break;
|
19431
|
-
case GGML_OP_ALIBI:
|
19432
|
-
{
|
19433
|
-
n_tasks = 1; //TODO
|
19434
|
-
} break;
|
19435
19153
|
case GGML_OP_CLAMP:
|
19436
19154
|
{
|
19437
19155
|
n_tasks = 1; //TODO
|
@@ -19477,15 +19195,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19477
19195
|
{
|
19478
19196
|
n_tasks = n_threads;
|
19479
19197
|
} break;
|
19480
|
-
case GGML_OP_FLASH_ATTN:
|
19481
19198
|
case GGML_OP_FLASH_ATTN_EXT:
|
19482
19199
|
{
|
19483
19200
|
n_tasks = n_threads;
|
19484
19201
|
} break;
|
19485
|
-
case GGML_OP_FLASH_FF:
|
19486
|
-
{
|
19487
|
-
n_tasks = n_threads;
|
19488
|
-
} break;
|
19489
19202
|
case GGML_OP_FLASH_ATTN_BACK:
|
19490
19203
|
{
|
19491
19204
|
n_tasks = n_threads;
|
@@ -19580,6 +19293,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19580
19293
|
|
19581
19294
|
* node_n = atomic_load(&state->shared->node_n);
|
19582
19295
|
if (* node_n != last_node_n) break;
|
19296
|
+
#if defined(__SSE3__)
|
19297
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19298
|
+
_mm_pause();
|
19299
|
+
#endif
|
19583
19300
|
}
|
19584
19301
|
}
|
19585
19302
|
|
@@ -19594,6 +19311,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
|
|
19594
19311
|
|
19595
19312
|
* task_phase = atomic_load(&state->shared->node_task);
|
19596
19313
|
if (* task_phase != last_task_phase) break;
|
19314
|
+
#if defined(__SSE3__)
|
19315
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19316
|
+
_mm_pause();
|
19317
|
+
#endif
|
19597
19318
|
}
|
19598
19319
|
}
|
19599
19320
|
|
@@ -19633,7 +19354,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19633
19354
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19634
19355
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19635
19356
|
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19636
|
-
ggml_compute_forward(¶ms, node);
|
19357
|
+
ggml_compute_forward(¶ms, node, state);
|
19637
19358
|
}
|
19638
19359
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19639
19360
|
}
|
@@ -19653,17 +19374,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19653
19374
|
/* INIT */
|
19654
19375
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19655
19376
|
params.type = GGML_TASK_TYPE_INIT;
|
19656
|
-
ggml_compute_forward(¶ms, node);
|
19377
|
+
ggml_compute_forward(¶ms, node, state);
|
19657
19378
|
}
|
19658
19379
|
|
19659
19380
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
19660
19381
|
// they do something more efficient than spinning (?)
|
19661
19382
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19662
|
-
ggml_compute_forward(¶ms, node);
|
19383
|
+
ggml_compute_forward(¶ms, node, state);
|
19663
19384
|
|
19664
19385
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19665
19386
|
params.type = GGML_TASK_TYPE_FINALIZE;
|
19666
|
-
ggml_compute_forward(¶ms, node);
|
19387
|
+
ggml_compute_forward(¶ms, node, state);
|
19667
19388
|
}
|
19668
19389
|
|
19669
19390
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -19702,7 +19423,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19702
19423
|
|
19703
19424
|
if (state->ith < n_tasks) {
|
19704
19425
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19705
|
-
ggml_compute_forward(¶ms, node);
|
19426
|
+
ggml_compute_forward(¶ms, node, state);
|
19706
19427
|
}
|
19707
19428
|
}
|
19708
19429
|
|
@@ -19723,7 +19444,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19723
19444
|
|
19724
19445
|
if (state->ith < n_tasks) {
|
19725
19446
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19726
|
-
ggml_compute_forward(¶ms, node);
|
19447
|
+
ggml_compute_forward(¶ms, node, state);
|
19727
19448
|
}
|
19728
19449
|
|
19729
19450
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
@@ -19874,39 +19595,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19874
19595
|
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
19875
19596
|
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
19876
19597
|
} break;
|
19877
|
-
case GGML_OP_FLASH_ATTN:
|
19878
|
-
{
|
19879
|
-
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
19880
|
-
|
19881
|
-
if (node->src[1]->type == GGML_TYPE_F32) {
|
19882
|
-
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
19883
|
-
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
19884
|
-
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
19885
|
-
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
19886
|
-
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
19887
|
-
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
19888
|
-
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
19889
|
-
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
19890
|
-
}
|
19891
|
-
} break;
|
19892
19598
|
case GGML_OP_FLASH_ATTN_EXT:
|
19893
19599
|
{
|
19894
19600
|
const int64_t ne00 = node->src[0]->ne[0]; // D
|
19895
19601
|
|
19896
|
-
cur =
|
19897
|
-
} break;
|
19898
|
-
case GGML_OP_FLASH_FF:
|
19899
|
-
{
|
19900
|
-
if (node->src[1]->type == GGML_TYPE_F32) {
|
19901
|
-
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
19902
|
-
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
19903
|
-
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
19904
|
-
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
19905
|
-
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
19906
|
-
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
19907
|
-
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
19908
|
-
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
19909
|
-
}
|
19602
|
+
cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
|
19910
19603
|
} break;
|
19911
19604
|
case GGML_OP_FLASH_ATTN_BACK:
|
19912
19605
|
{
|
@@ -19974,6 +19667,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19974
19667
|
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
19975
19668
|
/*.abort_callback =*/ NULL,
|
19976
19669
|
/*.abort_callback_data =*/ NULL,
|
19670
|
+
/*.current_chunk; =*/ 0,
|
19977
19671
|
};
|
19978
19672
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
19979
19673
|
|
@@ -21747,11 +21441,7 @@ size_t ggml_quantize_chunk(
|
|
21747
21441
|
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21748
21442
|
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21749
21443
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21750
|
-
#if QK_K == 64
|
21751
|
-
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21752
|
-
#else
|
21753
21444
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21754
|
-
#endif
|
21755
21445
|
case GGML_TYPE_F16:
|
21756
21446
|
{
|
21757
21447
|
size_t elemsize = sizeof(ggml_fp16_t);
|
@@ -23028,6 +22718,14 @@ int ggml_cpu_has_avx512_vnni(void) {
|
|
23028
22718
|
#endif
|
23029
22719
|
}
|
23030
22720
|
|
22721
|
+
int ggml_cpu_has_avx512_bf16(void) {
|
22722
|
+
#if defined(__AVX512BF16__)
|
22723
|
+
return 1;
|
22724
|
+
#else
|
22725
|
+
return 0;
|
22726
|
+
#endif
|
22727
|
+
}
|
22728
|
+
|
23031
22729
|
int ggml_cpu_has_fma(void) {
|
23032
22730
|
#if defined(__FMA__)
|
23033
22731
|
return 1;
|