llama_cpp 0.15.1 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
-
#include "sgemm.h"
|
8
7
|
|
9
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
10
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -37,6 +36,10 @@
|
|
37
36
|
#undef GGML_USE_LLAMAFILE
|
38
37
|
#endif
|
39
38
|
|
39
|
+
#ifdef GGML_USE_LLAMAFILE
|
40
|
+
#include "sgemm.h"
|
41
|
+
#endif
|
42
|
+
|
40
43
|
#if defined(_MSC_VER)
|
41
44
|
// disable "possible loss of data" to avoid hundreds of casts
|
42
45
|
// we should just be careful :)
|
@@ -109,6 +112,8 @@ typedef void * thread_ret_t;
|
|
109
112
|
|
110
113
|
#endif
|
111
114
|
|
115
|
+
typedef pthread_t ggml_thread_t;
|
116
|
+
|
112
117
|
#ifdef GGML_USE_CPU_HBM
|
113
118
|
#include <hbwmalloc.h>
|
114
119
|
#endif
|
@@ -160,9 +165,6 @@ void ggml_print_backtrace(void) {
|
|
160
165
|
#define GGML_DEBUG 0
|
161
166
|
#define GGML_GELU_FP16
|
162
167
|
#define GGML_GELU_QUICK_FP16
|
163
|
-
#define GGML_SILU_FP16
|
164
|
-
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
165
|
-
// #define GGML_FLASH_ATTN_EXP_FP16
|
166
168
|
|
167
169
|
#define GGML_SOFT_MAX_UNROLL 4
|
168
170
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -313,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
|
313
315
|
// precomputed quick gelu table for f16 (128 KB)
|
314
316
|
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
315
317
|
|
316
|
-
// precomputed silu table for f16 (128 KB)
|
317
|
-
static ggml_fp16_t ggml_table_silu_f16[1 << 16];
|
318
|
-
|
319
|
-
// precomputed exp table for f16 (128 KB)
|
320
|
-
static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
321
|
-
|
322
318
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
323
319
|
float ggml_table_f32_f16[1 << 16];
|
324
320
|
|
@@ -410,10 +406,10 @@ void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
|
|
410
406
|
int i = 0;
|
411
407
|
#if defined(__AVX512BF16__)
|
412
408
|
for (; i + 32 <= n; i += 32) {
|
413
|
-
|
414
|
-
(
|
415
|
-
(
|
416
|
-
|
409
|
+
_mm512_storeu_si512(
|
410
|
+
(__m512i *)(y + i),
|
411
|
+
m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
|
412
|
+
_mm512_loadu_ps(x + i))));
|
417
413
|
}
|
418
414
|
#endif
|
419
415
|
for (; i < n; i++) {
|
@@ -875,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
875
871
|
},
|
876
872
|
[GGML_TYPE_IQ4_XS] = {
|
877
873
|
.type_name = "iq4_xs",
|
878
|
-
#if QK_K == 64
|
879
|
-
.blck_size = QK4_NL,
|
880
|
-
#else
|
881
874
|
.blck_size = QK_K,
|
882
|
-
#endif
|
883
875
|
.type_size = sizeof(block_iq4_xs),
|
884
876
|
.is_quantized = true,
|
885
877
|
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
886
878
|
.from_float = quantize_row_iq4_xs,
|
887
879
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
888
880
|
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
889
|
-
#if QK_K == 64
|
890
|
-
.vec_dot_type = GGML_TYPE_Q8_0,
|
891
|
-
#else
|
892
881
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
893
|
-
#endif
|
894
882
|
.nrows = 1,
|
895
883
|
},
|
896
884
|
[GGML_TYPE_Q8_K] = {
|
@@ -1303,6 +1291,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1303
1291
|
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
1304
1292
|
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
1305
1293
|
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
1294
|
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
1295
|
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
1306
1296
|
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
1307
1297
|
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
1308
1298
|
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
@@ -1525,6 +1515,195 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
1525
1515
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
1526
1516
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
1527
1517
|
|
1518
|
+
#elif defined(__loongarch_asx)
|
1519
|
+
|
1520
|
+
#define GGML_SIMD
|
1521
|
+
|
1522
|
+
// F32 LASX
|
1523
|
+
#define GGML_F32_STEP 32
|
1524
|
+
#define GGML_F32_EPR 8
|
1525
|
+
|
1526
|
+
#define GGML_F32x8 __m256
|
1527
|
+
#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
|
1528
|
+
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
|
1529
|
+
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
|
1530
|
+
#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
|
1531
|
+
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
|
1532
|
+
#define GGML_F32x8_ADD __lasx_xvfadd_s
|
1533
|
+
#define GGML_F32x8_MUL __lasx_xvfmul_s
|
1534
|
+
#define GGML_F32x8_REDUCE(res, x) \
|
1535
|
+
do { \
|
1536
|
+
int offset = GGML_F32_ARR >> 1; \
|
1537
|
+
for (int i = 0; i < offset; ++i) { \
|
1538
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
1539
|
+
} \
|
1540
|
+
offset >>= 1; \
|
1541
|
+
for (int i = 0; i < offset; ++i) { \
|
1542
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
1543
|
+
} \
|
1544
|
+
offset >>= 1; \
|
1545
|
+
for (int i = 0; i < offset; ++i) { \
|
1546
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
1547
|
+
} \
|
1548
|
+
float *tmp_p = (float *)&x[0]; \
|
1549
|
+
res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
|
1550
|
+
} while (0)
|
1551
|
+
// TODO: is this optimal ?
|
1552
|
+
|
1553
|
+
#define GGML_F32_VEC GGML_F32x8
|
1554
|
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
1555
|
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
1556
|
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
1557
|
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
1558
|
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
1559
|
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
1560
|
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
1561
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
1562
|
+
|
1563
|
+
// F16 LASX
|
1564
|
+
|
1565
|
+
#define GGML_F16_STEP 32
|
1566
|
+
#define GGML_F16_EPR 8
|
1567
|
+
|
1568
|
+
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
1569
|
+
|
1570
|
+
#define GGML_F32Cx8 __m256
|
1571
|
+
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
1572
|
+
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
1573
|
+
|
1574
|
+
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
1575
|
+
float tmp[8];
|
1576
|
+
|
1577
|
+
for (int i = 0; i < 8; i++) {
|
1578
|
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
1579
|
+
}
|
1580
|
+
|
1581
|
+
return (__m256)__lasx_xvld(tmp, 0);
|
1582
|
+
}
|
1583
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
1584
|
+
float arr[8];
|
1585
|
+
|
1586
|
+
__lasx_xvst(y, arr, 0);
|
1587
|
+
|
1588
|
+
for (int i = 0; i < 8; i++)
|
1589
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1590
|
+
}
|
1591
|
+
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
1592
|
+
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
1593
|
+
|
1594
|
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
1595
|
+
#define GGML_F32Cx8_ADD __lasx_xvfadd_s
|
1596
|
+
#define GGML_F32Cx8_MUL __lasx_xvfmul_s
|
1597
|
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
1598
|
+
|
1599
|
+
#define GGML_F16_VEC GGML_F32Cx8
|
1600
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
1601
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
1602
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
1603
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
1604
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
1605
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
1606
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
1607
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
1608
|
+
|
1609
|
+
#elif defined(__loongarch_sx)
|
1610
|
+
|
1611
|
+
#define GGML_SIMD
|
1612
|
+
|
1613
|
+
// F32 LSX
|
1614
|
+
|
1615
|
+
#define GGML_F32_STEP 32
|
1616
|
+
#define GGML_F32_EPR 4
|
1617
|
+
|
1618
|
+
#define GGML_F32x4 __m128
|
1619
|
+
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
1620
|
+
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
1621
|
+
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
1622
|
+
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
|
1623
|
+
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
1624
|
+
#define GGML_F32x4_ADD __lsx_vfadd_s
|
1625
|
+
#define GGML_F32x4_MUL __lsx_vfmul_s
|
1626
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
1627
|
+
{ \
|
1628
|
+
int offset = GGML_F32_ARR >> 1; \
|
1629
|
+
for (int i = 0; i < offset; ++i) { \
|
1630
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
1631
|
+
} \
|
1632
|
+
offset >>= 1; \
|
1633
|
+
for (int i = 0; i < offset; ++i) { \
|
1634
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
1635
|
+
} \
|
1636
|
+
offset >>= 1; \
|
1637
|
+
for (int i = 0; i < offset; ++i) { \
|
1638
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
1639
|
+
} \
|
1640
|
+
__m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
|
1641
|
+
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
|
1642
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
1643
|
+
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
1644
|
+
tmp = __lsx_vsrli_d((__m128i)t0, 32); \
|
1645
|
+
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
|
1646
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
1647
|
+
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
1648
|
+
}
|
1649
|
+
|
1650
|
+
#define GGML_F32_VEC GGML_F32x4
|
1651
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
1652
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
1653
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
1654
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
1655
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
1656
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
1657
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
1658
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
1659
|
+
|
1660
|
+
// F16 LSX
|
1661
|
+
|
1662
|
+
#define GGML_F16_STEP 32
|
1663
|
+
#define GGML_F16_EPR 4
|
1664
|
+
|
1665
|
+
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
1666
|
+
float tmp[4];
|
1667
|
+
|
1668
|
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
1669
|
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
1670
|
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
1671
|
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
1672
|
+
|
1673
|
+
return __lsx_vld(tmp, 0);
|
1674
|
+
}
|
1675
|
+
|
1676
|
+
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
1677
|
+
float arr[4];
|
1678
|
+
|
1679
|
+
__lsx_vst(y, arr, 0);
|
1680
|
+
|
1681
|
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
1682
|
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
1683
|
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
1684
|
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
#define GGML_F32Cx4 __m128
|
1688
|
+
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
1689
|
+
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
1690
|
+
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
1691
|
+
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
1692
|
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
1693
|
+
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
1694
|
+
#define GGML_F32Cx4_MUL __lsx_vfmul_s
|
1695
|
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
1696
|
+
|
1697
|
+
#define GGML_F16_VEC GGML_F32Cx4
|
1698
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
1699
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
1700
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
1701
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
1702
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
1703
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
1704
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
1705
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
1706
|
+
|
1528
1707
|
#endif
|
1529
1708
|
|
1530
1709
|
// GGML_F32_ARR / GGML_F16_ARR
|
@@ -1534,6 +1713,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
1534
1713
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
1535
1714
|
#endif
|
1536
1715
|
|
1716
|
+
//
|
1717
|
+
// ggml context
|
1718
|
+
//
|
1719
|
+
|
1720
|
+
struct ggml_context {
|
1721
|
+
size_t mem_size;
|
1722
|
+
void* mem_buffer;
|
1723
|
+
bool mem_buffer_owned;
|
1724
|
+
bool no_alloc;
|
1725
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
1726
|
+
|
1727
|
+
int n_objects;
|
1728
|
+
|
1729
|
+
struct ggml_object* objects_begin;
|
1730
|
+
struct ggml_object* objects_end;
|
1731
|
+
|
1732
|
+
struct ggml_scratch scratch;
|
1733
|
+
struct ggml_scratch scratch_save;
|
1734
|
+
};
|
1735
|
+
|
1736
|
+
struct ggml_context_container {
|
1737
|
+
bool used;
|
1738
|
+
|
1739
|
+
struct ggml_context context;
|
1740
|
+
};
|
1741
|
+
|
1742
|
+
struct ggml_compute_state_shared {
|
1743
|
+
const struct ggml_cgraph* cgraph;
|
1744
|
+
const struct ggml_cplan* cplan;
|
1745
|
+
|
1746
|
+
int64_t perf_node_start_cycles;
|
1747
|
+
int64_t perf_node_start_time_us;
|
1748
|
+
|
1749
|
+
const int n_threads;
|
1750
|
+
|
1751
|
+
// synchronization primitives
|
1752
|
+
atomic_int n_active; // num active threads
|
1753
|
+
atomic_int node_n; // active graph node
|
1754
|
+
atomic_int node_task; // active graph node task phase
|
1755
|
+
|
1756
|
+
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
1757
|
+
void* abort_callback_data;
|
1758
|
+
|
1759
|
+
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
1760
|
+
};
|
1761
|
+
|
1762
|
+
struct ggml_compute_state {
|
1763
|
+
ggml_thread_t thrd;
|
1764
|
+
int ith;
|
1765
|
+
struct ggml_compute_state_shared* shared;
|
1766
|
+
enum ggml_status ec;
|
1767
|
+
};
|
1768
|
+
|
1537
1769
|
//
|
1538
1770
|
// fundamental operations
|
1539
1771
|
//
|
@@ -1615,10 +1847,10 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
|
1615
1847
|
__m512 c1 = _mm512_setzero_ps();
|
1616
1848
|
__m512 c2 = _mm512_setzero_ps();
|
1617
1849
|
for (; i + 64 <= n; i += 64) {
|
1618
|
-
c1 = _mm512_dpbf16_ps(c1, (
|
1619
|
-
|
1620
|
-
c2 = _mm512_dpbf16_ps(c2, (
|
1621
|
-
|
1850
|
+
c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
|
1851
|
+
m512bh(_mm512_loadu_si512((y + i))));
|
1852
|
+
c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
|
1853
|
+
m512bh(_mm512_loadu_si512((y + i + 32))));
|
1622
1854
|
}
|
1623
1855
|
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
1624
1856
|
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
@@ -1949,6 +2181,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
1949
2181
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1950
2182
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1951
2183
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
2184
|
+
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
1952
2185
|
// TODO: optimize performance
|
1953
2186
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1954
2187
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
@@ -2024,52 +2257,291 @@ inline static float ggml_silu_f32(float x) {
|
|
2024
2257
|
return x/(1.0f + expf(-x));
|
2025
2258
|
}
|
2026
2259
|
|
2027
|
-
|
2028
|
-
|
2029
|
-
//
|
2030
|
-
//
|
2031
|
-
//
|
2032
|
-
//
|
2260
|
+
#if defined(__ARM_NEON) && defined(__aarch64__)
|
2261
|
+
|
2262
|
+
// adapted from arm limited optimized routine
|
2263
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2264
|
+
// numbers above 88.38 will flush to infinity
|
2265
|
+
// numbers beneath -103.97 will flush to zero
|
2266
|
+
inline static float32x4_t ggml_v_expf(float32x4_t x) {
|
2267
|
+
const float32x4_t r = vdupq_n_f32(0x1.8p23f);
|
2268
|
+
const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
|
2269
|
+
const float32x4_t n = vsubq_f32(z, r);
|
2270
|
+
const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
|
2271
|
+
vdupq_n_f32(0x1.7f7d1cp-20f));
|
2272
|
+
const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
|
2273
|
+
const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
|
2274
|
+
const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
|
2275
|
+
const float32x4_t u = vmulq_f32(b, b);
|
2276
|
+
const float32x4_t j = vfmaq_f32(
|
2277
|
+
vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
|
2278
|
+
vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
|
2279
|
+
vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
|
2280
|
+
if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
|
2281
|
+
return vfmaq_f32(k, j, k);
|
2282
|
+
const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
|
2283
|
+
const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
|
2284
|
+
const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
|
2285
|
+
return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
|
2286
|
+
vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
|
2287
|
+
}
|
2288
|
+
|
2289
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2290
|
+
inline static float32x4_t ggml_v_silu(float32x4_t x) {
|
2291
|
+
const float32x4_t one = vdupq_n_f32(1.0f);
|
2292
|
+
const float32x4_t zero = vdupq_n_f32(0.0f);
|
2293
|
+
const float32x4_t neg_x = vsubq_f32(zero, x);
|
2294
|
+
const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
|
2295
|
+
const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
|
2296
|
+
return vdivq_f32(x, one_plus_exp_neg_x);
|
2297
|
+
}
|
2298
|
+
|
2299
|
+
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
|
2300
|
+
|
2301
|
+
// adapted from arm limited optimized routine
|
2302
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2303
|
+
// numbers above 88.38 will flush to infinity
|
2304
|
+
// numbers beneath -103.97 will flush to zero
|
2305
|
+
inline static __m512 ggml_v_expf(__m512 x) {
|
2306
|
+
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
2307
|
+
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
2308
|
+
const __m512 n = _mm512_sub_ps(z, r);
|
2309
|
+
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
2310
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
2311
|
+
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
|
2312
|
+
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
2313
|
+
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
2314
|
+
const __m512 u = _mm512_mul_ps(b, b);
|
2315
|
+
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2316
|
+
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
2317
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2318
|
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2319
|
+
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
2320
|
+
if (_mm512_kortestz(c, c))
|
2321
|
+
return _mm512_fmadd_ps(j, k, k);
|
2322
|
+
const __m512i g = _mm512_and_si512(
|
2323
|
+
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
2324
|
+
_mm512_set1_epi32(0x82000000u));
|
2325
|
+
const __m512 s1 =
|
2326
|
+
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
2327
|
+
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
2328
|
+
const __mmask16 d =
|
2329
|
+
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
2330
|
+
return _mm512_mask_blend_ps(
|
2331
|
+
d, _mm512_mask_blend_ps(
|
2332
|
+
c, _mm512_fmadd_ps(k, j, k),
|
2333
|
+
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
|
2334
|
+
_mm512_mul_ps(s1, s1));
|
2335
|
+
}
|
2336
|
+
|
2337
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2338
|
+
inline static __m512 ggml_v_silu(__m512 x) {
|
2339
|
+
const __m512 one = _mm512_set1_ps(1);
|
2340
|
+
const __m512 zero = _mm512_setzero_ps();
|
2341
|
+
const __m512 neg_x = _mm512_sub_ps(zero, x);
|
2342
|
+
const __m512 exp_neg_x = ggml_v_expf(neg_x);
|
2343
|
+
const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
|
2344
|
+
return _mm512_div_ps(x, one_plus_exp_neg_x);
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2348
|
+
|
2349
|
+
// adapted from arm limited optimized routine
|
2350
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2351
|
+
// numbers above 88.38 will flush to infinity
|
2352
|
+
// numbers beneath -103.97 will flush to zero
|
2353
|
+
inline static __m256 ggml_v_expf(__m256 x) {
|
2354
|
+
const __m256 r = _mm256_set1_ps(0x1.8p23f);
|
2355
|
+
const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
|
2356
|
+
const __m256 n = _mm256_sub_ps(z, r);
|
2357
|
+
const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
|
2358
|
+
_mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
|
2359
|
+
const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
|
2360
|
+
const __m256 k = _mm256_castsi256_ps(
|
2361
|
+
_mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
|
2362
|
+
const __m256i c = _mm256_castps_si256(
|
2363
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2364
|
+
_mm256_set1_ps(126), _CMP_GT_OQ));
|
2365
|
+
const __m256 u = _mm256_mul_ps(b, b);
|
2366
|
+
const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
|
2367
|
+
_mm256_set1_ps(0x1.573e2ep-5f)), u,
|
2368
|
+
_mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
|
2369
|
+
_mm256_set1_ps(0x1.fffdb6p-2f))),
|
2370
|
+
u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
|
2371
|
+
if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
|
2372
|
+
return _mm256_fmadd_ps(j, k, k);
|
2373
|
+
const __m256i g = _mm256_and_si256(
|
2374
|
+
_mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
|
2375
|
+
_mm256_set1_epi32(0x82000000u));
|
2376
|
+
const __m256 s1 =
|
2377
|
+
_mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
|
2378
|
+
const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
|
2379
|
+
const __m256i d = _mm256_castps_si256(
|
2380
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2381
|
+
_mm256_set1_ps(192), _CMP_GT_OQ));
|
2382
|
+
return _mm256_or_ps(
|
2383
|
+
_mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
|
2384
|
+
_mm256_andnot_ps(
|
2385
|
+
_mm256_castsi256_ps(d),
|
2386
|
+
_mm256_or_ps(
|
2387
|
+
_mm256_and_ps(_mm256_castsi256_ps(c),
|
2388
|
+
_mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
|
2389
|
+
_mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
|
2390
|
+
}
|
2391
|
+
|
2392
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2393
|
+
inline static __m256 ggml_v_silu(__m256 x) {
|
2394
|
+
const __m256 one = _mm256_set1_ps(1);
|
2395
|
+
const __m256 zero = _mm256_setzero_ps();
|
2396
|
+
const __m256 neg_x = _mm256_sub_ps(zero, x);
|
2397
|
+
const __m256 exp_neg_x = ggml_v_expf(neg_x);
|
2398
|
+
const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
|
2399
|
+
return _mm256_div_ps(x, one_plus_exp_neg_x);
|
2400
|
+
}
|
2401
|
+
|
2402
|
+
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
|
2033
2403
|
|
2034
|
-
#
|
2035
|
-
|
2036
|
-
|
2037
|
-
for (int i = 0; i < n; ++i) {
|
2038
|
-
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
2039
|
-
memcpy(&t, &fp16, sizeof(uint16_t));
|
2040
|
-
y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
|
2041
|
-
}
|
2042
|
-
}
|
2404
|
+
#if defined(__FMA__)
|
2405
|
+
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
|
2406
|
+
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
|
2043
2407
|
#else
|
2044
|
-
|
2045
|
-
|
2408
|
+
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
|
2409
|
+
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
|
2410
|
+
#endif
|
2411
|
+
|
2412
|
+
// adapted from arm limited optimized routine
|
2413
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2414
|
+
// numbers above 88.38 will flush to infinity
|
2415
|
+
// numbers beneath -103.97 will flush to zero
|
2416
|
+
inline static __m128 ggml_v_expf(__m128 x) {
|
2417
|
+
const __m128 r = _mm_set1_ps(0x1.8p23f);
|
2418
|
+
const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
|
2419
|
+
const __m128 n = _mm_sub_ps(z, r);
|
2420
|
+
const __m128 b =
|
2421
|
+
NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
|
2422
|
+
const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
|
2423
|
+
const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
|
2424
|
+
const __m128i c =
|
2425
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
|
2426
|
+
const __m128 u = _mm_mul_ps(b, b);
|
2427
|
+
const __m128 j =
|
2428
|
+
MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
|
2429
|
+
MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
|
2430
|
+
u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
|
2431
|
+
if (!_mm_movemask_epi8(c))
|
2432
|
+
return MADD128(j, k, k);
|
2433
|
+
const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
|
2434
|
+
_mm_set1_epi32(0x82000000u));
|
2435
|
+
const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
|
2436
|
+
const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
|
2437
|
+
const __m128i d =
|
2438
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
|
2439
|
+
return _mm_or_ps(
|
2440
|
+
_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
|
2441
|
+
_mm_andnot_ps(_mm_castsi128_ps(d),
|
2442
|
+
_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
|
2443
|
+
_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
|
2444
|
+
}
|
2445
|
+
|
2446
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2447
|
+
inline static __m128 ggml_v_silu(__m128 x) {
|
2448
|
+
const __m128 one = _mm_set1_ps(1);
|
2449
|
+
const __m128 zero = _mm_setzero_ps();
|
2450
|
+
const __m128 neg_x = _mm_sub_ps(zero, x);
|
2451
|
+
const __m128 exp_neg_x = ggml_v_expf(neg_x);
|
2452
|
+
const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
|
2453
|
+
return _mm_div_ps(x, one_plus_exp_neg_x);
|
2454
|
+
}
|
2455
|
+
|
2456
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__
|
2457
|
+
|
2458
|
+
static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
2459
|
+
int i = 0;
|
2460
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2461
|
+
for (; i + 15 < n; i += 16) {
|
2462
|
+
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
|
2463
|
+
}
|
2464
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2465
|
+
for (; i + 7 < n; i += 8) {
|
2466
|
+
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
|
2467
|
+
}
|
2468
|
+
#elif defined(__SSE2__)
|
2469
|
+
for (; i + 3 < n; i += 4) {
|
2470
|
+
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
2471
|
+
}
|
2472
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
2473
|
+
for (; i + 3 < n; i += 4) {
|
2474
|
+
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
2475
|
+
}
|
2476
|
+
#endif
|
2477
|
+
for (; i < n; ++i) {
|
2046
2478
|
y[i] = ggml_silu_f32(x[i]);
|
2047
2479
|
}
|
2048
2480
|
}
|
2481
|
+
|
2482
|
+
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
2483
|
+
int i = 0;
|
2484
|
+
ggml_float sum = 0;
|
2485
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2486
|
+
for (; i + 15 < n; i += 16) {
|
2487
|
+
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
|
2488
|
+
_mm512_set1_ps(max)));
|
2489
|
+
_mm512_storeu_ps(y + i, val);
|
2490
|
+
sum += (ggml_float)_mm512_reduce_add_ps(val);
|
2491
|
+
}
|
2492
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2493
|
+
for (; i + 7 < n; i += 8) {
|
2494
|
+
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
|
2495
|
+
_mm256_set1_ps(max)));
|
2496
|
+
_mm256_storeu_ps(y + i, val);
|
2497
|
+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
2498
|
+
_mm256_castps256_ps128(val));
|
2499
|
+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
2500
|
+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
2501
|
+
sum += (ggml_float)_mm_cvtss_f32(val2);
|
2502
|
+
}
|
2503
|
+
#elif defined(__SSE2__)
|
2504
|
+
for (; i + 3 < n; i += 4) {
|
2505
|
+
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
|
2506
|
+
_mm_set1_ps(max)));
|
2507
|
+
_mm_storeu_ps(y + i, val);
|
2508
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
2509
|
+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
2510
|
+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
2511
|
+
#else
|
2512
|
+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
2513
|
+
val = _mm_add_ps(val, tmp);
|
2514
|
+
tmp = _mm_movehl_ps(tmp, val);
|
2515
|
+
val = _mm_add_ss(val, tmp);
|
2516
|
+
#endif
|
2517
|
+
sum += (ggml_float)_mm_cvtss_f32(val);
|
2518
|
+
}
|
2519
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
2520
|
+
for (; i + 3 < n; i += 4) {
|
2521
|
+
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
2522
|
+
vdupq_n_f32(max)));
|
2523
|
+
vst1q_f32(y + i, val);
|
2524
|
+
sum += (ggml_float)vaddvq_f32(val);
|
2525
|
+
}
|
2049
2526
|
#endif
|
2527
|
+
for (; i < n; ++i) {
|
2528
|
+
float val = expf(x[i] - max);
|
2529
|
+
sum += (ggml_float)val;
|
2530
|
+
y[i] = val;
|
2531
|
+
}
|
2532
|
+
return sum;
|
2533
|
+
}
|
2050
2534
|
|
2051
2535
|
inline static float ggml_silu_backward_f32(float x, float dy) {
|
2052
2536
|
const float s = 1.0f/(1.0f + expf(-x));
|
2053
2537
|
return dy*s*(1.0f + x*(1.0f - s));
|
2054
2538
|
}
|
2055
2539
|
|
2056
|
-
#ifdef GGML_SILU_FP16
|
2057
|
-
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2058
|
-
for (int i = 0; i < n; ++i) {
|
2059
|
-
// we did not use x[i] to compute forward silu but its f16 equivalent
|
2060
|
-
// take derivative at f16 of x[i]:
|
2061
|
-
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
2062
|
-
float usedx = GGML_FP16_TO_FP32(fp16);
|
2063
|
-
dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
|
2064
|
-
}
|
2065
|
-
}
|
2066
|
-
#else
|
2067
2540
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2068
2541
|
for (int i = 0; i < n; ++i) {
|
2069
2542
|
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
2070
2543
|
}
|
2071
2544
|
}
|
2072
|
-
#endif
|
2073
2545
|
|
2074
2546
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
2075
2547
|
#ifndef GGML_USE_ACCELERATE
|
@@ -2185,7 +2657,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2185
2657
|
"SOFT_MAX_BACK",
|
2186
2658
|
"ROPE",
|
2187
2659
|
"ROPE_BACK",
|
2188
|
-
"ALIBI",
|
2189
2660
|
"CLAMP",
|
2190
2661
|
"CONV_TRANSPOSE_1D",
|
2191
2662
|
"IM2COL",
|
@@ -2199,9 +2670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2199
2670
|
"ARGSORT",
|
2200
2671
|
"LEAKY_RELU",
|
2201
2672
|
|
2202
|
-
"FLASH_ATTN",
|
2203
2673
|
"FLASH_ATTN_EXT",
|
2204
|
-
"FLASH_FF",
|
2205
2674
|
"FLASH_ATTN_BACK",
|
2206
2675
|
"SSM_CONV",
|
2207
2676
|
"SSM_SCAN",
|
@@ -2227,7 +2696,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2227
2696
|
"CROSS_ENTROPY_LOSS_BACK",
|
2228
2697
|
};
|
2229
2698
|
|
2230
|
-
static_assert(GGML_OP_COUNT ==
|
2699
|
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
2231
2700
|
|
2232
2701
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2233
2702
|
"none",
|
@@ -2276,7 +2745,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2276
2745
|
"soft_max_back(x)",
|
2277
2746
|
"rope(x)",
|
2278
2747
|
"rope_back(x)",
|
2279
|
-
"alibi(x)",
|
2280
2748
|
"clamp(x)",
|
2281
2749
|
"conv_transpose_1d(x)",
|
2282
2750
|
"im2col(x)",
|
@@ -2290,9 +2758,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2290
2758
|
"argsort(x)",
|
2291
2759
|
"leaky_relu(x)",
|
2292
2760
|
|
2293
|
-
"flash_attn(x)",
|
2294
2761
|
"flash_attn_ext(x)",
|
2295
|
-
"flash_ff(x)",
|
2296
2762
|
"flash_attn_back(x)",
|
2297
2763
|
"ssm_conv(x)",
|
2298
2764
|
"ssm_scan(x)",
|
@@ -2318,7 +2784,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2318
2784
|
"cross_entropy_loss_back(x,y)",
|
2319
2785
|
};
|
2320
2786
|
|
2321
|
-
static_assert(GGML_OP_COUNT ==
|
2787
|
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
2322
2788
|
|
2323
2789
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
2324
2790
|
|
@@ -2331,6 +2797,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2331
2797
|
"TANH",
|
2332
2798
|
"ELU",
|
2333
2799
|
"RELU",
|
2800
|
+
"SIGMOID",
|
2334
2801
|
"GELU",
|
2335
2802
|
"GELU_QUICK",
|
2336
2803
|
"SILU",
|
@@ -2338,7 +2805,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2338
2805
|
"HARDSIGMOID",
|
2339
2806
|
};
|
2340
2807
|
|
2341
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
2808
|
+
static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
2342
2809
|
|
2343
2810
|
|
2344
2811
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
@@ -2380,32 +2847,6 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
2380
2847
|
}
|
2381
2848
|
}
|
2382
2849
|
|
2383
|
-
//
|
2384
|
-
// ggml context
|
2385
|
-
//
|
2386
|
-
|
2387
|
-
struct ggml_context {
|
2388
|
-
size_t mem_size;
|
2389
|
-
void * mem_buffer;
|
2390
|
-
bool mem_buffer_owned;
|
2391
|
-
bool no_alloc;
|
2392
|
-
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
2393
|
-
|
2394
|
-
int n_objects;
|
2395
|
-
|
2396
|
-
struct ggml_object * objects_begin;
|
2397
|
-
struct ggml_object * objects_end;
|
2398
|
-
|
2399
|
-
struct ggml_scratch scratch;
|
2400
|
-
struct ggml_scratch scratch_save;
|
2401
|
-
};
|
2402
|
-
|
2403
|
-
struct ggml_context_container {
|
2404
|
-
bool used;
|
2405
|
-
|
2406
|
-
struct ggml_context context;
|
2407
|
-
};
|
2408
|
-
|
2409
2850
|
//
|
2410
2851
|
// NUMA support
|
2411
2852
|
//
|
@@ -2819,8 +3260,18 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2819
3260
|
(t0->ne[3] == t1->ne[3] );
|
2820
3261
|
}
|
2821
3262
|
|
2822
|
-
|
2823
|
-
|
3263
|
+
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3264
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3265
|
+
|
3266
|
+
return
|
3267
|
+
(t0->nb[0] == t1->nb[0] ) &&
|
3268
|
+
(t0->nb[1] == t1->nb[1] ) &&
|
3269
|
+
(t0->nb[2] == t1->nb[2] ) &&
|
3270
|
+
(t0->nb[3] == t1->nb[3] );
|
3271
|
+
}
|
3272
|
+
|
3273
|
+
// check if t1 can be represented as a repeatition of t0
|
3274
|
+
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2824
3275
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2825
3276
|
|
2826
3277
|
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
|
@@ -2878,8 +3329,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2878
3329
|
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
2879
3330
|
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
2880
3331
|
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
2881
|
-
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
2882
|
-
ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
2883
3332
|
}
|
2884
3333
|
|
2885
3334
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
@@ -3163,6 +3612,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3163
3612
|
|
3164
3613
|
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
3165
3614
|
|
3615
|
+
#ifdef __clang__
|
3616
|
+
// temporary until ggml_tensor::backend is removed
|
3617
|
+
#pragma clang diagnostic push
|
3618
|
+
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
3619
|
+
#endif
|
3620
|
+
|
3166
3621
|
*result = (struct ggml_tensor) {
|
3167
3622
|
/*.type =*/ type,
|
3168
3623
|
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
|
@@ -3185,6 +3640,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3185
3640
|
/*.padding =*/ { 0 },
|
3186
3641
|
};
|
3187
3642
|
|
3643
|
+
#ifdef __clang__
|
3644
|
+
#pragma clang diagnostic pop
|
3645
|
+
#endif
|
3646
|
+
|
3188
3647
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
3189
3648
|
//ggml_assert_aligned(result->data);
|
3190
3649
|
|
@@ -4563,6 +5022,20 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
4563
5022
|
return result;
|
4564
5023
|
}
|
4565
5024
|
|
5025
|
+
// ggml_sigmoid
|
5026
|
+
|
5027
|
+
struct ggml_tensor * ggml_sigmoid(
|
5028
|
+
struct ggml_context * ctx,
|
5029
|
+
struct ggml_tensor * a) {
|
5030
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
|
5031
|
+
}
|
5032
|
+
|
5033
|
+
struct ggml_tensor * ggml_sigmoid_inplace(
|
5034
|
+
struct ggml_context * ctx,
|
5035
|
+
struct ggml_tensor * a) {
|
5036
|
+
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
|
5037
|
+
}
|
5038
|
+
|
4566
5039
|
// ggml_gelu
|
4567
5040
|
|
4568
5041
|
struct ggml_tensor * ggml_gelu(
|
@@ -5646,7 +6119,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5646
6119
|
struct ggml_context * ctx,
|
5647
6120
|
struct ggml_tensor * a,
|
5648
6121
|
struct ggml_tensor * mask,
|
5649
|
-
struct ggml_tensor * pos,
|
5650
6122
|
float scale,
|
5651
6123
|
float max_bias,
|
5652
6124
|
bool inplace) {
|
@@ -5660,18 +6132,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5660
6132
|
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
5661
6133
|
}
|
5662
6134
|
|
5663
|
-
if (pos) {
|
5664
|
-
GGML_ASSERT(ggml_is_vector(pos));
|
5665
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
5666
|
-
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
5667
|
-
}
|
5668
|
-
|
5669
|
-
if (pos && mask) {
|
5670
|
-
GGML_ASSERT(pos->type == mask->type);
|
5671
|
-
}
|
5672
|
-
|
5673
6135
|
if (max_bias > 0.0f) {
|
5674
|
-
GGML_ASSERT(
|
6136
|
+
GGML_ASSERT(mask);
|
5675
6137
|
}
|
5676
6138
|
|
5677
6139
|
bool is_node = false;
|
@@ -5689,7 +6151,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5689
6151
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5690
6152
|
result->src[0] = a;
|
5691
6153
|
result->src[1] = mask;
|
5692
|
-
result->src[2] = pos;
|
5693
6154
|
|
5694
6155
|
return result;
|
5695
6156
|
}
|
@@ -5697,23 +6158,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5697
6158
|
struct ggml_tensor * ggml_soft_max(
|
5698
6159
|
struct ggml_context * ctx,
|
5699
6160
|
struct ggml_tensor * a) {
|
5700
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
6161
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
|
5701
6162
|
}
|
5702
6163
|
|
5703
6164
|
struct ggml_tensor * ggml_soft_max_inplace(
|
5704
6165
|
struct ggml_context * ctx,
|
5705
6166
|
struct ggml_tensor * a) {
|
5706
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
6167
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
|
5707
6168
|
}
|
5708
6169
|
|
5709
6170
|
struct ggml_tensor * ggml_soft_max_ext(
|
5710
6171
|
struct ggml_context * ctx,
|
5711
6172
|
struct ggml_tensor * a,
|
5712
6173
|
struct ggml_tensor * mask,
|
5713
|
-
struct ggml_tensor * pos,
|
5714
6174
|
float scale,
|
5715
6175
|
float max_bias) {
|
5716
|
-
return ggml_soft_max_impl(ctx, a, mask,
|
6176
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
5717
6177
|
}
|
5718
6178
|
|
5719
6179
|
// ggml_soft_max_back
|
@@ -5759,6 +6219,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
5759
6219
|
struct ggml_context * ctx,
|
5760
6220
|
struct ggml_tensor * a,
|
5761
6221
|
struct ggml_tensor * b,
|
6222
|
+
struct ggml_tensor * c,
|
5762
6223
|
int n_dims,
|
5763
6224
|
int mode,
|
5764
6225
|
int n_ctx,
|
@@ -5772,10 +6233,17 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
5772
6233
|
float xpos_base,
|
5773
6234
|
bool xpos_down,
|
5774
6235
|
bool inplace) {
|
6236
|
+
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
6237
|
+
|
5775
6238
|
GGML_ASSERT(ggml_is_vector(b));
|
5776
6239
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
5777
6240
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
5778
6241
|
|
6242
|
+
if (c) {
|
6243
|
+
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
6244
|
+
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
6245
|
+
}
|
6246
|
+
|
5779
6247
|
bool is_node = false;
|
5780
6248
|
|
5781
6249
|
if (a->grad) {
|
@@ -5799,6 +6267,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
5799
6267
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5800
6268
|
result->src[0] = a;
|
5801
6269
|
result->src[1] = b;
|
6270
|
+
result->src[2] = c;
|
5802
6271
|
|
5803
6272
|
return result;
|
5804
6273
|
}
|
@@ -5811,7 +6280,7 @@ struct ggml_tensor * ggml_rope(
|
|
5811
6280
|
int mode,
|
5812
6281
|
int n_ctx) {
|
5813
6282
|
return ggml_rope_impl(
|
5814
|
-
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
|
6283
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
|
5815
6284
|
);
|
5816
6285
|
}
|
5817
6286
|
|
@@ -5823,14 +6292,15 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
5823
6292
|
int mode,
|
5824
6293
|
int n_ctx) {
|
5825
6294
|
return ggml_rope_impl(
|
5826
|
-
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
|
6295
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
|
5827
6296
|
);
|
5828
6297
|
}
|
5829
6298
|
|
5830
|
-
struct ggml_tensor *
|
6299
|
+
struct ggml_tensor * ggml_rope_ext(
|
5831
6300
|
struct ggml_context * ctx,
|
5832
6301
|
struct ggml_tensor * a,
|
5833
6302
|
struct ggml_tensor * b,
|
6303
|
+
struct ggml_tensor * c,
|
5834
6304
|
int n_dims,
|
5835
6305
|
int mode,
|
5836
6306
|
int n_ctx,
|
@@ -5842,15 +6312,16 @@ struct ggml_tensor * ggml_rope_custom(
|
|
5842
6312
|
float beta_fast,
|
5843
6313
|
float beta_slow) {
|
5844
6314
|
return ggml_rope_impl(
|
5845
|
-
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6315
|
+
ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
5846
6316
|
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
|
5847
6317
|
);
|
5848
6318
|
}
|
5849
6319
|
|
5850
|
-
struct ggml_tensor *
|
6320
|
+
struct ggml_tensor * ggml_rope_ext_inplace(
|
5851
6321
|
struct ggml_context * ctx,
|
5852
6322
|
struct ggml_tensor * a,
|
5853
6323
|
struct ggml_tensor * b,
|
6324
|
+
struct ggml_tensor * c,
|
5854
6325
|
int n_dims,
|
5855
6326
|
int mode,
|
5856
6327
|
int n_ctx,
|
@@ -5862,19 +6333,49 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
5862
6333
|
float beta_fast,
|
5863
6334
|
float beta_slow) {
|
5864
6335
|
return ggml_rope_impl(
|
5865
|
-
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6336
|
+
ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
5866
6337
|
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
|
5867
6338
|
);
|
5868
6339
|
}
|
5869
6340
|
|
5870
|
-
struct ggml_tensor *
|
6341
|
+
struct ggml_tensor * ggml_rope_custom(
|
6342
|
+
struct ggml_context * ctx,
|
6343
|
+
struct ggml_tensor * a,
|
6344
|
+
struct ggml_tensor * b,
|
6345
|
+
int n_dims,
|
6346
|
+
int mode,
|
6347
|
+
int n_ctx,
|
6348
|
+
int n_orig_ctx,
|
6349
|
+
float freq_base,
|
6350
|
+
float freq_scale,
|
6351
|
+
float ext_factor,
|
6352
|
+
float attn_factor,
|
6353
|
+
float beta_fast,
|
6354
|
+
float beta_slow) {
|
6355
|
+
return ggml_rope_impl(
|
6356
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6357
|
+
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
|
6358
|
+
);
|
6359
|
+
}
|
6360
|
+
|
6361
|
+
struct ggml_tensor * ggml_rope_custom_inplace(
|
5871
6362
|
struct ggml_context * ctx,
|
5872
6363
|
struct ggml_tensor * a,
|
5873
6364
|
struct ggml_tensor * b,
|
5874
6365
|
int n_dims,
|
5875
|
-
|
5876
|
-
|
5877
|
-
|
6366
|
+
int mode,
|
6367
|
+
int n_ctx,
|
6368
|
+
int n_orig_ctx,
|
6369
|
+
float freq_base,
|
6370
|
+
float freq_scale,
|
6371
|
+
float ext_factor,
|
6372
|
+
float attn_factor,
|
6373
|
+
float beta_fast,
|
6374
|
+
float beta_slow) {
|
6375
|
+
return ggml_rope_impl(
|
6376
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
6377
|
+
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
|
6378
|
+
);
|
5878
6379
|
}
|
5879
6380
|
|
5880
6381
|
// ggml_rope_back
|
@@ -5883,6 +6384,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
5883
6384
|
struct ggml_context * ctx,
|
5884
6385
|
struct ggml_tensor * a,
|
5885
6386
|
struct ggml_tensor * b,
|
6387
|
+
struct ggml_tensor * c,
|
5886
6388
|
int n_dims,
|
5887
6389
|
int mode,
|
5888
6390
|
int n_ctx,
|
@@ -5898,6 +6400,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
5898
6400
|
GGML_ASSERT(ggml_is_vector(b));
|
5899
6401
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
5900
6402
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
6403
|
+
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
|
5901
6404
|
|
5902
6405
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
5903
6406
|
|
@@ -5928,37 +6431,6 @@ struct ggml_tensor * ggml_rope_back(
|
|
5928
6431
|
return result;
|
5929
6432
|
}
|
5930
6433
|
|
5931
|
-
// ggml_alibi
|
5932
|
-
|
5933
|
-
struct ggml_tensor * ggml_alibi(
|
5934
|
-
struct ggml_context * ctx,
|
5935
|
-
struct ggml_tensor * a,
|
5936
|
-
int n_past,
|
5937
|
-
int n_head,
|
5938
|
-
float bias_max) {
|
5939
|
-
GGML_ASSERT(n_past >= 0);
|
5940
|
-
bool is_node = false;
|
5941
|
-
|
5942
|
-
if (a->grad) {
|
5943
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5944
|
-
is_node = true;
|
5945
|
-
}
|
5946
|
-
|
5947
|
-
// TODO: when implement backward, fix this:
|
5948
|
-
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5949
|
-
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
5950
|
-
|
5951
|
-
int32_t op_params[3] = { n_past, n_head };
|
5952
|
-
memcpy(op_params + 2, &bias_max, sizeof(float));
|
5953
|
-
ggml_set_op_params(result, op_params, sizeof(op_params));
|
5954
|
-
|
5955
|
-
result->op = GGML_OP_ALIBI;
|
5956
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5957
|
-
result->src[0] = a;
|
5958
|
-
|
5959
|
-
return result;
|
5960
|
-
}
|
5961
|
-
|
5962
6434
|
// ggml_clamp
|
5963
6435
|
|
5964
6436
|
struct ggml_tensor * ggml_clamp(
|
@@ -6308,7 +6780,10 @@ struct ggml_tensor * ggml_pool_2d(
|
|
6308
6780
|
static struct ggml_tensor * ggml_upscale_impl(
|
6309
6781
|
struct ggml_context * ctx,
|
6310
6782
|
struct ggml_tensor * a,
|
6311
|
-
int
|
6783
|
+
int ne0,
|
6784
|
+
int ne1,
|
6785
|
+
int ne2,
|
6786
|
+
int ne3) {
|
6312
6787
|
bool is_node = false;
|
6313
6788
|
|
6314
6789
|
if (a->grad) {
|
@@ -6316,19 +6791,45 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
6316
6791
|
is_node = true;
|
6317
6792
|
}
|
6318
6793
|
|
6794
|
+
GGML_ASSERT(a->ne[0] <= ne0);
|
6795
|
+
GGML_ASSERT(a->ne[1] <= ne1);
|
6796
|
+
GGML_ASSERT(a->ne[2] <= ne2);
|
6797
|
+
GGML_ASSERT(a->ne[3] <= ne3);
|
6798
|
+
|
6319
6799
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
6320
|
-
|
6321
|
-
|
6322
|
-
|
6800
|
+
ne0,
|
6801
|
+
ne1,
|
6802
|
+
ne2,
|
6803
|
+
ne3
|
6804
|
+
);
|
6323
6805
|
|
6324
6806
|
result->op = GGML_OP_UPSCALE;
|
6325
|
-
|
6807
|
+
|
6326
6808
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6327
6809
|
result->src[0] = a;
|
6328
6810
|
|
6329
6811
|
return result;
|
6330
6812
|
}
|
6331
6813
|
|
6814
|
+
struct ggml_tensor * ggml_upscale(
|
6815
|
+
struct ggml_context * ctx,
|
6816
|
+
struct ggml_tensor * a,
|
6817
|
+
int scale_factor) {
|
6818
|
+
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
|
6819
|
+
}
|
6820
|
+
|
6821
|
+
struct ggml_tensor * ggml_upscale_ext(
|
6822
|
+
struct ggml_context * ctx,
|
6823
|
+
struct ggml_tensor * a,
|
6824
|
+
int ne0,
|
6825
|
+
int ne1,
|
6826
|
+
int ne2,
|
6827
|
+
int ne3) {
|
6828
|
+
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
|
6829
|
+
}
|
6830
|
+
|
6831
|
+
// ggml_pad
|
6832
|
+
|
6332
6833
|
struct ggml_tensor * ggml_pad(
|
6333
6834
|
struct ggml_context * ctx,
|
6334
6835
|
struct ggml_tensor * a,
|
@@ -6353,12 +6854,7 @@ struct ggml_tensor * ggml_pad(
|
|
6353
6854
|
return result;
|
6354
6855
|
}
|
6355
6856
|
|
6356
|
-
|
6357
|
-
struct ggml_context * ctx,
|
6358
|
-
struct ggml_tensor * a,
|
6359
|
-
int scale_factor) {
|
6360
|
-
return ggml_upscale_impl(ctx, a, scale_factor);
|
6361
|
-
}
|
6857
|
+
// ggml_arange
|
6362
6858
|
|
6363
6859
|
struct ggml_tensor * ggml_arange(
|
6364
6860
|
struct ggml_context * ctx,
|
@@ -6380,6 +6876,8 @@ struct ggml_tensor * ggml_arange(
|
|
6380
6876
|
return result;
|
6381
6877
|
}
|
6382
6878
|
|
6879
|
+
// ggml_timestep_embedding
|
6880
|
+
|
6383
6881
|
struct ggml_tensor * ggml_timestep_embedding(
|
6384
6882
|
struct ggml_context * ctx,
|
6385
6883
|
struct ggml_tensor * timesteps,
|
@@ -6446,38 +6944,6 @@ struct ggml_tensor * ggml_top_k(
|
|
6446
6944
|
return result;
|
6447
6945
|
}
|
6448
6946
|
|
6449
|
-
// ggml_flash_attn
|
6450
|
-
|
6451
|
-
struct ggml_tensor * ggml_flash_attn(
|
6452
|
-
struct ggml_context * ctx,
|
6453
|
-
struct ggml_tensor * q,
|
6454
|
-
struct ggml_tensor * k,
|
6455
|
-
struct ggml_tensor * v,
|
6456
|
-
bool masked) {
|
6457
|
-
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6458
|
-
// TODO: check if vT can be multiplied by (k*qT)
|
6459
|
-
|
6460
|
-
bool is_node = false;
|
6461
|
-
|
6462
|
-
if (q->grad || k->grad || v->grad) {
|
6463
|
-
is_node = true;
|
6464
|
-
}
|
6465
|
-
|
6466
|
-
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
6467
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
6468
|
-
|
6469
|
-
int32_t t = masked ? 1 : 0;
|
6470
|
-
ggml_set_op_params(result, &t, sizeof(t));
|
6471
|
-
|
6472
|
-
result->op = GGML_OP_FLASH_ATTN;
|
6473
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6474
|
-
result->src[0] = q;
|
6475
|
-
result->src[1] = k;
|
6476
|
-
result->src[2] = v;
|
6477
|
-
|
6478
|
-
return result;
|
6479
|
-
}
|
6480
|
-
|
6481
6947
|
// ggml_flash_attn_ext
|
6482
6948
|
|
6483
6949
|
struct ggml_tensor * ggml_flash_attn_ext(
|
@@ -6486,9 +6952,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6486
6952
|
struct ggml_tensor * k,
|
6487
6953
|
struct ggml_tensor * v,
|
6488
6954
|
struct ggml_tensor * mask,
|
6489
|
-
float scale
|
6955
|
+
float scale,
|
6956
|
+
float max_bias) {
|
6490
6957
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6491
6958
|
// TODO: check if vT can be multiplied by (k*qT)
|
6959
|
+
|
6492
6960
|
if (mask) {
|
6493
6961
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
6494
6962
|
GGML_ASSERT(mask->ne[2] == 1);
|
@@ -6498,6 +6966,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6498
6966
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
6499
6967
|
}
|
6500
6968
|
|
6969
|
+
if (max_bias > 0.0f) {
|
6970
|
+
GGML_ASSERT(mask);
|
6971
|
+
}
|
6972
|
+
|
6501
6973
|
bool is_node = false;
|
6502
6974
|
|
6503
6975
|
if (q->grad || k->grad || v->grad) {
|
@@ -6508,7 +6980,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6508
6980
|
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
6509
6981
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6510
6982
|
|
6511
|
-
float params[] = { scale };
|
6983
|
+
float params[] = { scale, max_bias };
|
6512
6984
|
ggml_set_op_params(result, params, sizeof(params));
|
6513
6985
|
|
6514
6986
|
result->op = GGML_OP_FLASH_ATTN_EXT;
|
@@ -6528,39 +7000,7 @@ void ggml_flash_attn_ext_set_prec(
|
|
6528
7000
|
|
6529
7001
|
const int32_t prec_i32 = (int32_t) prec;
|
6530
7002
|
|
6531
|
-
ggml_set_op_params_i32(a,
|
6532
|
-
}
|
6533
|
-
|
6534
|
-
// ggml_flash_ff
|
6535
|
-
|
6536
|
-
struct ggml_tensor * ggml_flash_ff(
|
6537
|
-
struct ggml_context * ctx,
|
6538
|
-
struct ggml_tensor * a,
|
6539
|
-
struct ggml_tensor * b0,
|
6540
|
-
struct ggml_tensor * b1,
|
6541
|
-
struct ggml_tensor * c0,
|
6542
|
-
struct ggml_tensor * c1) {
|
6543
|
-
GGML_ASSERT(ggml_can_mul_mat(b0, a));
|
6544
|
-
// TODO: more checks
|
6545
|
-
|
6546
|
-
bool is_node = false;
|
6547
|
-
|
6548
|
-
if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
|
6549
|
-
is_node = true;
|
6550
|
-
}
|
6551
|
-
|
6552
|
-
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6553
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
6554
|
-
|
6555
|
-
result->op = GGML_OP_FLASH_FF;
|
6556
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6557
|
-
result->src[0] = a;
|
6558
|
-
result->src[1] = b0;
|
6559
|
-
result->src[2] = b1;
|
6560
|
-
result->src[3] = c0;
|
6561
|
-
result->src[4] = c1;
|
6562
|
-
|
6563
|
-
return result;
|
7003
|
+
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
6564
7004
|
}
|
6565
7005
|
|
6566
7006
|
// ggml_flash_attn_back
|
@@ -6572,6 +7012,8 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6572
7012
|
struct ggml_tensor * v,
|
6573
7013
|
struct ggml_tensor * d,
|
6574
7014
|
bool masked) {
|
7015
|
+
GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
|
7016
|
+
|
6575
7017
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6576
7018
|
// TODO: check if vT can be multiplied by (k*qT)
|
6577
7019
|
|
@@ -10892,6 +11334,52 @@ static void ggml_compute_forward_relu(
|
|
10892
11334
|
}
|
10893
11335
|
}
|
10894
11336
|
|
11337
|
+
// ggml_compute_forward_sigmoid
|
11338
|
+
|
11339
|
+
static void ggml_compute_forward_sigmoid_f32(
|
11340
|
+
const struct ggml_compute_params * params,
|
11341
|
+
struct ggml_tensor * dst) {
|
11342
|
+
|
11343
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11344
|
+
|
11345
|
+
assert(params->ith == 0);
|
11346
|
+
assert(ggml_are_same_shape(src0, dst));
|
11347
|
+
|
11348
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11349
|
+
return;
|
11350
|
+
}
|
11351
|
+
|
11352
|
+
const int n = ggml_nrows(src0);
|
11353
|
+
const int nc = src0->ne[0];
|
11354
|
+
|
11355
|
+
assert(dst->nb[0] == sizeof(float));
|
11356
|
+
assert(src0->nb[0] == sizeof(float));
|
11357
|
+
|
11358
|
+
for (int i = 0; i < n; i++) {
|
11359
|
+
ggml_vec_sigmoid_f32(nc,
|
11360
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
11361
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
11362
|
+
}
|
11363
|
+
}
|
11364
|
+
|
11365
|
+
static void ggml_compute_forward_sigmoid(
|
11366
|
+
const struct ggml_compute_params * params,
|
11367
|
+
struct ggml_tensor * dst) {
|
11368
|
+
|
11369
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11370
|
+
|
11371
|
+
switch (src0->type) {
|
11372
|
+
case GGML_TYPE_F32:
|
11373
|
+
{
|
11374
|
+
ggml_compute_forward_sigmoid_f32(params, dst);
|
11375
|
+
} break;
|
11376
|
+
default:
|
11377
|
+
{
|
11378
|
+
GGML_ASSERT(false);
|
11379
|
+
} break;
|
11380
|
+
}
|
11381
|
+
}
|
11382
|
+
|
10895
11383
|
// ggml_compute_forward_gelu
|
10896
11384
|
|
10897
11385
|
static void ggml_compute_forward_gelu_f32(
|
@@ -11742,80 +12230,171 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
11742
12230
|
}
|
11743
12231
|
#endif
|
11744
12232
|
|
11745
|
-
static void
|
11746
|
-
|
11747
|
-
|
12233
|
+
static void ggml_compute_forward_mul_mat_one_chunk(
|
12234
|
+
const struct ggml_compute_params * params,
|
12235
|
+
struct ggml_tensor * dst,
|
12236
|
+
const int64_t num_rows_per_vec_dot,
|
12237
|
+
const int64_t ir0_start,
|
12238
|
+
const int64_t ir0_end,
|
12239
|
+
const int64_t ir1_start,
|
12240
|
+
const int64_t ir1_end) {
|
11748
12241
|
|
11749
12242
|
const struct ggml_tensor * src0 = dst->src[0];
|
11750
12243
|
const struct ggml_tensor * src1 = dst->src[1];
|
11751
12244
|
|
11752
|
-
int64_t t0 = ggml_perf_time_us();
|
11753
|
-
UNUSED(t0);
|
11754
|
-
|
11755
12245
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11756
12246
|
|
11757
|
-
const int ith = params->ith;
|
11758
|
-
const int nth = params->nth;
|
11759
|
-
|
11760
12247
|
const enum ggml_type type = src0->type;
|
11761
12248
|
|
11762
12249
|
const bool src1_cont = ggml_is_contiguous(src1);
|
11763
12250
|
|
11764
|
-
ggml_vec_dot_t const vec_dot
|
11765
|
-
enum ggml_type const vec_dot_type
|
11766
|
-
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
11767
|
-
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
11768
|
-
|
11769
|
-
GGML_ASSERT(ne0 == ne01);
|
11770
|
-
GGML_ASSERT(ne1 == ne11);
|
11771
|
-
GGML_ASSERT(ne2 == ne12);
|
11772
|
-
GGML_ASSERT(ne3 == ne13);
|
11773
|
-
|
11774
|
-
// we don't support permuted src0 or src1
|
11775
|
-
GGML_ASSERT(nb00 == ggml_type_size(type));
|
11776
|
-
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
11777
|
-
|
11778
|
-
// dst cannot be transposed or permuted
|
11779
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
11780
|
-
GGML_ASSERT(nb0 <= nb1);
|
11781
|
-
GGML_ASSERT(nb1 <= nb2);
|
11782
|
-
GGML_ASSERT(nb2 <= nb3);
|
12251
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
12252
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
11783
12253
|
|
11784
12254
|
// broadcast factors
|
11785
|
-
const int64_t r2 = ne12/ne02;
|
11786
|
-
const int64_t r3 = ne13/ne03;
|
12255
|
+
const int64_t r2 = ne12 / ne02;
|
12256
|
+
const int64_t r3 = ne13 / ne03;
|
11787
12257
|
|
11788
|
-
//
|
11789
|
-
// compute by src0 rows
|
12258
|
+
//printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
|
11790
12259
|
|
11791
|
-
|
11792
|
-
if (
|
11793
|
-
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
11794
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
11795
|
-
}
|
12260
|
+
// threads with no work simply yield (not sure if it helps)
|
12261
|
+
if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
|
11796
12262
|
return;
|
11797
12263
|
}
|
11798
|
-
#endif
|
11799
12264
|
|
11800
|
-
|
11801
|
-
|
11802
|
-
const int64_t ne_plane = ne01*ne00;
|
11803
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
11804
|
-
UNUSED(desired_wsize);
|
12265
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12266
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11805
12267
|
|
11806
|
-
|
11807
|
-
|
11808
|
-
assert(params->wsize >= desired_wsize);
|
11809
|
-
// parallelize by src0 rows
|
11810
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
11811
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
11812
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
11813
|
-
const int64_t i03 = i13/r3;
|
11814
|
-
const int64_t i02 = i12/r2;
|
12268
|
+
assert(ne12 % ne02 == 0);
|
12269
|
+
assert(ne13 % ne03 == 0);
|
11815
12270
|
|
11816
|
-
|
11817
|
-
|
11818
|
-
|
12271
|
+
// block-tiling attempt
|
12272
|
+
const int64_t blck_0 = 16;
|
12273
|
+
const int64_t blck_1 = 16;
|
12274
|
+
|
12275
|
+
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
12276
|
+
|
12277
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
12278
|
+
// 16 * 2, accounting for mmla kernels
|
12279
|
+
float tmp[32];
|
12280
|
+
|
12281
|
+
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
12282
|
+
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
12283
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
|
12284
|
+
const int64_t i13 = (ir1 / (ne12 * ne1));
|
12285
|
+
const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
12286
|
+
const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
12287
|
+
|
12288
|
+
// broadcast src0 into src1
|
12289
|
+
const int64_t i03 = i13 / r3;
|
12290
|
+
const int64_t i02 = i12 / r2;
|
12291
|
+
|
12292
|
+
const int64_t i1 = i11;
|
12293
|
+
const int64_t i2 = i12;
|
12294
|
+
const int64_t i3 = i13;
|
12295
|
+
|
12296
|
+
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
12297
|
+
|
12298
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
12299
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
12300
|
+
// the original src1 data pointer, so we should index using the indices directly
|
12301
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12302
|
+
const char * src1_col = (const char*)wdata +
|
12303
|
+
(src1_cont || src1->type != vec_dot_type
|
12304
|
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
12305
|
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
12306
|
+
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
12307
|
+
|
12308
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
12309
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
12310
|
+
//}
|
12311
|
+
|
12312
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
12313
|
+
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
12314
|
+
}
|
12315
|
+
|
12316
|
+
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
12317
|
+
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
|
12318
|
+
}
|
12319
|
+
}
|
12320
|
+
}
|
12321
|
+
}
|
12322
|
+
}
|
12323
|
+
|
12324
|
+
static void ggml_compute_forward_mul_mat(
|
12325
|
+
const struct ggml_compute_params * params,
|
12326
|
+
struct ggml_tensor * dst,
|
12327
|
+
struct ggml_compute_state * state) {
|
12328
|
+
|
12329
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
12330
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
12331
|
+
|
12332
|
+
int64_t t0 = ggml_perf_time_us();
|
12333
|
+
UNUSED(t0);
|
12334
|
+
|
12335
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
12336
|
+
|
12337
|
+
const int ith = params->ith;
|
12338
|
+
const int nth = params->nth;
|
12339
|
+
|
12340
|
+
const enum ggml_type type = src0->type;
|
12341
|
+
|
12342
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
12343
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
12344
|
+
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
12345
|
+
|
12346
|
+
GGML_ASSERT(ne0 == ne01);
|
12347
|
+
GGML_ASSERT(ne1 == ne11);
|
12348
|
+
GGML_ASSERT(ne2 == ne12);
|
12349
|
+
GGML_ASSERT(ne3 == ne13);
|
12350
|
+
|
12351
|
+
// we don't support permuted src0 or src1
|
12352
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
12353
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
12354
|
+
|
12355
|
+
// dst cannot be transposed or permuted
|
12356
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12357
|
+
GGML_ASSERT(nb0 <= nb1);
|
12358
|
+
GGML_ASSERT(nb1 <= nb2);
|
12359
|
+
GGML_ASSERT(nb2 <= nb3);
|
12360
|
+
|
12361
|
+
// broadcast factors
|
12362
|
+
const int64_t r2 = ne12 / ne02;
|
12363
|
+
const int64_t r3 = ne13 / ne03;
|
12364
|
+
UNUSED(r2);
|
12365
|
+
UNUSED(r3);
|
12366
|
+
|
12367
|
+
// nb01 >= nb00 - src0 is not transposed
|
12368
|
+
// compute by src0 rows
|
12369
|
+
|
12370
|
+
#if defined(GGML_USE_CLBLAST)
|
12371
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
12372
|
+
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
12373
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
12374
|
+
}
|
12375
|
+
return;
|
12376
|
+
}
|
12377
|
+
#endif
|
12378
|
+
|
12379
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12380
|
+
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
12381
|
+
const int64_t ne_plane = ne01*ne00;
|
12382
|
+
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
12383
|
+
UNUSED(desired_wsize);
|
12384
|
+
|
12385
|
+
if (params->type == GGML_TASK_TYPE_INIT) {
|
12386
|
+
if (type != GGML_TYPE_F32) {
|
12387
|
+
assert(params->wsize >= desired_wsize);
|
12388
|
+
// parallelize by src0 rows
|
12389
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12390
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12391
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
12392
|
+
const int64_t i03 = i13/r3;
|
12393
|
+
const int64_t i02 = i12/r2;
|
12394
|
+
|
12395
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12396
|
+
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12397
|
+
ggml_to_float_t const to_float = type_traits[type].to_float;
|
11819
12398
|
|
11820
12399
|
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
11821
12400
|
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
@@ -11865,6 +12444,8 @@ static void ggml_compute_forward_mul_mat(
|
|
11865
12444
|
#endif
|
11866
12445
|
|
11867
12446
|
#if GGML_USE_LLAMAFILE
|
12447
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
12448
|
+
|
11868
12449
|
if (src1_cont) {
|
11869
12450
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11870
12451
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
@@ -11890,6 +12471,8 @@ UseGgmlGemm1:;
|
|
11890
12471
|
if (ith != 0) {
|
11891
12472
|
return;
|
11892
12473
|
}
|
12474
|
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
12475
|
+
atomic_store(&state->shared->current_chunk, nth);
|
11893
12476
|
if (src1->type != vec_dot_type) {
|
11894
12477
|
char * wdata = params->wdata;
|
11895
12478
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -11914,11 +12497,11 @@ UseGgmlGemm1:;
|
|
11914
12497
|
return;
|
11915
12498
|
}
|
11916
12499
|
|
11917
|
-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11918
|
-
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11919
|
-
|
11920
12500
|
#if GGML_USE_LLAMAFILE
|
11921
12501
|
if (src1->type != vec_dot_type) {
|
12502
|
+
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12503
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
12504
|
+
|
11922
12505
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11923
12506
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
11924
12507
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
@@ -11939,98 +12522,87 @@ UseGgmlGemm1:;
|
|
11939
12522
|
UseGgmlGemm2:;
|
11940
12523
|
#endif
|
11941
12524
|
|
11942
|
-
|
11943
|
-
|
11944
|
-
|
11945
|
-
|
11946
|
-
|
11947
|
-
// distribute the thread work across the inner or outer loop based on which one is larger
|
11948
|
-
|
11949
|
-
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
11950
|
-
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
11951
|
-
|
11952
|
-
const int64_t ith0 = ith % nth0;
|
11953
|
-
const int64_t ith1 = ith / nth0;
|
11954
|
-
|
11955
|
-
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
11956
|
-
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
11957
|
-
|
11958
|
-
const int64_t ir010 = dr0*ith0;
|
11959
|
-
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
11960
|
-
|
11961
|
-
const int64_t ir110 = dr1*ith1;
|
11962
|
-
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11963
|
-
|
11964
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11965
|
-
|
11966
|
-
// threads with no work simply yield (not sure if it helps)
|
11967
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11968
|
-
sched_yield();
|
11969
|
-
return;
|
11970
|
-
}
|
12525
|
+
#ifdef GGML_PERF
|
12526
|
+
int chunks_executed = 0;
|
12527
|
+
UNUSED(chunks_executed);
|
12528
|
+
#endif
|
11971
12529
|
|
11972
|
-
|
11973
|
-
|
12530
|
+
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
12531
|
+
const int64_t nr0 = ne0;
|
11974
12532
|
|
11975
|
-
//
|
11976
|
-
const int64_t
|
11977
|
-
const int64_t blck_1 = 16;
|
12533
|
+
// This is the size of the rest of the dimensions of the result
|
12534
|
+
const int64_t nr1 = ne1 * ne2 * ne3;
|
11978
12535
|
|
11979
12536
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
11980
|
-
int64_t
|
12537
|
+
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
11981
12538
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
11982
12539
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
11983
12540
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
11984
|
-
|
12541
|
+
num_rows_per_vec_dot = 1;
|
11985
12542
|
}
|
11986
12543
|
|
11987
|
-
|
12544
|
+
// Now select a reasonable chunk size.
|
12545
|
+
int chunk_size = 16;
|
11988
12546
|
|
11989
|
-
//
|
11990
|
-
|
11991
|
-
|
12547
|
+
// We need to step up the size if it's small
|
12548
|
+
if (nr0 == 1 || nr1 == 1) {
|
12549
|
+
chunk_size = 64;
|
12550
|
+
}
|
11992
12551
|
|
11993
|
-
|
11994
|
-
|
11995
|
-
|
11996
|
-
|
11997
|
-
|
11998
|
-
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
12552
|
+
// distribute the work across the inner or outer loop based on which one is larger
|
12553
|
+
// The number of chunks in the 0/1 dim.
|
12554
|
+
// CEIL(nr0/chunk_size)
|
12555
|
+
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
12556
|
+
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
11999
12557
|
|
12000
|
-
|
12001
|
-
|
12002
|
-
|
12558
|
+
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
12559
|
+
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
12560
|
+
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
12561
|
+
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
12562
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
12563
|
+
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
12564
|
+
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
12565
|
+
}
|
12003
12566
|
|
12004
|
-
|
12005
|
-
|
12006
|
-
|
12567
|
+
// The number of elements in each chunk
|
12568
|
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
12569
|
+
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
12007
12570
|
|
12008
|
-
|
12571
|
+
//if (ith == 0)
|
12572
|
+
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
12009
12573
|
|
12010
|
-
|
12011
|
-
|
12012
|
-
// the original src1 data pointer, so we should index using the indices directly
|
12013
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12014
|
-
const char * src1_col = (const char *) wdata +
|
12015
|
-
(src1_cont || src1->type != vec_dot_type
|
12016
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
12017
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
12018
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
12574
|
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
12575
|
+
int current_chunk = ith;
|
12019
12576
|
|
12020
|
-
|
12021
|
-
|
12022
|
-
|
12577
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
12578
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
12579
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
12023
12580
|
|
12024
|
-
|
12025
|
-
|
12026
|
-
}
|
12581
|
+
const int64_t ir0_start = dr0 * ith0;
|
12582
|
+
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
12027
12583
|
|
12028
|
-
|
12029
|
-
|
12030
|
-
|
12031
|
-
|
12584
|
+
const int64_t ir1_start = dr1 * ith1;
|
12585
|
+
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
12586
|
+
|
12587
|
+
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
12588
|
+
|
12589
|
+
#ifdef GGML_PERF
|
12590
|
+
chunks_executed++;
|
12591
|
+
#endif
|
12592
|
+
|
12593
|
+
if (nth >= nchunk0 * nchunk1) {
|
12594
|
+
break;
|
12032
12595
|
}
|
12596
|
+
|
12597
|
+
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
12033
12598
|
}
|
12599
|
+
|
12600
|
+
#ifdef GGML_PERF
|
12601
|
+
// These numbers are useful when trying to measure how well the threading scheduling works.
|
12602
|
+
//int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
|
12603
|
+
//float time = (ggml_perf_time_us() - t0);
|
12604
|
+
//printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
|
12605
|
+
#endif
|
12034
12606
|
}
|
12035
12607
|
|
12036
12608
|
// ggml_compute_forward_mul_mat_id
|
@@ -13333,7 +13905,6 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13333
13905
|
|
13334
13906
|
const struct ggml_tensor * src0 = dst->src[0];
|
13335
13907
|
const struct ggml_tensor * src1 = dst->src[1];
|
13336
|
-
const struct ggml_tensor * src2 = dst->src[2];
|
13337
13908
|
|
13338
13909
|
assert(ggml_is_contiguous(dst));
|
13339
13910
|
assert(ggml_are_same_shape(src0, dst));
|
@@ -13359,8 +13930,8 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13359
13930
|
|
13360
13931
|
// TODO: is this supposed to be ceil instead of floor?
|
13361
13932
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
13362
|
-
const uint32_t
|
13363
|
-
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(
|
13933
|
+
const uint32_t n_head = ne02;
|
13934
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
13364
13935
|
|
13365
13936
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
13366
13937
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
@@ -13377,13 +13948,13 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13377
13948
|
|
13378
13949
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
13379
13950
|
|
13380
|
-
|
13381
|
-
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
13382
|
-
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
13383
|
-
|
13384
|
-
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
13951
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
13385
13952
|
|
13386
13953
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
13954
|
+
// ALiBi
|
13955
|
+
const uint32_t h = (i1/ne01)%ne02; // head
|
13956
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
13957
|
+
|
13387
13958
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
13388
13959
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
13389
13960
|
|
@@ -13396,27 +13967,11 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13396
13967
|
if (mp_f32) {
|
13397
13968
|
if (use_f16) {
|
13398
13969
|
for (int i = 0; i < nc; ++i) {
|
13399
|
-
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
13400
|
-
}
|
13401
|
-
} else {
|
13402
|
-
for (int i = 0; i < nc; ++i) {
|
13403
|
-
wp[i] += mp_f32[i];
|
13404
|
-
}
|
13405
|
-
}
|
13406
|
-
}
|
13407
|
-
|
13408
|
-
// ALiBi bias
|
13409
|
-
if (max_bias > 0.0f) {
|
13410
|
-
const uint32_t h = (i1/ne01)%ne02; // head
|
13411
|
-
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
13412
|
-
|
13413
|
-
if (use_f16) {
|
13414
|
-
for (int i = 0; i < nc; ++i) {
|
13415
|
-
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
13970
|
+
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
|
13416
13971
|
}
|
13417
13972
|
} else {
|
13418
13973
|
for (int i = 0; i < nc; ++i) {
|
13419
|
-
wp[i] += slope*
|
13974
|
+
wp[i] += slope*mp_f32[i];
|
13420
13975
|
}
|
13421
13976
|
}
|
13422
13977
|
}
|
@@ -13431,22 +13986,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13431
13986
|
float max = -INFINITY;
|
13432
13987
|
ggml_vec_max_f32(nc, &max, wp);
|
13433
13988
|
|
13434
|
-
ggml_float sum =
|
13435
|
-
|
13436
|
-
uint16_t scvt;
|
13437
|
-
for (int i = 0; i < nc; i++) {
|
13438
|
-
if (wp[i] == -INFINITY) {
|
13439
|
-
dp[i] = 0.0f;
|
13440
|
-
} else {
|
13441
|
-
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
13442
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
13443
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
13444
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
13445
|
-
sum += (ggml_float)val;
|
13446
|
-
dp[i] = val;
|
13447
|
-
}
|
13448
|
-
}
|
13449
|
-
|
13989
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
|
13450
13990
|
assert(sum > 0.0);
|
13451
13991
|
|
13452
13992
|
sum = 1.0/sum;
|
@@ -13578,68 +14118,9 @@ static void ggml_compute_forward_soft_max_back(
|
|
13578
14118
|
}
|
13579
14119
|
}
|
13580
14120
|
|
13581
|
-
//
|
13582
|
-
|
13583
|
-
static void ggml_compute_forward_alibi_f32(
|
13584
|
-
const struct ggml_compute_params * params,
|
13585
|
-
struct ggml_tensor * dst) {
|
13586
|
-
|
13587
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13588
|
-
|
13589
|
-
assert(params->ith == 0);
|
13590
|
-
|
13591
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13592
|
-
return;
|
13593
|
-
}
|
13594
|
-
|
13595
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13596
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
13597
|
-
float max_bias;
|
13598
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13599
|
-
|
13600
|
-
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13601
|
-
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13602
|
-
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13603
|
-
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13604
|
-
|
13605
|
-
const int64_t n = ggml_nrows(src0);
|
13606
|
-
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13607
|
-
|
13608
|
-
const size_t nb0 = src0->nb[0];
|
13609
|
-
const size_t nb1 = src0->nb[1];
|
13610
|
-
const size_t nb2 = src0->nb[2];
|
13611
|
-
//const int nb3 = src0->nb[3];
|
13612
|
-
|
13613
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
13614
|
-
GGML_ASSERT(n_head == ne2);
|
13615
|
-
|
13616
|
-
// add alibi to src0 (KQ_scaled)
|
13617
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
13618
|
-
|
13619
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13620
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13621
|
-
|
13622
|
-
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13623
|
-
// TODO: k*nb2 or k*nb3
|
13624
|
-
float m_k;
|
13625
|
-
|
13626
|
-
if (k < n_heads_log2_floor) {
|
13627
|
-
m_k = powf(m0, k + 1);
|
13628
|
-
} else {
|
13629
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13630
|
-
}
|
13631
|
-
|
13632
|
-
for (int64_t i = 0; i < ne0; i++) {
|
13633
|
-
for (int64_t j = 0; j < ne1; j++) {
|
13634
|
-
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13635
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13636
|
-
pdst[0] = i * m_k + src[0];
|
13637
|
-
}
|
13638
|
-
}
|
13639
|
-
}
|
13640
|
-
}
|
14121
|
+
// ggml_compute_forward_clamp
|
13641
14122
|
|
13642
|
-
static void
|
14123
|
+
static void ggml_compute_forward_clamp_f32(
|
13643
14124
|
const struct ggml_compute_params * params,
|
13644
14125
|
struct ggml_tensor * dst) {
|
13645
14126
|
|
@@ -13651,71 +14132,48 @@ static void ggml_compute_forward_alibi_f16(
|
|
13651
14132
|
return;
|
13652
14133
|
}
|
13653
14134
|
|
13654
|
-
|
13655
|
-
|
13656
|
-
float
|
13657
|
-
memcpy(&
|
14135
|
+
float min;
|
14136
|
+
float max;
|
14137
|
+
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
14138
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
13658
14139
|
|
13659
|
-
const int
|
13660
|
-
const int
|
13661
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13662
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
14140
|
+
const int ith = params->ith;
|
14141
|
+
const int nth = params->nth;
|
13663
14142
|
|
13664
14143
|
const int n = ggml_nrows(src0);
|
13665
|
-
const int
|
13666
|
-
|
13667
|
-
const int nb0 = src0->nb[0];
|
13668
|
-
const int nb1 = src0->nb[1];
|
13669
|
-
const int nb2 = src0->nb[2];
|
13670
|
-
//const int nb3 = src0->nb[3];
|
13671
|
-
|
13672
|
-
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
13673
|
-
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
13674
|
-
GGML_ASSERT(n_head == ne2);
|
13675
|
-
|
13676
|
-
// add alibi to src0 (KQ_scaled)
|
13677
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
14144
|
+
const int nc = src0->ne[0];
|
13678
14145
|
|
13679
|
-
const
|
13680
|
-
const
|
14146
|
+
const size_t nb00 = src0->nb[0];
|
14147
|
+
const size_t nb01 = src0->nb[1];
|
13681
14148
|
|
13682
|
-
|
13683
|
-
|
13684
|
-
float m_k;
|
14149
|
+
const size_t nb0 = dst->nb[0];
|
14150
|
+
const size_t nb1 = dst->nb[1];
|
13685
14151
|
|
13686
|
-
|
13687
|
-
|
13688
|
-
} else {
|
13689
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13690
|
-
}
|
14152
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
14153
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
13691
14154
|
|
13692
|
-
|
13693
|
-
|
13694
|
-
|
13695
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
14155
|
+
for (int j = ith; j < n; j += nth) {
|
14156
|
+
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
14157
|
+
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
13696
14158
|
|
13697
|
-
|
13698
|
-
|
13699
|
-
}
|
14159
|
+
for (int i = 0; i < nc; i++) {
|
14160
|
+
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
13700
14161
|
}
|
13701
14162
|
}
|
13702
14163
|
}
|
13703
14164
|
|
13704
|
-
static void
|
14165
|
+
static void ggml_compute_forward_clamp(
|
13705
14166
|
const struct ggml_compute_params * params,
|
13706
14167
|
struct ggml_tensor * dst) {
|
13707
14168
|
|
13708
14169
|
const struct ggml_tensor * src0 = dst->src[0];
|
13709
14170
|
|
13710
14171
|
switch (src0->type) {
|
13711
|
-
case GGML_TYPE_F16:
|
13712
|
-
{
|
13713
|
-
ggml_compute_forward_alibi_f16(params, dst);
|
13714
|
-
} break;
|
13715
14172
|
case GGML_TYPE_F32:
|
13716
14173
|
{
|
13717
|
-
|
14174
|
+
ggml_compute_forward_clamp_f32(params, dst);
|
13718
14175
|
} break;
|
14176
|
+
case GGML_TYPE_F16:
|
13719
14177
|
case GGML_TYPE_BF16:
|
13720
14178
|
case GGML_TYPE_Q4_0:
|
13721
14179
|
case GGML_TYPE_Q4_1:
|
@@ -13750,102 +14208,12 @@ static void ggml_compute_forward_alibi(
|
|
13750
14208
|
}
|
13751
14209
|
}
|
13752
14210
|
|
13753
|
-
//
|
13754
|
-
|
13755
|
-
static void ggml_compute_forward_clamp_f32(
|
13756
|
-
const struct ggml_compute_params * params,
|
13757
|
-
struct ggml_tensor * dst) {
|
13758
|
-
|
13759
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
14211
|
+
// ggml_compute_forward_rope
|
13760
14212
|
|
13761
|
-
|
13762
|
-
|
13763
|
-
|
13764
|
-
|
13765
|
-
}
|
13766
|
-
|
13767
|
-
float min;
|
13768
|
-
float max;
|
13769
|
-
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
13770
|
-
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
13771
|
-
|
13772
|
-
const int ith = params->ith;
|
13773
|
-
const int nth = params->nth;
|
13774
|
-
|
13775
|
-
const int n = ggml_nrows(src0);
|
13776
|
-
const int nc = src0->ne[0];
|
13777
|
-
|
13778
|
-
const size_t nb00 = src0->nb[0];
|
13779
|
-
const size_t nb01 = src0->nb[1];
|
13780
|
-
|
13781
|
-
const size_t nb0 = dst->nb[0];
|
13782
|
-
const size_t nb1 = dst->nb[1];
|
13783
|
-
|
13784
|
-
GGML_ASSERT( nb0 == sizeof(float));
|
13785
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
13786
|
-
|
13787
|
-
for (int j = ith; j < n; j += nth) {
|
13788
|
-
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
13789
|
-
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
13790
|
-
|
13791
|
-
for (int i = 0; i < nc; i++) {
|
13792
|
-
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
13793
|
-
}
|
13794
|
-
}
|
13795
|
-
}
|
13796
|
-
|
13797
|
-
static void ggml_compute_forward_clamp(
|
13798
|
-
const struct ggml_compute_params * params,
|
13799
|
-
struct ggml_tensor * dst) {
|
13800
|
-
|
13801
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13802
|
-
|
13803
|
-
switch (src0->type) {
|
13804
|
-
case GGML_TYPE_F32:
|
13805
|
-
{
|
13806
|
-
ggml_compute_forward_clamp_f32(params, dst);
|
13807
|
-
} break;
|
13808
|
-
case GGML_TYPE_F16:
|
13809
|
-
case GGML_TYPE_BF16:
|
13810
|
-
case GGML_TYPE_Q4_0:
|
13811
|
-
case GGML_TYPE_Q4_1:
|
13812
|
-
case GGML_TYPE_Q5_0:
|
13813
|
-
case GGML_TYPE_Q5_1:
|
13814
|
-
case GGML_TYPE_Q8_0:
|
13815
|
-
case GGML_TYPE_Q8_1:
|
13816
|
-
case GGML_TYPE_Q2_K:
|
13817
|
-
case GGML_TYPE_Q3_K:
|
13818
|
-
case GGML_TYPE_Q4_K:
|
13819
|
-
case GGML_TYPE_Q5_K:
|
13820
|
-
case GGML_TYPE_Q6_K:
|
13821
|
-
case GGML_TYPE_IQ2_XXS:
|
13822
|
-
case GGML_TYPE_IQ2_XS:
|
13823
|
-
case GGML_TYPE_IQ3_XXS:
|
13824
|
-
case GGML_TYPE_IQ1_S:
|
13825
|
-
case GGML_TYPE_IQ1_M:
|
13826
|
-
case GGML_TYPE_IQ4_NL:
|
13827
|
-
case GGML_TYPE_IQ4_XS:
|
13828
|
-
case GGML_TYPE_IQ3_S:
|
13829
|
-
case GGML_TYPE_IQ2_S:
|
13830
|
-
case GGML_TYPE_Q8_K:
|
13831
|
-
case GGML_TYPE_I8:
|
13832
|
-
case GGML_TYPE_I16:
|
13833
|
-
case GGML_TYPE_I32:
|
13834
|
-
case GGML_TYPE_I64:
|
13835
|
-
case GGML_TYPE_F64:
|
13836
|
-
case GGML_TYPE_COUNT:
|
13837
|
-
{
|
13838
|
-
GGML_ASSERT(false);
|
13839
|
-
} break;
|
13840
|
-
}
|
13841
|
-
}
|
13842
|
-
|
13843
|
-
// ggml_compute_forward_rope
|
13844
|
-
|
13845
|
-
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
13846
|
-
const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
13847
|
-
return 1 - MIN(1, MAX(0, y));
|
13848
|
-
}
|
14213
|
+
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
14214
|
+
const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
14215
|
+
return 1 - MIN(1, MAX(0, y));
|
14216
|
+
}
|
13849
14217
|
|
13850
14218
|
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
13851
14219
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
@@ -13905,6 +14273,7 @@ static void ggml_compute_forward_rope_f32(
|
|
13905
14273
|
|
13906
14274
|
const struct ggml_tensor * src0 = dst->src[0];
|
13907
14275
|
const struct ggml_tensor * src1 = dst->src[1];
|
14276
|
+
const struct ggml_tensor * src2 = dst->src[2];
|
13908
14277
|
|
13909
14278
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13910
14279
|
return;
|
@@ -13964,6 +14333,17 @@ static void ggml_compute_forward_rope_f32(
|
|
13964
14333
|
const bool is_neox = mode & 2;
|
13965
14334
|
const bool is_glm = mode & 4;
|
13966
14335
|
|
14336
|
+
const float * freq_factors = NULL;
|
14337
|
+
if (is_neox) {
|
14338
|
+
if (src2 != NULL) {
|
14339
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14340
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14341
|
+
freq_factors = (const float *) src2->data;
|
14342
|
+
}
|
14343
|
+
} else {
|
14344
|
+
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14345
|
+
}
|
14346
|
+
|
13967
14347
|
// backward process uses inverse rotation by cos and sin.
|
13968
14348
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
13969
14349
|
// this essentially just switches the sign of sin.
|
@@ -14040,10 +14420,11 @@ static void ggml_compute_forward_rope_f32(
|
|
14040
14420
|
|
14041
14421
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
14042
14422
|
float cur_rot = inv_ndims * ic - ib;
|
14423
|
+
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14043
14424
|
|
14044
14425
|
float cos_theta, sin_theta;
|
14045
14426
|
rope_yarn(
|
14046
|
-
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14427
|
+
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14047
14428
|
&cos_theta, &sin_theta
|
14048
14429
|
);
|
14049
14430
|
sin_theta *= sin_sign;
|
@@ -14076,6 +14457,7 @@ static void ggml_compute_forward_rope_f32(
|
|
14076
14457
|
}
|
14077
14458
|
}
|
14078
14459
|
|
14460
|
+
// TODO: deduplicate f16/f32 code
|
14079
14461
|
static void ggml_compute_forward_rope_f16(
|
14080
14462
|
const struct ggml_compute_params * params,
|
14081
14463
|
struct ggml_tensor * dst,
|
@@ -14083,6 +14465,7 @@ static void ggml_compute_forward_rope_f16(
|
|
14083
14465
|
|
14084
14466
|
const struct ggml_tensor * src0 = dst->src[0];
|
14085
14467
|
const struct ggml_tensor * src1 = dst->src[1];
|
14468
|
+
const struct ggml_tensor * src2 = dst->src[2];
|
14086
14469
|
|
14087
14470
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
14088
14471
|
return;
|
@@ -14135,6 +14518,17 @@ static void ggml_compute_forward_rope_f16(
|
|
14135
14518
|
const bool is_neox = mode & 2;
|
14136
14519
|
const bool is_glm = mode & 4;
|
14137
14520
|
|
14521
|
+
const float * freq_factors = NULL;
|
14522
|
+
if (is_neox) {
|
14523
|
+
if (src2 != NULL) {
|
14524
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14525
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14526
|
+
freq_factors = (const float *) src2->data;
|
14527
|
+
}
|
14528
|
+
} else {
|
14529
|
+
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14530
|
+
}
|
14531
|
+
|
14138
14532
|
// backward process uses inverse rotation by cos and sin.
|
14139
14533
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
14140
14534
|
// this essentially just switches the sign of sin.
|
@@ -14207,10 +14601,11 @@ static void ggml_compute_forward_rope_f16(
|
|
14207
14601
|
|
14208
14602
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
14209
14603
|
float cur_rot = inv_ndims * ic - ib;
|
14604
|
+
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14210
14605
|
|
14211
14606
|
float cos_theta, sin_theta;
|
14212
14607
|
rope_yarn(
|
14213
|
-
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14608
|
+
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14214
14609
|
&cos_theta, &sin_theta
|
14215
14610
|
);
|
14216
14611
|
sin_theta *= sin_sign;
|
@@ -14972,25 +15367,28 @@ static void ggml_compute_forward_upscale_f32(
|
|
14972
15367
|
return;
|
14973
15368
|
}
|
14974
15369
|
|
14975
|
-
GGML_ASSERT(src0->
|
15370
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14976
15371
|
|
14977
15372
|
const int ith = params->ith;
|
14978
15373
|
const int nth = params->nth;
|
14979
15374
|
|
14980
15375
|
GGML_TENSOR_UNARY_OP_LOCALS
|
14981
15376
|
|
14982
|
-
const
|
15377
|
+
const float sf0 = (float)ne0/src0->ne[0];
|
15378
|
+
const float sf1 = (float)ne1/src0->ne[1];
|
15379
|
+
const float sf2 = (float)ne2/src0->ne[2];
|
15380
|
+
const float sf3 = (float)ne3/src0->ne[3];
|
14983
15381
|
|
14984
15382
|
// TODO: optimize
|
14985
15383
|
|
14986
15384
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
14987
|
-
const int64_t i03 = i3;
|
15385
|
+
const int64_t i03 = i3 / sf3;
|
14988
15386
|
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
14989
|
-
const int64_t i02 = i2;
|
15387
|
+
const int64_t i02 = i2 / sf2;
|
14990
15388
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14991
|
-
const int64_t i01 = i1 /
|
15389
|
+
const int64_t i01 = i1 / sf1;
|
14992
15390
|
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
14993
|
-
const int64_t i00 = i0 /
|
15391
|
+
const int64_t i00 = i0 / sf0;
|
14994
15392
|
|
14995
15393
|
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
14996
15394
|
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
@@ -15020,6 +15418,7 @@ static void ggml_compute_forward_upscale(
|
|
15020
15418
|
}
|
15021
15419
|
}
|
15022
15420
|
|
15421
|
+
|
15023
15422
|
// ggml_compute_forward_pad
|
15024
15423
|
|
15025
15424
|
static void ggml_compute_forward_pad_f32(
|
@@ -15200,487 +15599,42 @@ static void ggml_compute_forward_argsort_f32(
|
|
15200
15599
|
const int ith = params->ith;
|
15201
15600
|
const int nth = params->nth;
|
15202
15601
|
|
15203
|
-
const int64_t nr = ggml_nrows(src0);
|
15204
|
-
|
15205
|
-
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
15206
|
-
|
15207
|
-
for (int64_t i = ith; i < nr; i += nth) {
|
15208
|
-
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
15209
|
-
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
15210
|
-
|
15211
|
-
for (int64_t j = 0; j < ne0; j++) {
|
15212
|
-
dst_data[j] = j;
|
15213
|
-
}
|
15214
|
-
|
15215
|
-
// C doesn't have a functional sort, so we do a bubble sort instead
|
15216
|
-
for (int64_t j = 0; j < ne0; j++) {
|
15217
|
-
for (int64_t k = j + 1; k < ne0; k++) {
|
15218
|
-
if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
15219
|
-
(order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
15220
|
-
int32_t tmp = dst_data[j];
|
15221
|
-
dst_data[j] = dst_data[k];
|
15222
|
-
dst_data[k] = tmp;
|
15223
|
-
}
|
15224
|
-
}
|
15225
|
-
}
|
15226
|
-
}
|
15227
|
-
}
|
15228
|
-
|
15229
|
-
static void ggml_compute_forward_argsort(
|
15230
|
-
const struct ggml_compute_params * params,
|
15231
|
-
struct ggml_tensor * dst) {
|
15232
|
-
|
15233
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
15234
|
-
|
15235
|
-
switch (src0->type) {
|
15236
|
-
case GGML_TYPE_F32:
|
15237
|
-
{
|
15238
|
-
ggml_compute_forward_argsort_f32(params, dst);
|
15239
|
-
} break;
|
15240
|
-
default:
|
15241
|
-
{
|
15242
|
-
GGML_ASSERT(false);
|
15243
|
-
} break;
|
15244
|
-
}
|
15245
|
-
}
|
15246
|
-
|
15247
|
-
// ggml_compute_forward_flash_attn
|
15248
|
-
|
15249
|
-
static void ggml_compute_forward_flash_attn_f32(
|
15250
|
-
const struct ggml_compute_params * params,
|
15251
|
-
const bool masked,
|
15252
|
-
struct ggml_tensor * dst) {
|
15253
|
-
|
15254
|
-
const struct ggml_tensor * q = dst->src[0];
|
15255
|
-
const struct ggml_tensor * k = dst->src[1];
|
15256
|
-
const struct ggml_tensor * v = dst->src[2];
|
15257
|
-
|
15258
|
-
int64_t t0 = ggml_perf_time_us();
|
15259
|
-
UNUSED(t0);
|
15260
|
-
|
15261
|
-
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
15262
|
-
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
15263
|
-
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
15264
|
-
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
15265
|
-
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
15266
|
-
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
15267
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
15268
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
15269
|
-
|
15270
|
-
const int ith = params->ith;
|
15271
|
-
const int nth = params->nth;
|
15272
|
-
|
15273
|
-
const int64_t D = neq0;
|
15274
|
-
const int64_t N = neq1;
|
15275
|
-
const int64_t P = nek1 - N;
|
15276
|
-
const int64_t M = P + N;
|
15277
|
-
|
15278
|
-
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
15279
|
-
|
15280
|
-
GGML_ASSERT(ne0 == D);
|
15281
|
-
GGML_ASSERT(ne1 == N);
|
15282
|
-
GGML_ASSERT(P >= 0);
|
15283
|
-
|
15284
|
-
GGML_ASSERT(nbq0 == sizeof(float));
|
15285
|
-
GGML_ASSERT(nbk0 == sizeof(float));
|
15286
|
-
GGML_ASSERT(nbv0 == sizeof(float));
|
15287
|
-
|
15288
|
-
GGML_ASSERT(neq0 == D);
|
15289
|
-
GGML_ASSERT(nek0 == D);
|
15290
|
-
GGML_ASSERT(nev1 == D);
|
15291
|
-
|
15292
|
-
GGML_ASSERT(neq1 == N);
|
15293
|
-
GGML_ASSERT(nek1 == N + P);
|
15294
|
-
GGML_ASSERT(nev1 == D);
|
15295
|
-
|
15296
|
-
// dst cannot be transposed or permuted
|
15297
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
15298
|
-
GGML_ASSERT(nb0 <= nb1);
|
15299
|
-
GGML_ASSERT(nb1 <= nb2);
|
15300
|
-
GGML_ASSERT(nb2 <= nb3);
|
15301
|
-
|
15302
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
15303
|
-
return;
|
15304
|
-
}
|
15305
|
-
|
15306
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
15307
|
-
return;
|
15308
|
-
}
|
15309
|
-
|
15310
|
-
// parallelize by q rows using ggml_vec_dot_f32
|
15311
|
-
|
15312
|
-
// total rows in q
|
15313
|
-
const int nr = neq1*neq2*neq3;
|
15314
|
-
|
15315
|
-
// rows per thread
|
15316
|
-
const int dr = (nr + nth - 1)/nth;
|
15317
|
-
|
15318
|
-
// row range for this thread
|
15319
|
-
const int ir0 = dr*ith;
|
15320
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
15321
|
-
|
15322
|
-
const float scale = 1.0f/sqrtf(D);
|
15323
|
-
|
15324
|
-
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
15325
|
-
|
15326
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
15327
|
-
// q indices
|
15328
|
-
const int iq3 = ir/(neq2*neq1);
|
15329
|
-
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15330
|
-
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15331
|
-
|
15332
|
-
float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
|
15333
|
-
|
15334
|
-
for (int i = M; i < Mup; ++i) {
|
15335
|
-
S[i] = -INFINITY;
|
15336
|
-
}
|
15337
|
-
|
15338
|
-
const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
|
15339
|
-
for (int64_t ic = 0; ic < masked_begin; ++ic) {
|
15340
|
-
// k indices
|
15341
|
-
const int ik3 = iq3;
|
15342
|
-
const int ik2 = iq2 % nek2;
|
15343
|
-
const int ik1 = ic;
|
15344
|
-
|
15345
|
-
// S indices
|
15346
|
-
const int i1 = ik1;
|
15347
|
-
|
15348
|
-
ggml_vec_dot_f32(neq0,
|
15349
|
-
S + i1, 0,
|
15350
|
-
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
15351
|
-
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
15352
|
-
}
|
15353
|
-
|
15354
|
-
// scale
|
15355
|
-
ggml_vec_scale_f32(masked_begin, S, scale);
|
15356
|
-
|
15357
|
-
for (int64_t i = masked_begin; i < M; i++) {
|
15358
|
-
S[i] = -INFINITY;
|
15359
|
-
}
|
15360
|
-
|
15361
|
-
// softmax
|
15362
|
-
// exclude known -INF S[..] values from max and loop
|
15363
|
-
// dont forget to set their SW values to zero
|
15364
|
-
{
|
15365
|
-
float max = -INFINITY;
|
15366
|
-
ggml_vec_max_f32(masked_begin, &max, S);
|
15367
|
-
|
15368
|
-
ggml_float sum = 0.0;
|
15369
|
-
{
|
15370
|
-
#ifdef GGML_SOFT_MAX_ACCELERATE
|
15371
|
-
max = -max;
|
15372
|
-
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
15373
|
-
vvexpf(S, S, &Mup);
|
15374
|
-
ggml_vec_sum_f32(Mup, &sum, S);
|
15375
|
-
#else
|
15376
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
15377
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15378
|
-
|
15379
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15380
|
-
if (i >= masked_begin) {
|
15381
|
-
break;
|
15382
|
-
}
|
15383
|
-
float * SS = S + i;
|
15384
|
-
|
15385
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15386
|
-
if (i + j >= masked_begin) {
|
15387
|
-
break;
|
15388
|
-
} else if (SS[j] == -INFINITY) {
|
15389
|
-
SS[j] = 0.0f;
|
15390
|
-
} else {
|
15391
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
15392
|
-
const float val = expf(SS[j] - max);
|
15393
|
-
#else
|
15394
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15395
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15396
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15397
|
-
#endif
|
15398
|
-
sump[j] += (ggml_float)val;
|
15399
|
-
SS[j] = val;
|
15400
|
-
}
|
15401
|
-
}
|
15402
|
-
}
|
15403
|
-
|
15404
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15405
|
-
sum += sump[i];
|
15406
|
-
}
|
15407
|
-
#endif
|
15408
|
-
}
|
15409
|
-
|
15410
|
-
assert(sum > 0.0);
|
15411
|
-
|
15412
|
-
sum = 1.0/sum;
|
15413
|
-
ggml_vec_scale_f32(masked_begin, S, sum);
|
15414
|
-
|
15415
|
-
#ifndef NDEBUG
|
15416
|
-
for (int i = 0; i < masked_begin; ++i) {
|
15417
|
-
assert(!isnan(S[i]));
|
15418
|
-
assert(!isinf(S[i]));
|
15419
|
-
}
|
15420
|
-
#endif
|
15421
|
-
}
|
15422
|
-
|
15423
|
-
for (int64_t ic = 0; ic < nev1; ++ic) {
|
15424
|
-
// dst indices
|
15425
|
-
const int i1 = iq1;
|
15426
|
-
const int i2 = iq2;
|
15427
|
-
const int i3 = iq3;
|
15428
|
-
|
15429
|
-
// v indices
|
15430
|
-
const int iv2 = iq2 % nev2;
|
15431
|
-
const int iv3 = iq3;
|
15432
|
-
|
15433
|
-
ggml_vec_dot_f32(masked_begin,
|
15434
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
15435
|
-
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
15436
|
-
S, 0, 1);
|
15437
|
-
}
|
15438
|
-
}
|
15439
|
-
}
|
15440
|
-
|
15441
|
-
static void ggml_compute_forward_flash_attn_f16(
|
15442
|
-
const struct ggml_compute_params * params,
|
15443
|
-
const bool masked,
|
15444
|
-
struct ggml_tensor * dst) {
|
15445
|
-
|
15446
|
-
const struct ggml_tensor * q = dst->src[0];
|
15447
|
-
const struct ggml_tensor * k = dst->src[1];
|
15448
|
-
const struct ggml_tensor * v = dst->src[2];
|
15449
|
-
|
15450
|
-
int64_t t0 = ggml_perf_time_us();
|
15451
|
-
UNUSED(t0);
|
15452
|
-
|
15453
|
-
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
15454
|
-
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
15455
|
-
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
15456
|
-
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
15457
|
-
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
15458
|
-
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
15459
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
15460
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
15461
|
-
|
15462
|
-
const int ith = params->ith;
|
15463
|
-
const int nth = params->nth;
|
15464
|
-
|
15465
|
-
const int64_t D = neq0;
|
15466
|
-
const int64_t N = neq1;
|
15467
|
-
const int64_t P = nek1 - N;
|
15468
|
-
const int64_t M = P + N;
|
15469
|
-
|
15470
|
-
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
15471
|
-
|
15472
|
-
GGML_ASSERT(ne0 == D);
|
15473
|
-
GGML_ASSERT(ne1 == N);
|
15474
|
-
GGML_ASSERT(P >= 0);
|
15475
|
-
|
15476
|
-
GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
|
15477
|
-
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
15478
|
-
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
15479
|
-
|
15480
|
-
GGML_ASSERT(neq0 == D);
|
15481
|
-
GGML_ASSERT(nek0 == D);
|
15482
|
-
GGML_ASSERT(nev1 == D);
|
15483
|
-
|
15484
|
-
GGML_ASSERT(neq1 == N);
|
15485
|
-
GGML_ASSERT(nek1 == N + P);
|
15486
|
-
GGML_ASSERT(nev1 == D);
|
15487
|
-
|
15488
|
-
// dst cannot be transposed or permuted
|
15489
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
15490
|
-
GGML_ASSERT(nb0 <= nb1);
|
15491
|
-
GGML_ASSERT(nb1 <= nb2);
|
15492
|
-
GGML_ASSERT(nb2 <= nb3);
|
15493
|
-
|
15494
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
15495
|
-
return;
|
15496
|
-
}
|
15497
|
-
|
15498
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
15499
|
-
return;
|
15500
|
-
}
|
15501
|
-
|
15502
|
-
// parallelize by q rows using ggml_vec_dot_f32
|
15503
|
-
|
15504
|
-
// total rows in q
|
15505
|
-
const int nr = neq1*neq2*neq3;
|
15506
|
-
|
15507
|
-
// rows per thread
|
15508
|
-
const int dr = (nr + nth - 1)/nth;
|
15509
|
-
|
15510
|
-
// row range for this thread
|
15511
|
-
const int ir0 = dr*ith;
|
15512
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
15513
|
-
|
15514
|
-
const float scale = 1.0f/sqrtf(D);
|
15515
|
-
|
15516
|
-
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
15517
|
-
|
15518
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
15519
|
-
// q indices
|
15520
|
-
const int iq3 = ir/(neq2*neq1);
|
15521
|
-
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15522
|
-
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15523
|
-
|
15524
|
-
float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
|
15525
|
-
|
15526
|
-
for (int i = M; i < Mup; ++i) {
|
15527
|
-
S[i] = -INFINITY;
|
15528
|
-
}
|
15529
|
-
|
15530
|
-
if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
|
15531
|
-
for (int64_t ic = 0; ic < nek1; ++ic) {
|
15532
|
-
// k indices
|
15533
|
-
const int ik3 = iq3;
|
15534
|
-
const int ik2 = iq2 % nek2;
|
15535
|
-
const int ik1 = ic;
|
15536
|
-
|
15537
|
-
// S indices
|
15538
|
-
const int i1 = ik1;
|
15539
|
-
|
15540
|
-
ggml_vec_dot_f16(neq0,
|
15541
|
-
S + i1, 0,
|
15542
|
-
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
15543
|
-
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
15544
|
-
}
|
15545
|
-
} else {
|
15546
|
-
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
15547
|
-
// k indices
|
15548
|
-
const int ik3 = iq3;
|
15549
|
-
const int ik2 = iq2 % nek2;
|
15550
|
-
const int ik1 = ic;
|
15551
|
-
|
15552
|
-
// S indices
|
15553
|
-
const int i1 = ik1;
|
15554
|
-
|
15555
|
-
ggml_vec_dot_f16_unroll(neq0, nbk1,
|
15556
|
-
S + i1,
|
15557
|
-
((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
15558
|
-
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
15559
|
-
}
|
15560
|
-
}
|
15561
|
-
|
15562
|
-
// scale
|
15563
|
-
ggml_vec_scale_f32(nek1, S, scale);
|
15564
|
-
|
15565
|
-
if (masked) {
|
15566
|
-
for (int64_t i = P; i < M; i++) {
|
15567
|
-
if (i > P + iq1) {
|
15568
|
-
S[i] = -INFINITY;
|
15569
|
-
}
|
15570
|
-
}
|
15571
|
-
}
|
15572
|
-
|
15573
|
-
// softmax
|
15574
|
-
// todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
|
15575
|
-
// dont forget to set their S values to zero
|
15576
|
-
{
|
15577
|
-
float max = -INFINITY;
|
15578
|
-
ggml_vec_max_f32(M, &max, S);
|
15579
|
-
|
15580
|
-
ggml_float sum = 0.0;
|
15581
|
-
{
|
15582
|
-
#ifdef GGML_SOFT_MAX_ACCELERATE
|
15583
|
-
max = -max;
|
15584
|
-
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
15585
|
-
vvexpf(S, S, &Mup);
|
15586
|
-
ggml_vec_sum_f32(Mup, &sum, S);
|
15587
|
-
#else
|
15588
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
15589
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15590
|
-
|
15591
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15592
|
-
float * SS = S + i;
|
15593
|
-
|
15594
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15595
|
-
if (SS[j] == -INFINITY) {
|
15596
|
-
SS[j] = 0.0f;
|
15597
|
-
} else {
|
15598
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15599
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15600
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15601
|
-
sump[j] += (ggml_float)val;
|
15602
|
-
SS[j] = val;
|
15603
|
-
}
|
15604
|
-
}
|
15605
|
-
}
|
15606
|
-
|
15607
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15608
|
-
sum += sump[i];
|
15609
|
-
}
|
15610
|
-
#endif
|
15611
|
-
}
|
15612
|
-
|
15613
|
-
assert(sum > 0.0);
|
15614
|
-
|
15615
|
-
sum = 1.0/sum;
|
15616
|
-
ggml_vec_scale_f32(M, S, sum);
|
15617
|
-
|
15618
|
-
#ifndef NDEBUG
|
15619
|
-
for (int i = 0; i < M; ++i) {
|
15620
|
-
assert(!isnan(S[i]));
|
15621
|
-
assert(!isinf(S[i]));
|
15622
|
-
}
|
15623
|
-
#endif
|
15624
|
-
}
|
15625
|
-
|
15626
|
-
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
|
15602
|
+
const int64_t nr = ggml_nrows(src0);
|
15603
|
+
|
15604
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
15605
|
+
|
15606
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
15607
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
15608
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
15627
15609
|
|
15628
|
-
for (int64_t
|
15629
|
-
|
15610
|
+
for (int64_t j = 0; j < ne0; j++) {
|
15611
|
+
dst_data[j] = j;
|
15630
15612
|
}
|
15631
15613
|
|
15632
|
-
//
|
15633
|
-
|
15634
|
-
for (int64_t
|
15635
|
-
|
15636
|
-
|
15637
|
-
|
15638
|
-
|
15639
|
-
|
15640
|
-
|
15641
|
-
const int iv2 = iq2 % nev2;
|
15642
|
-
const int iv3 = iq3;
|
15643
|
-
|
15644
|
-
ggml_vec_dot_f16(nev0,
|
15645
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
15646
|
-
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
15647
|
-
S16, 0, 1);
|
15648
|
-
}
|
15649
|
-
} else {
|
15650
|
-
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
15651
|
-
// dst indices
|
15652
|
-
const int i1 = iq1;
|
15653
|
-
const int i2 = iq2;
|
15654
|
-
const int i3 = iq3;
|
15655
|
-
|
15656
|
-
// v indices
|
15657
|
-
const int iv2 = iq2 % nev2;
|
15658
|
-
const int iv3 = iq3;
|
15659
|
-
|
15660
|
-
ggml_vec_dot_f16_unroll(nev0, nbv1,
|
15661
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
15662
|
-
((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
15663
|
-
S16);
|
15614
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
15615
|
+
for (int64_t j = 0; j < ne0; j++) {
|
15616
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
15617
|
+
if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
15618
|
+
(order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
15619
|
+
int32_t tmp = dst_data[j];
|
15620
|
+
dst_data[j] = dst_data[k];
|
15621
|
+
dst_data[k] = tmp;
|
15622
|
+
}
|
15664
15623
|
}
|
15665
15624
|
}
|
15666
15625
|
}
|
15667
15626
|
}
|
15668
15627
|
|
15669
|
-
static void
|
15670
|
-
|
15671
|
-
|
15672
|
-
struct ggml_tensor * dst) {
|
15628
|
+
static void ggml_compute_forward_argsort(
|
15629
|
+
const struct ggml_compute_params * params,
|
15630
|
+
struct ggml_tensor * dst) {
|
15673
15631
|
|
15674
|
-
const struct ggml_tensor *
|
15632
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
15675
15633
|
|
15676
|
-
switch (
|
15677
|
-
case GGML_TYPE_F16:
|
15678
|
-
{
|
15679
|
-
ggml_compute_forward_flash_attn_f16(params, masked, dst);
|
15680
|
-
} break;
|
15634
|
+
switch (src0->type) {
|
15681
15635
|
case GGML_TYPE_F32:
|
15682
15636
|
{
|
15683
|
-
|
15637
|
+
ggml_compute_forward_argsort_f32(params, dst);
|
15684
15638
|
} break;
|
15685
15639
|
default:
|
15686
15640
|
{
|
@@ -15719,9 +15673,10 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15719
15673
|
GGML_ASSERT(ne0 == D);
|
15720
15674
|
GGML_ASSERT(ne2 == N);
|
15721
15675
|
|
15722
|
-
|
15723
|
-
GGML_ASSERT(
|
15724
|
-
GGML_ASSERT(
|
15676
|
+
// input tensor rows must be contiguous
|
15677
|
+
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
|
15678
|
+
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
|
15679
|
+
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
|
15725
15680
|
|
15726
15681
|
GGML_ASSERT(neq0 == D);
|
15727
15682
|
GGML_ASSERT(nek0 == D);
|
@@ -15763,8 +15718,22 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15763
15718
|
const int ir0 = dr*ith;
|
15764
15719
|
const int ir1 = MIN(ir0 + dr, nr);
|
15765
15720
|
|
15766
|
-
float scale
|
15767
|
-
|
15721
|
+
float scale = 1.0f;
|
15722
|
+
float max_bias = 0.0f;
|
15723
|
+
|
15724
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
15725
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
15726
|
+
|
15727
|
+
const uint32_t n_head = neq2;
|
15728
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
15729
|
+
|
15730
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
15731
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
15732
|
+
|
15733
|
+
enum ggml_type const k_vec_dot_type = type_traits[k->type].vec_dot_type;
|
15734
|
+
ggml_from_float_t const q_to_vec_dot = type_traits[k_vec_dot_type].from_float;
|
15735
|
+
ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot;
|
15736
|
+
ggml_to_float_t const v_to_float = type_traits[v->type].to_float;
|
15768
15737
|
|
15769
15738
|
// loop over n_batch and n_head
|
15770
15739
|
for (int ir = ir0; ir < ir1; ++ir) {
|
@@ -15773,14 +15742,22 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15773
15742
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15774
15743
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15775
15744
|
|
15776
|
-
|
15777
|
-
float
|
15745
|
+
const uint32_t h = iq2; // head index
|
15746
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
15747
|
+
|
15748
|
+
float S = 0.0f; // sum
|
15749
|
+
float M = -INFINITY; // maximum KQ value
|
15778
15750
|
|
15779
|
-
float *
|
15780
|
-
|
15781
|
-
ggml_fp16_t *
|
15751
|
+
float * VKQ32 = (float *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator
|
15752
|
+
float * V32 = (VKQ32 + 1*D); // (temporary) FP32 V buffer
|
15753
|
+
ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator
|
15754
|
+
ggml_fp16_t * Q_q = (ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16
|
15782
15755
|
|
15783
|
-
|
15756
|
+
if (v->type == GGML_TYPE_F16) {
|
15757
|
+
memset(VKQ16, 0, D*sizeof(ggml_fp16_t));
|
15758
|
+
} else {
|
15759
|
+
memset(VKQ32, 0, D*sizeof(float));
|
15760
|
+
}
|
15784
15761
|
|
15785
15762
|
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
15786
15763
|
|
@@ -15792,61 +15769,79 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15792
15769
|
const int iv3 = iq3 / rv3;
|
15793
15770
|
const int iv2 = iq2 / rv2;
|
15794
15771
|
|
15772
|
+
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
15773
|
+
q_to_vec_dot(pq, Q_q, D);
|
15774
|
+
|
15795
15775
|
// online softmax / attention
|
15796
15776
|
// loop over n_kv and n_head_kv
|
15797
15777
|
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
15798
15778
|
for (int64_t ic = 0; ic < nek1; ++ic) {
|
15799
|
-
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15779
|
+
const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15800
15780
|
if (mv == -INFINITY) {
|
15801
15781
|
continue;
|
15802
15782
|
}
|
15803
15783
|
|
15804
|
-
float s;
|
15784
|
+
float s; // KQ value
|
15805
15785
|
|
15806
|
-
|
15807
|
-
|
15808
|
-
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
15786
|
+
const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
|
15787
|
+
kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1);
|
15809
15788
|
|
15810
|
-
|
15811
|
-
Q16[d] = GGML_FP32_TO_FP16(pq[d]);
|
15812
|
-
}
|
15813
|
-
}
|
15789
|
+
s = s*scale + mv; // scale KQ value and apply mask
|
15814
15790
|
|
15815
|
-
|
15816
|
-
&s, 0,
|
15817
|
-
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
15818
|
-
Q16, 0, 1);
|
15791
|
+
const float Mold = M;
|
15819
15792
|
|
15820
|
-
|
15793
|
+
float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
|
15794
|
+
float vs = 1.0f; // post-softmax KQ value, expf(s - M)
|
15821
15795
|
|
15822
|
-
const
|
15796
|
+
const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
15823
15797
|
|
15824
|
-
|
15825
|
-
|
15798
|
+
if (v->type== GGML_TYPE_F16) {
|
15799
|
+
if (s > M) {
|
15800
|
+
// s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
|
15801
|
+
M = s;
|
15802
|
+
ms = expf(Mold - M);
|
15826
15803
|
|
15827
|
-
|
15828
|
-
|
15829
|
-
|
15804
|
+
// V = V*expf(Mold - M)
|
15805
|
+
ggml_vec_scale_f16(D, VKQ16, ms);
|
15806
|
+
} else {
|
15807
|
+
// no new maximum, ms == 1.0f, vs != 1.0f
|
15808
|
+
vs = expf(s - M);
|
15809
|
+
}
|
15830
15810
|
|
15831
|
-
// V
|
15832
|
-
|
15811
|
+
// V += v*expf(s - M)
|
15812
|
+
ggml_vec_mad_f16(D, VKQ16, (const ggml_fp16_t *) v_data, vs);
|
15833
15813
|
} else {
|
15834
|
-
|
15835
|
-
|
15814
|
+
if (s > M) {
|
15815
|
+
// s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
|
15816
|
+
M = s;
|
15817
|
+
ms = expf(Mold - M);
|
15818
|
+
|
15819
|
+
// V = V*expf(Mold - M)
|
15820
|
+
ggml_vec_scale_f32(D, VKQ32, ms);
|
15821
|
+
} else {
|
15822
|
+
// no new maximum, ms == 1.0f, vs != 1.0f
|
15823
|
+
vs = expf(s - M);
|
15824
|
+
}
|
15836
15825
|
|
15837
|
-
|
15826
|
+
v_to_float(v_data, V32, D);
|
15838
15827
|
|
15839
|
-
|
15840
|
-
|
15828
|
+
// V += v*expf(s - M)
|
15829
|
+
ggml_vec_mad_f32(D, VKQ32, V32, vs);
|
15830
|
+
}
|
15841
15831
|
|
15842
|
-
S = S*ms + vs;
|
15832
|
+
S = S*ms + vs; // scale and increment sum with partial sum
|
15843
15833
|
}
|
15844
15834
|
|
15845
|
-
|
15846
|
-
|
15847
|
-
|
15835
|
+
if (v->type == GGML_TYPE_F16) {
|
15836
|
+
for (int64_t d = 0; d < D; ++d) {
|
15837
|
+
VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
|
15838
|
+
}
|
15848
15839
|
}
|
15849
15840
|
|
15841
|
+
// V /= S
|
15842
|
+
const float S_inv = 1.0f/S;
|
15843
|
+
ggml_vec_scale_f32(D, VKQ32, S_inv);
|
15844
|
+
|
15850
15845
|
// dst indices
|
15851
15846
|
const int i1 = iq1;
|
15852
15847
|
const int i2 = iq2;
|
@@ -15856,7 +15851,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15856
15851
|
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
|
15857
15852
|
|
15858
15853
|
// permute(0, 2, 1, 3)
|
15859
|
-
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1,
|
15854
|
+
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
|
15860
15855
|
}
|
15861
15856
|
}
|
15862
15857
|
|
@@ -15867,7 +15862,7 @@ static void ggml_compute_forward_flash_attn_ext(
|
|
15867
15862
|
const struct ggml_tensor * v,
|
15868
15863
|
const struct ggml_tensor * mask,
|
15869
15864
|
struct ggml_tensor * dst) {
|
15870
|
-
switch (dst->op_params[
|
15865
|
+
switch (dst->op_params[2]) {
|
15871
15866
|
case GGML_PREC_DEFAULT:
|
15872
15867
|
case GGML_PREC_F32:
|
15873
15868
|
{
|
@@ -15881,165 +15876,6 @@ static void ggml_compute_forward_flash_attn_ext(
|
|
15881
15876
|
}
|
15882
15877
|
}
|
15883
15878
|
|
15884
|
-
// ggml_compute_forward_flash_ff
|
15885
|
-
|
15886
|
-
static void ggml_compute_forward_flash_ff_f16(
|
15887
|
-
const struct ggml_compute_params * params,
|
15888
|
-
struct ggml_tensor * dst) {
|
15889
|
-
|
15890
|
-
const struct ggml_tensor * a = dst->src[0]; // F16
|
15891
|
-
const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
|
15892
|
-
const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
|
15893
|
-
const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
|
15894
|
-
const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
|
15895
|
-
|
15896
|
-
int64_t t0 = ggml_perf_time_us();
|
15897
|
-
UNUSED(t0);
|
15898
|
-
|
15899
|
-
GGML_TENSOR_LOCALS(int64_t, nea, a, ne)
|
15900
|
-
GGML_TENSOR_LOCALS(size_t, nba, a, nb)
|
15901
|
-
GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne)
|
15902
|
-
GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb)
|
15903
|
-
GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne)
|
15904
|
-
GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb)
|
15905
|
-
GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne)
|
15906
|
-
GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb)
|
15907
|
-
GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne)
|
15908
|
-
GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb)
|
15909
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
15910
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
15911
|
-
|
15912
|
-
const int ith = params->ith;
|
15913
|
-
const int nth = params->nth;
|
15914
|
-
|
15915
|
-
const int64_t D = nea0;
|
15916
|
-
//const int64_t N = nea1;
|
15917
|
-
const int64_t M = neb01;
|
15918
|
-
|
15919
|
-
GGML_ASSERT(ne0 == nea0);
|
15920
|
-
GGML_ASSERT(ne1 == nea1);
|
15921
|
-
GGML_ASSERT(ne2 == nea2);
|
15922
|
-
|
15923
|
-
GGML_ASSERT(nba0 == sizeof(ggml_fp16_t));
|
15924
|
-
GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
|
15925
|
-
GGML_ASSERT(nbb10 == sizeof(float));
|
15926
|
-
GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
|
15927
|
-
GGML_ASSERT(nbc10 == sizeof(float));
|
15928
|
-
|
15929
|
-
GGML_ASSERT(neb00 == D);
|
15930
|
-
GGML_ASSERT(neb01 == M);
|
15931
|
-
GGML_ASSERT(neb10 == M);
|
15932
|
-
GGML_ASSERT(neb11 == 1);
|
15933
|
-
|
15934
|
-
GGML_ASSERT(nec00 == M);
|
15935
|
-
GGML_ASSERT(nec01 == D);
|
15936
|
-
GGML_ASSERT(nec10 == D);
|
15937
|
-
GGML_ASSERT(nec11 == 1);
|
15938
|
-
|
15939
|
-
// dst cannot be transposed or permuted
|
15940
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
15941
|
-
GGML_ASSERT(nb0 <= nb1);
|
15942
|
-
GGML_ASSERT(nb1 <= nb2);
|
15943
|
-
GGML_ASSERT(nb2 <= nb3);
|
15944
|
-
|
15945
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
15946
|
-
return;
|
15947
|
-
}
|
15948
|
-
|
15949
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
15950
|
-
return;
|
15951
|
-
}
|
15952
|
-
|
15953
|
-
// parallelize by a rows using ggml_vec_dot_f32
|
15954
|
-
|
15955
|
-
// total rows in a
|
15956
|
-
const int nr = nea1*nea2*nea3;
|
15957
|
-
|
15958
|
-
// rows per thread
|
15959
|
-
const int dr = (nr + nth - 1)/nth;
|
15960
|
-
|
15961
|
-
// row range for this thread
|
15962
|
-
const int ir0 = dr*ith;
|
15963
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
15964
|
-
|
15965
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
15966
|
-
// a indices
|
15967
|
-
const int ia3 = ir/(nea2*nea1);
|
15968
|
-
const int ia2 = (ir - ia3*nea2*nea1)/nea1;
|
15969
|
-
const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
|
15970
|
-
|
15971
|
-
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
15972
|
-
|
15973
|
-
for (int64_t ic = 0; ic < neb01; ++ic) {
|
15974
|
-
// b0 indices
|
15975
|
-
const int ib03 = ia3;
|
15976
|
-
const int ib02 = ia2;
|
15977
|
-
const int ib01 = ic;
|
15978
|
-
|
15979
|
-
// S indices
|
15980
|
-
const int i1 = ib01;
|
15981
|
-
|
15982
|
-
ggml_vec_dot_f16(nea0,
|
15983
|
-
S + i1, 0,
|
15984
|
-
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
15985
|
-
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
15986
|
-
}
|
15987
|
-
|
15988
|
-
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
15989
|
-
//ggml_vec_gelu_f32(neb01, S, S);
|
15990
|
-
|
15991
|
-
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
15992
|
-
|
15993
|
-
for (int64_t i = 0; i < M; i++) {
|
15994
|
-
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
15995
|
-
}
|
15996
|
-
|
15997
|
-
ggml_vec_gelu_f16(neb01, S16, S16);
|
15998
|
-
|
15999
|
-
{
|
16000
|
-
// dst indices
|
16001
|
-
const int i1 = ia1;
|
16002
|
-
const int i2 = ia2;
|
16003
|
-
const int i3 = ia3;
|
16004
|
-
|
16005
|
-
for (int64_t ic = 0; ic < nec01; ++ic) {
|
16006
|
-
|
16007
|
-
ggml_vec_dot_f16(neb01,
|
16008
|
-
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
16009
|
-
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
16010
|
-
S16, 0, 1);
|
16011
|
-
}
|
16012
|
-
|
16013
|
-
ggml_vec_add_f32(nec01,
|
16014
|
-
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
16015
|
-
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
16016
|
-
(float *) c1->data);
|
16017
|
-
}
|
16018
|
-
}
|
16019
|
-
}
|
16020
|
-
|
16021
|
-
static void ggml_compute_forward_flash_ff(
|
16022
|
-
const struct ggml_compute_params * params,
|
16023
|
-
struct ggml_tensor * dst) {
|
16024
|
-
|
16025
|
-
const struct ggml_tensor * b0 = dst->src[1];
|
16026
|
-
|
16027
|
-
switch (b0->type) {
|
16028
|
-
case GGML_TYPE_F16:
|
16029
|
-
{
|
16030
|
-
ggml_compute_forward_flash_ff_f16(params, dst);
|
16031
|
-
} break;
|
16032
|
-
case GGML_TYPE_F32:
|
16033
|
-
{
|
16034
|
-
GGML_ASSERT(false); // TODO
|
16035
|
-
} break;
|
16036
|
-
default:
|
16037
|
-
{
|
16038
|
-
GGML_ASSERT(false);
|
16039
|
-
} break;
|
16040
|
-
}
|
16041
|
-
}
|
16042
|
-
|
16043
15879
|
// ggml_compute_forward_flash_attn_back
|
16044
15880
|
|
16045
15881
|
static void ggml_compute_forward_flash_attn_back_f32(
|
@@ -16221,38 +16057,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
16221
16057
|
vvexpf(SM, SM, &Mup);
|
16222
16058
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
16223
16059
|
#else
|
16224
|
-
|
16225
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
16226
|
-
|
16227
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
16228
|
-
if (i >= masked_begin) {
|
16229
|
-
break;
|
16230
|
-
}
|
16231
|
-
float * SR = S + i;
|
16232
|
-
float * SW = SM + i;
|
16233
|
-
|
16234
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
16235
|
-
if (i + j >= masked_begin) {
|
16236
|
-
break;
|
16237
|
-
} else if (SR[j] == -INFINITY) {
|
16238
|
-
SW[j] = 0.0f;
|
16239
|
-
} else {
|
16240
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
16241
|
-
const float val = expf(SR[j] - max);
|
16242
|
-
#else
|
16243
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
16244
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
16245
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
16246
|
-
#endif
|
16247
|
-
sump[j] += (ggml_float)val;
|
16248
|
-
SW[j] = val;
|
16249
|
-
}
|
16250
|
-
}
|
16251
|
-
}
|
16252
|
-
|
16253
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
16254
|
-
sum += sump[i];
|
16255
|
-
}
|
16060
|
+
sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
|
16256
16061
|
#endif
|
16257
16062
|
}
|
16258
16063
|
|
@@ -16834,6 +16639,10 @@ static void ggml_compute_forward_unary(
|
|
16834
16639
|
{
|
16835
16640
|
ggml_compute_forward_relu(params, dst);
|
16836
16641
|
} break;
|
16642
|
+
case GGML_UNARY_OP_SIGMOID:
|
16643
|
+
{
|
16644
|
+
ggml_compute_forward_sigmoid(params, dst);
|
16645
|
+
} break;
|
16837
16646
|
case GGML_UNARY_OP_GELU:
|
16838
16647
|
{
|
16839
16648
|
ggml_compute_forward_gelu(params, dst);
|
@@ -17274,35 +17083,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
17274
17083
|
assert(!isnan(s1[i]));
|
17275
17084
|
}
|
17276
17085
|
#endif
|
17277
|
-
// soft_max
|
17278
|
-
ggml_float sum = 0.0;
|
17279
|
-
{
|
17280
|
-
float max = -INFINITY;
|
17281
|
-
ggml_vec_max_f32(nc, &max, s0);
|
17282
17086
|
|
17283
|
-
|
17284
|
-
|
17285
|
-
|
17286
|
-
|
17287
|
-
|
17288
|
-
|
17289
|
-
const float s = s0[i] - max;
|
17290
|
-
const float val = expf(s);
|
17291
|
-
#else
|
17292
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17293
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17294
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17295
|
-
#endif
|
17296
|
-
sum += (ggml_float)val;
|
17297
|
-
st[i] = val;
|
17298
|
-
}
|
17299
|
-
}
|
17087
|
+
// soft_max
|
17088
|
+
float max = -INFINITY;
|
17089
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17090
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
|
17091
|
+
assert(sum > 0.0);
|
17092
|
+
sum = (1.0 - eps) / sum;
|
17300
17093
|
|
17301
|
-
assert(sum > 0.0);
|
17302
|
-
// sum = 1.0/sum;
|
17303
|
-
}
|
17304
17094
|
// avoid log(0) by rescaling from [0..1] to [eps..1]
|
17305
|
-
sum = (1.0 - eps) / sum;
|
17306
17095
|
ggml_vec_scale_f32(nc, st, sum);
|
17307
17096
|
ggml_vec_add1_f32(nc, st, st, eps);
|
17308
17097
|
ggml_vec_log_f32(nc, st, st);
|
@@ -17392,32 +17181,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
17392
17181
|
#endif
|
17393
17182
|
|
17394
17183
|
// soft_max
|
17395
|
-
|
17396
|
-
|
17397
|
-
|
17398
|
-
|
17399
|
-
|
17400
|
-
uint16_t scvt; UNUSED(scvt);
|
17401
|
-
for (int i = 0; i < nc; i++) {
|
17402
|
-
if (s0[i] == -INFINITY) {
|
17403
|
-
ds0[i] = 0.0f;
|
17404
|
-
} else {
|
17405
|
-
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
17406
|
-
const float s = s0[i] - max;
|
17407
|
-
const float val = expf(s);
|
17408
|
-
#else
|
17409
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17410
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17411
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17412
|
-
#endif
|
17413
|
-
sum += (ggml_float)val;
|
17414
|
-
ds0[i] = val;
|
17415
|
-
}
|
17416
|
-
}
|
17417
|
-
|
17418
|
-
assert(sum > 0.0);
|
17419
|
-
sum = (1.0 - eps)/sum;
|
17420
|
-
}
|
17184
|
+
float max = -INFINITY;
|
17185
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17186
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
|
17187
|
+
assert(sum > 0.0);
|
17188
|
+
sum = (1.0 - eps) / sum;
|
17421
17189
|
|
17422
17190
|
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
17423
17191
|
ggml_vec_scale_f32(nc, ds0, sum);
|
@@ -17454,7 +17222,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
17454
17222
|
|
17455
17223
|
/////////////////////////////////
|
17456
17224
|
|
17457
|
-
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
17225
|
+
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
|
17458
17226
|
GGML_ASSERT(params);
|
17459
17227
|
|
17460
17228
|
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
@@ -17552,7 +17320,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17552
17320
|
} break;
|
17553
17321
|
case GGML_OP_MUL_MAT:
|
17554
17322
|
{
|
17555
|
-
ggml_compute_forward_mul_mat(params, tensor);
|
17323
|
+
ggml_compute_forward_mul_mat(params, tensor, state);
|
17556
17324
|
} break;
|
17557
17325
|
case GGML_OP_MUL_MAT_ID:
|
17558
17326
|
{
|
@@ -17630,10 +17398,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17630
17398
|
{
|
17631
17399
|
ggml_compute_forward_rope_back(params, tensor);
|
17632
17400
|
} break;
|
17633
|
-
case GGML_OP_ALIBI:
|
17634
|
-
{
|
17635
|
-
ggml_compute_forward_alibi(params, tensor);
|
17636
|
-
} break;
|
17637
17401
|
case GGML_OP_CLAMP:
|
17638
17402
|
{
|
17639
17403
|
ggml_compute_forward_clamp(params, tensor);
|
@@ -17682,21 +17446,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17682
17446
|
{
|
17683
17447
|
ggml_compute_forward_leaky_relu(params, tensor);
|
17684
17448
|
} break;
|
17685
|
-
case GGML_OP_FLASH_ATTN:
|
17686
|
-
{
|
17687
|
-
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
17688
|
-
GGML_ASSERT(t == 0 || t == 1);
|
17689
|
-
const bool masked = t != 0;
|
17690
|
-
ggml_compute_forward_flash_attn(params, masked, tensor);
|
17691
|
-
} break;
|
17692
17449
|
case GGML_OP_FLASH_ATTN_EXT:
|
17693
17450
|
{
|
17694
17451
|
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
17695
17452
|
} break;
|
17696
|
-
case GGML_OP_FLASH_FF:
|
17697
|
-
{
|
17698
|
-
ggml_compute_forward_flash_ff(params, tensor);
|
17699
|
-
} break;
|
17700
17453
|
case GGML_OP_FLASH_ATTN_BACK:
|
17701
17454
|
{
|
17702
17455
|
int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -18066,6 +17819,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg
|
|
18066
17819
|
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
|
18067
17820
|
struct ggml_tensor * src0 = tensor->src[0];
|
18068
17821
|
struct ggml_tensor * src1 = tensor->src[1];
|
17822
|
+
struct ggml_tensor * src2 = tensor->src[2];
|
18069
17823
|
|
18070
17824
|
switch (tensor->op) {
|
18071
17825
|
case GGML_OP_DUP:
|
@@ -18597,6 +18351,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18597
18351
|
ggml_rope_back(ctx,
|
18598
18352
|
tensor->grad,
|
18599
18353
|
src1,
|
18354
|
+
src2,
|
18600
18355
|
n_dims,
|
18601
18356
|
mode,
|
18602
18357
|
n_ctx,
|
@@ -18636,6 +18391,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18636
18391
|
ggml_rope_impl(ctx,
|
18637
18392
|
tensor->grad,
|
18638
18393
|
src1,
|
18394
|
+
src2,
|
18639
18395
|
n_dims,
|
18640
18396
|
mode,
|
18641
18397
|
n_ctx,
|
@@ -18652,10 +18408,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18652
18408
|
zero_table);
|
18653
18409
|
}
|
18654
18410
|
} break;
|
18655
|
-
case GGML_OP_ALIBI:
|
18656
|
-
{
|
18657
|
-
GGML_ASSERT(false); // TODO: not implemented
|
18658
|
-
} break;
|
18659
18411
|
case GGML_OP_CLAMP:
|
18660
18412
|
{
|
18661
18413
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -18704,7 +18456,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18704
18456
|
{
|
18705
18457
|
GGML_ASSERT(false); // TODO: not implemented
|
18706
18458
|
} break;
|
18707
|
-
case GGML_OP_FLASH_ATTN:
|
18708
18459
|
case GGML_OP_FLASH_ATTN_EXT:
|
18709
18460
|
{
|
18710
18461
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -18721,7 +18472,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18721
18472
|
masked);
|
18722
18473
|
}
|
18723
18474
|
|
18724
|
-
struct ggml_tensor * src2 = tensor->src[2];
|
18725
18475
|
const int64_t elem_q = ggml_nelements(src0);
|
18726
18476
|
const int64_t elem_k = ggml_nelements(src1);
|
18727
18477
|
const int64_t elem_v = ggml_nelements(src2);
|
@@ -18759,10 +18509,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18759
18509
|
zero_table);
|
18760
18510
|
}
|
18761
18511
|
} break;
|
18762
|
-
case GGML_OP_FLASH_FF:
|
18763
|
-
{
|
18764
|
-
GGML_ASSERT(false); // not supported
|
18765
|
-
} break;
|
18766
18512
|
case GGML_OP_FLASH_ATTN_BACK:
|
18767
18513
|
{
|
18768
18514
|
GGML_ASSERT(false); // not supported
|
@@ -18826,6 +18572,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18826
18572
|
zero_table);
|
18827
18573
|
}
|
18828
18574
|
} break;
|
18575
|
+
case GGML_UNARY_OP_SIGMOID:
|
18576
|
+
{
|
18577
|
+
GGML_ASSERT(false); // TODO: not implemented
|
18578
|
+
} break;
|
18829
18579
|
case GGML_UNARY_OP_GELU:
|
18830
18580
|
{
|
18831
18581
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -19172,8 +18922,6 @@ typedef int ggml_lock_t;
|
|
19172
18922
|
|
19173
18923
|
#define GGML_LOCK_INITIALIZER 0
|
19174
18924
|
|
19175
|
-
typedef pthread_t ggml_thread_t;
|
19176
|
-
|
19177
18925
|
#define ggml_thread_create pthread_create
|
19178
18926
|
#define ggml_thread_join pthread_join
|
19179
18927
|
|
@@ -19199,8 +18947,6 @@ typedef int ggml_lock_t;
|
|
19199
18947
|
|
19200
18948
|
#define GGML_LOCK_INITIALIZER 0
|
19201
18949
|
|
19202
|
-
typedef pthread_t ggml_thread_t;
|
19203
|
-
|
19204
18950
|
#define ggml_thread_create pthread_create
|
19205
18951
|
#define ggml_thread_join pthread_join
|
19206
18952
|
|
@@ -19280,31 +19026,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
|
19280
19026
|
static void clear_numa_thread_affinity(void) {}
|
19281
19027
|
#endif
|
19282
19028
|
|
19283
|
-
struct ggml_compute_state_shared {
|
19284
|
-
const struct ggml_cgraph * cgraph;
|
19285
|
-
const struct ggml_cplan * cplan;
|
19286
|
-
|
19287
|
-
int64_t perf_node_start_cycles;
|
19288
|
-
int64_t perf_node_start_time_us;
|
19289
|
-
|
19290
|
-
const int n_threads;
|
19291
|
-
|
19292
|
-
// synchronization primitives
|
19293
|
-
atomic_int n_active; // num active threads
|
19294
|
-
atomic_int node_n; // active graph node
|
19295
|
-
atomic_int node_task; // active graph node task phase
|
19296
|
-
|
19297
|
-
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
19298
|
-
void * abort_callback_data;
|
19299
|
-
};
|
19300
|
-
|
19301
|
-
struct ggml_compute_state {
|
19302
|
-
ggml_thread_t thrd;
|
19303
|
-
int ith;
|
19304
|
-
struct ggml_compute_state_shared * shared;
|
19305
|
-
enum ggml_status ec;
|
19306
|
-
};
|
19307
|
-
|
19308
19029
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
19309
19030
|
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
19310
19031
|
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
@@ -19355,6 +19076,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19355
19076
|
case GGML_UNARY_OP_TANH:
|
19356
19077
|
case GGML_UNARY_OP_ELU:
|
19357
19078
|
case GGML_UNARY_OP_RELU:
|
19079
|
+
case GGML_UNARY_OP_SIGMOID:
|
19358
19080
|
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
19359
19081
|
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
19360
19082
|
{
|
@@ -19428,10 +19150,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19428
19150
|
{
|
19429
19151
|
n_tasks = n_threads;
|
19430
19152
|
} break;
|
19431
|
-
case GGML_OP_ALIBI:
|
19432
|
-
{
|
19433
|
-
n_tasks = 1; //TODO
|
19434
|
-
} break;
|
19435
19153
|
case GGML_OP_CLAMP:
|
19436
19154
|
{
|
19437
19155
|
n_tasks = 1; //TODO
|
@@ -19477,15 +19195,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19477
19195
|
{
|
19478
19196
|
n_tasks = n_threads;
|
19479
19197
|
} break;
|
19480
|
-
case GGML_OP_FLASH_ATTN:
|
19481
19198
|
case GGML_OP_FLASH_ATTN_EXT:
|
19482
19199
|
{
|
19483
19200
|
n_tasks = n_threads;
|
19484
19201
|
} break;
|
19485
|
-
case GGML_OP_FLASH_FF:
|
19486
|
-
{
|
19487
|
-
n_tasks = n_threads;
|
19488
|
-
} break;
|
19489
19202
|
case GGML_OP_FLASH_ATTN_BACK:
|
19490
19203
|
{
|
19491
19204
|
n_tasks = n_threads;
|
@@ -19580,6 +19293,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19580
19293
|
|
19581
19294
|
* node_n = atomic_load(&state->shared->node_n);
|
19582
19295
|
if (* node_n != last_node_n) break;
|
19296
|
+
#if defined(__SSE3__)
|
19297
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19298
|
+
_mm_pause();
|
19299
|
+
#endif
|
19583
19300
|
}
|
19584
19301
|
}
|
19585
19302
|
|
@@ -19594,6 +19311,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
|
|
19594
19311
|
|
19595
19312
|
* task_phase = atomic_load(&state->shared->node_task);
|
19596
19313
|
if (* task_phase != last_task_phase) break;
|
19314
|
+
#if defined(__SSE3__)
|
19315
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19316
|
+
_mm_pause();
|
19317
|
+
#endif
|
19597
19318
|
}
|
19598
19319
|
}
|
19599
19320
|
|
@@ -19633,7 +19354,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19633
19354
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19634
19355
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19635
19356
|
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19636
|
-
ggml_compute_forward(¶ms, node);
|
19357
|
+
ggml_compute_forward(¶ms, node, state);
|
19637
19358
|
}
|
19638
19359
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19639
19360
|
}
|
@@ -19653,17 +19374,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19653
19374
|
/* INIT */
|
19654
19375
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19655
19376
|
params.type = GGML_TASK_TYPE_INIT;
|
19656
|
-
ggml_compute_forward(¶ms, node);
|
19377
|
+
ggml_compute_forward(¶ms, node, state);
|
19657
19378
|
}
|
19658
19379
|
|
19659
19380
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
19660
19381
|
// they do something more efficient than spinning (?)
|
19661
19382
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19662
|
-
ggml_compute_forward(¶ms, node);
|
19383
|
+
ggml_compute_forward(¶ms, node, state);
|
19663
19384
|
|
19664
19385
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19665
19386
|
params.type = GGML_TASK_TYPE_FINALIZE;
|
19666
|
-
ggml_compute_forward(¶ms, node);
|
19387
|
+
ggml_compute_forward(¶ms, node, state);
|
19667
19388
|
}
|
19668
19389
|
|
19669
19390
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -19702,7 +19423,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19702
19423
|
|
19703
19424
|
if (state->ith < n_tasks) {
|
19704
19425
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19705
|
-
ggml_compute_forward(¶ms, node);
|
19426
|
+
ggml_compute_forward(¶ms, node, state);
|
19706
19427
|
}
|
19707
19428
|
}
|
19708
19429
|
|
@@ -19723,7 +19444,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19723
19444
|
|
19724
19445
|
if (state->ith < n_tasks) {
|
19725
19446
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19726
|
-
ggml_compute_forward(¶ms, node);
|
19447
|
+
ggml_compute_forward(¶ms, node, state);
|
19727
19448
|
}
|
19728
19449
|
|
19729
19450
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
@@ -19874,39 +19595,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19874
19595
|
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
19875
19596
|
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
19876
19597
|
} break;
|
19877
|
-
case GGML_OP_FLASH_ATTN:
|
19878
|
-
{
|
19879
|
-
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
19880
|
-
|
19881
|
-
if (node->src[1]->type == GGML_TYPE_F32) {
|
19882
|
-
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
19883
|
-
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
19884
|
-
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
19885
|
-
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
19886
|
-
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
19887
|
-
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
19888
|
-
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
19889
|
-
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
19890
|
-
}
|
19891
|
-
} break;
|
19892
19598
|
case GGML_OP_FLASH_ATTN_EXT:
|
19893
19599
|
{
|
19894
19600
|
const int64_t ne00 = node->src[0]->ne[0]; // D
|
19895
19601
|
|
19896
|
-
cur =
|
19897
|
-
} break;
|
19898
|
-
case GGML_OP_FLASH_FF:
|
19899
|
-
{
|
19900
|
-
if (node->src[1]->type == GGML_TYPE_F32) {
|
19901
|
-
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
19902
|
-
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
19903
|
-
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
19904
|
-
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
19905
|
-
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
19906
|
-
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
19907
|
-
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
19908
|
-
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
19909
|
-
}
|
19602
|
+
cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
|
19910
19603
|
} break;
|
19911
19604
|
case GGML_OP_FLASH_ATTN_BACK:
|
19912
19605
|
{
|
@@ -19974,6 +19667,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19974
19667
|
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
19975
19668
|
/*.abort_callback =*/ NULL,
|
19976
19669
|
/*.abort_callback_data =*/ NULL,
|
19670
|
+
/*.current_chunk; =*/ 0,
|
19977
19671
|
};
|
19978
19672
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
19979
19673
|
|
@@ -21747,11 +21441,7 @@ size_t ggml_quantize_chunk(
|
|
21747
21441
|
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21748
21442
|
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21749
21443
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21750
|
-
#if QK_K == 64
|
21751
|
-
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21752
|
-
#else
|
21753
21444
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
21754
|
-
#endif
|
21755
21445
|
case GGML_TYPE_F16:
|
21756
21446
|
{
|
21757
21447
|
size_t elemsize = sizeof(ggml_fp16_t);
|
@@ -23028,6 +22718,14 @@ int ggml_cpu_has_avx512_vnni(void) {
|
|
23028
22718
|
#endif
|
23029
22719
|
}
|
23030
22720
|
|
22721
|
+
int ggml_cpu_has_avx512_bf16(void) {
|
22722
|
+
#if defined(__AVX512BF16__)
|
22723
|
+
return 1;
|
22724
|
+
#else
|
22725
|
+
return 0;
|
22726
|
+
#endif
|
22727
|
+
}
|
22728
|
+
|
23031
22729
|
int ggml_cpu_has_fma(void) {
|
23032
22730
|
#if defined(__FMA__)
|
23033
22731
|
return 1;
|