llama_cpp 0.14.6 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +90 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -3
- data/vendor/tmp/llama.cpp/Makefile +52 -22
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +394 -44
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +996 -455
- data/vendor/tmp/llama.cpp/llama.h +46 -15
- data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
858
858
|
// simd mappings
|
859
859
|
//
|
860
860
|
|
861
|
-
#if defined(__ARM_NEON)
|
862
|
-
#if !defined(__aarch64__)
|
863
|
-
|
864
|
-
// 64-bit compatibility
|
865
|
-
|
866
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
867
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
868
|
-
}
|
869
|
-
|
870
|
-
#endif
|
871
|
-
#endif
|
872
|
-
|
873
861
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
874
862
|
// we then implement the fundamental computation operations below using only these macros
|
875
863
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
@@ -963,7 +951,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
963
951
|
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
964
952
|
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
965
953
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
966
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
|
954
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
967
955
|
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
968
956
|
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
969
957
|
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
@@ -989,7 +977,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
989
977
|
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
990
978
|
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
991
979
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
992
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
980
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
|
993
981
|
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
994
982
|
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
995
983
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
@@ -1058,7 +1046,7 @@ do { \
|
|
1058
1046
|
|
1059
1047
|
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
1060
1048
|
// so F16C guard isn't required
|
1061
|
-
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
1049
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
|
1062
1050
|
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
1063
1051
|
|
1064
1052
|
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
@@ -1156,7 +1144,7 @@ do { \
|
|
1156
1144
|
|
1157
1145
|
#if defined(__F16C__)
|
1158
1146
|
// the _mm256_cvt intrinsics require F16C
|
1159
|
-
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
1147
|
+
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
1160
1148
|
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
1161
1149
|
#else
|
1162
1150
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
@@ -1674,6 +1662,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
1674
1662
|
#endif
|
1675
1663
|
}
|
1676
1664
|
|
1665
|
+
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
|
1666
|
+
#if defined(GGML_SIMD)
|
1667
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
1668
|
+
|
1669
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
1670
|
+
|
1671
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
1672
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
1673
|
+
|
1674
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
1675
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
1676
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
1677
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
1678
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
1679
|
+
|
1680
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
1681
|
+
}
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
// leftovers
|
1685
|
+
for (int i = np; i < n; ++i) {
|
1686
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
1687
|
+
}
|
1688
|
+
#else
|
1689
|
+
// scalar
|
1690
|
+
for (int i = 0; i < n; ++i) {
|
1691
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
1692
|
+
}
|
1693
|
+
#endif
|
1694
|
+
}
|
1695
|
+
|
1677
1696
|
// xs and vs are byte strides of x and v
|
1678
1697
|
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
|
1679
1698
|
|
@@ -1758,6 +1777,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
1758
1777
|
#endif
|
1759
1778
|
}
|
1760
1779
|
|
1780
|
+
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
1781
|
+
#if defined(GGML_SIMD)
|
1782
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
1783
|
+
|
1784
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
1785
|
+
|
1786
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
1787
|
+
|
1788
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
1789
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
1790
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
1791
|
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
1792
|
+
|
1793
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
1794
|
+
}
|
1795
|
+
}
|
1796
|
+
|
1797
|
+
// leftovers
|
1798
|
+
for (int i = np; i < n; ++i) {
|
1799
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
1800
|
+
}
|
1801
|
+
#else
|
1802
|
+
// scalar
|
1803
|
+
for (int i = 0; i < n; ++i) {
|
1804
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
1805
|
+
}
|
1806
|
+
#endif
|
1807
|
+
}
|
1808
|
+
|
1761
1809
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
1762
1810
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
1763
1811
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
@@ -2012,6 +2060,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2012
2060
|
"LEAKY_RELU",
|
2013
2061
|
|
2014
2062
|
"FLASH_ATTN",
|
2063
|
+
"FLASH_ATTN_EXT",
|
2015
2064
|
"FLASH_FF",
|
2016
2065
|
"FLASH_ATTN_BACK",
|
2017
2066
|
"SSM_CONV",
|
@@ -2038,7 +2087,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2038
2087
|
"CROSS_ENTROPY_LOSS_BACK",
|
2039
2088
|
};
|
2040
2089
|
|
2041
|
-
static_assert(GGML_OP_COUNT ==
|
2090
|
+
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
2042
2091
|
|
2043
2092
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2044
2093
|
"none",
|
@@ -2102,6 +2151,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2102
2151
|
"leaky_relu(x)",
|
2103
2152
|
|
2104
2153
|
"flash_attn(x)",
|
2154
|
+
"flash_attn_ext(x)",
|
2105
2155
|
"flash_ff(x)",
|
2106
2156
|
"flash_attn_back(x)",
|
2107
2157
|
"ssm_conv(x)",
|
@@ -2128,7 +2178,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2128
2178
|
"cross_entropy_loss_back(x,y)",
|
2129
2179
|
};
|
2130
2180
|
|
2131
|
-
static_assert(GGML_OP_COUNT ==
|
2181
|
+
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
2132
2182
|
|
2133
2183
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
2134
2184
|
|
@@ -4571,6 +4621,8 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4571
4621
|
void ggml_mul_mat_set_prec(
|
4572
4622
|
struct ggml_tensor * a,
|
4573
4623
|
enum ggml_prec prec) {
|
4624
|
+
GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
|
4625
|
+
|
4574
4626
|
const int32_t prec_i32 = (int32_t) prec;
|
4575
4627
|
|
4576
4628
|
ggml_set_op_params_i32(a, 0, prec_i32);
|
@@ -5409,17 +5461,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5409
5461
|
GGML_ASSERT(ggml_is_contiguous(a));
|
5410
5462
|
|
5411
5463
|
if (mask) {
|
5464
|
+
GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
|
5412
5465
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
5413
5466
|
GGML_ASSERT(ggml_is_matrix(mask));
|
5414
|
-
GGML_ASSERT(
|
5467
|
+
GGML_ASSERT(mask->ne[0] == a->ne[0]);
|
5468
|
+
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
5415
5469
|
}
|
5416
5470
|
|
5417
5471
|
if (pos) {
|
5418
5472
|
GGML_ASSERT(ggml_is_vector(pos));
|
5419
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F32);
|
5473
|
+
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
5420
5474
|
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
5421
5475
|
}
|
5422
5476
|
|
5477
|
+
if (pos && mask) {
|
5478
|
+
GGML_ASSERT(pos->type == mask->type);
|
5479
|
+
}
|
5480
|
+
|
5423
5481
|
if (max_bias > 0.0f) {
|
5424
5482
|
GGML_ASSERT(pos);
|
5425
5483
|
}
|
@@ -6228,6 +6286,59 @@ struct ggml_tensor * ggml_flash_attn(
|
|
6228
6286
|
return result;
|
6229
6287
|
}
|
6230
6288
|
|
6289
|
+
// ggml_flash_attn_ext
|
6290
|
+
|
6291
|
+
struct ggml_tensor * ggml_flash_attn_ext(
|
6292
|
+
struct ggml_context * ctx,
|
6293
|
+
struct ggml_tensor * q,
|
6294
|
+
struct ggml_tensor * k,
|
6295
|
+
struct ggml_tensor * v,
|
6296
|
+
struct ggml_tensor * mask,
|
6297
|
+
float scale) {
|
6298
|
+
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6299
|
+
// TODO: check if vT can be multiplied by (k*qT)
|
6300
|
+
if (mask) {
|
6301
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
6302
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
6303
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
6304
|
+
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
6305
|
+
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
6306
|
+
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
6307
|
+
}
|
6308
|
+
|
6309
|
+
bool is_node = false;
|
6310
|
+
|
6311
|
+
if (q->grad || k->grad || v->grad) {
|
6312
|
+
is_node = true;
|
6313
|
+
}
|
6314
|
+
|
6315
|
+
// permute(0, 2, 1, 3)
|
6316
|
+
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
6317
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6318
|
+
|
6319
|
+
float params[] = { scale };
|
6320
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6321
|
+
|
6322
|
+
result->op = GGML_OP_FLASH_ATTN_EXT;
|
6323
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6324
|
+
result->src[0] = q;
|
6325
|
+
result->src[1] = k;
|
6326
|
+
result->src[2] = v;
|
6327
|
+
result->src[3] = mask;
|
6328
|
+
|
6329
|
+
return result;
|
6330
|
+
}
|
6331
|
+
|
6332
|
+
void ggml_flash_attn_ext_set_prec(
|
6333
|
+
struct ggml_tensor * a,
|
6334
|
+
enum ggml_prec prec) {
|
6335
|
+
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
6336
|
+
|
6337
|
+
const int32_t prec_i32 = (int32_t) prec;
|
6338
|
+
|
6339
|
+
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
|
6340
|
+
}
|
6341
|
+
|
6231
6342
|
// ggml_flash_ff
|
6232
6343
|
|
6233
6344
|
struct ggml_tensor * ggml_flash_ff(
|
@@ -10825,7 +10936,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10825
10936
|
#endif
|
10826
10937
|
|
10827
10938
|
#if GGML_USE_LLAMAFILE
|
10828
|
-
if (
|
10939
|
+
if (src1_cont) {
|
10829
10940
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10830
10941
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10831
10942
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
@@ -10878,15 +10989,13 @@ UseGgmlGemm1:;
|
|
10878
10989
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10879
10990
|
|
10880
10991
|
#if GGML_USE_LLAMAFILE
|
10881
|
-
if (
|
10992
|
+
if (src1->type != vec_dot_type) {
|
10882
10993
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10883
10994
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10884
10995
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10885
10996
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10886
10997
|
nb01/ggml_type_size(src0->type),
|
10887
|
-
(const char *)wdata +
|
10888
|
-
nb12/ggml_type_size(src1->type)*i12 +
|
10889
|
-
nb13/ggml_type_size(src1->type)*i13),
|
10998
|
+
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
10890
10999
|
row_size/ggml_type_size(vec_dot_type),
|
10891
11000
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
10892
11001
|
nb1/ggml_type_size(dst->type),
|
@@ -12269,7 +12378,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12269
12378
|
|
12270
12379
|
GGML_TENSOR_UNARY_OP_LOCALS
|
12271
12380
|
|
12272
|
-
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
12381
|
+
//const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
12273
12382
|
|
12274
12383
|
// TODO: is this supposed to be ceil instead of floor?
|
12275
12384
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
@@ -12292,19 +12401,31 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12292
12401
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
12293
12402
|
|
12294
12403
|
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
|
12295
|
-
|
12404
|
+
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
12405
|
+
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
12406
|
+
|
12407
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
12296
12408
|
|
12297
12409
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
12298
12410
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
12299
12411
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
12300
12412
|
|
12301
12413
|
// broadcast the mask across rows
|
12302
|
-
|
12414
|
+
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
12415
|
+
float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
12303
12416
|
|
12304
12417
|
ggml_vec_cpy_f32 (nc, wp, sp);
|
12305
12418
|
ggml_vec_scale_f32(nc, wp, scale);
|
12306
|
-
if (
|
12307
|
-
|
12419
|
+
if (mp_f32) {
|
12420
|
+
if (use_f16) {
|
12421
|
+
for (int i = 0; i < nc; ++i) {
|
12422
|
+
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
12423
|
+
}
|
12424
|
+
} else {
|
12425
|
+
for (int i = 0; i < nc; ++i) {
|
12426
|
+
wp[i] += mp_f32[i];
|
12427
|
+
}
|
12428
|
+
}
|
12308
12429
|
}
|
12309
12430
|
|
12310
12431
|
// ALiBi bias
|
@@ -12312,8 +12433,14 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12312
12433
|
const uint32_t h = (i1/ne01)%ne02; // head
|
12313
12434
|
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
12314
12435
|
|
12315
|
-
|
12316
|
-
|
12436
|
+
if (use_f16) {
|
12437
|
+
for (int i = 0; i < nc; ++i) {
|
12438
|
+
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
12439
|
+
}
|
12440
|
+
} else {
|
12441
|
+
for (int i = 0; i < nc; ++i) {
|
12442
|
+
wp[i] += slope*pos_f32[i];
|
12443
|
+
}
|
12317
12444
|
}
|
12318
12445
|
}
|
12319
12446
|
|
@@ -14583,6 +14710,198 @@ static void ggml_compute_forward_flash_attn(
|
|
14583
14710
|
}
|
14584
14711
|
}
|
14585
14712
|
|
14713
|
+
// ggml_compute_forward_flash_attn_ext
|
14714
|
+
|
14715
|
+
static void ggml_compute_forward_flash_attn_ext_f16(
|
14716
|
+
const struct ggml_compute_params * params,
|
14717
|
+
const struct ggml_tensor * q,
|
14718
|
+
const struct ggml_tensor * k,
|
14719
|
+
const struct ggml_tensor * v,
|
14720
|
+
const struct ggml_tensor * mask,
|
14721
|
+
struct ggml_tensor * dst) {
|
14722
|
+
int64_t t0 = ggml_perf_time_us();
|
14723
|
+
UNUSED(t0);
|
14724
|
+
|
14725
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
14726
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
14727
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
14728
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
14729
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
14730
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
14731
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
14732
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
14733
|
+
|
14734
|
+
const int ith = params->ith;
|
14735
|
+
const int nth = params->nth;
|
14736
|
+
|
14737
|
+
const int64_t D = neq0;
|
14738
|
+
const int64_t N = neq1;
|
14739
|
+
|
14740
|
+
GGML_ASSERT(ne0 == D);
|
14741
|
+
GGML_ASSERT(ne2 == N);
|
14742
|
+
|
14743
|
+
GGML_ASSERT(nbq0 == sizeof(float));
|
14744
|
+
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
14745
|
+
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
14746
|
+
|
14747
|
+
GGML_ASSERT(neq0 == D);
|
14748
|
+
GGML_ASSERT(nek0 == D);
|
14749
|
+
GGML_ASSERT(nev0 == D);
|
14750
|
+
|
14751
|
+
GGML_ASSERT(neq1 == N);
|
14752
|
+
GGML_ASSERT(nev0 == D);
|
14753
|
+
|
14754
|
+
// dst cannot be transposed or permuted
|
14755
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
14756
|
+
GGML_ASSERT(nb0 <= nb1);
|
14757
|
+
GGML_ASSERT(nb1 <= nb2);
|
14758
|
+
GGML_ASSERT(nb2 <= nb3);
|
14759
|
+
|
14760
|
+
// broadcast factors
|
14761
|
+
const int64_t rk2 = neq2/nek2;
|
14762
|
+
const int64_t rk3 = neq3/nek3;
|
14763
|
+
|
14764
|
+
const int64_t rv2 = neq2/nev2;
|
14765
|
+
const int64_t rv3 = neq3/nev3;
|
14766
|
+
|
14767
|
+
if (params->type == GGML_TASK_TYPE_INIT) {
|
14768
|
+
return;
|
14769
|
+
}
|
14770
|
+
|
14771
|
+
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
14772
|
+
return;
|
14773
|
+
}
|
14774
|
+
|
14775
|
+
// parallelize by q rows using ggml_vec_dot_f32
|
14776
|
+
|
14777
|
+
// total rows in q
|
14778
|
+
const int nr = neq1*neq2*neq3;
|
14779
|
+
|
14780
|
+
// rows per thread
|
14781
|
+
const int dr = (nr + nth - 1)/nth;
|
14782
|
+
|
14783
|
+
// row range for this thread
|
14784
|
+
const int ir0 = dr*ith;
|
14785
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
14786
|
+
|
14787
|
+
float scale = 1.0f;
|
14788
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
14789
|
+
|
14790
|
+
// loop over n_batch and n_head
|
14791
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
14792
|
+
// q indices
|
14793
|
+
const int iq3 = ir/(neq2*neq1);
|
14794
|
+
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
14795
|
+
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
14796
|
+
|
14797
|
+
float S = 0.0f;
|
14798
|
+
float M = -INFINITY;
|
14799
|
+
|
14800
|
+
float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32);
|
14801
|
+
ggml_fp16_t * Q16 = (ggml_fp16_t *) (V32); // reuse memory
|
14802
|
+
ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D);
|
14803
|
+
|
14804
|
+
memset(V16, 0, D*sizeof(ggml_fp16_t));
|
14805
|
+
|
14806
|
+
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
14807
|
+
|
14808
|
+
// k indices
|
14809
|
+
const int ik3 = iq3 / rk3;
|
14810
|
+
const int ik2 = iq2 / rk2;
|
14811
|
+
|
14812
|
+
// v indices
|
14813
|
+
const int iv3 = iq3 / rv3;
|
14814
|
+
const int iv2 = iq2 / rv2;
|
14815
|
+
|
14816
|
+
// online softmax / attention
|
14817
|
+
// loop over n_kv and n_head_kv
|
14818
|
+
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
14819
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
14820
|
+
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
14821
|
+
if (mv == -INFINITY) {
|
14822
|
+
continue;
|
14823
|
+
}
|
14824
|
+
|
14825
|
+
float s;
|
14826
|
+
|
14827
|
+
// convert Q to F16 in V32
|
14828
|
+
{
|
14829
|
+
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
14830
|
+
|
14831
|
+
for (int64_t d = 0; d < D; ++d) {
|
14832
|
+
Q16[d] = GGML_FP32_TO_FP16(pq[d]);
|
14833
|
+
}
|
14834
|
+
}
|
14835
|
+
|
14836
|
+
ggml_vec_dot_f16(D,
|
14837
|
+
&s, 0,
|
14838
|
+
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
14839
|
+
Q16, 0, 1);
|
14840
|
+
|
14841
|
+
s = s*scale + mv;
|
14842
|
+
|
14843
|
+
const float Mold = M;
|
14844
|
+
|
14845
|
+
float ms = 1.0f;
|
14846
|
+
float vs = 1.0f;
|
14847
|
+
|
14848
|
+
if (s > M) {
|
14849
|
+
M = s;
|
14850
|
+
ms = expf(Mold - M);
|
14851
|
+
|
14852
|
+
// V = V*expf(Mold - M)
|
14853
|
+
ggml_vec_scale_f16(D, V16, ms);
|
14854
|
+
} else {
|
14855
|
+
vs = expf(s - M);
|
14856
|
+
}
|
14857
|
+
|
14858
|
+
const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
14859
|
+
|
14860
|
+
// V += v*expf(s - M)
|
14861
|
+
ggml_vec_mad_f16(D, V16, v16, vs);
|
14862
|
+
|
14863
|
+
S = S*ms + vs;
|
14864
|
+
}
|
14865
|
+
|
14866
|
+
// V /= S
|
14867
|
+
for (int64_t d = 0; d < D; ++d) {
|
14868
|
+
V32[d] = GGML_FP16_TO_FP32(V16[d])/S;
|
14869
|
+
}
|
14870
|
+
|
14871
|
+
// dst indices
|
14872
|
+
const int i1 = iq1;
|
14873
|
+
const int i2 = iq2;
|
14874
|
+
const int i3 = iq3;
|
14875
|
+
|
14876
|
+
// original
|
14877
|
+
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
|
14878
|
+
|
14879
|
+
// permute(0, 2, 1, 3)
|
14880
|
+
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1);
|
14881
|
+
}
|
14882
|
+
}
|
14883
|
+
|
14884
|
+
static void ggml_compute_forward_flash_attn_ext(
|
14885
|
+
const struct ggml_compute_params * params,
|
14886
|
+
const struct ggml_tensor * q,
|
14887
|
+
const struct ggml_tensor * k,
|
14888
|
+
const struct ggml_tensor * v,
|
14889
|
+
const struct ggml_tensor * mask,
|
14890
|
+
struct ggml_tensor * dst) {
|
14891
|
+
switch (dst->op_params[1]) {
|
14892
|
+
case GGML_PREC_DEFAULT:
|
14893
|
+
case GGML_PREC_F32:
|
14894
|
+
{
|
14895
|
+
// uses F32 accumulators
|
14896
|
+
ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
|
14897
|
+
} break;
|
14898
|
+
default:
|
14899
|
+
{
|
14900
|
+
GGML_ASSERT(false);
|
14901
|
+
} break;
|
14902
|
+
}
|
14903
|
+
}
|
14904
|
+
|
14586
14905
|
// ggml_compute_forward_flash_ff
|
14587
14906
|
|
14588
14907
|
static void ggml_compute_forward_flash_ff_f16(
|
@@ -16390,6 +16709,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
16390
16709
|
const bool masked = t != 0;
|
16391
16710
|
ggml_compute_forward_flash_attn(params, masked, tensor);
|
16392
16711
|
} break;
|
16712
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
16713
|
+
{
|
16714
|
+
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
16715
|
+
} break;
|
16393
16716
|
case GGML_OP_FLASH_FF:
|
16394
16717
|
{
|
16395
16718
|
ggml_compute_forward_flash_ff(params, tensor);
|
@@ -17402,6 +17725,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
17402
17725
|
GGML_ASSERT(false); // TODO: not implemented
|
17403
17726
|
} break;
|
17404
17727
|
case GGML_OP_FLASH_ATTN:
|
17728
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
17405
17729
|
{
|
17406
17730
|
struct ggml_tensor * flash_grad = NULL;
|
17407
17731
|
if (src0->grad || src1->grad || tensor->src[2]->grad) {
|
@@ -18174,6 +18498,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18174
18498
|
n_tasks = n_threads;
|
18175
18499
|
} break;
|
18176
18500
|
case GGML_OP_FLASH_ATTN:
|
18501
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
18177
18502
|
{
|
18178
18503
|
n_tasks = n_threads;
|
18179
18504
|
} break;
|
@@ -18577,6 +18902,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18577
18902
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
18578
18903
|
}
|
18579
18904
|
} break;
|
18905
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
18906
|
+
{
|
18907
|
+
const int64_t ne00 = node->src[0]->ne[0]; // D
|
18908
|
+
|
18909
|
+
cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size
|
18910
|
+
} break;
|
18580
18911
|
case GGML_OP_FLASH_FF:
|
18581
18912
|
{
|
18582
18913
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -20628,7 +20959,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
|
|
20628
20959
|
}
|
20629
20960
|
|
20630
20961
|
struct gguf_context * gguf_init_empty(void) {
|
20631
|
-
struct gguf_context * ctx =
|
20962
|
+
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
20632
20963
|
|
20633
20964
|
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
20634
20965
|
ctx->header.version = GGUF_VERSION;
|
@@ -20673,7 +21004,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20673
21004
|
|
20674
21005
|
bool ok = true;
|
20675
21006
|
|
20676
|
-
struct gguf_context * ctx =
|
21007
|
+
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
20677
21008
|
|
20678
21009
|
// read the header
|
20679
21010
|
{
|
@@ -20710,9 +21041,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20710
21041
|
|
20711
21042
|
// read the kv pairs
|
20712
21043
|
{
|
20713
|
-
|
21044
|
+
const uint64_t n_kv = ctx->header.n_kv;
|
20714
21045
|
|
20715
|
-
|
21046
|
+
// header.n_kv will hold the actual value of pairs that were successfully read in the loop below
|
21047
|
+
ctx->header.n_kv = 0;
|
21048
|
+
ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
|
21049
|
+
|
21050
|
+
for (uint64_t i = 0; i < n_kv; ++i) {
|
20716
21051
|
struct gguf_kv * kv = &ctx->kv[i];
|
20717
21052
|
|
20718
21053
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
@@ -20761,7 +21096,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20761
21096
|
return NULL;
|
20762
21097
|
}
|
20763
21098
|
|
20764
|
-
kv->value.arr.data =
|
21099
|
+
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
|
20765
21100
|
|
20766
21101
|
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
20767
21102
|
} break;
|
@@ -20775,7 +21110,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20775
21110
|
return NULL;
|
20776
21111
|
}
|
20777
21112
|
|
20778
|
-
kv->value.arr.data =
|
21113
|
+
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
|
20779
21114
|
|
20780
21115
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20781
21116
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
@@ -20791,6 +21126,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20791
21126
|
if (!ok) {
|
20792
21127
|
break;
|
20793
21128
|
}
|
21129
|
+
|
21130
|
+
ctx->header.n_kv++;
|
20794
21131
|
}
|
20795
21132
|
|
20796
21133
|
if (!ok) {
|
@@ -20803,7 +21140,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20803
21140
|
|
20804
21141
|
// read the tensor infos
|
20805
21142
|
{
|
20806
|
-
ctx->infos =
|
21143
|
+
ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
20807
21144
|
|
20808
21145
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20809
21146
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -20824,8 +21161,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20824
21161
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
20825
21162
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
20826
21163
|
|
21164
|
+
// TODO: return an error instead of crashing with GGML_ASSERT
|
20827
21165
|
gguf_tensor_info_sanitize(info);
|
20828
21166
|
|
21167
|
+
// make sure there is no duplicated tensor names
|
21168
|
+
for (uint64_t j = 0; j < i; ++j) {
|
21169
|
+
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
|
21170
|
+
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
|
21171
|
+
ok = false;
|
21172
|
+
}
|
21173
|
+
}
|
21174
|
+
|
20829
21175
|
if (!ok) {
|
20830
21176
|
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
20831
21177
|
fclose(file);
|
@@ -20994,7 +21340,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20994
21340
|
GGML_FREE(ctx->infos);
|
20995
21341
|
}
|
20996
21342
|
|
20997
|
-
|
21343
|
+
GGML_FREE(ctx);
|
20998
21344
|
}
|
20999
21345
|
|
21000
21346
|
const char * gguf_type_name(enum gguf_type type) {
|
@@ -21305,7 +21651,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|
21305
21651
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
21306
21652
|
ctx->kv[idx].value.arr.type = type;
|
21307
21653
|
ctx->kv[idx].value.arr.n = n;
|
21308
|
-
ctx->kv[idx].value.arr.data =
|
21654
|
+
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
|
21309
21655
|
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
21310
21656
|
}
|
21311
21657
|
|
@@ -21315,7 +21661,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
21315
21661
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
21316
21662
|
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
21317
21663
|
ctx->kv[idx].value.arr.n = n;
|
21318
|
-
ctx->kv[idx].value.arr.data =
|
21664
|
+
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
|
21319
21665
|
for (int i = 0; i < n; i++) {
|
21320
21666
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
21321
21667
|
str->n = strlen(data[i]);
|
@@ -21342,7 +21688,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
21342
21688
|
case GGUF_TYPE_ARRAY:
|
21343
21689
|
{
|
21344
21690
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
21345
|
-
const char ** data =
|
21691
|
+
const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
|
21346
21692
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
21347
21693
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
21348
21694
|
}
|
@@ -21362,6 +21708,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
21362
21708
|
void gguf_add_tensor(
|
21363
21709
|
struct gguf_context * ctx,
|
21364
21710
|
const struct ggml_tensor * tensor) {
|
21711
|
+
if (gguf_find_tensor(ctx, tensor->name) != -1) {
|
21712
|
+
GGML_ASSERT(false && "duplicated tensor name");
|
21713
|
+
}
|
21714
|
+
|
21365
21715
|
const int idx = ctx->header.n_tensors;
|
21366
21716
|
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
21367
21717
|
|
@@ -21430,7 +21780,7 @@ struct gguf_buf {
|
|
21430
21780
|
|
21431
21781
|
static struct gguf_buf gguf_buf_init(size_t size) {
|
21432
21782
|
struct gguf_buf buf = {
|
21433
|
-
/*buf.data =*/ size == 0 ? NULL :
|
21783
|
+
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
|
21434
21784
|
/*buf.size =*/ size,
|
21435
21785
|
/*buf.offset =*/ 0,
|
21436
21786
|
};
|