llama_cpp 0.14.7 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -951,7 +951,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
951
951
|
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
952
952
|
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
953
953
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
954
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
|
954
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
955
955
|
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
956
956
|
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
957
957
|
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
@@ -977,7 +977,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
977
977
|
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
978
978
|
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
979
979
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
980
|
-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
980
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
|
981
981
|
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
982
982
|
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
983
983
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
@@ -1046,7 +1046,7 @@ do { \
|
|
1046
1046
|
|
1047
1047
|
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
1048
1048
|
// so F16C guard isn't required
|
1049
|
-
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
1049
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
|
1050
1050
|
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
1051
1051
|
|
1052
1052
|
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
@@ -1144,7 +1144,7 @@ do { \
|
|
1144
1144
|
|
1145
1145
|
#if defined(__F16C__)
|
1146
1146
|
// the _mm256_cvt intrinsics require F16C
|
1147
|
-
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
1147
|
+
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
1148
1148
|
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
1149
1149
|
#else
|
1150
1150
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
@@ -1662,6 +1662,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
1662
1662
|
#endif
|
1663
1663
|
}
|
1664
1664
|
|
1665
|
+
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
|
1666
|
+
#if defined(GGML_SIMD)
|
1667
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
1668
|
+
|
1669
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
1670
|
+
|
1671
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
1672
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
1673
|
+
|
1674
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
1675
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
1676
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
1677
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
1678
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
1679
|
+
|
1680
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
1681
|
+
}
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
// leftovers
|
1685
|
+
for (int i = np; i < n; ++i) {
|
1686
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
1687
|
+
}
|
1688
|
+
#else
|
1689
|
+
// scalar
|
1690
|
+
for (int i = 0; i < n; ++i) {
|
1691
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
1692
|
+
}
|
1693
|
+
#endif
|
1694
|
+
}
|
1695
|
+
|
1665
1696
|
// xs and vs are byte strides of x and v
|
1666
1697
|
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
|
1667
1698
|
|
@@ -1746,6 +1777,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
1746
1777
|
#endif
|
1747
1778
|
}
|
1748
1779
|
|
1780
|
+
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
1781
|
+
#if defined(GGML_SIMD)
|
1782
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
1783
|
+
|
1784
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
1785
|
+
|
1786
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
1787
|
+
|
1788
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
1789
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
1790
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
1791
|
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
1792
|
+
|
1793
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
1794
|
+
}
|
1795
|
+
}
|
1796
|
+
|
1797
|
+
// leftovers
|
1798
|
+
for (int i = np; i < n; ++i) {
|
1799
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
1800
|
+
}
|
1801
|
+
#else
|
1802
|
+
// scalar
|
1803
|
+
for (int i = 0; i < n; ++i) {
|
1804
|
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
1805
|
+
}
|
1806
|
+
#endif
|
1807
|
+
}
|
1808
|
+
|
1749
1809
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
1750
1810
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
1751
1811
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
@@ -2000,6 +2060,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2000
2060
|
"LEAKY_RELU",
|
2001
2061
|
|
2002
2062
|
"FLASH_ATTN",
|
2063
|
+
"FLASH_ATTN_EXT",
|
2003
2064
|
"FLASH_FF",
|
2004
2065
|
"FLASH_ATTN_BACK",
|
2005
2066
|
"SSM_CONV",
|
@@ -2026,7 +2087,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2026
2087
|
"CROSS_ENTROPY_LOSS_BACK",
|
2027
2088
|
};
|
2028
2089
|
|
2029
|
-
static_assert(GGML_OP_COUNT ==
|
2090
|
+
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
2030
2091
|
|
2031
2092
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2032
2093
|
"none",
|
@@ -2090,6 +2151,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2090
2151
|
"leaky_relu(x)",
|
2091
2152
|
|
2092
2153
|
"flash_attn(x)",
|
2154
|
+
"flash_attn_ext(x)",
|
2093
2155
|
"flash_ff(x)",
|
2094
2156
|
"flash_attn_back(x)",
|
2095
2157
|
"ssm_conv(x)",
|
@@ -2116,7 +2178,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2116
2178
|
"cross_entropy_loss_back(x,y)",
|
2117
2179
|
};
|
2118
2180
|
|
2119
|
-
static_assert(GGML_OP_COUNT ==
|
2181
|
+
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
2120
2182
|
|
2121
2183
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
2122
2184
|
|
@@ -4559,6 +4621,8 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4559
4621
|
void ggml_mul_mat_set_prec(
|
4560
4622
|
struct ggml_tensor * a,
|
4561
4623
|
enum ggml_prec prec) {
|
4624
|
+
GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
|
4625
|
+
|
4562
4626
|
const int32_t prec_i32 = (int32_t) prec;
|
4563
4627
|
|
4564
4628
|
ggml_set_op_params_i32(a, 0, prec_i32);
|
@@ -5397,17 +5461,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5397
5461
|
GGML_ASSERT(ggml_is_contiguous(a));
|
5398
5462
|
|
5399
5463
|
if (mask) {
|
5464
|
+
GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
|
5400
5465
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
5401
5466
|
GGML_ASSERT(ggml_is_matrix(mask));
|
5402
|
-
GGML_ASSERT(
|
5467
|
+
GGML_ASSERT(mask->ne[0] == a->ne[0]);
|
5468
|
+
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
5403
5469
|
}
|
5404
5470
|
|
5405
5471
|
if (pos) {
|
5406
5472
|
GGML_ASSERT(ggml_is_vector(pos));
|
5407
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F32);
|
5473
|
+
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
5408
5474
|
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
5409
5475
|
}
|
5410
5476
|
|
5477
|
+
if (pos && mask) {
|
5478
|
+
GGML_ASSERT(pos->type == mask->type);
|
5479
|
+
}
|
5480
|
+
|
5411
5481
|
if (max_bias > 0.0f) {
|
5412
5482
|
GGML_ASSERT(pos);
|
5413
5483
|
}
|
@@ -6216,6 +6286,59 @@ struct ggml_tensor * ggml_flash_attn(
|
|
6216
6286
|
return result;
|
6217
6287
|
}
|
6218
6288
|
|
6289
|
+
// ggml_flash_attn_ext
|
6290
|
+
|
6291
|
+
struct ggml_tensor * ggml_flash_attn_ext(
|
6292
|
+
struct ggml_context * ctx,
|
6293
|
+
struct ggml_tensor * q,
|
6294
|
+
struct ggml_tensor * k,
|
6295
|
+
struct ggml_tensor * v,
|
6296
|
+
struct ggml_tensor * mask,
|
6297
|
+
float scale) {
|
6298
|
+
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6299
|
+
// TODO: check if vT can be multiplied by (k*qT)
|
6300
|
+
if (mask) {
|
6301
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
6302
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
6303
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
6304
|
+
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
6305
|
+
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
6306
|
+
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
6307
|
+
}
|
6308
|
+
|
6309
|
+
bool is_node = false;
|
6310
|
+
|
6311
|
+
if (q->grad || k->grad || v->grad) {
|
6312
|
+
is_node = true;
|
6313
|
+
}
|
6314
|
+
|
6315
|
+
// permute(0, 2, 1, 3)
|
6316
|
+
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
6317
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6318
|
+
|
6319
|
+
float params[] = { scale };
|
6320
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6321
|
+
|
6322
|
+
result->op = GGML_OP_FLASH_ATTN_EXT;
|
6323
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6324
|
+
result->src[0] = q;
|
6325
|
+
result->src[1] = k;
|
6326
|
+
result->src[2] = v;
|
6327
|
+
result->src[3] = mask;
|
6328
|
+
|
6329
|
+
return result;
|
6330
|
+
}
|
6331
|
+
|
6332
|
+
void ggml_flash_attn_ext_set_prec(
|
6333
|
+
struct ggml_tensor * a,
|
6334
|
+
enum ggml_prec prec) {
|
6335
|
+
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
6336
|
+
|
6337
|
+
const int32_t prec_i32 = (int32_t) prec;
|
6338
|
+
|
6339
|
+
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
|
6340
|
+
}
|
6341
|
+
|
6219
6342
|
// ggml_flash_ff
|
6220
6343
|
|
6221
6344
|
struct ggml_tensor * ggml_flash_ff(
|
@@ -12255,7 +12378,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12255
12378
|
|
12256
12379
|
GGML_TENSOR_UNARY_OP_LOCALS
|
12257
12380
|
|
12258
|
-
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
12381
|
+
//const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
12259
12382
|
|
12260
12383
|
// TODO: is this supposed to be ceil instead of floor?
|
12261
12384
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
@@ -12278,19 +12401,31 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12278
12401
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
12279
12402
|
|
12280
12403
|
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
|
12281
|
-
|
12404
|
+
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
12405
|
+
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
12406
|
+
|
12407
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
12282
12408
|
|
12283
12409
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
12284
12410
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
12285
12411
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
12286
12412
|
|
12287
12413
|
// broadcast the mask across rows
|
12288
|
-
|
12414
|
+
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
12415
|
+
float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
|
12289
12416
|
|
12290
12417
|
ggml_vec_cpy_f32 (nc, wp, sp);
|
12291
12418
|
ggml_vec_scale_f32(nc, wp, scale);
|
12292
|
-
if (
|
12293
|
-
|
12419
|
+
if (mp_f32) {
|
12420
|
+
if (use_f16) {
|
12421
|
+
for (int i = 0; i < nc; ++i) {
|
12422
|
+
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
12423
|
+
}
|
12424
|
+
} else {
|
12425
|
+
for (int i = 0; i < nc; ++i) {
|
12426
|
+
wp[i] += mp_f32[i];
|
12427
|
+
}
|
12428
|
+
}
|
12294
12429
|
}
|
12295
12430
|
|
12296
12431
|
// ALiBi bias
|
@@ -12298,8 +12433,14 @@ static void ggml_compute_forward_soft_max_f32(
|
|
12298
12433
|
const uint32_t h = (i1/ne01)%ne02; // head
|
12299
12434
|
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
12300
12435
|
|
12301
|
-
|
12302
|
-
|
12436
|
+
if (use_f16) {
|
12437
|
+
for (int i = 0; i < nc; ++i) {
|
12438
|
+
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
12439
|
+
}
|
12440
|
+
} else {
|
12441
|
+
for (int i = 0; i < nc; ++i) {
|
12442
|
+
wp[i] += slope*pos_f32[i];
|
12443
|
+
}
|
12303
12444
|
}
|
12304
12445
|
}
|
12305
12446
|
|
@@ -14569,6 +14710,198 @@ static void ggml_compute_forward_flash_attn(
|
|
14569
14710
|
}
|
14570
14711
|
}
|
14571
14712
|
|
14713
|
+
// ggml_compute_forward_flash_attn_ext
|
14714
|
+
|
14715
|
+
static void ggml_compute_forward_flash_attn_ext_f16(
|
14716
|
+
const struct ggml_compute_params * params,
|
14717
|
+
const struct ggml_tensor * q,
|
14718
|
+
const struct ggml_tensor * k,
|
14719
|
+
const struct ggml_tensor * v,
|
14720
|
+
const struct ggml_tensor * mask,
|
14721
|
+
struct ggml_tensor * dst) {
|
14722
|
+
int64_t t0 = ggml_perf_time_us();
|
14723
|
+
UNUSED(t0);
|
14724
|
+
|
14725
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
14726
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
14727
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
14728
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
14729
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
14730
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
14731
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
14732
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
14733
|
+
|
14734
|
+
const int ith = params->ith;
|
14735
|
+
const int nth = params->nth;
|
14736
|
+
|
14737
|
+
const int64_t D = neq0;
|
14738
|
+
const int64_t N = neq1;
|
14739
|
+
|
14740
|
+
GGML_ASSERT(ne0 == D);
|
14741
|
+
GGML_ASSERT(ne2 == N);
|
14742
|
+
|
14743
|
+
GGML_ASSERT(nbq0 == sizeof(float));
|
14744
|
+
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
14745
|
+
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
14746
|
+
|
14747
|
+
GGML_ASSERT(neq0 == D);
|
14748
|
+
GGML_ASSERT(nek0 == D);
|
14749
|
+
GGML_ASSERT(nev0 == D);
|
14750
|
+
|
14751
|
+
GGML_ASSERT(neq1 == N);
|
14752
|
+
GGML_ASSERT(nev0 == D);
|
14753
|
+
|
14754
|
+
// dst cannot be transposed or permuted
|
14755
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
14756
|
+
GGML_ASSERT(nb0 <= nb1);
|
14757
|
+
GGML_ASSERT(nb1 <= nb2);
|
14758
|
+
GGML_ASSERT(nb2 <= nb3);
|
14759
|
+
|
14760
|
+
// broadcast factors
|
14761
|
+
const int64_t rk2 = neq2/nek2;
|
14762
|
+
const int64_t rk3 = neq3/nek3;
|
14763
|
+
|
14764
|
+
const int64_t rv2 = neq2/nev2;
|
14765
|
+
const int64_t rv3 = neq3/nev3;
|
14766
|
+
|
14767
|
+
if (params->type == GGML_TASK_TYPE_INIT) {
|
14768
|
+
return;
|
14769
|
+
}
|
14770
|
+
|
14771
|
+
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
14772
|
+
return;
|
14773
|
+
}
|
14774
|
+
|
14775
|
+
// parallelize by q rows using ggml_vec_dot_f32
|
14776
|
+
|
14777
|
+
// total rows in q
|
14778
|
+
const int nr = neq1*neq2*neq3;
|
14779
|
+
|
14780
|
+
// rows per thread
|
14781
|
+
const int dr = (nr + nth - 1)/nth;
|
14782
|
+
|
14783
|
+
// row range for this thread
|
14784
|
+
const int ir0 = dr*ith;
|
14785
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
14786
|
+
|
14787
|
+
float scale = 1.0f;
|
14788
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
14789
|
+
|
14790
|
+
// loop over n_batch and n_head
|
14791
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
14792
|
+
// q indices
|
14793
|
+
const int iq3 = ir/(neq2*neq1);
|
14794
|
+
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
14795
|
+
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
14796
|
+
|
14797
|
+
float S = 0.0f;
|
14798
|
+
float M = -INFINITY;
|
14799
|
+
|
14800
|
+
float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32);
|
14801
|
+
ggml_fp16_t * Q16 = (ggml_fp16_t *) (V32); // reuse memory
|
14802
|
+
ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D);
|
14803
|
+
|
14804
|
+
memset(V16, 0, D*sizeof(ggml_fp16_t));
|
14805
|
+
|
14806
|
+
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
14807
|
+
|
14808
|
+
// k indices
|
14809
|
+
const int ik3 = iq3 / rk3;
|
14810
|
+
const int ik2 = iq2 / rk2;
|
14811
|
+
|
14812
|
+
// v indices
|
14813
|
+
const int iv3 = iq3 / rv3;
|
14814
|
+
const int iv2 = iq2 / rv2;
|
14815
|
+
|
14816
|
+
// online softmax / attention
|
14817
|
+
// loop over n_kv and n_head_kv
|
14818
|
+
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
14819
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
14820
|
+
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
14821
|
+
if (mv == -INFINITY) {
|
14822
|
+
continue;
|
14823
|
+
}
|
14824
|
+
|
14825
|
+
float s;
|
14826
|
+
|
14827
|
+
// convert Q to F16 in V32
|
14828
|
+
{
|
14829
|
+
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
14830
|
+
|
14831
|
+
for (int64_t d = 0; d < D; ++d) {
|
14832
|
+
Q16[d] = GGML_FP32_TO_FP16(pq[d]);
|
14833
|
+
}
|
14834
|
+
}
|
14835
|
+
|
14836
|
+
ggml_vec_dot_f16(D,
|
14837
|
+
&s, 0,
|
14838
|
+
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
14839
|
+
Q16, 0, 1);
|
14840
|
+
|
14841
|
+
s = s*scale + mv;
|
14842
|
+
|
14843
|
+
const float Mold = M;
|
14844
|
+
|
14845
|
+
float ms = 1.0f;
|
14846
|
+
float vs = 1.0f;
|
14847
|
+
|
14848
|
+
if (s > M) {
|
14849
|
+
M = s;
|
14850
|
+
ms = expf(Mold - M);
|
14851
|
+
|
14852
|
+
// V = V*expf(Mold - M)
|
14853
|
+
ggml_vec_scale_f16(D, V16, ms);
|
14854
|
+
} else {
|
14855
|
+
vs = expf(s - M);
|
14856
|
+
}
|
14857
|
+
|
14858
|
+
const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
14859
|
+
|
14860
|
+
// V += v*expf(s - M)
|
14861
|
+
ggml_vec_mad_f16(D, V16, v16, vs);
|
14862
|
+
|
14863
|
+
S = S*ms + vs;
|
14864
|
+
}
|
14865
|
+
|
14866
|
+
// V /= S
|
14867
|
+
for (int64_t d = 0; d < D; ++d) {
|
14868
|
+
V32[d] = GGML_FP16_TO_FP32(V16[d])/S;
|
14869
|
+
}
|
14870
|
+
|
14871
|
+
// dst indices
|
14872
|
+
const int i1 = iq1;
|
14873
|
+
const int i2 = iq2;
|
14874
|
+
const int i3 = iq3;
|
14875
|
+
|
14876
|
+
// original
|
14877
|
+
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
|
14878
|
+
|
14879
|
+
// permute(0, 2, 1, 3)
|
14880
|
+
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1);
|
14881
|
+
}
|
14882
|
+
}
|
14883
|
+
|
14884
|
+
static void ggml_compute_forward_flash_attn_ext(
|
14885
|
+
const struct ggml_compute_params * params,
|
14886
|
+
const struct ggml_tensor * q,
|
14887
|
+
const struct ggml_tensor * k,
|
14888
|
+
const struct ggml_tensor * v,
|
14889
|
+
const struct ggml_tensor * mask,
|
14890
|
+
struct ggml_tensor * dst) {
|
14891
|
+
switch (dst->op_params[1]) {
|
14892
|
+
case GGML_PREC_DEFAULT:
|
14893
|
+
case GGML_PREC_F32:
|
14894
|
+
{
|
14895
|
+
// uses F32 accumulators
|
14896
|
+
ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
|
14897
|
+
} break;
|
14898
|
+
default:
|
14899
|
+
{
|
14900
|
+
GGML_ASSERT(false);
|
14901
|
+
} break;
|
14902
|
+
}
|
14903
|
+
}
|
14904
|
+
|
14572
14905
|
// ggml_compute_forward_flash_ff
|
14573
14906
|
|
14574
14907
|
static void ggml_compute_forward_flash_ff_f16(
|
@@ -16376,6 +16709,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
16376
16709
|
const bool masked = t != 0;
|
16377
16710
|
ggml_compute_forward_flash_attn(params, masked, tensor);
|
16378
16711
|
} break;
|
16712
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
16713
|
+
{
|
16714
|
+
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
16715
|
+
} break;
|
16379
16716
|
case GGML_OP_FLASH_FF:
|
16380
16717
|
{
|
16381
16718
|
ggml_compute_forward_flash_ff(params, tensor);
|
@@ -17388,6 +17725,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
17388
17725
|
GGML_ASSERT(false); // TODO: not implemented
|
17389
17726
|
} break;
|
17390
17727
|
case GGML_OP_FLASH_ATTN:
|
17728
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
17391
17729
|
{
|
17392
17730
|
struct ggml_tensor * flash_grad = NULL;
|
17393
17731
|
if (src0->grad || src1->grad || tensor->src[2]->grad) {
|
@@ -18160,6 +18498,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18160
18498
|
n_tasks = n_threads;
|
18161
18499
|
} break;
|
18162
18500
|
case GGML_OP_FLASH_ATTN:
|
18501
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
18163
18502
|
{
|
18164
18503
|
n_tasks = n_threads;
|
18165
18504
|
} break;
|
@@ -18563,6 +18902,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18563
18902
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
18564
18903
|
}
|
18565
18904
|
} break;
|
18905
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
18906
|
+
{
|
18907
|
+
const int64_t ne00 = node->src[0]->ne[0]; // D
|
18908
|
+
|
18909
|
+
cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size
|
18910
|
+
} break;
|
18566
18911
|
case GGML_OP_FLASH_FF:
|
18567
18912
|
{
|
18568
18913
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -20614,7 +20959,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
|
|
20614
20959
|
}
|
20615
20960
|
|
20616
20961
|
struct gguf_context * gguf_init_empty(void) {
|
20617
|
-
struct gguf_context * ctx =
|
20962
|
+
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
20618
20963
|
|
20619
20964
|
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
20620
20965
|
ctx->header.version = GGUF_VERSION;
|
@@ -20659,7 +21004,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20659
21004
|
|
20660
21005
|
bool ok = true;
|
20661
21006
|
|
20662
|
-
struct gguf_context * ctx =
|
21007
|
+
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
|
20663
21008
|
|
20664
21009
|
// read the header
|
20665
21010
|
{
|
@@ -20696,9 +21041,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20696
21041
|
|
20697
21042
|
// read the kv pairs
|
20698
21043
|
{
|
20699
|
-
|
21044
|
+
const uint64_t n_kv = ctx->header.n_kv;
|
20700
21045
|
|
20701
|
-
|
21046
|
+
// header.n_kv will hold the actual value of pairs that were successfully read in the loop below
|
21047
|
+
ctx->header.n_kv = 0;
|
21048
|
+
ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
|
21049
|
+
|
21050
|
+
for (uint64_t i = 0; i < n_kv; ++i) {
|
20702
21051
|
struct gguf_kv * kv = &ctx->kv[i];
|
20703
21052
|
|
20704
21053
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
@@ -20747,7 +21096,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20747
21096
|
return NULL;
|
20748
21097
|
}
|
20749
21098
|
|
20750
|
-
kv->value.arr.data =
|
21099
|
+
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
|
20751
21100
|
|
20752
21101
|
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
20753
21102
|
} break;
|
@@ -20761,7 +21110,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20761
21110
|
return NULL;
|
20762
21111
|
}
|
20763
21112
|
|
20764
|
-
kv->value.arr.data =
|
21113
|
+
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
|
20765
21114
|
|
20766
21115
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20767
21116
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
@@ -20777,6 +21126,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20777
21126
|
if (!ok) {
|
20778
21127
|
break;
|
20779
21128
|
}
|
21129
|
+
|
21130
|
+
ctx->header.n_kv++;
|
20780
21131
|
}
|
20781
21132
|
|
20782
21133
|
if (!ok) {
|
@@ -20789,7 +21140,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20789
21140
|
|
20790
21141
|
// read the tensor infos
|
20791
21142
|
{
|
20792
|
-
ctx->infos =
|
21143
|
+
ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
20793
21144
|
|
20794
21145
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20795
21146
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -20810,8 +21161,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20810
21161
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
20811
21162
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
20812
21163
|
|
21164
|
+
// TODO: return an error instead of crashing with GGML_ASSERT
|
20813
21165
|
gguf_tensor_info_sanitize(info);
|
20814
21166
|
|
21167
|
+
// make sure there is no duplicated tensor names
|
21168
|
+
for (uint64_t j = 0; j < i; ++j) {
|
21169
|
+
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
|
21170
|
+
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
|
21171
|
+
ok = false;
|
21172
|
+
}
|
21173
|
+
}
|
21174
|
+
|
20815
21175
|
if (!ok) {
|
20816
21176
|
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
20817
21177
|
fclose(file);
|
@@ -20980,7 +21340,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20980
21340
|
GGML_FREE(ctx->infos);
|
20981
21341
|
}
|
20982
21342
|
|
20983
|
-
|
21343
|
+
GGML_FREE(ctx);
|
20984
21344
|
}
|
20985
21345
|
|
20986
21346
|
const char * gguf_type_name(enum gguf_type type) {
|
@@ -21291,7 +21651,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|
21291
21651
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
21292
21652
|
ctx->kv[idx].value.arr.type = type;
|
21293
21653
|
ctx->kv[idx].value.arr.n = n;
|
21294
|
-
ctx->kv[idx].value.arr.data =
|
21654
|
+
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
|
21295
21655
|
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
21296
21656
|
}
|
21297
21657
|
|
@@ -21301,7 +21661,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
21301
21661
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
21302
21662
|
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
21303
21663
|
ctx->kv[idx].value.arr.n = n;
|
21304
|
-
ctx->kv[idx].value.arr.data =
|
21664
|
+
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
|
21305
21665
|
for (int i = 0; i < n; i++) {
|
21306
21666
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
21307
21667
|
str->n = strlen(data[i]);
|
@@ -21328,7 +21688,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
21328
21688
|
case GGUF_TYPE_ARRAY:
|
21329
21689
|
{
|
21330
21690
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
21331
|
-
const char ** data =
|
21691
|
+
const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
|
21332
21692
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
21333
21693
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
21334
21694
|
}
|
@@ -21348,6 +21708,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
21348
21708
|
void gguf_add_tensor(
|
21349
21709
|
struct gguf_context * ctx,
|
21350
21710
|
const struct ggml_tensor * tensor) {
|
21711
|
+
if (gguf_find_tensor(ctx, tensor->name) != -1) {
|
21712
|
+
GGML_ASSERT(false && "duplicated tensor name");
|
21713
|
+
}
|
21714
|
+
|
21351
21715
|
const int idx = ctx->header.n_tensors;
|
21352
21716
|
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
21353
21717
|
|
@@ -21416,7 +21780,7 @@ struct gguf_buf {
|
|
21416
21780
|
|
21417
21781
|
static struct gguf_buf gguf_buf_init(size_t size) {
|
21418
21782
|
struct gguf_buf buf = {
|
21419
|
-
/*buf.data =*/ size == 0 ? NULL :
|
21783
|
+
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
|
21420
21784
|
/*buf.size =*/ size,
|
21421
21785
|
/*buf.offset =*/ 0,
|
21422
21786
|
};
|