llama_cpp 0.14.6 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
858
858
  // simd mappings
859
859
  //
860
860
 
861
- #if defined(__ARM_NEON)
862
- #if !defined(__aarch64__)
863
-
864
- // 64-bit compatibility
865
-
866
- inline static float vaddvq_f32(float32x4_t v) {
867
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
868
- }
869
-
870
- #endif
871
- #endif
872
-
873
861
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
874
862
  // we then implement the fundamental computation operations below using only these macros
875
863
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -963,7 +951,7 @@ inline static float vaddvq_f32(float32x4_t v) {
963
951
  #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
964
952
  #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
965
953
  #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
966
- #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
954
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
967
955
  #define GGML_F16_VEC_FMA GGML_F16x8_FMA
968
956
  #define GGML_F16_VEC_ADD GGML_F16x8_ADD
969
957
  #define GGML_F16_VEC_MUL GGML_F16x8_MUL
@@ -989,7 +977,7 @@ inline static float vaddvq_f32(float32x4_t v) {
989
977
  #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
990
978
  #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
991
979
  #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
992
- #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
980
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
993
981
  #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
994
982
  #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
995
983
  #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
@@ -1058,7 +1046,7 @@ do { \
1058
1046
 
1059
1047
  // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
1060
1048
  // so F16C guard isn't required
1061
- #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
1049
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
1062
1050
  #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
1063
1051
 
1064
1052
  #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
@@ -1156,7 +1144,7 @@ do { \
1156
1144
 
1157
1145
  #if defined(__F16C__)
1158
1146
  // the _mm256_cvt intrinsics require F16C
1159
- #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
1147
+ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
1160
1148
  #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
1161
1149
  #else
1162
1150
  static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
@@ -1674,6 +1662,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
1674
1662
  #endif
1675
1663
  }
1676
1664
 
1665
+ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
1666
+ #if defined(GGML_SIMD)
1667
+ const int np = (n & ~(GGML_F16_STEP - 1));
1668
+
1669
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
1670
+
1671
+ GGML_F16_VEC ax[GGML_F16_ARR];
1672
+ GGML_F16_VEC ay[GGML_F16_ARR];
1673
+
1674
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
1675
+ for (int j = 0; j < GGML_F16_ARR; j++) {
1676
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
1677
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
1678
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
1679
+
1680
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
1681
+ }
1682
+ }
1683
+
1684
+ // leftovers
1685
+ for (int i = np; i < n; ++i) {
1686
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1687
+ }
1688
+ #else
1689
+ // scalar
1690
+ for (int i = 0; i < n; ++i) {
1691
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1692
+ }
1693
+ #endif
1694
+ }
1695
+
1677
1696
  // xs and vs are byte strides of x and v
1678
1697
  inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
1679
1698
 
@@ -1758,6 +1777,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
1758
1777
  #endif
1759
1778
  }
1760
1779
 
1780
+ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
1781
+ #if defined(GGML_SIMD)
1782
+ const int np = (n & ~(GGML_F16_STEP - 1));
1783
+
1784
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
1785
+
1786
+ GGML_F16_VEC ay[GGML_F16_ARR];
1787
+
1788
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
1789
+ for (int j = 0; j < GGML_F16_ARR; j++) {
1790
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
1791
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
1792
+
1793
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
1794
+ }
1795
+ }
1796
+
1797
+ // leftovers
1798
+ for (int i = np; i < n; ++i) {
1799
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
1800
+ }
1801
+ #else
1802
+ // scalar
1803
+ for (int i = 0; i < n; ++i) {
1804
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
1805
+ }
1806
+ #endif
1807
+ }
1808
+
1761
1809
  inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1762
1810
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1763
1811
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
@@ -2012,6 +2060,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2012
2060
  "LEAKY_RELU",
2013
2061
 
2014
2062
  "FLASH_ATTN",
2063
+ "FLASH_ATTN_EXT",
2015
2064
  "FLASH_FF",
2016
2065
  "FLASH_ATTN_BACK",
2017
2066
  "SSM_CONV",
@@ -2038,7 +2087,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2038
2087
  "CROSS_ENTROPY_LOSS_BACK",
2039
2088
  };
2040
2089
 
2041
- static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2090
+ static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2042
2091
 
2043
2092
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2044
2093
  "none",
@@ -2102,6 +2151,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2102
2151
  "leaky_relu(x)",
2103
2152
 
2104
2153
  "flash_attn(x)",
2154
+ "flash_attn_ext(x)",
2105
2155
  "flash_ff(x)",
2106
2156
  "flash_attn_back(x)",
2107
2157
  "ssm_conv(x)",
@@ -2128,7 +2178,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2128
2178
  "cross_entropy_loss_back(x,y)",
2129
2179
  };
2130
2180
 
2131
- static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2181
+ static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2132
2182
 
2133
2183
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
2134
2184
 
@@ -4571,6 +4621,8 @@ struct ggml_tensor * ggml_mul_mat(
4571
4621
  void ggml_mul_mat_set_prec(
4572
4622
  struct ggml_tensor * a,
4573
4623
  enum ggml_prec prec) {
4624
+ GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
4625
+
4574
4626
  const int32_t prec_i32 = (int32_t) prec;
4575
4627
 
4576
4628
  ggml_set_op_params_i32(a, 0, prec_i32);
@@ -5409,17 +5461,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
5409
5461
  GGML_ASSERT(ggml_is_contiguous(a));
5410
5462
 
5411
5463
  if (mask) {
5464
+ GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
5412
5465
  GGML_ASSERT(ggml_is_contiguous(mask));
5413
5466
  GGML_ASSERT(ggml_is_matrix(mask));
5414
- GGML_ASSERT(ggml_can_repeat_rows(mask, a));
5467
+ GGML_ASSERT(mask->ne[0] == a->ne[0]);
5468
+ GGML_ASSERT(mask->ne[1] >= a->ne[1]);
5415
5469
  }
5416
5470
 
5417
5471
  if (pos) {
5418
5472
  GGML_ASSERT(ggml_is_vector(pos));
5419
- GGML_ASSERT(pos->type == GGML_TYPE_F32);
5473
+ GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
5420
5474
  GGML_ASSERT(pos->ne[0] == a->ne[0]);
5421
5475
  }
5422
5476
 
5477
+ if (pos && mask) {
5478
+ GGML_ASSERT(pos->type == mask->type);
5479
+ }
5480
+
5423
5481
  if (max_bias > 0.0f) {
5424
5482
  GGML_ASSERT(pos);
5425
5483
  }
@@ -6228,6 +6286,59 @@ struct ggml_tensor * ggml_flash_attn(
6228
6286
  return result;
6229
6287
  }
6230
6288
 
6289
+ // ggml_flash_attn_ext
6290
+
6291
+ struct ggml_tensor * ggml_flash_attn_ext(
6292
+ struct ggml_context * ctx,
6293
+ struct ggml_tensor * q,
6294
+ struct ggml_tensor * k,
6295
+ struct ggml_tensor * v,
6296
+ struct ggml_tensor * mask,
6297
+ float scale) {
6298
+ GGML_ASSERT(ggml_can_mul_mat(k, q));
6299
+ // TODO: check if vT can be multiplied by (k*qT)
6300
+ if (mask) {
6301
+ GGML_ASSERT(ggml_is_contiguous(mask));
6302
+ GGML_ASSERT(mask->ne[2] == 1);
6303
+ GGML_ASSERT(mask->ne[3] == 1);
6304
+ GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
6305
+ "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
6306
+ //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
6307
+ }
6308
+
6309
+ bool is_node = false;
6310
+
6311
+ if (q->grad || k->grad || v->grad) {
6312
+ is_node = true;
6313
+ }
6314
+
6315
+ // permute(0, 2, 1, 3)
6316
+ int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
6317
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6318
+
6319
+ float params[] = { scale };
6320
+ ggml_set_op_params(result, params, sizeof(params));
6321
+
6322
+ result->op = GGML_OP_FLASH_ATTN_EXT;
6323
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6324
+ result->src[0] = q;
6325
+ result->src[1] = k;
6326
+ result->src[2] = v;
6327
+ result->src[3] = mask;
6328
+
6329
+ return result;
6330
+ }
6331
+
6332
+ void ggml_flash_attn_ext_set_prec(
6333
+ struct ggml_tensor * a,
6334
+ enum ggml_prec prec) {
6335
+ GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
6336
+
6337
+ const int32_t prec_i32 = (int32_t) prec;
6338
+
6339
+ ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
6340
+ }
6341
+
6231
6342
  // ggml_flash_ff
6232
6343
 
6233
6344
  struct ggml_tensor * ggml_flash_ff(
@@ -10825,7 +10936,7 @@ static void ggml_compute_forward_mul_mat(
10825
10936
  #endif
10826
10937
 
10827
10938
  #if GGML_USE_LLAMAFILE
10828
- if (nb10 == ggml_type_size(src1->type)) {
10939
+ if (src1_cont) {
10829
10940
  for (int64_t i13 = 0; i13 < ne13; i13++)
10830
10941
  for (int64_t i12 = 0; i12 < ne12; i12++)
10831
10942
  if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -10878,15 +10989,13 @@ UseGgmlGemm1:;
10878
10989
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10879
10990
 
10880
10991
  #if GGML_USE_LLAMAFILE
10881
- if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
10992
+ if (src1->type != vec_dot_type) {
10882
10993
  for (int64_t i13 = 0; i13 < ne13; i13++)
10883
10994
  for (int64_t i12 = 0; i12 < ne12; i12++)
10884
10995
  if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10885
10996
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10886
10997
  nb01/ggml_type_size(src0->type),
10887
- (const char *)wdata + ggml_row_size(vec_dot_type,
10888
- nb12/ggml_type_size(src1->type)*i12 +
10889
- nb13/ggml_type_size(src1->type)*i13),
10998
+ (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
10890
10999
  row_size/ggml_type_size(vec_dot_type),
10891
11000
  (char *)dst->data + i12*nb2 + i13*nb3,
10892
11001
  nb1/ggml_type_size(dst->type),
@@ -12269,7 +12378,7 @@ static void ggml_compute_forward_soft_max_f32(
12269
12378
 
12270
12379
  GGML_TENSOR_UNARY_OP_LOCALS
12271
12380
 
12272
- const int64_t ne11 = src1 ? src1->ne[1] : 1;
12381
+ //const int64_t ne11 = src1 ? src1->ne[1] : 1;
12273
12382
 
12274
12383
  // TODO: is this supposed to be ceil instead of floor?
12275
12384
  // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
@@ -12292,19 +12401,31 @@ static void ggml_compute_forward_soft_max_f32(
12292
12401
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
12293
12402
 
12294
12403
  // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
12295
- float * pos = src2 ? (float *) src2->data : src0->data;
12404
+ ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
12405
+ float * pos_f32 = src2 ? (float *) src2->data : src0->data;
12406
+
12407
+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
12296
12408
 
12297
12409
  for (int i1 = ir0; i1 < ir1; i1++) {
12298
12410
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
12299
12411
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
12300
12412
 
12301
12413
  // broadcast the mask across rows
12302
- float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
12414
+ ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
12415
+ float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
12303
12416
 
12304
12417
  ggml_vec_cpy_f32 (nc, wp, sp);
12305
12418
  ggml_vec_scale_f32(nc, wp, scale);
12306
- if (mp) {
12307
- ggml_vec_acc_f32(nc, wp, mp);
12419
+ if (mp_f32) {
12420
+ if (use_f16) {
12421
+ for (int i = 0; i < nc; ++i) {
12422
+ wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
12423
+ }
12424
+ } else {
12425
+ for (int i = 0; i < nc; ++i) {
12426
+ wp[i] += mp_f32[i];
12427
+ }
12428
+ }
12308
12429
  }
12309
12430
 
12310
12431
  // ALiBi bias
@@ -12312,8 +12433,14 @@ static void ggml_compute_forward_soft_max_f32(
12312
12433
  const uint32_t h = (i1/ne01)%ne02; // head
12313
12434
  const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
12314
12435
 
12315
- for (int i = 0; i < nc; i++) {
12316
- wp[i] = wp[i] + slope*pos[i];
12436
+ if (use_f16) {
12437
+ for (int i = 0; i < nc; ++i) {
12438
+ wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
12439
+ }
12440
+ } else {
12441
+ for (int i = 0; i < nc; ++i) {
12442
+ wp[i] += slope*pos_f32[i];
12443
+ }
12317
12444
  }
12318
12445
  }
12319
12446
 
@@ -14583,6 +14710,198 @@ static void ggml_compute_forward_flash_attn(
14583
14710
  }
14584
14711
  }
14585
14712
 
14713
+ // ggml_compute_forward_flash_attn_ext
14714
+
14715
+ static void ggml_compute_forward_flash_attn_ext_f16(
14716
+ const struct ggml_compute_params * params,
14717
+ const struct ggml_tensor * q,
14718
+ const struct ggml_tensor * k,
14719
+ const struct ggml_tensor * v,
14720
+ const struct ggml_tensor * mask,
14721
+ struct ggml_tensor * dst) {
14722
+ int64_t t0 = ggml_perf_time_us();
14723
+ UNUSED(t0);
14724
+
14725
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
14726
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
14727
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
14728
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
14729
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
14730
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
14731
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
14732
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
14733
+
14734
+ const int ith = params->ith;
14735
+ const int nth = params->nth;
14736
+
14737
+ const int64_t D = neq0;
14738
+ const int64_t N = neq1;
14739
+
14740
+ GGML_ASSERT(ne0 == D);
14741
+ GGML_ASSERT(ne2 == N);
14742
+
14743
+ GGML_ASSERT(nbq0 == sizeof(float));
14744
+ GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
14745
+ GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
14746
+
14747
+ GGML_ASSERT(neq0 == D);
14748
+ GGML_ASSERT(nek0 == D);
14749
+ GGML_ASSERT(nev0 == D);
14750
+
14751
+ GGML_ASSERT(neq1 == N);
14752
+ GGML_ASSERT(nev0 == D);
14753
+
14754
+ // dst cannot be transposed or permuted
14755
+ GGML_ASSERT(nb0 == sizeof(float));
14756
+ GGML_ASSERT(nb0 <= nb1);
14757
+ GGML_ASSERT(nb1 <= nb2);
14758
+ GGML_ASSERT(nb2 <= nb3);
14759
+
14760
+ // broadcast factors
14761
+ const int64_t rk2 = neq2/nek2;
14762
+ const int64_t rk3 = neq3/nek3;
14763
+
14764
+ const int64_t rv2 = neq2/nev2;
14765
+ const int64_t rv3 = neq3/nev3;
14766
+
14767
+ if (params->type == GGML_TASK_TYPE_INIT) {
14768
+ return;
14769
+ }
14770
+
14771
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14772
+ return;
14773
+ }
14774
+
14775
+ // parallelize by q rows using ggml_vec_dot_f32
14776
+
14777
+ // total rows in q
14778
+ const int nr = neq1*neq2*neq3;
14779
+
14780
+ // rows per thread
14781
+ const int dr = (nr + nth - 1)/nth;
14782
+
14783
+ // row range for this thread
14784
+ const int ir0 = dr*ith;
14785
+ const int ir1 = MIN(ir0 + dr, nr);
14786
+
14787
+ float scale = 1.0f;
14788
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
14789
+
14790
+ // loop over n_batch and n_head
14791
+ for (int ir = ir0; ir < ir1; ++ir) {
14792
+ // q indices
14793
+ const int iq3 = ir/(neq2*neq1);
14794
+ const int iq2 = (ir - iq3*neq2*neq1)/neq1;
14795
+ const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
14796
+
14797
+ float S = 0.0f;
14798
+ float M = -INFINITY;
14799
+
14800
+ float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32);
14801
+ ggml_fp16_t * Q16 = (ggml_fp16_t *) (V32); // reuse memory
14802
+ ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D);
14803
+
14804
+ memset(V16, 0, D*sizeof(ggml_fp16_t));
14805
+
14806
+ const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
14807
+
14808
+ // k indices
14809
+ const int ik3 = iq3 / rk3;
14810
+ const int ik2 = iq2 / rk2;
14811
+
14812
+ // v indices
14813
+ const int iv3 = iq3 / rv3;
14814
+ const int iv2 = iq2 / rv2;
14815
+
14816
+ // online softmax / attention
14817
+ // loop over n_kv and n_head_kv
14818
+ // ref: https://arxiv.org/pdf/2112.05682.pdf
14819
+ for (int64_t ic = 0; ic < nek1; ++ic) {
14820
+ const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
14821
+ if (mv == -INFINITY) {
14822
+ continue;
14823
+ }
14824
+
14825
+ float s;
14826
+
14827
+ // convert Q to F16 in V32
14828
+ {
14829
+ const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
14830
+
14831
+ for (int64_t d = 0; d < D; ++d) {
14832
+ Q16[d] = GGML_FP32_TO_FP16(pq[d]);
14833
+ }
14834
+ }
14835
+
14836
+ ggml_vec_dot_f16(D,
14837
+ &s, 0,
14838
+ (ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
14839
+ Q16, 0, 1);
14840
+
14841
+ s = s*scale + mv;
14842
+
14843
+ const float Mold = M;
14844
+
14845
+ float ms = 1.0f;
14846
+ float vs = 1.0f;
14847
+
14848
+ if (s > M) {
14849
+ M = s;
14850
+ ms = expf(Mold - M);
14851
+
14852
+ // V = V*expf(Mold - M)
14853
+ ggml_vec_scale_f16(D, V16, ms);
14854
+ } else {
14855
+ vs = expf(s - M);
14856
+ }
14857
+
14858
+ const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
14859
+
14860
+ // V += v*expf(s - M)
14861
+ ggml_vec_mad_f16(D, V16, v16, vs);
14862
+
14863
+ S = S*ms + vs;
14864
+ }
14865
+
14866
+ // V /= S
14867
+ for (int64_t d = 0; d < D; ++d) {
14868
+ V32[d] = GGML_FP16_TO_FP32(V16[d])/S;
14869
+ }
14870
+
14871
+ // dst indices
14872
+ const int i1 = iq1;
14873
+ const int i2 = iq2;
14874
+ const int i3 = iq3;
14875
+
14876
+ // original
14877
+ //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
14878
+
14879
+ // permute(0, 2, 1, 3)
14880
+ memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1);
14881
+ }
14882
+ }
14883
+
14884
+ static void ggml_compute_forward_flash_attn_ext(
14885
+ const struct ggml_compute_params * params,
14886
+ const struct ggml_tensor * q,
14887
+ const struct ggml_tensor * k,
14888
+ const struct ggml_tensor * v,
14889
+ const struct ggml_tensor * mask,
14890
+ struct ggml_tensor * dst) {
14891
+ switch (dst->op_params[1]) {
14892
+ case GGML_PREC_DEFAULT:
14893
+ case GGML_PREC_F32:
14894
+ {
14895
+ // uses F32 accumulators
14896
+ ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
14897
+ } break;
14898
+ default:
14899
+ {
14900
+ GGML_ASSERT(false);
14901
+ } break;
14902
+ }
14903
+ }
14904
+
14586
14905
  // ggml_compute_forward_flash_ff
14587
14906
 
14588
14907
  static void ggml_compute_forward_flash_ff_f16(
@@ -16390,6 +16709,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
16390
16709
  const bool masked = t != 0;
16391
16710
  ggml_compute_forward_flash_attn(params, masked, tensor);
16392
16711
  } break;
16712
+ case GGML_OP_FLASH_ATTN_EXT:
16713
+ {
16714
+ ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
16715
+ } break;
16393
16716
  case GGML_OP_FLASH_FF:
16394
16717
  {
16395
16718
  ggml_compute_forward_flash_ff(params, tensor);
@@ -17402,6 +17725,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17402
17725
  GGML_ASSERT(false); // TODO: not implemented
17403
17726
  } break;
17404
17727
  case GGML_OP_FLASH_ATTN:
17728
+ case GGML_OP_FLASH_ATTN_EXT:
17405
17729
  {
17406
17730
  struct ggml_tensor * flash_grad = NULL;
17407
17731
  if (src0->grad || src1->grad || tensor->src[2]->grad) {
@@ -18174,6 +18498,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18174
18498
  n_tasks = n_threads;
18175
18499
  } break;
18176
18500
  case GGML_OP_FLASH_ATTN:
18501
+ case GGML_OP_FLASH_ATTN_EXT:
18177
18502
  {
18178
18503
  n_tasks = n_threads;
18179
18504
  } break;
@@ -18577,6 +18902,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18577
18902
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
18578
18903
  }
18579
18904
  } break;
18905
+ case GGML_OP_FLASH_ATTN_EXT:
18906
+ {
18907
+ const int64_t ne00 = node->src[0]->ne[0]; // D
18908
+
18909
+ cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size
18910
+ } break;
18580
18911
  case GGML_OP_FLASH_FF:
18581
18912
  {
18582
18913
  if (node->src[1]->type == GGML_TYPE_F32) {
@@ -20628,7 +20959,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
20628
20959
  }
20629
20960
 
20630
20961
  struct gguf_context * gguf_init_empty(void) {
20631
- struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20962
+ struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
20632
20963
 
20633
20964
  memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
20634
20965
  ctx->header.version = GGUF_VERSION;
@@ -20673,7 +21004,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20673
21004
 
20674
21005
  bool ok = true;
20675
21006
 
20676
- struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
21007
+ struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
20677
21008
 
20678
21009
  // read the header
20679
21010
  {
@@ -20710,9 +21041,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20710
21041
 
20711
21042
  // read the kv pairs
20712
21043
  {
20713
- ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
21044
+ const uint64_t n_kv = ctx->header.n_kv;
20714
21045
 
20715
- for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
21046
+ // header.n_kv will hold the actual value of pairs that were successfully read in the loop below
21047
+ ctx->header.n_kv = 0;
21048
+ ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
21049
+
21050
+ for (uint64_t i = 0; i < n_kv; ++i) {
20716
21051
  struct gguf_kv * kv = &ctx->kv[i];
20717
21052
 
20718
21053
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -20761,7 +21096,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20761
21096
  return NULL;
20762
21097
  }
20763
21098
 
20764
- kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
21099
+ kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
20765
21100
 
20766
21101
  ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
20767
21102
  } break;
@@ -20775,7 +21110,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20775
21110
  return NULL;
20776
21111
  }
20777
21112
 
20778
- kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
21113
+ kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
20779
21114
 
20780
21115
  for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20781
21116
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
@@ -20791,6 +21126,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20791
21126
  if (!ok) {
20792
21127
  break;
20793
21128
  }
21129
+
21130
+ ctx->header.n_kv++;
20794
21131
  }
20795
21132
 
20796
21133
  if (!ok) {
@@ -20803,7 +21140,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20803
21140
 
20804
21141
  // read the tensor infos
20805
21142
  {
20806
- ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
21143
+ ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
20807
21144
 
20808
21145
  for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
20809
21146
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -20824,8 +21161,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20824
21161
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
20825
21162
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
20826
21163
 
21164
+ // TODO: return an error instead of crashing with GGML_ASSERT
20827
21165
  gguf_tensor_info_sanitize(info);
20828
21166
 
21167
+ // make sure there is no duplicated tensor names
21168
+ for (uint64_t j = 0; j < i; ++j) {
21169
+ if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
21170
+ fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
21171
+ ok = false;
21172
+ }
21173
+ }
21174
+
20829
21175
  if (!ok) {
20830
21176
  fprintf(stderr, "%s: failed to read tensor info\n", __func__);
20831
21177
  fclose(file);
@@ -20994,7 +21340,7 @@ void gguf_free(struct gguf_context * ctx) {
20994
21340
  GGML_FREE(ctx->infos);
20995
21341
  }
20996
21342
 
20997
- GGML_ALIGNED_FREE(ctx);
21343
+ GGML_FREE(ctx);
20998
21344
  }
20999
21345
 
21000
21346
  const char * gguf_type_name(enum gguf_type type) {
@@ -21305,7 +21651,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
21305
21651
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
21306
21652
  ctx->kv[idx].value.arr.type = type;
21307
21653
  ctx->kv[idx].value.arr.n = n;
21308
- ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
21654
+ ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
21309
21655
  memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
21310
21656
  }
21311
21657
 
@@ -21315,7 +21661,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
21315
21661
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
21316
21662
  ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
21317
21663
  ctx->kv[idx].value.arr.n = n;
21318
- ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
21664
+ ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
21319
21665
  for (int i = 0; i < n; i++) {
21320
21666
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
21321
21667
  str->n = strlen(data[i]);
@@ -21342,7 +21688,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
21342
21688
  case GGUF_TYPE_ARRAY:
21343
21689
  {
21344
21690
  if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
21345
- const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
21691
+ const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
21346
21692
  for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
21347
21693
  data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
21348
21694
  }
@@ -21362,6 +21708,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
21362
21708
  void gguf_add_tensor(
21363
21709
  struct gguf_context * ctx,
21364
21710
  const struct ggml_tensor * tensor) {
21711
+ if (gguf_find_tensor(ctx, tensor->name) != -1) {
21712
+ GGML_ASSERT(false && "duplicated tensor name");
21713
+ }
21714
+
21365
21715
  const int idx = ctx->header.n_tensors;
21366
21716
  ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
21367
21717
 
@@ -21430,7 +21780,7 @@ struct gguf_buf {
21430
21780
 
21431
21781
  static struct gguf_buf gguf_buf_init(size_t size) {
21432
21782
  struct gguf_buf buf = {
21433
- /*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
21783
+ /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
21434
21784
  /*buf.size =*/ size,
21435
21785
  /*buf.offset =*/ 0,
21436
21786
  };