llama_cpp 0.9.3 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
4826
4826
  static struct ggml_tensor * ggml_soft_max_impl(
4827
4827
  struct ggml_context * ctx,
4828
4828
  struct ggml_tensor * a,
4829
+ struct ggml_tensor * mask,
4830
+ float scale,
4829
4831
  bool inplace) {
4832
+ GGML_ASSERT(ggml_is_contiguous(a));
4833
+ if (mask) {
4834
+ GGML_ASSERT(ggml_is_contiguous(mask));
4835
+ GGML_ASSERT(mask->ne[2] == 1);
4836
+ GGML_ASSERT(mask->ne[3] == 1);
4837
+ GGML_ASSERT(ggml_can_repeat_rows(mask, a));
4838
+ }
4839
+
4830
4840
  bool is_node = false;
4831
4841
 
4832
4842
  if (a->grad) {
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
4835
4845
 
4836
4846
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4837
4847
 
4848
+ float params[] = { scale };
4849
+ ggml_set_op_params(result, params, sizeof(params));
4850
+
4838
4851
  result->op = GGML_OP_SOFT_MAX;
4839
4852
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4840
4853
  result->src[0] = a;
4854
+ result->src[1] = mask;
4841
4855
 
4842
4856
  return result;
4843
4857
  }
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
4845
4859
  struct ggml_tensor * ggml_soft_max(
4846
4860
  struct ggml_context * ctx,
4847
4861
  struct ggml_tensor * a) {
4848
- return ggml_soft_max_impl(ctx, a, false);
4862
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
4849
4863
  }
4850
4864
 
4851
4865
  struct ggml_tensor * ggml_soft_max_inplace(
4852
4866
  struct ggml_context * ctx,
4853
4867
  struct ggml_tensor * a) {
4854
- return ggml_soft_max_impl(ctx, a, true);
4868
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
4869
+ }
4870
+
4871
+ struct ggml_tensor * ggml_soft_max_ext(
4872
+ struct ggml_context * ctx,
4873
+ struct ggml_tensor * a,
4874
+ struct ggml_tensor * mask,
4875
+ float scale) {
4876
+ return ggml_soft_max_impl(ctx, a, mask, scale, false);
4855
4877
  }
4856
4878
 
4857
4879
  // ggml_soft_max_back
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9373
9395
  // TODO: find the optimal values for these
9374
9396
  if (ggml_is_contiguous(src0) &&
9375
9397
  ggml_is_contiguous(src1) &&
9376
- src0->type == GGML_TYPE_F32 &&
9398
+ //src0->type == GGML_TYPE_F32 &&
9377
9399
  src1->type == GGML_TYPE_F32 &&
9378
9400
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9379
9401
 
@@ -9611,10 +9633,12 @@ static void ggml_compute_forward_out_prod_f32(
9611
9633
  const int ith = params->ith;
9612
9634
  const int nth = params->nth;
9613
9635
 
9636
+ GGML_ASSERT(ne0 == ne00);
9637
+ GGML_ASSERT(ne1 == ne10);
9638
+ GGML_ASSERT(ne2 == ne02);
9614
9639
  GGML_ASSERT(ne02 == ne12);
9615
- GGML_ASSERT(ne03 == ne13);
9616
- GGML_ASSERT(ne2 == ne12);
9617
9640
  GGML_ASSERT(ne3 == ne13);
9641
+ GGML_ASSERT(ne03 == ne13);
9618
9642
 
9619
9643
  // we don't support permuted src0 or src1
9620
9644
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9625,18 +9649,25 @@ static void ggml_compute_forward_out_prod_f32(
9625
9649
  // GGML_ASSERT(nb1 <= nb2);
9626
9650
  // GGML_ASSERT(nb2 <= nb3);
9627
9651
 
9628
- GGML_ASSERT(ne0 == ne00);
9629
- GGML_ASSERT(ne1 == ne10);
9630
- GGML_ASSERT(ne2 == ne02);
9631
- GGML_ASSERT(ne3 == ne03);
9632
-
9633
9652
  // nb01 >= nb00 - src0 is not transposed
9634
9653
  // compute by src0 rows
9635
9654
 
9636
9655
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
9637
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9656
+ // TODO: #if defined(GGML_USE_CLBLAST)
9657
+
9658
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9659
+ bool use_blas = ggml_is_matrix(src0) &&
9660
+ ggml_is_matrix(src1) &&
9661
+ ggml_is_contiguous(src0) &&
9662
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
9663
+ #endif
9638
9664
 
9639
9665
  if (params->type == GGML_TASK_INIT) {
9666
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
9667
+ if (use_blas) {
9668
+ return;
9669
+ }
9670
+ #endif
9640
9671
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
9641
9672
  return;
9642
9673
  }
@@ -9645,6 +9676,50 @@ static void ggml_compute_forward_out_prod_f32(
9645
9676
  return;
9646
9677
  }
9647
9678
 
9679
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9680
+ if (use_blas) {
9681
+ if (params->ith != 0) { // All threads other than the first do no work.
9682
+ return;
9683
+ }
9684
+ // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
9685
+ // src0: (k,n)
9686
+ // src1: (k,m)
9687
+ // dst: (m,n)
9688
+ //
9689
+ // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
9690
+ // Also expressed as (major,minor)
9691
+ // a: (m,k): so src1 transposed
9692
+ // b: (k,n): so src0
9693
+ // c: (m,n)
9694
+ //
9695
+ // However, if ggml_is_transposed(src1) is true, then
9696
+ // src1->data already contains a transposed version, so sgemm mustn't
9697
+ // transpose it further.
9698
+
9699
+ int n = src0->ne[0];
9700
+ int k = src0->ne[1];
9701
+ int m = src1->ne[0];
9702
+
9703
+ int transposeA, lda;
9704
+
9705
+ if (!ggml_is_transposed(src1)) {
9706
+ transposeA = CblasTrans;
9707
+ lda = m;
9708
+ } else {
9709
+ transposeA = CblasNoTrans;
9710
+ lda = k;
9711
+ }
9712
+
9713
+ float * a = (float *) ((char *) src1->data);
9714
+ float * b = (float *) ((char *) src0->data);
9715
+ float * c = (float *) ((char *) dst->data);
9716
+
9717
+ cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
9718
+
9719
+ return;
9720
+ }
9721
+ #endif
9722
+
9648
9723
  // dst[:,:,:,:] = 0
9649
9724
  // for i2,i3:
9650
9725
  // for i1:
@@ -10498,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
10498
10573
  static void ggml_compute_forward_soft_max_f32(
10499
10574
  const struct ggml_compute_params * params,
10500
10575
  const struct ggml_tensor * src0,
10501
- struct ggml_tensor * dst) {
10502
- GGML_ASSERT(ggml_is_contiguous(src0));
10503
- GGML_ASSERT(ggml_is_contiguous(dst));
10504
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
10576
+ const struct ggml_tensor * src1,
10577
+ struct ggml_tensor * dst) {
10578
+ assert(ggml_is_contiguous(dst));
10579
+ assert(ggml_are_same_shape(src0, dst));
10505
10580
 
10506
10581
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10507
10582
  return;
10508
10583
  }
10509
10584
 
10585
+ float scale = 1.0f;
10586
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
10587
+
10510
10588
  // TODO: handle transposed/permuted matrices
10511
10589
 
10512
10590
  const int ith = params->ith;
10513
10591
  const int nth = params->nth;
10514
10592
 
10593
+ const int64_t ne11 = src1 ? src1->ne[1] : 1;
10594
+
10515
10595
  const int nc = src0->ne[0];
10516
10596
  const int nr = ggml_nrows(src0);
10517
10597
 
@@ -10522,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
10522
10602
  const int ir0 = dr*ith;
10523
10603
  const int ir1 = MIN(ir0 + dr, nr);
10524
10604
 
10605
+ float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
10606
+
10525
10607
  for (int i1 = ir0; i1 < ir1; i1++) {
10526
- float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10527
- float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10608
+ float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10609
+ float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10610
+
10611
+ // broadcast the mask across rows
10612
+ float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
10613
+
10614
+ ggml_vec_cpy_f32 (nc, wp, sp);
10615
+ ggml_vec_scale_f32(nc, wp, scale);
10616
+ if (mp) {
10617
+ ggml_vec_acc_f32(nc, wp, mp);
10618
+ }
10528
10619
 
10529
10620
  #ifndef NDEBUG
10530
10621
  for (int i = 0; i < nc; ++i) {
10531
10622
  //printf("p[%d] = %f\n", i, p[i]);
10532
- assert(!isnan(sp[i]));
10623
+ assert(!isnan(wp[i]));
10533
10624
  }
10534
10625
  #endif
10535
10626
 
10536
10627
  float max = -INFINITY;
10537
- ggml_vec_max_f32(nc, &max, sp);
10628
+ ggml_vec_max_f32(nc, &max, wp);
10538
10629
 
10539
10630
  ggml_float sum = 0.0;
10540
10631
 
10541
10632
  uint16_t scvt;
10542
10633
  for (int i = 0; i < nc; i++) {
10543
- if (sp[i] == -INFINITY) {
10634
+ if (wp[i] == -INFINITY) {
10544
10635
  dp[i] = 0.0f;
10545
10636
  } else {
10546
- // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
10547
- ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
10637
+ // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
10638
+ ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
10548
10639
  memcpy(&scvt, &s, sizeof(scvt));
10549
10640
  const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
10550
10641
  sum += (ggml_float)val;
@@ -10569,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
10569
10660
  static void ggml_compute_forward_soft_max(
10570
10661
  const struct ggml_compute_params * params,
10571
10662
  const struct ggml_tensor * src0,
10572
- struct ggml_tensor * dst) {
10663
+ const struct ggml_tensor * src1,
10664
+ struct ggml_tensor * dst) {
10573
10665
  switch (src0->type) {
10574
10666
  case GGML_TYPE_F32:
10575
10667
  {
10576
- ggml_compute_forward_soft_max_f32(params, src0, dst);
10668
+ ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
10577
10669
  } break;
10578
10670
  default:
10579
10671
  {
@@ -13810,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13810
13902
  } break;
13811
13903
  case GGML_OP_SOFT_MAX:
13812
13904
  {
13813
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
13905
+ ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
13814
13906
  } break;
13815
13907
  case GGML_OP_SOFT_MAX_BACK:
13816
13908
  {
@@ -15636,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15636
15728
  {
15637
15729
  n_tasks = 1;
15638
15730
  } break;
15639
- case GGML_OP_COUNT:
15640
- {
15641
- GGML_ASSERT(false);
15642
- } break;
15643
15731
  default:
15644
15732
  {
15645
- printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15733
+ fprintf(stderr, "%s: op not implemented: ", __func__);
15734
+ if (node->op < GGML_OP_COUNT) {
15735
+ fprintf(stderr, "%s\n", ggml_op_name(node->op));
15736
+ } else {
15737
+ fprintf(stderr, "%d\n", node->op);
15738
+ }
15646
15739
  GGML_ASSERT(false);
15647
15740
  } break;
15648
15741
  }
@@ -15845,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15845
15938
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15846
15939
  }
15847
15940
  } break;
15941
+ case GGML_OP_SOFT_MAX:
15942
+ {
15943
+ n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
15944
+
15945
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15946
+ } break;
15848
15947
  case GGML_OP_CONV_TRANSPOSE_1D:
15849
15948
  {
15850
15949
  GGML_ASSERT(node->src[0]->ne[3] == 1);
@@ -18399,24 +18498,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
18399
18498
  }
18400
18499
 
18401
18500
  const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
18501
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18402
18502
  return ctx->kv[key_id].key.data;
18403
18503
  }
18404
18504
 
18405
18505
  enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
18506
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18406
18507
  return ctx->kv[key_id].type;
18407
18508
  }
18408
18509
 
18409
18510
  enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
18511
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18410
18512
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18411
18513
  return ctx->kv[key_id].value.arr.type;
18412
18514
  }
18413
18515
 
18414
18516
  const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
18517
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18415
18518
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18416
18519
  return ctx->kv[key_id].value.arr.data;
18417
18520
  }
18418
18521
 
18419
18522
  const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
18523
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18420
18524
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18421
18525
  struct gguf_kv * kv = &ctx->kv[key_id];
18422
18526
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
@@ -18424,70 +18528,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
18424
18528
  }
18425
18529
 
18426
18530
  int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
18531
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18427
18532
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18428
18533
  return ctx->kv[key_id].value.arr.n;
18429
18534
  }
18430
18535
 
18431
18536
  uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
18537
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18432
18538
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
18433
18539
  return ctx->kv[key_id].value.uint8;
18434
18540
  }
18435
18541
 
18436
18542
  int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
18543
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18437
18544
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
18438
18545
  return ctx->kv[key_id].value.int8;
18439
18546
  }
18440
18547
 
18441
18548
  uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
18549
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18442
18550
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
18443
18551
  return ctx->kv[key_id].value.uint16;
18444
18552
  }
18445
18553
 
18446
18554
  int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
18555
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18447
18556
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
18448
18557
  return ctx->kv[key_id].value.int16;
18449
18558
  }
18450
18559
 
18451
18560
  uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
18561
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18452
18562
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
18453
18563
  return ctx->kv[key_id].value.uint32;
18454
18564
  }
18455
18565
 
18456
18566
  int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
18567
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18457
18568
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
18458
18569
  return ctx->kv[key_id].value.int32;
18459
18570
  }
18460
18571
 
18461
18572
  float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
18573
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18462
18574
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
18463
18575
  return ctx->kv[key_id].value.float32;
18464
18576
  }
18465
18577
 
18466
18578
  uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
18579
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18467
18580
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
18468
18581
  return ctx->kv[key_id].value.uint64;
18469
18582
  }
18470
18583
 
18471
18584
  int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
18585
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18472
18586
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
18473
18587
  return ctx->kv[key_id].value.int64;
18474
18588
  }
18475
18589
 
18476
18590
  double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
18591
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18477
18592
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
18478
18593
  return ctx->kv[key_id].value.float64;
18479
18594
  }
18480
18595
 
18481
18596
  bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
18597
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18482
18598
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
18483
18599
  return ctx->kv[key_id].value.bool_;
18484
18600
  }
18485
18601
 
18486
18602
  const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
18603
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18487
18604
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
18488
18605
  return ctx->kv[key_id].value.str.data;
18489
18606
  }
18490
18607
 
18608
+ const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
18609
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18610
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
18611
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
18612
+ return &ctx->kv[key_id].value;
18613
+ }
18614
+
18491
18615
  int gguf_get_n_tensors(const struct gguf_context * ctx) {
18492
18616
  return ctx->header.n_tensors;
18493
18617
  }
@@ -244,11 +244,10 @@
244
244
  #define GGML_ASSERT(x) \
245
245
  do { \
246
246
  if (!(x)) { \
247
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
248
- fflush(stderr); \
249
247
  fflush(stdout); \
248
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
250
249
  ggml_print_backtrace(); \
251
- exit(1); \
250
+ abort(); \
252
251
  } \
253
252
  } while (0)
254
253
 
@@ -1283,6 +1282,14 @@ extern "C" {
1283
1282
  struct ggml_context * ctx,
1284
1283
  struct ggml_tensor * a);
1285
1284
 
1285
+ // fused soft_max(a*scale + mask)
1286
+ // mask is optional
1287
+ GGML_API struct ggml_tensor * ggml_soft_max_ext(
1288
+ struct ggml_context * ctx,
1289
+ struct ggml_tensor * a,
1290
+ struct ggml_tensor * mask,
1291
+ float scale);
1292
+
1286
1293
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1287
1294
  struct ggml_context * ctx,
1288
1295
  struct ggml_tensor * a,
@@ -2045,6 +2052,7 @@ extern "C" {
2045
2052
  GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
2046
2053
  GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
2047
2054
  GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
2055
+ GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
2048
2056
  GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
2049
2057
  GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2050
2058
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);