llama_cpp 0.9.3 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
4826
4826
  static struct ggml_tensor * ggml_soft_max_impl(
4827
4827
  struct ggml_context * ctx,
4828
4828
  struct ggml_tensor * a,
4829
+ struct ggml_tensor * mask,
4830
+ float scale,
4829
4831
  bool inplace) {
4832
+ GGML_ASSERT(ggml_is_contiguous(a));
4833
+ if (mask) {
4834
+ GGML_ASSERT(ggml_is_contiguous(mask));
4835
+ GGML_ASSERT(mask->ne[2] == 1);
4836
+ GGML_ASSERT(mask->ne[3] == 1);
4837
+ GGML_ASSERT(ggml_can_repeat_rows(mask, a));
4838
+ }
4839
+
4830
4840
  bool is_node = false;
4831
4841
 
4832
4842
  if (a->grad) {
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
4835
4845
 
4836
4846
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4837
4847
 
4848
+ float params[] = { scale };
4849
+ ggml_set_op_params(result, params, sizeof(params));
4850
+
4838
4851
  result->op = GGML_OP_SOFT_MAX;
4839
4852
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4840
4853
  result->src[0] = a;
4854
+ result->src[1] = mask;
4841
4855
 
4842
4856
  return result;
4843
4857
  }
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
4845
4859
  struct ggml_tensor * ggml_soft_max(
4846
4860
  struct ggml_context * ctx,
4847
4861
  struct ggml_tensor * a) {
4848
- return ggml_soft_max_impl(ctx, a, false);
4862
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
4849
4863
  }
4850
4864
 
4851
4865
  struct ggml_tensor * ggml_soft_max_inplace(
4852
4866
  struct ggml_context * ctx,
4853
4867
  struct ggml_tensor * a) {
4854
- return ggml_soft_max_impl(ctx, a, true);
4868
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
4869
+ }
4870
+
4871
+ struct ggml_tensor * ggml_soft_max_ext(
4872
+ struct ggml_context * ctx,
4873
+ struct ggml_tensor * a,
4874
+ struct ggml_tensor * mask,
4875
+ float scale) {
4876
+ return ggml_soft_max_impl(ctx, a, mask, scale, false);
4855
4877
  }
4856
4878
 
4857
4879
  // ggml_soft_max_back
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9373
9395
  // TODO: find the optimal values for these
9374
9396
  if (ggml_is_contiguous(src0) &&
9375
9397
  ggml_is_contiguous(src1) &&
9376
- src0->type == GGML_TYPE_F32 &&
9398
+ //src0->type == GGML_TYPE_F32 &&
9377
9399
  src1->type == GGML_TYPE_F32 &&
9378
9400
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9379
9401
 
@@ -9611,10 +9633,12 @@ static void ggml_compute_forward_out_prod_f32(
9611
9633
  const int ith = params->ith;
9612
9634
  const int nth = params->nth;
9613
9635
 
9636
+ GGML_ASSERT(ne0 == ne00);
9637
+ GGML_ASSERT(ne1 == ne10);
9638
+ GGML_ASSERT(ne2 == ne02);
9614
9639
  GGML_ASSERT(ne02 == ne12);
9615
- GGML_ASSERT(ne03 == ne13);
9616
- GGML_ASSERT(ne2 == ne12);
9617
9640
  GGML_ASSERT(ne3 == ne13);
9641
+ GGML_ASSERT(ne03 == ne13);
9618
9642
 
9619
9643
  // we don't support permuted src0 or src1
9620
9644
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9625,18 +9649,25 @@ static void ggml_compute_forward_out_prod_f32(
9625
9649
  // GGML_ASSERT(nb1 <= nb2);
9626
9650
  // GGML_ASSERT(nb2 <= nb3);
9627
9651
 
9628
- GGML_ASSERT(ne0 == ne00);
9629
- GGML_ASSERT(ne1 == ne10);
9630
- GGML_ASSERT(ne2 == ne02);
9631
- GGML_ASSERT(ne3 == ne03);
9632
-
9633
9652
  // nb01 >= nb00 - src0 is not transposed
9634
9653
  // compute by src0 rows
9635
9654
 
9636
9655
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
9637
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9656
+ // TODO: #if defined(GGML_USE_CLBLAST)
9657
+
9658
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9659
+ bool use_blas = ggml_is_matrix(src0) &&
9660
+ ggml_is_matrix(src1) &&
9661
+ ggml_is_contiguous(src0) &&
9662
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
9663
+ #endif
9638
9664
 
9639
9665
  if (params->type == GGML_TASK_INIT) {
9666
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
9667
+ if (use_blas) {
9668
+ return;
9669
+ }
9670
+ #endif
9640
9671
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
9641
9672
  return;
9642
9673
  }
@@ -9645,6 +9676,50 @@ static void ggml_compute_forward_out_prod_f32(
9645
9676
  return;
9646
9677
  }
9647
9678
 
9679
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9680
+ if (use_blas) {
9681
+ if (params->ith != 0) { // All threads other than the first do no work.
9682
+ return;
9683
+ }
9684
+ // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
9685
+ // src0: (k,n)
9686
+ // src1: (k,m)
9687
+ // dst: (m,n)
9688
+ //
9689
+ // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
9690
+ // Also expressed as (major,minor)
9691
+ // a: (m,k): so src1 transposed
9692
+ // b: (k,n): so src0
9693
+ // c: (m,n)
9694
+ //
9695
+ // However, if ggml_is_transposed(src1) is true, then
9696
+ // src1->data already contains a transposed version, so sgemm mustn't
9697
+ // transpose it further.
9698
+
9699
+ int n = src0->ne[0];
9700
+ int k = src0->ne[1];
9701
+ int m = src1->ne[0];
9702
+
9703
+ int transposeA, lda;
9704
+
9705
+ if (!ggml_is_transposed(src1)) {
9706
+ transposeA = CblasTrans;
9707
+ lda = m;
9708
+ } else {
9709
+ transposeA = CblasNoTrans;
9710
+ lda = k;
9711
+ }
9712
+
9713
+ float * a = (float *) ((char *) src1->data);
9714
+ float * b = (float *) ((char *) src0->data);
9715
+ float * c = (float *) ((char *) dst->data);
9716
+
9717
+ cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
9718
+
9719
+ return;
9720
+ }
9721
+ #endif
9722
+
9648
9723
  // dst[:,:,:,:] = 0
9649
9724
  // for i2,i3:
9650
9725
  // for i1:
@@ -10498,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
10498
10573
  static void ggml_compute_forward_soft_max_f32(
10499
10574
  const struct ggml_compute_params * params,
10500
10575
  const struct ggml_tensor * src0,
10501
- struct ggml_tensor * dst) {
10502
- GGML_ASSERT(ggml_is_contiguous(src0));
10503
- GGML_ASSERT(ggml_is_contiguous(dst));
10504
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
10576
+ const struct ggml_tensor * src1,
10577
+ struct ggml_tensor * dst) {
10578
+ assert(ggml_is_contiguous(dst));
10579
+ assert(ggml_are_same_shape(src0, dst));
10505
10580
 
10506
10581
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10507
10582
  return;
10508
10583
  }
10509
10584
 
10585
+ float scale = 1.0f;
10586
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
10587
+
10510
10588
  // TODO: handle transposed/permuted matrices
10511
10589
 
10512
10590
  const int ith = params->ith;
10513
10591
  const int nth = params->nth;
10514
10592
 
10593
+ const int64_t ne11 = src1 ? src1->ne[1] : 1;
10594
+
10515
10595
  const int nc = src0->ne[0];
10516
10596
  const int nr = ggml_nrows(src0);
10517
10597
 
@@ -10522,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
10522
10602
  const int ir0 = dr*ith;
10523
10603
  const int ir1 = MIN(ir0 + dr, nr);
10524
10604
 
10605
+ float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
10606
+
10525
10607
  for (int i1 = ir0; i1 < ir1; i1++) {
10526
- float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10527
- float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10608
+ float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10609
+ float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10610
+
10611
+ // broadcast the mask across rows
10612
+ float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
10613
+
10614
+ ggml_vec_cpy_f32 (nc, wp, sp);
10615
+ ggml_vec_scale_f32(nc, wp, scale);
10616
+ if (mp) {
10617
+ ggml_vec_acc_f32(nc, wp, mp);
10618
+ }
10528
10619
 
10529
10620
  #ifndef NDEBUG
10530
10621
  for (int i = 0; i < nc; ++i) {
10531
10622
  //printf("p[%d] = %f\n", i, p[i]);
10532
- assert(!isnan(sp[i]));
10623
+ assert(!isnan(wp[i]));
10533
10624
  }
10534
10625
  #endif
10535
10626
 
10536
10627
  float max = -INFINITY;
10537
- ggml_vec_max_f32(nc, &max, sp);
10628
+ ggml_vec_max_f32(nc, &max, wp);
10538
10629
 
10539
10630
  ggml_float sum = 0.0;
10540
10631
 
10541
10632
  uint16_t scvt;
10542
10633
  for (int i = 0; i < nc; i++) {
10543
- if (sp[i] == -INFINITY) {
10634
+ if (wp[i] == -INFINITY) {
10544
10635
  dp[i] = 0.0f;
10545
10636
  } else {
10546
- // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
10547
- ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
10637
+ // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
10638
+ ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
10548
10639
  memcpy(&scvt, &s, sizeof(scvt));
10549
10640
  const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
10550
10641
  sum += (ggml_float)val;
@@ -10569,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
10569
10660
  static void ggml_compute_forward_soft_max(
10570
10661
  const struct ggml_compute_params * params,
10571
10662
  const struct ggml_tensor * src0,
10572
- struct ggml_tensor * dst) {
10663
+ const struct ggml_tensor * src1,
10664
+ struct ggml_tensor * dst) {
10573
10665
  switch (src0->type) {
10574
10666
  case GGML_TYPE_F32:
10575
10667
  {
10576
- ggml_compute_forward_soft_max_f32(params, src0, dst);
10668
+ ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
10577
10669
  } break;
10578
10670
  default:
10579
10671
  {
@@ -13810,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13810
13902
  } break;
13811
13903
  case GGML_OP_SOFT_MAX:
13812
13904
  {
13813
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
13905
+ ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
13814
13906
  } break;
13815
13907
  case GGML_OP_SOFT_MAX_BACK:
13816
13908
  {
@@ -15636,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15636
15728
  {
15637
15729
  n_tasks = 1;
15638
15730
  } break;
15639
- case GGML_OP_COUNT:
15640
- {
15641
- GGML_ASSERT(false);
15642
- } break;
15643
15731
  default:
15644
15732
  {
15645
- printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15733
+ fprintf(stderr, "%s: op not implemented: ", __func__);
15734
+ if (node->op < GGML_OP_COUNT) {
15735
+ fprintf(stderr, "%s\n", ggml_op_name(node->op));
15736
+ } else {
15737
+ fprintf(stderr, "%d\n", node->op);
15738
+ }
15646
15739
  GGML_ASSERT(false);
15647
15740
  } break;
15648
15741
  }
@@ -15845,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15845
15938
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15846
15939
  }
15847
15940
  } break;
15941
+ case GGML_OP_SOFT_MAX:
15942
+ {
15943
+ n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
15944
+
15945
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15946
+ } break;
15848
15947
  case GGML_OP_CONV_TRANSPOSE_1D:
15849
15948
  {
15850
15949
  GGML_ASSERT(node->src[0]->ne[3] == 1);
@@ -18399,24 +18498,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
18399
18498
  }
18400
18499
 
18401
18500
  const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
18501
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18402
18502
  return ctx->kv[key_id].key.data;
18403
18503
  }
18404
18504
 
18405
18505
  enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
18506
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18406
18507
  return ctx->kv[key_id].type;
18407
18508
  }
18408
18509
 
18409
18510
  enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
18511
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18410
18512
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18411
18513
  return ctx->kv[key_id].value.arr.type;
18412
18514
  }
18413
18515
 
18414
18516
  const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
18517
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18415
18518
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18416
18519
  return ctx->kv[key_id].value.arr.data;
18417
18520
  }
18418
18521
 
18419
18522
  const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
18523
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18420
18524
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18421
18525
  struct gguf_kv * kv = &ctx->kv[key_id];
18422
18526
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
@@ -18424,70 +18528,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
18424
18528
  }
18425
18529
 
18426
18530
  int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
18531
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18427
18532
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
18428
18533
  return ctx->kv[key_id].value.arr.n;
18429
18534
  }
18430
18535
 
18431
18536
  uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
18537
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18432
18538
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
18433
18539
  return ctx->kv[key_id].value.uint8;
18434
18540
  }
18435
18541
 
18436
18542
  int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
18543
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18437
18544
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
18438
18545
  return ctx->kv[key_id].value.int8;
18439
18546
  }
18440
18547
 
18441
18548
  uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
18549
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18442
18550
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
18443
18551
  return ctx->kv[key_id].value.uint16;
18444
18552
  }
18445
18553
 
18446
18554
  int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
18555
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18447
18556
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
18448
18557
  return ctx->kv[key_id].value.int16;
18449
18558
  }
18450
18559
 
18451
18560
  uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
18561
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18452
18562
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
18453
18563
  return ctx->kv[key_id].value.uint32;
18454
18564
  }
18455
18565
 
18456
18566
  int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
18567
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18457
18568
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
18458
18569
  return ctx->kv[key_id].value.int32;
18459
18570
  }
18460
18571
 
18461
18572
  float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
18573
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18462
18574
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
18463
18575
  return ctx->kv[key_id].value.float32;
18464
18576
  }
18465
18577
 
18466
18578
  uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
18579
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18467
18580
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
18468
18581
  return ctx->kv[key_id].value.uint64;
18469
18582
  }
18470
18583
 
18471
18584
  int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
18585
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18472
18586
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
18473
18587
  return ctx->kv[key_id].value.int64;
18474
18588
  }
18475
18589
 
18476
18590
  double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
18591
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18477
18592
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
18478
18593
  return ctx->kv[key_id].value.float64;
18479
18594
  }
18480
18595
 
18481
18596
  bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
18597
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18482
18598
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
18483
18599
  return ctx->kv[key_id].value.bool_;
18484
18600
  }
18485
18601
 
18486
18602
  const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
18603
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18487
18604
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
18488
18605
  return ctx->kv[key_id].value.str.data;
18489
18606
  }
18490
18607
 
18608
+ const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
18609
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18610
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
18611
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
18612
+ return &ctx->kv[key_id].value;
18613
+ }
18614
+
18491
18615
  int gguf_get_n_tensors(const struct gguf_context * ctx) {
18492
18616
  return ctx->header.n_tensors;
18493
18617
  }
@@ -244,11 +244,10 @@
244
244
  #define GGML_ASSERT(x) \
245
245
  do { \
246
246
  if (!(x)) { \
247
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
248
- fflush(stderr); \
249
247
  fflush(stdout); \
248
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
250
249
  ggml_print_backtrace(); \
251
- exit(1); \
250
+ abort(); \
252
251
  } \
253
252
  } while (0)
254
253
 
@@ -1283,6 +1282,14 @@ extern "C" {
1283
1282
  struct ggml_context * ctx,
1284
1283
  struct ggml_tensor * a);
1285
1284
 
1285
+ // fused soft_max(a*scale + mask)
1286
+ // mask is optional
1287
+ GGML_API struct ggml_tensor * ggml_soft_max_ext(
1288
+ struct ggml_context * ctx,
1289
+ struct ggml_tensor * a,
1290
+ struct ggml_tensor * mask,
1291
+ float scale);
1292
+
1286
1293
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1287
1294
  struct ggml_context * ctx,
1288
1295
  struct ggml_tensor * a,
@@ -2045,6 +2052,7 @@ extern "C" {
2045
2052
  GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
2046
2053
  GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
2047
2054
  GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
2055
+ GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
2048
2056
  GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
2049
2057
  GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2050
2058
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);