llama_cpp 0.9.4 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
233
233
  #define UNUSED GGML_UNUSED
234
234
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
235
235
 
236
- //
237
- // tensor access macros
238
- //
239
-
240
- #define GGML_TENSOR_UNARY_OP_LOCALS \
241
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
242
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
243
- GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
244
- GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
245
-
246
- #define GGML_TENSOR_BINARY_OP_LOCALS \
247
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
248
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
249
- GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
250
- GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
251
- GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
252
- GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
253
-
254
236
  #if defined(GGML_USE_ACCELERATE)
255
237
  #include <Accelerate/Accelerate.h>
256
238
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1613
1595
  "GROUP_NORM",
1614
1596
 
1615
1597
  "MUL_MAT",
1598
+ "MUL_MAT_ID",
1616
1599
  "OUT_PROD",
1617
1600
 
1618
1601
  "SCALE",
@@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1640
1623
  "POOL_1D",
1641
1624
  "POOL_2D",
1642
1625
  "UPSCALE",
1626
+ "ARGSORT",
1643
1627
 
1644
1628
  "FLASH_ATTN",
1645
1629
  "FLASH_FF",
@@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1666
1650
  "CROSS_ENTROPY_LOSS_BACK",
1667
1651
  };
1668
1652
 
1669
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1653
+ static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
1670
1654
 
1671
1655
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1672
1656
  "none",
@@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1695
1679
  "group_norm(x)",
1696
1680
 
1697
1681
  "X*Y",
1682
+ "X[i]*Y",
1698
1683
  "X*Y",
1699
1684
 
1700
1685
  "x*v",
@@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1722
1707
  "pool_1d(x)",
1723
1708
  "pool_2d(x)",
1724
1709
  "upscale(x)",
1710
+ "argsort(x)",
1725
1711
 
1726
1712
  "flash_attn(x)",
1727
1713
  "flash_ff(x)",
@@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1748
1734
  "cross_entropy_loss_back(x,y)",
1749
1735
  };
1750
1736
 
1751
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1737
+ static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
1752
1738
 
1753
1739
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1754
1740
 
1741
+
1742
+ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1743
+ "ABS",
1744
+ "SGN",
1745
+ "NEG",
1746
+ "STEP",
1747
+ "TANH",
1748
+ "ELU",
1749
+ "RELU",
1750
+ "GELU",
1751
+ "GELU_QUICK",
1752
+ "SILU",
1753
+ "LEAKY",
1754
+ };
1755
+
1756
+ static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
1757
+
1758
+
1755
1759
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
1756
1760
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
1757
1761
 
@@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
1771
1775
 
1772
1776
  p[GGML_OP_ACC ] = true;
1773
1777
  p[GGML_OP_MUL_MAT ] = true;
1778
+ p[GGML_OP_MUL_MAT_ID ] = true;
1774
1779
  p[GGML_OP_OUT_PROD ] = true;
1775
1780
  p[GGML_OP_SET ] = true;
1776
1781
  p[GGML_OP_GET_ROWS_BACK ] = true;
@@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
2023
2028
  return GGML_OP_SYMBOL[op];
2024
2029
  }
2025
2030
 
2031
+ const char * ggml_unary_op_name(enum ggml_unary_op op) {
2032
+ return GGML_UNARY_OP_NAME[op];
2033
+ }
2034
+
2035
+ const char * ggml_op_desc(const struct ggml_tensor * t) {
2036
+ if (t->op == GGML_OP_UNARY) {
2037
+ enum ggml_unary_op uop = ggml_get_unary_op(t);
2038
+ return ggml_unary_op_name(uop);
2039
+ }
2040
+ else {
2041
+ return ggml_op_name(t->op);
2042
+ }
2043
+ }
2044
+
2026
2045
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
2027
2046
  return ggml_type_size(tensor->type);
2028
2047
  }
@@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
3154
3173
  struct ggml_tensor * a,
3155
3174
  struct ggml_tensor * b,
3156
3175
  bool inplace) {
3157
- // TODO: support less-strict constraint
3158
- // GGML_ASSERT(ggml_can_repeat(b, a));
3159
- GGML_ASSERT(ggml_can_repeat_rows(b, a));
3176
+ GGML_ASSERT(ggml_can_repeat(b, a));
3160
3177
 
3161
3178
  bool is_node = false;
3162
3179
 
@@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
3371
3388
  struct ggml_tensor * a,
3372
3389
  struct ggml_tensor * b,
3373
3390
  bool inplace) {
3374
- // TODO: support less-strict constraint
3375
- // GGML_ASSERT(ggml_can_repeat(b, a));
3376
- GGML_ASSERT(ggml_can_repeat_rows(b, a));
3391
+ GGML_ASSERT(ggml_can_repeat(b, a));
3377
3392
 
3378
3393
  bool is_node = false;
3379
3394
 
@@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
3418
3433
  struct ggml_tensor * a,
3419
3434
  struct ggml_tensor * b,
3420
3435
  bool inplace) {
3421
- GGML_ASSERT(ggml_are_same_shape(a, b));
3436
+ GGML_ASSERT(ggml_can_repeat(b, a));
3422
3437
 
3423
3438
  bool is_node = false;
3424
3439
 
@@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
4056
4071
  return result;
4057
4072
  }
4058
4073
 
4074
+ // ggml_mul_mat_id
4075
+
4076
+ struct ggml_tensor * ggml_mul_mat_id(
4077
+ struct ggml_context * ctx,
4078
+ struct ggml_tensor * as[],
4079
+ struct ggml_tensor * ids,
4080
+ int id,
4081
+ struct ggml_tensor * b) {
4082
+
4083
+ int64_t n_as = ids->ne[0];
4084
+
4085
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
4086
+ GGML_ASSERT(ggml_is_vector(ids));
4087
+ GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4088
+ GGML_ASSERT(id >= 0 && id < n_as);
4089
+
4090
+ bool is_node = false;
4091
+
4092
+ if (as[0]->grad || b->grad) {
4093
+ is_node = true;
4094
+ }
4095
+
4096
+ const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4097
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
4098
+
4099
+ ggml_set_op_params_i32(result, 0, id);
4100
+
4101
+ result->op = GGML_OP_MUL_MAT_ID;
4102
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4103
+ result->src[0] = ids;
4104
+ result->src[1] = b;
4105
+
4106
+ for (int64_t i = 0; i < n_as; i++) {
4107
+ struct ggml_tensor * a = as[i];
4108
+ GGML_ASSERT(ggml_are_same_shape(as[0], a));
4109
+ GGML_ASSERT(ggml_can_mul_mat(a, b));
4110
+ GGML_ASSERT(!ggml_is_transposed(a));
4111
+ result->src[i + 2] = a;
4112
+ }
4113
+
4114
+ return result;
4115
+ }
4116
+
4059
4117
  // ggml_out_prod
4060
4118
 
4061
4119
  struct ggml_tensor * ggml_out_prod(
@@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
4209
4267
  struct ggml_tensor * b,
4210
4268
  size_t nb1,
4211
4269
  size_t offset) {
4212
- return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
4270
+ return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
4213
4271
  }
4214
4272
 
4215
4273
  // ggml_cpy
@@ -4826,7 +4884,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
4826
4884
  static struct ggml_tensor * ggml_soft_max_impl(
4827
4885
  struct ggml_context * ctx,
4828
4886
  struct ggml_tensor * a,
4887
+ struct ggml_tensor * mask,
4888
+ float scale,
4829
4889
  bool inplace) {
4890
+ GGML_ASSERT(ggml_is_contiguous(a));
4891
+ if (mask) {
4892
+ GGML_ASSERT(ggml_is_contiguous(mask));
4893
+ GGML_ASSERT(mask->ne[2] == 1);
4894
+ GGML_ASSERT(mask->ne[3] == 1);
4895
+ GGML_ASSERT(ggml_can_repeat_rows(mask, a));
4896
+ }
4897
+
4830
4898
  bool is_node = false;
4831
4899
 
4832
4900
  if (a->grad) {
@@ -4835,9 +4903,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
4835
4903
 
4836
4904
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4837
4905
 
4906
+ float params[] = { scale };
4907
+ ggml_set_op_params(result, params, sizeof(params));
4908
+
4838
4909
  result->op = GGML_OP_SOFT_MAX;
4839
4910
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4840
4911
  result->src[0] = a;
4912
+ result->src[1] = mask;
4841
4913
 
4842
4914
  return result;
4843
4915
  }
@@ -4845,13 +4917,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
4845
4917
  struct ggml_tensor * ggml_soft_max(
4846
4918
  struct ggml_context * ctx,
4847
4919
  struct ggml_tensor * a) {
4848
- return ggml_soft_max_impl(ctx, a, false);
4920
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
4849
4921
  }
4850
4922
 
4851
4923
  struct ggml_tensor * ggml_soft_max_inplace(
4852
4924
  struct ggml_context * ctx,
4853
4925
  struct ggml_tensor * a) {
4854
- return ggml_soft_max_impl(ctx, a, true);
4926
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
4927
+ }
4928
+
4929
+ struct ggml_tensor * ggml_soft_max_ext(
4930
+ struct ggml_context * ctx,
4931
+ struct ggml_tensor * a,
4932
+ struct ggml_tensor * mask,
4933
+ float scale) {
4934
+ return ggml_soft_max_impl(ctx, a, mask, scale, false);
4855
4935
  }
4856
4936
 
4857
4937
  // ggml_soft_max_back
@@ -5446,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
5446
5526
  return ggml_upscale_impl(ctx, a, scale_factor);
5447
5527
  }
5448
5528
 
5529
+ // ggml_argsort
5530
+
5531
+ struct ggml_tensor * ggml_argsort(
5532
+ struct ggml_context * ctx,
5533
+ struct ggml_tensor * a,
5534
+ enum ggml_sort_order order) {
5535
+ bool is_node = false;
5536
+
5537
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
5538
+
5539
+ ggml_set_op_params_i32(result, 0, (int32_t) order);
5540
+
5541
+ result->op = GGML_OP_ARGSORT;
5542
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5543
+ result->src[0] = a;
5544
+
5545
+ return result;
5546
+ }
5547
+
5548
+ // ggml_top_k
5549
+
5550
+ struct ggml_tensor * ggml_top_k(
5551
+ struct ggml_context * ctx,
5552
+ struct ggml_tensor * a,
5553
+ int k) {
5554
+ GGML_ASSERT(a->ne[0] >= k);
5555
+
5556
+ struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
5557
+
5558
+ result = ggml_view_4d(ctx, result,
5559
+ k, result->ne[1], result->ne[2], result->ne[3],
5560
+ result->nb[1], result->nb[2], result->nb[3],
5561
+ 0);
5562
+
5563
+ return result;
5564
+ }
5565
+
5449
5566
  // ggml_flash_attn
5450
5567
 
5451
5568
  struct ggml_tensor * ggml_flash_attn(
@@ -6805,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
6805
6922
  const struct ggml_tensor * src0,
6806
6923
  const struct ggml_tensor * src1,
6807
6924
  struct ggml_tensor * dst) {
6808
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
6925
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
6809
6926
 
6810
6927
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6811
6928
  return;
@@ -6838,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
6838
6955
  const int64_t i13 = i03 % ne13;
6839
6956
  const int64_t i12 = i02 % ne12;
6840
6957
  const int64_t i11 = i01 % ne11;
6958
+ const int64_t nr0 = ne00 / ne10;
6841
6959
 
6842
6960
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
6843
6961
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
6844
6962
  float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
6845
6963
 
6964
+ for (int64_t r = 0; r < nr0; ++r) {
6846
6965
  #ifdef GGML_USE_ACCELERATE
6847
- vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
6966
+ vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
6848
6967
  #else
6849
- ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
6968
+ ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
6850
6969
  #endif
6970
+ }
6851
6971
  }
6852
6972
  } else {
6853
6973
  // src1 is not contiguous
@@ -6864,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
6864
6984
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
6865
6985
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
6866
6986
 
6867
- for (int i0 = 0; i0 < ne0; i0++) {
6868
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
6987
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
6988
+ const int64_t i10 = i0 % ne10;
6989
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
6869
6990
 
6870
6991
  dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
6871
6992
  }
@@ -7585,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
7585
7706
  const struct ggml_tensor * src0,
7586
7707
  const struct ggml_tensor * src1,
7587
7708
  struct ggml_tensor * dst) {
7588
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
7709
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7589
7710
 
7590
7711
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7591
7712
  return;
@@ -7608,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
7608
7729
 
7609
7730
  GGML_ASSERT( nb0 == sizeof(float));
7610
7731
  GGML_ASSERT(nb00 == sizeof(float));
7611
- GGML_ASSERT(ne00 == ne10);
7612
7732
 
7613
7733
  if (nb10 == sizeof(float)) {
7614
7734
  for (int64_t ir = ith; ir < nr; ir += nth) {
@@ -7620,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
7620
7740
  const int64_t i13 = i03 % ne13;
7621
7741
  const int64_t i12 = i02 % ne12;
7622
7742
  const int64_t i11 = i01 % ne11;
7743
+ const int64_t nr0 = ne00 / ne10;
7623
7744
 
7624
7745
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7625
7746
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7626
7747
  float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
7627
7748
 
7749
+ for (int64_t r = 0 ; r < nr0; ++r) {
7628
7750
  #ifdef GGML_USE_ACCELERATE
7629
- UNUSED(ggml_vec_mul_f32);
7751
+ UNUSED(ggml_vec_mul_f32);
7630
7752
 
7631
- vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
7753
+ vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
7632
7754
  #else
7633
- ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
7755
+ ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
7634
7756
  #endif
7635
- // }
7636
- // }
7757
+ }
7637
7758
  }
7638
7759
  } else {
7639
7760
  // src1 is not contiguous
@@ -7651,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
7651
7772
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7652
7773
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7653
7774
 
7654
- for (int64_t i0 = 0; i0 < ne00; i0++) {
7655
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
7775
+ for (int64_t i0 = 0; i0 < ne00; ++i0) {
7776
+ const int64_t i10 = i0 % ne10;
7777
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
7656
7778
 
7657
7779
  dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
7658
7780
  }
@@ -7686,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
7686
7808
  const struct ggml_tensor * src0,
7687
7809
  const struct ggml_tensor * src1,
7688
7810
  struct ggml_tensor * dst) {
7689
- assert(params->ith == 0);
7690
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7811
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7691
7812
 
7692
7813
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7693
7814
  return;
7694
7815
  }
7695
7816
 
7696
- const int nr = ggml_nrows(src0);
7817
+ const int ith = params->ith;
7818
+ const int nth = params->nth;
7819
+
7820
+ const int64_t nr = ggml_nrows(src0);
7697
7821
 
7698
7822
  GGML_TENSOR_BINARY_OP_LOCALS
7699
7823
 
@@ -7701,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
7701
7825
  GGML_ASSERT(nb00 == sizeof(float));
7702
7826
 
7703
7827
  if (nb10 == sizeof(float)) {
7704
- for (int ir = 0; ir < nr; ++ir) {
7705
- // src0, src1 and dst are same shape => same indices
7706
- const int i3 = ir/(ne2*ne1);
7707
- const int i2 = (ir - i3*ne2*ne1)/ne1;
7708
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
7828
+ for (int64_t ir = ith; ir < nr; ir += nth) {
7829
+ // src0 and dst are same shape => same indices
7830
+ const int64_t i03 = ir/(ne02*ne01);
7831
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
7832
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
7833
+
7834
+ const int64_t i13 = i03 % ne13;
7835
+ const int64_t i12 = i02 % ne12;
7836
+ const int64_t i11 = i01 % ne11;
7837
+ const int64_t nr0 = ne00 / ne10;
7709
7838
 
7839
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7840
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7841
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
7842
+
7843
+ for (int64_t r = 0; r < nr0; ++r) {
7710
7844
  #ifdef GGML_USE_ACCELERATE
7711
- UNUSED(ggml_vec_div_f32);
7845
+ UNUSED(ggml_vec_div_f32);
7712
7846
 
7713
- vDSP_vdiv(
7714
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
7715
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
7716
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
7717
- ne0);
7847
+ vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
7718
7848
  #else
7719
- ggml_vec_div_f32(ne0,
7720
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
7721
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
7722
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
7849
+ ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
7723
7850
  #endif
7724
- // }
7725
- // }
7851
+ }
7726
7852
  }
7727
7853
  } else {
7728
7854
  // src1 is not contiguous
7729
- for (int ir = 0; ir < nr; ++ir) {
7730
- // src0, src1 and dst are same shape => same indices
7731
- const int i3 = ir/(ne2*ne1);
7732
- const int i2 = (ir - i3*ne2*ne1)/ne1;
7733
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
7855
+ for (int64_t ir = ith; ir < nr; ir += nth) {
7856
+ // src0 and dst are same shape => same indices
7857
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
7858
+ const int64_t i03 = ir/(ne02*ne01);
7859
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
7860
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
7734
7861
 
7735
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
7736
- float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
7737
- for (int i0 = 0; i0 < ne0; i0++) {
7738
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
7862
+ const int64_t i13 = i03 % ne13;
7863
+ const int64_t i12 = i02 % ne12;
7864
+ const int64_t i11 = i01 % ne11;
7865
+
7866
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7867
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7868
+
7869
+ for (int64_t i0 = 0; i0 < ne00; ++i0) {
7870
+ const int64_t i10 = i0 % ne10;
7871
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
7739
7872
 
7740
7873
  dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
7741
7874
  }
@@ -8181,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
8181
8314
  return;
8182
8315
  }
8183
8316
 
8184
- GGML_TENSOR_UNARY_OP_LOCALS;
8317
+ GGML_TENSOR_UNARY_OP_LOCALS
8185
8318
 
8186
8319
  // guaranteed to be an integer due to the check in ggml_can_repeat
8187
8320
  const int nr0 = (int)(ne0/ne00);
@@ -8326,6 +8459,7 @@ static void ggml_compute_forward_concat_f32(
8326
8459
  GGML_ASSERT(src0->nb[0] == sizeof(float));
8327
8460
 
8328
8461
  const int ith = params->ith;
8462
+ const int nth = params->nth;
8329
8463
 
8330
8464
  GGML_TENSOR_BINARY_OP_LOCALS
8331
8465
 
@@ -8335,7 +8469,7 @@ static void ggml_compute_forward_concat_f32(
8335
8469
  GGML_ASSERT(nb10 == sizeof(float));
8336
8470
 
8337
8471
  for (int i3 = 0; i3 < ne3; i3++) {
8338
- for (int i2 = ith; i2 < ne2; i2++) {
8472
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
8339
8473
  if (i2 < ne02) { // src0
8340
8474
  for (int i1 = 0; i1 < ne1; i1++) {
8341
8475
  for (int i0 = 0; i0 < ne0; i0++) {
@@ -9373,7 +9507,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9373
9507
  // TODO: find the optimal values for these
9374
9508
  if (ggml_is_contiguous(src0) &&
9375
9509
  ggml_is_contiguous(src1) &&
9376
- src0->type == GGML_TYPE_F32 &&
9510
+ //src0->type == GGML_TYPE_F32 &&
9377
9511
  src1->type == GGML_TYPE_F32 &&
9378
9512
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9379
9513
 
@@ -9495,6 +9629,8 @@ static void ggml_compute_forward_mul_mat(
9495
9629
  char * wdata = params->wdata;
9496
9630
  const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9497
9631
 
9632
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
9633
+
9498
9634
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
9499
9635
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
9500
9636
  for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -9596,6 +9732,26 @@ static void ggml_compute_forward_mul_mat(
9596
9732
  }
9597
9733
  }
9598
9734
 
9735
+ // ggml_compute_forward_mul_mat_id
9736
+
9737
+ static void ggml_compute_forward_mul_mat_id(
9738
+ const struct ggml_compute_params * params,
9739
+ struct ggml_tensor * dst) {
9740
+
9741
+ const struct ggml_tensor * ids = dst->src[0];
9742
+ const struct ggml_tensor * src1 = dst->src[1];
9743
+
9744
+ const int id = ggml_get_op_params_i32(dst, 0);
9745
+
9746
+ const int a_id = ((int32_t *)ids->data)[id];
9747
+
9748
+ GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
9749
+
9750
+ const struct ggml_tensor * src0 = dst->src[a_id + 2];
9751
+
9752
+ ggml_compute_forward_mul_mat(params, src0, src1, dst);
9753
+ }
9754
+
9599
9755
  // ggml_compute_forward_out_prod
9600
9756
 
9601
9757
  static void ggml_compute_forward_out_prod_f32(
@@ -10551,20 +10707,25 @@ static void ggml_compute_forward_diag_mask_zero(
10551
10707
  static void ggml_compute_forward_soft_max_f32(
10552
10708
  const struct ggml_compute_params * params,
10553
10709
  const struct ggml_tensor * src0,
10554
- struct ggml_tensor * dst) {
10555
- GGML_ASSERT(ggml_is_contiguous(src0));
10556
- GGML_ASSERT(ggml_is_contiguous(dst));
10557
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
10710
+ const struct ggml_tensor * src1,
10711
+ struct ggml_tensor * dst) {
10712
+ assert(ggml_is_contiguous(dst));
10713
+ assert(ggml_are_same_shape(src0, dst));
10558
10714
 
10559
10715
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10560
10716
  return;
10561
10717
  }
10562
10718
 
10719
+ float scale = 1.0f;
10720
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
10721
+
10563
10722
  // TODO: handle transposed/permuted matrices
10564
10723
 
10565
10724
  const int ith = params->ith;
10566
10725
  const int nth = params->nth;
10567
10726
 
10727
+ const int64_t ne11 = src1 ? src1->ne[1] : 1;
10728
+
10568
10729
  const int nc = src0->ne[0];
10569
10730
  const int nr = ggml_nrows(src0);
10570
10731
 
@@ -10575,29 +10736,40 @@ static void ggml_compute_forward_soft_max_f32(
10575
10736
  const int ir0 = dr*ith;
10576
10737
  const int ir1 = MIN(ir0 + dr, nr);
10577
10738
 
10739
+ float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
10740
+
10578
10741
  for (int i1 = ir0; i1 < ir1; i1++) {
10579
- float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10580
- float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10742
+ float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10743
+ float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10744
+
10745
+ // broadcast the mask across rows
10746
+ float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
10747
+
10748
+ ggml_vec_cpy_f32 (nc, wp, sp);
10749
+ ggml_vec_scale_f32(nc, wp, scale);
10750
+ if (mp) {
10751
+ ggml_vec_acc_f32(nc, wp, mp);
10752
+ }
10581
10753
 
10582
10754
  #ifndef NDEBUG
10583
10755
  for (int i = 0; i < nc; ++i) {
10584
10756
  //printf("p[%d] = %f\n", i, p[i]);
10585
- assert(!isnan(sp[i]));
10757
+ assert(!isnan(wp[i]));
10586
10758
  }
10587
10759
  #endif
10588
10760
 
10589
10761
  float max = -INFINITY;
10590
- ggml_vec_max_f32(nc, &max, sp);
10762
+ ggml_vec_max_f32(nc, &max, wp);
10591
10763
 
10592
10764
  ggml_float sum = 0.0;
10593
10765
 
10594
10766
  uint16_t scvt;
10595
10767
  for (int i = 0; i < nc; i++) {
10596
- if (sp[i] == -INFINITY) {
10768
+ if (wp[i] == -INFINITY) {
10597
10769
  dp[i] = 0.0f;
10598
10770
  } else {
10599
- // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
10600
- ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
10771
+ // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
10772
+ ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
10601
10773
  memcpy(&scvt, &s, sizeof(scvt));
10602
10774
  const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
10603
10775
  sum += (ggml_float)val;
@@ -10622,11 +10794,12 @@ static void ggml_compute_forward_soft_max_f32(
10622
10794
  static void ggml_compute_forward_soft_max(
10623
10795
  const struct ggml_compute_params * params,
10624
10796
  const struct ggml_tensor * src0,
10625
- struct ggml_tensor * dst) {
10797
+ const struct ggml_tensor * src1,
10798
+ struct ggml_tensor * dst) {
10626
10799
  switch (src0->type) {
10627
10800
  case GGML_TYPE_F32:
10628
10801
  {
10629
- ggml_compute_forward_soft_max_f32(params, src0, dst);
10802
+ ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
10630
10803
  } break;
10631
10804
  default:
10632
10805
  {
@@ -11982,6 +12155,67 @@ static void ggml_compute_forward_upscale(
11982
12155
  }
11983
12156
  }
11984
12157
 
12158
+ // ggml_compute_forward_argsort
12159
+
12160
+ static void ggml_compute_forward_argsort_f32(
12161
+ const struct ggml_compute_params * params,
12162
+ const struct ggml_tensor * src0,
12163
+ struct ggml_tensor * dst) {
12164
+
12165
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12166
+ return;
12167
+ }
12168
+
12169
+ GGML_TENSOR_UNARY_OP_LOCALS
12170
+
12171
+ GGML_ASSERT(nb0 == sizeof(float));
12172
+
12173
+ const int ith = params->ith;
12174
+ const int nth = params->nth;
12175
+
12176
+ const int64_t nr = ggml_nrows(src0);
12177
+
12178
+ enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
12179
+
12180
+ for (int64_t i = ith; i < nr; i += nth) {
12181
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
12182
+ const float * src_data = (float *)((char *) src0->data + i*nb01);
12183
+
12184
+ for (int64_t j = 0; j < ne0; j++) {
12185
+ dst_data[j] = j;
12186
+ }
12187
+
12188
+ // C doesn't have a functional sort, so we do a bubble sort instead
12189
+ for (int64_t j = 0; j < ne0; j++) {
12190
+ for (int64_t k = j + 1; k < ne0; k++) {
12191
+ if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
12192
+ (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
12193
+ int32_t tmp = dst_data[j];
12194
+ dst_data[j] = dst_data[k];
12195
+ dst_data[k] = tmp;
12196
+ }
12197
+ }
12198
+ }
12199
+ }
12200
+ }
12201
+
12202
+ static void ggml_compute_forward_argsort(
12203
+ const struct ggml_compute_params * params,
12204
+ const struct ggml_tensor * src0,
12205
+ struct ggml_tensor * dst) {
12206
+
12207
+ switch (src0->type) {
12208
+ case GGML_TYPE_F32:
12209
+ {
12210
+ ggml_compute_forward_argsort_f32(params, src0, dst);
12211
+ } break;
12212
+ default:
12213
+ {
12214
+ GGML_ASSERT(false);
12215
+ } break;
12216
+ }
12217
+ }
12218
+
11985
12219
  // ggml_compute_forward_flash_attn
11986
12220
 
11987
12221
  static void ggml_compute_forward_flash_attn_f32(
@@ -13805,6 +14039,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13805
14039
  {
13806
14040
  ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
13807
14041
  } break;
14042
+ case GGML_OP_MUL_MAT_ID:
14043
+ {
14044
+ ggml_compute_forward_mul_mat_id(params, tensor);
14045
+ } break;
13808
14046
  case GGML_OP_OUT_PROD:
13809
14047
  {
13810
14048
  ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
@@ -13863,7 +14101,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13863
14101
  } break;
13864
14102
  case GGML_OP_SOFT_MAX:
13865
14103
  {
13866
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
14104
+ ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
13867
14105
  } break;
13868
14106
  case GGML_OP_SOFT_MAX_BACK:
13869
14107
  {
@@ -13909,6 +14147,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13909
14147
  {
13910
14148
  ggml_compute_forward_upscale(params, tensor->src[0], tensor);
13911
14149
  } break;
14150
+ case GGML_OP_ARGSORT:
14151
+ {
14152
+ ggml_compute_forward_argsort(params, tensor->src[0], tensor);
14153
+ } break;
13912
14154
  case GGML_OP_FLASH_ATTN:
13913
14155
  {
13914
14156
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -14559,6 +14801,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14559
14801
  zero_table);
14560
14802
  }
14561
14803
  } break;
14804
+ case GGML_OP_MUL_MAT_ID:
14805
+ {
14806
+ GGML_ASSERT(false); // TODO: not implemented
14807
+ } break;
14562
14808
  case GGML_OP_OUT_PROD:
14563
14809
  {
14564
14810
  GGML_ASSERT(false); // TODO: not implemented
@@ -14897,6 +15143,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14897
15143
  {
14898
15144
  GGML_ASSERT(false); // TODO: not implemented
14899
15145
  } break;
15146
+ case GGML_OP_ARGSORT:
15147
+ {
15148
+ GGML_ASSERT(false); // TODO: not implemented
15149
+ } break;
14900
15150
  case GGML_OP_FLASH_ATTN:
14901
15151
  {
14902
15152
  struct ggml_tensor * flash_grad = NULL;
@@ -15257,12 +15507,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15257
15507
  return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15258
15508
  }
15259
15509
 
15260
- struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15261
- const size_t obj_size = sizeof(struct ggml_cgraph);
15262
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15263
- struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15264
-
15265
- *cgraph = (struct ggml_cgraph) {
15510
+ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
15511
+ struct ggml_cgraph cgraph = {
15266
15512
  /*.size =*/ 0,
15267
15513
  /*.n_nodes =*/ i1 - i0,
15268
15514
  /*.n_leafs =*/ 0,
@@ -15497,7 +15743,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15497
15743
  n_tasks = n_threads;
15498
15744
  } break;
15499
15745
  case GGML_OP_SUB:
15500
- case GGML_OP_DIV:
15501
15746
  case GGML_OP_SQR:
15502
15747
  case GGML_OP_SQRT:
15503
15748
  case GGML_OP_LOG:
@@ -15530,10 +15775,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15530
15775
  {
15531
15776
  n_tasks = n_threads;
15532
15777
  } break;
15778
+ default:
15779
+ GGML_ASSERT(false);
15533
15780
  }
15534
15781
  break;
15535
15782
  case GGML_OP_SILU_BACK:
15536
15783
  case GGML_OP_MUL:
15784
+ case GGML_OP_DIV:
15537
15785
  case GGML_OP_NORM:
15538
15786
  case GGML_OP_RMS_NORM:
15539
15787
  case GGML_OP_RMS_NORM_BACK:
@@ -15571,6 +15819,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15571
15819
  }
15572
15820
  #endif
15573
15821
  } break;
15822
+ case GGML_OP_MUL_MAT_ID:
15823
+ {
15824
+ // FIXME: blas
15825
+ n_tasks = n_threads;
15826
+ } break;
15574
15827
  case GGML_OP_OUT_PROD:
15575
15828
  {
15576
15829
  n_tasks = n_threads;
@@ -15590,7 +15843,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15590
15843
  } break;
15591
15844
  case GGML_OP_DIAG_MASK_ZERO:
15592
15845
  case GGML_OP_DIAG_MASK_INF:
15593
- case GGML_OP_SOFT_MAX:
15594
15846
  case GGML_OP_SOFT_MAX_BACK:
15595
15847
  case GGML_OP_ROPE:
15596
15848
  case GGML_OP_ROPE_BACK:
@@ -15606,6 +15858,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15606
15858
  {
15607
15859
  n_tasks = 1; //TODO
15608
15860
  } break;
15861
+ case GGML_OP_SOFT_MAX:
15862
+ {
15863
+ n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
15864
+ } break;
15609
15865
  case GGML_OP_CONV_TRANSPOSE_1D:
15610
15866
  {
15611
15867
  n_tasks = n_threads;
@@ -15627,6 +15883,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15627
15883
  {
15628
15884
  n_tasks = n_threads;
15629
15885
  } break;
15886
+ case GGML_OP_ARGSORT:
15887
+ {
15888
+ n_tasks = n_threads;
15889
+ } break;
15630
15890
  case GGML_OP_FLASH_ATTN:
15631
15891
  {
15632
15892
  n_tasks = n_threads;
@@ -15695,7 +15955,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15695
15955
  } break;
15696
15956
  default:
15697
15957
  {
15698
- printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15958
+ fprintf(stderr, "%s: op not implemented: ", __func__);
15959
+ if (node->op < GGML_OP_COUNT) {
15960
+ fprintf(stderr, "%s\n", ggml_op_name(node->op));
15961
+ } else {
15962
+ fprintf(stderr, "%d\n", node->op);
15963
+ }
15699
15964
  GGML_ASSERT(false);
15700
15965
  } break;
15701
15966
  }
@@ -15836,18 +16101,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15836
16101
 
15837
16102
  // thread scheduling for the different operations + work buffer size estimation
15838
16103
  for (int i = 0; i < cgraph->n_nodes; i++) {
15839
- int n_tasks = 1;
15840
-
15841
16104
  struct ggml_tensor * node = cgraph->nodes[i];
15842
16105
 
16106
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16107
+
15843
16108
  size_t cur = 0;
15844
16109
 
15845
16110
  switch (node->op) {
15846
16111
  case GGML_OP_CPY:
15847
16112
  case GGML_OP_DUP:
15848
16113
  {
15849
- n_tasks = n_threads;
15850
-
15851
16114
  if (ggml_is_quantized(node->type)) {
15852
16115
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15853
16116
  }
@@ -15855,16 +16118,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15855
16118
  case GGML_OP_ADD:
15856
16119
  case GGML_OP_ADD1:
15857
16120
  {
15858
- n_tasks = n_threads;
15859
-
15860
16121
  if (ggml_is_quantized(node->src[0]->type)) {
15861
16122
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15862
16123
  }
15863
16124
  } break;
15864
16125
  case GGML_OP_ACC:
15865
16126
  {
15866
- n_tasks = n_threads;
15867
-
15868
16127
  if (ggml_is_quantized(node->src[0]->type)) {
15869
16128
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
15870
16129
  }
@@ -15890,14 +16149,33 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15890
16149
  cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
15891
16150
  }
15892
16151
  } break;
16152
+ case GGML_OP_MUL_MAT_ID:
16153
+ {
16154
+ const struct ggml_tensor * a = node->src[2];
16155
+ const struct ggml_tensor * b = node->src[1];
16156
+ const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
16157
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16158
+ if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
16159
+ if (a->type != GGML_TYPE_F32) {
16160
+ // here we need memory just for single 2D matrix from src0
16161
+ cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
16162
+ }
16163
+ } else
16164
+ #endif
16165
+ if (b->type != vec_dot_type) {
16166
+ cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
16167
+ }
16168
+ } break;
15893
16169
  case GGML_OP_OUT_PROD:
15894
16170
  {
15895
- n_tasks = n_threads;
15896
-
15897
16171
  if (ggml_is_quantized(node->src[0]->type)) {
15898
16172
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15899
16173
  }
15900
16174
  } break;
16175
+ case GGML_OP_SOFT_MAX:
16176
+ {
16177
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16178
+ } break;
15901
16179
  case GGML_OP_CONV_TRANSPOSE_1D:
15902
16180
  {
15903
16181
  GGML_ASSERT(node->src[0]->ne[3] == 1);
@@ -15923,10 +16201,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15923
16201
  GGML_ASSERT(false);
15924
16202
  }
15925
16203
  } break;
15926
- case GGML_OP_IM2COL:
15927
- {
15928
- n_tasks = n_threads;
15929
- } break;
15930
16204
  case GGML_OP_CONV_TRANSPOSE_2D:
15931
16205
  {
15932
16206
  const int64_t ne00 = node->src[0]->ne[0]; // W
@@ -15943,8 +16217,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15943
16217
  } break;
15944
16218
  case GGML_OP_FLASH_ATTN:
15945
16219
  {
15946
- n_tasks = n_threads;
15947
-
15948
16220
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
15949
16221
 
15950
16222
  if (node->src[1]->type == GGML_TYPE_F32) {
@@ -15957,8 +16229,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15957
16229
  } break;
15958
16230
  case GGML_OP_FLASH_FF:
15959
16231
  {
15960
- n_tasks = n_threads;
15961
-
15962
16232
  if (node->src[1]->type == GGML_TYPE_F32) {
15963
16233
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
15964
16234
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
@@ -15969,8 +16239,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15969
16239
  } break;
15970
16240
  case GGML_OP_FLASH_ATTN_BACK:
15971
16241
  {
15972
- n_tasks = n_threads;
15973
-
15974
16242
  const int64_t D = node->src[0]->ne[0];
15975
16243
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
15976
16244
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
@@ -15985,8 +16253,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15985
16253
 
15986
16254
  case GGML_OP_CROSS_ENTROPY_LOSS:
15987
16255
  {
15988
- n_tasks = n_threads;
15989
-
15990
16256
  cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
15991
16257
  } break;
15992
16258
  case GGML_OP_COUNT:
@@ -17773,8 +18039,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
17773
18039
  memcpy(&qh, &y[i].qh, sizeof(qh));
17774
18040
 
17775
18041
  for (int j = 0; j < QK5_0; j += 2) {
17776
- const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
17777
- const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
18042
+ const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
18043
+ const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
17778
18044
 
17779
18045
  // cast to 16 bins
17780
18046
  const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -17803,8 +18069,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
17803
18069
  memcpy(&qh, &y[i].qh, sizeof(qh));
17804
18070
 
17805
18071
  for (int j = 0; j < QK5_1; j += 2) {
17806
- const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
17807
- const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
18072
+ const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
18073
+ const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
17808
18074
 
17809
18075
  // cast to 16 bins
17810
18076
  const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -17994,6 +18260,7 @@ struct gguf_kv {
17994
18260
 
17995
18261
  struct gguf_header {
17996
18262
  char magic[4];
18263
+
17997
18264
  uint32_t version;
17998
18265
  uint64_t n_tensors; // GGUFv2
17999
18266
  uint64_t n_kv; // GGUFv2
@@ -18083,7 +18350,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18083
18350
 
18084
18351
  for (uint32_t i = 0; i < sizeof(magic); i++) {
18085
18352
  if (magic[i] != GGUF_MAGIC[i]) {
18086
- fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
18353
+ fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
18087
18354
  fclose(file);
18088
18355
  return NULL;
18089
18356
  }
@@ -18098,7 +18365,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18098
18365
  {
18099
18366
  strncpy(ctx->header.magic, magic, 4);
18100
18367
 
18101
-
18102
18368
  ctx->kv = NULL;
18103
18369
  ctx->infos = NULL;
18104
18370
  ctx->data = NULL;