llama_cpp 0.10.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
1
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
2
2
  #define _USE_MATH_DEFINES // For M_PI on MSVC
3
3
 
4
4
  #include "ggml-impl.h"
@@ -33,7 +33,7 @@
33
33
  // we should just be careful :)
34
34
  #pragma warning(disable: 4244 4267)
35
35
 
36
- // disable POSIX deprecation warnigns
36
+ // disable POSIX deprecation warnings
37
37
  // these functions are never going away, anyway
38
38
  #pragma warning(disable: 4996)
39
39
  #endif
@@ -1395,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1395
1395
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1396
1396
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1397
1397
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1398
- inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1398
+ inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1399
1399
 
1400
1400
  static const float GELU_COEF_A = 0.044715f;
1401
1401
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1623,7 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1623
1623
  "POOL_1D",
1624
1624
  "POOL_2D",
1625
1625
  "UPSCALE",
1626
+ "PAD",
1626
1627
  "ARGSORT",
1628
+ "LEAKY_RELU",
1627
1629
 
1628
1630
  "FLASH_ATTN",
1629
1631
  "FLASH_FF",
@@ -1650,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1650
1652
  "CROSS_ENTROPY_LOSS_BACK",
1651
1653
  };
1652
1654
 
1653
- static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
1655
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1654
1656
 
1655
1657
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1656
1658
  "none",
@@ -1707,7 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1707
1709
  "pool_1d(x)",
1708
1710
  "pool_2d(x)",
1709
1711
  "upscale(x)",
1712
+ "pad(x)",
1710
1713
  "argsort(x)",
1714
+ "leaky_relu(x)",
1711
1715
 
1712
1716
  "flash_attn(x)",
1713
1717
  "flash_ff(x)",
@@ -1734,7 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1734
1738
  "cross_entropy_loss_back(x,y)",
1735
1739
  };
1736
1740
 
1737
- static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
1741
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1738
1742
 
1739
1743
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1740
1744
 
@@ -1750,17 +1754,16 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1750
1754
  "GELU",
1751
1755
  "GELU_QUICK",
1752
1756
  "SILU",
1753
- "LEAKY",
1754
1757
  };
1755
1758
 
1756
- static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
1759
+ static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
1757
1760
 
1758
1761
 
1759
1762
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
1760
1763
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
1761
1764
 
1762
1765
  // WARN:
1763
- // Mis-confguration can lead to problem that's hard to reason about:
1766
+ // Mis-configuration can lead to problem that's hard to reason about:
1764
1767
  // * At best it crash or talks nosense.
1765
1768
  // * At worst it talks slightly difference but hard to perceive.
1766
1769
  //
@@ -1994,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
1994
1997
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1995
1998
  }
1996
1999
 
1997
- size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
1998
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1999
-
2000
- return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
2001
- }
2002
-
2003
2000
  int ggml_blck_size(enum ggml_type type) {
2004
2001
  return type_traits[type].blck_size;
2005
2002
  }
@@ -2008,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
2008
2005
  return type_traits[type].type_size;
2009
2006
  }
2010
2007
 
2011
- float ggml_type_sizef(enum ggml_type type) {
2012
- return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
2008
+ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2009
+ assert(ne % ggml_blck_size(type) == 0);
2010
+ return ggml_type_size(type)*ne/ggml_blck_size(type);
2011
+ }
2012
+
2013
+ double ggml_type_sizef(enum ggml_type type) {
2014
+ return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2013
2015
  }
2014
2016
 
2015
2017
  const char * ggml_type_name(enum ggml_type type) {
@@ -2046,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
2046
2048
  return ggml_type_size(tensor->type);
2047
2049
  }
2048
2050
 
2049
- static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2051
+ bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2050
2052
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2051
2053
 
2052
2054
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2053
2055
  }
2054
2056
 
2055
- static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
2057
+ bool ggml_is_vector(const struct ggml_tensor * tensor) {
2056
2058
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2057
2059
 
2058
2060
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2059
2061
  }
2060
2062
 
2061
- static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2063
+ bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2062
2064
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2063
2065
 
2064
2066
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
2065
2067
  }
2066
2068
 
2069
+ bool ggml_is_3d(const struct ggml_tensor * tensor) {
2070
+ return tensor->ne[3] == 1;
2071
+ }
2072
+
2073
+ int ggml_n_dims(const struct ggml_tensor * tensor) {
2074
+ for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
2075
+ if (tensor->ne[i] > 1) {
2076
+ return i + 1;
2077
+ }
2078
+ }
2079
+ return 1;
2080
+ }
2081
+
2067
2082
  static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2068
2083
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2069
2084
 
@@ -2470,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2470
2485
  view_src = view_src->view_src;
2471
2486
  }
2472
2487
 
2473
- size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
2488
+ size_t data_size = ggml_row_size(type, ne[0]);
2474
2489
  for (int i = 1; i < n_dims; i++) {
2475
2490
  data_size *= ne[i];
2476
2491
  }
@@ -2513,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2513
2528
  /*.type =*/ type,
2514
2529
  /*.backend =*/ GGML_BACKEND_CPU,
2515
2530
  /*.buffer =*/ NULL,
2516
- /*.n_dims =*/ n_dims,
2517
2531
  /*.ne =*/ { 1, 1, 1, 1 },
2518
2532
  /*.nb =*/ { 0, 0, 0, 0 },
2519
2533
  /*.op =*/ GGML_OP_NONE,
@@ -2620,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
2620
2634
  }
2621
2635
 
2622
2636
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
2623
- return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
2637
+ return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
2624
2638
  }
2625
2639
 
2626
2640
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
@@ -3069,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
3069
3083
  struct ggml_tensor * ggml_view_tensor(
3070
3084
  struct ggml_context * ctx,
3071
3085
  struct ggml_tensor * src) {
3072
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
3086
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
3073
3087
  ggml_format_name(result, "%s (view)", src->name);
3074
3088
 
3075
3089
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -3227,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
3227
3241
  is_node = true;
3228
3242
  }
3229
3243
 
3230
- struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
3244
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
3231
3245
 
3232
3246
  result->op = GGML_OP_ADD;
3233
- result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
3247
+ result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
3234
3248
  result->src[0] = a;
3235
3249
  result->src[1] = b;
3236
3250
 
@@ -3599,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows(
3599
3613
  is_node = true;
3600
3614
  }
3601
3615
 
3602
- int64_t ne[4] = {1,1,1,1};
3603
- for (int i=1; i<a->n_dims; ++i) {
3616
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
3617
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3604
3618
  ne[i] = a->ne[i];
3605
3619
  }
3606
3620
 
3607
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
3621
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
3608
3622
 
3609
3623
  result->op = GGML_OP_SUM_ROWS;
3610
3624
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3625,8 +3639,8 @@ struct ggml_tensor * ggml_mean(
3625
3639
  is_node = true;
3626
3640
  }
3627
3641
 
3628
- int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3629
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
3642
+ int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3643
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3630
3644
 
3631
3645
  result->op = GGML_OP_MEAN;
3632
3646
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3648,8 +3662,7 @@ struct ggml_tensor * ggml_argmax(
3648
3662
  is_node = true;
3649
3663
  }
3650
3664
 
3651
- int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
3652
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
3665
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
3653
3666
 
3654
3667
  result->op = GGML_OP_ARGMAX;
3655
3668
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3672,7 +3685,7 @@ struct ggml_tensor * ggml_repeat(
3672
3685
  is_node = true;
3673
3686
  }
3674
3687
 
3675
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3688
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3676
3689
 
3677
3690
  result->op = GGML_OP_REPEAT;
3678
3691
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3699,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back(
3699
3712
  return a;
3700
3713
  }
3701
3714
 
3702
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3715
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3703
3716
 
3704
3717
  result->op = GGML_OP_REPEAT_BACK;
3705
3718
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3830,12 +3843,25 @@ struct ggml_tensor * ggml_relu_inplace(
3830
3843
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3831
3844
  }
3832
3845
 
3833
- // ggml_leaky
3846
+ // ggml_leaky_relu
3834
3847
 
3835
- struct ggml_tensor * ggml_leaky(
3848
+ struct ggml_tensor * ggml_leaky_relu(
3836
3849
  struct ggml_context * ctx,
3837
- struct ggml_tensor * a) {
3838
- return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3850
+ struct ggml_tensor * a, float negative_slope, bool inplace) {
3851
+ bool is_node = false;
3852
+
3853
+ if (!inplace && (a->grad)) {
3854
+ is_node = true;
3855
+ }
3856
+
3857
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3858
+ ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
3859
+
3860
+ result->op = GGML_OP_LEAKY_RELU;
3861
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
3862
+ result->src[0] = a;
3863
+
3864
+ return result;
3839
3865
  }
3840
3866
 
3841
3867
  // ggml_gelu
@@ -4022,8 +4048,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
4022
4048
 
4023
4049
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4024
4050
 
4025
- result->op = GGML_OP_GROUP_NORM;
4026
4051
  result->op_params[0] = n_groups;
4052
+
4053
+ result->op = GGML_OP_GROUP_NORM;
4027
4054
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4028
4055
  result->src[0] = a;
4029
4056
  result->src[1] = NULL; // TODO: maybe store epsilon here?
@@ -4061,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat(
4061
4088
  }
4062
4089
 
4063
4090
  const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4064
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4091
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4065
4092
 
4066
4093
  result->op = GGML_OP_MUL_MAT;
4067
4094
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4075,17 +4102,18 @@ struct ggml_tensor * ggml_mul_mat(
4075
4102
 
4076
4103
  struct ggml_tensor * ggml_mul_mat_id(
4077
4104
  struct ggml_context * ctx,
4078
- struct ggml_tensor * as[],
4105
+ struct ggml_tensor * const as[],
4106
+ int n_as,
4079
4107
  struct ggml_tensor * ids,
4080
4108
  int id,
4081
4109
  struct ggml_tensor * b) {
4082
4110
 
4083
- int64_t n_as = ids->ne[0];
4084
-
4085
4111
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4086
- GGML_ASSERT(ggml_is_vector(ids));
4112
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4113
+ GGML_ASSERT(ids->ne[1] == b->ne[1]);
4114
+ GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4087
4115
  GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4088
- GGML_ASSERT(id >= 0 && id < n_as);
4116
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4089
4117
 
4090
4118
  bool is_node = false;
4091
4119
 
@@ -4094,16 +4122,17 @@ struct ggml_tensor * ggml_mul_mat_id(
4094
4122
  }
4095
4123
 
4096
4124
  const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4097
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
4125
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4098
4126
 
4099
4127
  ggml_set_op_params_i32(result, 0, id);
4128
+ ggml_set_op_params_i32(result, 1, n_as);
4100
4129
 
4101
4130
  result->op = GGML_OP_MUL_MAT_ID;
4102
4131
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4103
4132
  result->src[0] = ids;
4104
4133
  result->src[1] = b;
4105
4134
 
4106
- for (int64_t i = 0; i < n_as; i++) {
4135
+ for (int i = 0; i < n_as; i++) {
4107
4136
  struct ggml_tensor * a = as[i];
4108
4137
  GGML_ASSERT(ggml_are_same_shape(as[0], a));
4109
4138
  GGML_ASSERT(ggml_can_mul_mat(a, b));
@@ -4131,7 +4160,7 @@ struct ggml_tensor * ggml_out_prod(
4131
4160
 
4132
4161
  // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
4133
4162
  const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
4134
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4163
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4135
4164
 
4136
4165
  result->op = GGML_OP_OUT_PROD;
4137
4166
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4416,7 +4445,7 @@ struct ggml_tensor * ggml_reshape(
4416
4445
  //GGML_ASSERT(false);
4417
4446
  }
4418
4447
 
4419
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
4448
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
4420
4449
  ggml_format_name(result, "%s (reshaped)", a->name);
4421
4450
 
4422
4451
  result->op = GGML_OP_RESHAPE;
@@ -4731,7 +4760,9 @@ struct ggml_tensor * ggml_get_rows(
4731
4760
  struct ggml_context * ctx,
4732
4761
  struct ggml_tensor * a,
4733
4762
  struct ggml_tensor * b) {
4734
- GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
4763
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
4764
+ GGML_ASSERT(b->ne[3] == 1);
4765
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
4735
4766
 
4736
4767
  bool is_node = false;
4737
4768
 
@@ -4741,7 +4772,7 @@ struct ggml_tensor * ggml_get_rows(
4741
4772
 
4742
4773
  // TODO: implement non F32 return
4743
4774
  //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
4744
- struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
4775
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
4745
4776
 
4746
4777
  result->op = GGML_OP_GET_ROWS;
4747
4778
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4792,7 +4823,7 @@ struct ggml_tensor * ggml_diag(
4792
4823
  }
4793
4824
 
4794
4825
  const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
4795
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
4826
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
4796
4827
 
4797
4828
  result->op = GGML_OP_DIAG;
4798
4829
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5439,7 +5470,7 @@ struct ggml_tensor * ggml_pool_1d(
5439
5470
  is_node = true;
5440
5471
  }
5441
5472
 
5442
- const int64_t ne[3] = {
5473
+ const int64_t ne[2] = {
5443
5474
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5444
5475
  a->ne[1],
5445
5476
  };
@@ -5519,6 +5550,30 @@ static struct ggml_tensor * ggml_upscale_impl(
5519
5550
  return result;
5520
5551
  }
5521
5552
 
5553
+ struct ggml_tensor * ggml_pad(
5554
+ struct ggml_context * ctx,
5555
+ struct ggml_tensor * a,
5556
+ int p0, int p1, int p2, int p3) {
5557
+ bool is_node = false;
5558
+
5559
+ if (a->grad) {
5560
+ GGML_ASSERT(false); // TODO: implement backward
5561
+ is_node = true;
5562
+ }
5563
+
5564
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
5565
+ a->ne[0] + p0,
5566
+ a->ne[1] + p1,
5567
+ a->ne[2] + p2,
5568
+ a->ne[3] + p3);
5569
+
5570
+ result->op = GGML_OP_PAD;
5571
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5572
+ result->src[0] = a;
5573
+
5574
+ return result;
5575
+ }
5576
+
5522
5577
  struct ggml_tensor * ggml_upscale(
5523
5578
  struct ggml_context * ctx,
5524
5579
  struct ggml_tensor * a,
@@ -5534,7 +5589,7 @@ struct ggml_tensor * ggml_argsort(
5534
5589
  enum ggml_sort_order order) {
5535
5590
  bool is_node = false;
5536
5591
 
5537
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
5592
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
5538
5593
 
5539
5594
  ggml_set_op_params_i32(result, 0, (int32_t) order);
5540
5595
 
@@ -5581,7 +5636,7 @@ struct ggml_tensor * ggml_flash_attn(
5581
5636
  }
5582
5637
 
5583
5638
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
5584
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
5639
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
5585
5640
 
5586
5641
  int32_t t = masked ? 1 : 0;
5587
5642
  ggml_set_op_params(result, &t, sizeof(t));
@@ -5614,7 +5669,7 @@ struct ggml_tensor * ggml_flash_ff(
5614
5669
  }
5615
5670
 
5616
5671
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5617
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
5672
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
5618
5673
 
5619
5674
  result->op = GGML_OP_FLASH_FF;
5620
5675
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5730,7 +5785,6 @@ struct ggml_tensor * ggml_win_part(
5730
5785
  const int np = npx*npy;
5731
5786
 
5732
5787
  const int64_t ne[4] = { a->ne[0], w, w, np, };
5733
-
5734
5788
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5735
5789
 
5736
5790
  int32_t params[] = { npx, npy, w };
@@ -7520,7 +7574,7 @@ static void ggml_compute_forward_acc_f32(
7520
7574
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7521
7575
 
7522
7576
  // view src0 and dst with these strides and data offset inbytes during acc
7523
- // nb0 is implicitely element_size because src0 and dst are contiguous
7577
+ // nb0 is implicitly element_size because src0 and dst are contiguous
7524
7578
  size_t nb1 = ((int32_t *) dst->op_params)[0];
7525
7579
  size_t nb2 = ((int32_t *) dst->op_params)[1];
7526
7580
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -7716,6 +7770,8 @@ static void ggml_compute_forward_mul_f32(
7716
7770
 
7717
7771
  #ifdef GGML_USE_CLBLAST
7718
7772
  if (src1->backend == GGML_BACKEND_GPU) {
7773
+ // TODO: OpenCL kernel support full broadcast
7774
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7719
7775
  if (ith == 0) {
7720
7776
  ggml_cl_mul(src0, src1, dst);
7721
7777
  }
@@ -8981,10 +9037,9 @@ static void ggml_compute_forward_silu(
8981
9037
  } break;
8982
9038
  }
8983
9039
  }
9040
+ // ggml_compute_forward_leaky_relu
8984
9041
 
8985
- // ggml_compute_forward_leaky
8986
-
8987
- static void ggml_compute_forward_leaky_f32(
9042
+ static void ggml_compute_forward_leaky_relu_f32(
8988
9043
  const struct ggml_compute_params * params,
8989
9044
  const struct ggml_tensor * src0,
8990
9045
  struct ggml_tensor * dst) {
@@ -8998,24 +9053,27 @@ static void ggml_compute_forward_leaky_f32(
8998
9053
  const int n = ggml_nrows(src0);
8999
9054
  const int nc = src0->ne[0];
9000
9055
 
9056
+ float negative_slope;
9057
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
9058
+
9001
9059
  assert(dst->nb[0] == sizeof(float));
9002
9060
  assert(src0->nb[0] == sizeof(float));
9003
9061
 
9004
9062
  for (int i = 0; i < n; i++) {
9005
- ggml_vec_leaky_f32(nc,
9063
+ ggml_vec_leaky_relu_f32(nc,
9006
9064
  (float *) ((char *) dst->data + i*( dst->nb[1])),
9007
- (float *) ((char *) src0->data + i*(src0->nb[1])));
9065
+ (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
9008
9066
  }
9009
9067
  }
9010
9068
 
9011
- static void ggml_compute_forward_leaky(
9069
+ static void ggml_compute_forward_leaky_relu(
9012
9070
  const struct ggml_compute_params * params,
9013
9071
  const struct ggml_tensor * src0,
9014
9072
  struct ggml_tensor * dst) {
9015
9073
  switch (src0->type) {
9016
9074
  case GGML_TYPE_F32:
9017
9075
  {
9018
- ggml_compute_forward_leaky_f32(params, src0, dst);
9076
+ ggml_compute_forward_leaky_relu_f32(params, src0, dst);
9019
9077
  } break;
9020
9078
  default:
9021
9079
  {
@@ -9504,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9504
9562
  const int64_t ne0 = dst->ne[0];
9505
9563
  const int64_t ne1 = dst->ne[1];
9506
9564
 
9565
+ // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
9566
+ // all the experts for each batch element and the processing would become incredibly slow
9507
9567
  // TODO: find the optimal values for these
9508
- if (ggml_is_contiguous(src0) &&
9568
+ if (dst->op != GGML_OP_MUL_MAT_ID &&
9569
+ ggml_is_contiguous(src0) &&
9509
9570
  ggml_is_contiguous(src1) &&
9510
9571
  //src0->type == GGML_TYPE_F32 &&
9511
9572
  src1->type == GGML_TYPE_F32 &&
@@ -9519,11 +9580,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9519
9580
  }
9520
9581
  #endif
9521
9582
 
9583
+ // off1 = offset in i11 and i1
9584
+ // cne1 = ne11 and ne1
9585
+ // in a normal matrix multiplication, off1 = 0 and cne1 = ne1
9586
+ // during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
9522
9587
  static void ggml_compute_forward_mul_mat(
9523
9588
  const struct ggml_compute_params * params,
9524
9589
  const struct ggml_tensor * src0,
9525
9590
  const struct ggml_tensor * src1,
9526
- struct ggml_tensor * dst) {
9591
+ struct ggml_tensor * dst,
9592
+ int64_t off1, int64_t cne1) {
9527
9593
  int64_t t0 = ggml_perf_time_us();
9528
9594
  UNUSED(t0);
9529
9595
 
@@ -9591,10 +9657,9 @@ static void ggml_compute_forward_mul_mat(
9591
9657
  const int64_t i03 = i13/r3;
9592
9658
  const int64_t i02 = i12/r2;
9593
9659
 
9594
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9595
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
9596
-
9597
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9660
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9661
+ const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
9662
+ float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
9598
9663
 
9599
9664
  if (type != GGML_TYPE_F32) {
9600
9665
  float * const wdata = params->wdata;
@@ -9611,10 +9676,10 @@ static void ggml_compute_forward_mul_mat(
9611
9676
  }
9612
9677
 
9613
9678
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9614
- ne11, ne01, ne10,
9615
- 1.0f, y, ne10,
9616
- x, ne00,
9617
- 0.0f, d, ne01);
9679
+ cne1, ne01, ne10,
9680
+ 1.0f, y, ne10,
9681
+ x, ne00,
9682
+ 0.0f, d, ne01);
9618
9683
  }
9619
9684
  }
9620
9685
 
@@ -9627,9 +9692,10 @@ static void ggml_compute_forward_mul_mat(
9627
9692
  if (params->type == GGML_TASK_INIT) {
9628
9693
  if (src1->type != vec_dot_type) {
9629
9694
  char * wdata = params->wdata;
9630
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9695
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9631
9696
 
9632
9697
  assert(params->wsize >= ne11*ne12*ne13*row_size);
9698
+ assert(src1->type == GGML_TYPE_F32);
9633
9699
 
9634
9700
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
9635
9701
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -9649,10 +9715,10 @@ static void ggml_compute_forward_mul_mat(
9649
9715
  }
9650
9716
 
9651
9717
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9652
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9718
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9653
9719
 
9654
9720
  const int64_t nr0 = ne01; // src0 rows
9655
- const int64_t nr1 = ne11*ne12*ne13; // src1 rows
9721
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9656
9722
 
9657
9723
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9658
9724
 
@@ -9694,9 +9760,9 @@ static void ggml_compute_forward_mul_mat(
9694
9760
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9695
9761
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9696
9762
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9697
- const int64_t i13 = (ir1/(ne12*ne11));
9698
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
9699
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
9763
+ const int64_t i13 = (ir1/(ne12*cne1));
9764
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9765
+ const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
9700
9766
 
9701
9767
  // broadcast src0 into src1
9702
9768
  const int64_t i03 = i13/r3;
@@ -9736,20 +9802,28 @@ static void ggml_compute_forward_mul_mat(
9736
9802
 
9737
9803
  static void ggml_compute_forward_mul_mat_id(
9738
9804
  const struct ggml_compute_params * params,
9805
+ const struct ggml_tensor * src0,
9806
+ const struct ggml_tensor * src1,
9739
9807
  struct ggml_tensor * dst) {
9740
9808
 
9741
- const struct ggml_tensor * ids = dst->src[0];
9742
- const struct ggml_tensor * src1 = dst->src[1];
9743
-
9744
- const int id = ggml_get_op_params_i32(dst, 0);
9809
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9810
+ // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
9811
+ ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
9812
+ return;
9813
+ }
9745
9814
 
9746
- const int a_id = ((int32_t *)ids->data)[id];
9815
+ const struct ggml_tensor * ids = src0;
9816
+ const int id = ggml_get_op_params_i32(dst, 0);
9817
+ const int n_as = ggml_get_op_params_i32(dst, 1);
9747
9818
 
9748
- GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
9819
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9820
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9749
9821
 
9750
- const struct ggml_tensor * src0 = dst->src[a_id + 2];
9822
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
9751
9823
 
9752
- ggml_compute_forward_mul_mat(params, src0, src1, dst);
9824
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
9825
+ ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
9826
+ }
9753
9827
  }
9754
9828
 
9755
9829
  // ggml_compute_forward_out_prod
@@ -10161,7 +10235,7 @@ static void ggml_compute_forward_set_f32(
10161
10235
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10162
10236
 
10163
10237
  // view src0 and dst with these strides and data offset inbytes during set
10164
- // nb0 is implicitely element_size because src0 and dst are contiguous
10238
+ // nb0 is implicitly element_size because src0 and dst are contiguous
10165
10239
  size_t nb1 = ((int32_t *) dst->op_params)[0];
10166
10240
  size_t nb2 = ((int32_t *) dst->op_params)[1];
10167
10241
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -10325,21 +10399,30 @@ static void ggml_compute_forward_get_rows_q(
10325
10399
  return;
10326
10400
  }
10327
10401
 
10328
- const int nc = src0->ne[0];
10329
- const int nr = ggml_nelements(src1);
10402
+ GGML_TENSOR_BINARY_OP_LOCALS
10403
+
10404
+ const int64_t nc = ne00;
10405
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10406
+
10330
10407
  const enum ggml_type type = src0->type;
10331
10408
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
10332
10409
 
10333
- assert( dst->ne[0] == nc);
10334
- assert( dst->ne[1] == nr);
10335
- assert(src0->nb[0] == ggml_type_size(type));
10410
+ assert(ne0 == nc);
10411
+ assert(ne02 == ne11);
10412
+ assert(nb00 == ggml_type_size(type));
10413
+ assert(ggml_nrows(dst) == nr);
10336
10414
 
10337
- for (int i = 0; i < nr; ++i) {
10338
- const int r = ((int32_t *) src1->data)[i];
10415
+ // TODO: multi-thread
10416
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10417
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10418
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10419
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10339
10420
 
10340
- dequantize_row_q(
10341
- (const void *) ((char *) src0->data + r*src0->nb[1]),
10342
- (float *) ((char *) dst->data + i*dst->nb[1]), nc);
10421
+ dequantize_row_q(
10422
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10423
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10424
+ }
10425
+ }
10343
10426
  }
10344
10427
  }
10345
10428
 
@@ -10354,19 +10437,26 @@ static void ggml_compute_forward_get_rows_f16(
10354
10437
  return;
10355
10438
  }
10356
10439
 
10357
- const int nc = src0->ne[0];
10358
- const int nr = ggml_nelements(src1);
10440
+ GGML_TENSOR_BINARY_OP_LOCALS
10359
10441
 
10360
- assert( dst->ne[0] == nc);
10361
- assert( dst->ne[1] == nr);
10362
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
10442
+ const int64_t nc = ne00;
10443
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10363
10444
 
10364
- for (int i = 0; i < nr; ++i) {
10365
- const int r = ((int32_t *) src1->data)[i];
10445
+ assert(ne0 == nc);
10446
+ assert(ne02 == ne11);
10447
+ assert(nb00 == sizeof(ggml_fp16_t));
10448
+ assert(ggml_nrows(dst) == nr);
10366
10449
 
10367
- for (int j = 0; j < nc; ++j) {
10368
- ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
10369
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
10450
+ // TODO: multi-thread
10451
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10452
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10453
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10454
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10455
+
10456
+ ggml_fp16_to_fp32_row(
10457
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10458
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10459
+ }
10370
10460
  }
10371
10461
  }
10372
10462
  }
@@ -10382,19 +10472,27 @@ static void ggml_compute_forward_get_rows_f32(
10382
10472
  return;
10383
10473
  }
10384
10474
 
10385
- const int nc = src0->ne[0];
10386
- const int nr = ggml_nelements(src1);
10475
+ GGML_TENSOR_BINARY_OP_LOCALS
10387
10476
 
10388
- assert( dst->ne[0] == nc);
10389
- assert( dst->ne[1] == nr);
10390
- assert(src0->nb[0] == sizeof(float));
10477
+ const int64_t nc = ne00;
10478
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10391
10479
 
10392
- for (int i = 0; i < nr; ++i) {
10393
- const int r = ((int32_t *) src1->data)[i];
10480
+ assert(ne0 == nc);
10481
+ assert(ne02 == ne11);
10482
+ assert(nb00 == sizeof(float));
10483
+ assert(ggml_nrows(dst) == nr);
10394
10484
 
10395
- ggml_vec_cpy_f32(nc,
10396
- (float *) ((char *) dst->data + i*dst->nb[1]),
10397
- (float *) ((char *) src0->data + r*src0->nb[1]));
10485
+ // TODO: multi-thread
10486
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10487
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10488
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10489
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10490
+
10491
+ ggml_vec_cpy_f32(nc,
10492
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
10493
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
10494
+ }
10495
+ }
10398
10496
  }
10399
10497
  }
10400
10498
 
@@ -12114,6 +12212,7 @@ static void ggml_compute_forward_upscale_f32(
12114
12212
  GGML_ASSERT(src0->nb[0] == sizeof(float));
12115
12213
 
12116
12214
  const int ith = params->ith;
12215
+ const int nth = params->nth;
12117
12216
 
12118
12217
  GGML_TENSOR_UNARY_OP_LOCALS
12119
12218
 
@@ -12121,16 +12220,17 @@ static void ggml_compute_forward_upscale_f32(
12121
12220
 
12122
12221
  // TODO: optimize
12123
12222
 
12124
- for (int i03 = 0; i03 < ne03; i03++) {
12125
- for (int i02 = ith; i02 < ne02; i02++) {
12126
- for (int m = 0; m < dst->ne[1]; m++) {
12127
- int i01 = m / scale_factor;
12128
- for (int n = 0; n < dst->ne[0]; n++) {
12129
- int i00 = n / scale_factor;
12130
-
12131
- const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
12223
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
12224
+ const int64_t i03 = i3;
12225
+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
12226
+ const int64_t i02 = i2;
12227
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
12228
+ const int64_t i01 = i1 / scale_factor;
12229
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
12230
+ const int64_t i00 = i0 / scale_factor;
12132
12231
 
12133
- float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
12232
+ const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
12233
+ float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
12134
12234
 
12135
12235
  *y = *x;
12136
12236
  }
@@ -12155,6 +12255,64 @@ static void ggml_compute_forward_upscale(
12155
12255
  }
12156
12256
  }
12157
12257
 
12258
+ // ggml_compute_forward_pad
12259
+
12260
+ static void ggml_compute_forward_pad_f32(
12261
+ const struct ggml_compute_params * params,
12262
+ const struct ggml_tensor * src0,
12263
+ struct ggml_tensor * dst) {
12264
+
12265
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12266
+ return;
12267
+ }
12268
+
12269
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
12270
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
12271
+
12272
+ const int ith = params->ith;
12273
+ const int nth = params->nth;
12274
+
12275
+ GGML_TENSOR_UNARY_OP_LOCALS
12276
+
12277
+ float * dst_ptr = (float *) dst->data;
12278
+
12279
+ // TODO: optimize
12280
+
12281
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
12282
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
12283
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
12284
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
12285
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
12286
+
12287
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12288
+
12289
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
12290
+ dst_ptr[dst_idx] = *src_ptr;
12291
+ } else {
12292
+ dst_ptr[dst_idx] = 0;
12293
+ }
12294
+ }
12295
+ }
12296
+ }
12297
+ }
12298
+ }
12299
+
12300
+ static void ggml_compute_forward_pad(
12301
+ const struct ggml_compute_params * params,
12302
+ const struct ggml_tensor * src0,
12303
+ struct ggml_tensor * dst) {
12304
+ switch (src0->type) {
12305
+ case GGML_TYPE_F32:
12306
+ {
12307
+ ggml_compute_forward_pad_f32(params, src0, dst);
12308
+ } break;
12309
+ default:
12310
+ {
12311
+ GGML_ASSERT(false);
12312
+ } break;
12313
+ }
12314
+ }
12315
+
12158
12316
  // ggml_compute_forward_argsort
12159
12317
 
12160
12318
  static void ggml_compute_forward_argsort_f32(
@@ -13362,10 +13520,6 @@ static void ggml_compute_forward_unary(
13362
13520
  {
13363
13521
  ggml_compute_forward_silu(params, src0, dst);
13364
13522
  } break;
13365
- case GGML_UNARY_OP_LEAKY:
13366
- {
13367
- ggml_compute_forward_leaky(params, src0, dst);
13368
- } break;
13369
13523
  default:
13370
13524
  {
13371
13525
  GGML_ASSERT(false);
@@ -14037,11 +14191,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14037
14191
  } break;
14038
14192
  case GGML_OP_MUL_MAT:
14039
14193
  {
14040
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
14194
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
14041
14195
  } break;
14042
14196
  case GGML_OP_MUL_MAT_ID:
14043
14197
  {
14044
- ggml_compute_forward_mul_mat_id(params, tensor);
14198
+ ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
14045
14199
  } break;
14046
14200
  case GGML_OP_OUT_PROD:
14047
14201
  {
@@ -14147,10 +14301,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14147
14301
  {
14148
14302
  ggml_compute_forward_upscale(params, tensor->src[0], tensor);
14149
14303
  } break;
14304
+ case GGML_OP_PAD:
14305
+ {
14306
+ ggml_compute_forward_pad(params, tensor->src[0], tensor);
14307
+ } break;
14150
14308
  case GGML_OP_ARGSORT:
14151
14309
  {
14152
14310
  ggml_compute_forward_argsort(params, tensor->src[0], tensor);
14153
14311
  } break;
14312
+ case GGML_OP_LEAKY_RELU:
14313
+ {
14314
+ ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
14315
+ } break;
14154
14316
  case GGML_OP_FLASH_ATTN:
14155
14317
  {
14156
14318
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -14405,7 +14567,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14405
14567
  return replacements->vals[i];
14406
14568
  }
14407
14569
 
14408
- struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14570
+ struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
14409
14571
 
14410
14572
  // insert clone into replacements
14411
14573
  GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
@@ -14475,7 +14637,7 @@ void ggml_build_backward_gradient_checkpointing(
14475
14637
  // insert new tensors recomputing src, reusing already made replacements,
14476
14638
  // remember replacements: remember new tensors with mapping from corresponding gf nodes
14477
14639
  // recurse for input tensors,
14478
- // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
14640
+ // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
14479
14641
  node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
14480
14642
  }
14481
14643
  // insert rewritten backward node with replacements made into resulting backward graph gb
@@ -15143,10 +15305,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15143
15305
  {
15144
15306
  GGML_ASSERT(false); // TODO: not implemented
15145
15307
  } break;
15308
+ case GGML_OP_PAD:
15309
+ {
15310
+ GGML_ASSERT(false); // TODO: not implemented
15311
+ } break;
15146
15312
  case GGML_OP_ARGSORT:
15147
15313
  {
15148
15314
  GGML_ASSERT(false); // TODO: not implemented
15149
15315
  } break;
15316
+ case GGML_OP_LEAKY_RELU:
15317
+ {
15318
+ GGML_ASSERT(false); // TODO: not implemented
15319
+ } break;
15150
15320
  case GGML_OP_FLASH_ATTN:
15151
15321
  {
15152
15322
  struct ggml_tensor * flash_grad = NULL;
@@ -15752,6 +15922,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15752
15922
  case GGML_OP_ARGMAX:
15753
15923
  case GGML_OP_REPEAT:
15754
15924
  case GGML_OP_REPEAT_BACK:
15925
+ case GGML_OP_LEAKY_RELU:
15755
15926
  {
15756
15927
  n_tasks = 1;
15757
15928
  } break;
@@ -15764,7 +15935,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15764
15935
  case GGML_UNARY_OP_TANH:
15765
15936
  case GGML_UNARY_OP_ELU:
15766
15937
  case GGML_UNARY_OP_RELU:
15767
- case GGML_UNARY_OP_LEAKY:
15768
15938
  {
15769
15939
  n_tasks = 1;
15770
15940
  } break;
@@ -15883,6 +16053,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15883
16053
  {
15884
16054
  n_tasks = n_threads;
15885
16055
  } break;
16056
+ case GGML_OP_PAD:
16057
+ {
16058
+ n_tasks = n_threads;
16059
+ } break;
15886
16060
  case GGML_OP_ARGSORT:
15887
16061
  {
15888
16062
  n_tasks = n_threads;
@@ -16146,7 +16320,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16146
16320
  } else
16147
16321
  #endif
16148
16322
  if (node->src[1]->type != vec_dot_type) {
16149
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16323
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
16150
16324
  }
16151
16325
  } break;
16152
16326
  case GGML_OP_MUL_MAT_ID:
@@ -16163,7 +16337,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16163
16337
  } else
16164
16338
  #endif
16165
16339
  if (b->type != vec_dot_type) {
16166
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
16340
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
16167
16341
  }
16168
16342
  } break;
16169
16343
  case GGML_OP_OUT_PROD:
@@ -16394,7 +16568,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
16394
16568
  fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
16395
16569
  ggml_type_name(tensor->type),
16396
16570
  ggml_op_name (tensor->op),
16397
- tensor->n_dims,
16571
+ ggml_n_dims(tensor),
16398
16572
  ne[0], ne[1], ne[2], ne[3],
16399
16573
  nb[0], nb[1], nb[2], nb[3],
16400
16574
  tensor->data,
@@ -16409,7 +16583,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16409
16583
  arg,
16410
16584
  ggml_type_name(tensor->type),
16411
16585
  ggml_op_name (tensor->op),
16412
- tensor->n_dims,
16586
+ ggml_n_dims(tensor),
16413
16587
  ne[0], ne[1], ne[2], ne[3],
16414
16588
  nb[0], nb[1], nb[2], nb[3],
16415
16589
  tensor->data,
@@ -16499,11 +16673,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16499
16673
 
16500
16674
  const uint32_t type = tensor->type;
16501
16675
  const uint32_t op = tensor->op;
16502
- const uint32_t n_dims = tensor->n_dims;
16503
16676
 
16504
16677
  fwrite(&type, sizeof(uint32_t), 1, fout);
16505
16678
  fwrite(&op, sizeof(uint32_t), 1, fout);
16506
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16507
16679
 
16508
16680
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16509
16681
  const uint64_t ne = tensor->ne[j];
@@ -16533,11 +16705,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16533
16705
 
16534
16706
  const uint32_t type = tensor->type;
16535
16707
  const uint32_t op = tensor->op;
16536
- const uint32_t n_dims = tensor->n_dims;
16537
16708
 
16538
16709
  fwrite(&type, sizeof(uint32_t), 1, fout);
16539
16710
  fwrite(&op, sizeof(uint32_t), 1, fout);
16540
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16541
16711
 
16542
16712
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16543
16713
  const uint64_t ne = tensor->ne[j];
@@ -16709,12 +16879,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16709
16879
  {
16710
16880
  uint32_t type;
16711
16881
  uint32_t op;
16712
- uint32_t n_dims;
16713
16882
 
16714
16883
  for (uint32_t i = 0; i < n_leafs; ++i) {
16715
16884
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16716
16885
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16717
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16718
16886
 
16719
16887
  int64_t ne[GGML_MAX_DIMS];
16720
16888
  size_t nb[GGML_MAX_DIMS];
@@ -16730,7 +16898,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16730
16898
  nb[j] = nb_cur;
16731
16899
  }
16732
16900
 
16733
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
16901
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16734
16902
 
16735
16903
  tensor->op = (enum ggml_op) op;
16736
16904
 
@@ -16747,7 +16915,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16747
16915
 
16748
16916
  ptr += ggml_nbytes(tensor);
16749
16917
 
16750
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
16918
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16751
16919
  }
16752
16920
  }
16753
16921
 
@@ -16757,12 +16925,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16757
16925
  {
16758
16926
  uint32_t type;
16759
16927
  uint32_t op;
16760
- uint32_t n_dims;
16761
16928
 
16762
16929
  for (uint32_t i = 0; i < n_nodes; ++i) {
16763
16930
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16764
16931
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16765
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16766
16932
 
16767
16933
  enum ggml_op eop = (enum ggml_op) op;
16768
16934
 
@@ -16833,7 +16999,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16833
16999
  } break;
16834
17000
  default:
16835
17001
  {
16836
- tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
17002
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16837
17003
 
16838
17004
  tensor->op = eop;
16839
17005
  } break;
@@ -16852,7 +17018,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16852
17018
 
16853
17019
  result->nodes[i] = tensor;
16854
17020
 
16855
- fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17021
+ fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16856
17022
  }
16857
17023
  }
16858
17024
  }
@@ -16990,7 +17156,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16990
17156
  fprintf(fp, "(%s)|", ggml_type_name(node->type));
16991
17157
  }
16992
17158
 
16993
- if (node->n_dims == 2) {
17159
+ if (ggml_is_matrix(node)) {
16994
17160
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
16995
17161
  } else {
16996
17162
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
@@ -17257,7 +17423,7 @@ static enum ggml_opt_result ggml_opt_adam(
17257
17423
  int64_t i = 0;
17258
17424
  for (int p = 0; p < np; ++p) {
17259
17425
  const int64_t ne = ggml_nelements(ps[p]);
17260
- const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
17426
+ const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
17261
17427
  for (int64_t j = 0; j < ne; ++j) {
17262
17428
  float x = ggml_get_f32_1d(ps[p], j);
17263
17429
  float g_ = g[i]*gnorm;
@@ -18531,7 +18697,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18531
18697
  return NULL;
18532
18698
  }
18533
18699
 
18534
- const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
18700
+ const size_t size_cur = ggml_row_size(info->type, ne);
18535
18701
 
18536
18702
  ctx->size += GGML_PAD(size_cur, ctx->alignment);
18537
18703
  }
@@ -19035,8 +19201,8 @@ void gguf_add_tensor(
19035
19201
  ctx->infos[idx].ne[i] = 1;
19036
19202
  }
19037
19203
 
19038
- ctx->infos[idx].n_dims = tensor->n_dims;
19039
- for (int i = 0; i < tensor->n_dims; i++) {
19204
+ ctx->infos[idx].n_dims = ggml_n_dims(tensor);
19205
+ for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
19040
19206
  ctx->infos[idx].ne[i] = tensor->ne[i];
19041
19207
  }
19042
19208