llama_cpp 0.10.0 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
1
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
2
2
  #define _USE_MATH_DEFINES // For M_PI on MSVC
3
3
 
4
4
  #include "ggml-impl.h"
@@ -33,7 +33,7 @@
33
33
  // we should just be careful :)
34
34
  #pragma warning(disable: 4244 4267)
35
35
 
36
- // disable POSIX deprecation warnigns
36
+ // disable POSIX deprecation warnings
37
37
  // these functions are never going away, anyway
38
38
  #pragma warning(disable: 4996)
39
39
  #endif
@@ -1395,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1395
1395
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1396
1396
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1397
1397
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1398
- inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1398
+ inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1399
1399
 
1400
1400
  static const float GELU_COEF_A = 0.044715f;
1401
1401
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1623,7 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1623
1623
  "POOL_1D",
1624
1624
  "POOL_2D",
1625
1625
  "UPSCALE",
1626
+ "PAD",
1626
1627
  "ARGSORT",
1628
+ "LEAKY_RELU",
1627
1629
 
1628
1630
  "FLASH_ATTN",
1629
1631
  "FLASH_FF",
@@ -1650,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1650
1652
  "CROSS_ENTROPY_LOSS_BACK",
1651
1653
  };
1652
1654
 
1653
- static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
1655
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1654
1656
 
1655
1657
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1656
1658
  "none",
@@ -1707,7 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1707
1709
  "pool_1d(x)",
1708
1710
  "pool_2d(x)",
1709
1711
  "upscale(x)",
1712
+ "pad(x)",
1710
1713
  "argsort(x)",
1714
+ "leaky_relu(x)",
1711
1715
 
1712
1716
  "flash_attn(x)",
1713
1717
  "flash_ff(x)",
@@ -1734,7 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1734
1738
  "cross_entropy_loss_back(x,y)",
1735
1739
  };
1736
1740
 
1737
- static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
1741
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1738
1742
 
1739
1743
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1740
1744
 
@@ -1750,17 +1754,16 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1750
1754
  "GELU",
1751
1755
  "GELU_QUICK",
1752
1756
  "SILU",
1753
- "LEAKY",
1754
1757
  };
1755
1758
 
1756
- static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
1759
+ static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
1757
1760
 
1758
1761
 
1759
1762
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
1760
1763
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
1761
1764
 
1762
1765
  // WARN:
1763
- // Mis-confguration can lead to problem that's hard to reason about:
1766
+ // Mis-configuration can lead to problem that's hard to reason about:
1764
1767
  // * At best it crash or talks nosense.
1765
1768
  // * At worst it talks slightly difference but hard to perceive.
1766
1769
  //
@@ -1994,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
1994
1997
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1995
1998
  }
1996
1999
 
1997
- size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
1998
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1999
-
2000
- return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
2001
- }
2002
-
2003
2000
  int ggml_blck_size(enum ggml_type type) {
2004
2001
  return type_traits[type].blck_size;
2005
2002
  }
@@ -2008,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
2008
2005
  return type_traits[type].type_size;
2009
2006
  }
2010
2007
 
2011
- float ggml_type_sizef(enum ggml_type type) {
2012
- return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
2008
+ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2009
+ assert(ne % ggml_blck_size(type) == 0);
2010
+ return ggml_type_size(type)*ne/ggml_blck_size(type);
2011
+ }
2012
+
2013
+ double ggml_type_sizef(enum ggml_type type) {
2014
+ return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2013
2015
  }
2014
2016
 
2015
2017
  const char * ggml_type_name(enum ggml_type type) {
@@ -2046,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
2046
2048
  return ggml_type_size(tensor->type);
2047
2049
  }
2048
2050
 
2049
- static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2051
+ bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2050
2052
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2051
2053
 
2052
2054
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2053
2055
  }
2054
2056
 
2055
- static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
2057
+ bool ggml_is_vector(const struct ggml_tensor * tensor) {
2056
2058
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2057
2059
 
2058
2060
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2059
2061
  }
2060
2062
 
2061
- static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2063
+ bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2062
2064
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2063
2065
 
2064
2066
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
2065
2067
  }
2066
2068
 
2069
+ bool ggml_is_3d(const struct ggml_tensor * tensor) {
2070
+ return tensor->ne[3] == 1;
2071
+ }
2072
+
2073
+ int ggml_n_dims(const struct ggml_tensor * tensor) {
2074
+ for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
2075
+ if (tensor->ne[i] > 1) {
2076
+ return i + 1;
2077
+ }
2078
+ }
2079
+ return 1;
2080
+ }
2081
+
2067
2082
  static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2068
2083
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2069
2084
 
@@ -2368,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
2368
2383
  size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
2369
2384
  size_t max_size = 0;
2370
2385
 
2371
- struct ggml_object * obj = ctx->objects_begin;
2372
-
2373
- while (obj != NULL) {
2374
- if (obj->type == GGML_OBJECT_TENSOR) {
2375
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
2376
-
2377
- const size_t size = ggml_nbytes(tensor);
2378
-
2379
- if (max_size < size) {
2380
- max_size = size;
2381
- }
2382
- }
2383
-
2384
- obj = obj->next;
2386
+ for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
2387
+ max_size = MAX(max_size, ggml_nbytes(tensor));
2385
2388
  }
2386
2389
 
2387
2390
  return max_size;
@@ -2470,7 +2473,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2470
2473
  view_src = view_src->view_src;
2471
2474
  }
2472
2475
 
2473
- size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
2476
+ size_t data_size = ggml_row_size(type, ne[0]);
2474
2477
  for (int i = 1; i < n_dims; i++) {
2475
2478
  data_size *= ne[i];
2476
2479
  }
@@ -2513,7 +2516,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2513
2516
  /*.type =*/ type,
2514
2517
  /*.backend =*/ GGML_BACKEND_CPU,
2515
2518
  /*.buffer =*/ NULL,
2516
- /*.n_dims =*/ n_dims,
2517
2519
  /*.ne =*/ { 1, 1, 1, 1 },
2518
2520
  /*.nb =*/ { 0, 0, 0, 0 },
2519
2521
  /*.op =*/ GGML_OP_NONE,
@@ -2620,7 +2622,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
2620
2622
  }
2621
2623
 
2622
2624
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
2623
- return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
2625
+ return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
2624
2626
  }
2625
2627
 
2626
2628
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
@@ -3069,7 +3071,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
3069
3071
  struct ggml_tensor * ggml_view_tensor(
3070
3072
  struct ggml_context * ctx,
3071
3073
  struct ggml_tensor * src) {
3072
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
3074
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
3073
3075
  ggml_format_name(result, "%s (view)", src->name);
3074
3076
 
3075
3077
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -3079,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
3079
3081
  return result;
3080
3082
  }
3081
3083
 
3082
- struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3084
+ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3083
3085
  struct ggml_object * obj = ctx->objects_begin;
3084
3086
 
3085
3087
  char * const mem_buffer = ctx->mem_buffer;
@@ -3095,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3095
3097
  return NULL;
3096
3098
  }
3097
3099
 
3098
- struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
3100
+ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
3099
3101
  struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
3100
3102
  obj = obj->next;
3101
3103
 
@@ -3227,10 +3229,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
3227
3229
  is_node = true;
3228
3230
  }
3229
3231
 
3230
- struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
3232
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
3231
3233
 
3232
3234
  result->op = GGML_OP_ADD;
3233
- result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
3235
+ result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
3234
3236
  result->src[0] = a;
3235
3237
  result->src[1] = b;
3236
3238
 
@@ -3599,12 +3601,12 @@ struct ggml_tensor * ggml_sum_rows(
3599
3601
  is_node = true;
3600
3602
  }
3601
3603
 
3602
- int64_t ne[4] = {1,1,1,1};
3603
- for (int i=1; i<a->n_dims; ++i) {
3604
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
3605
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3604
3606
  ne[i] = a->ne[i];
3605
3607
  }
3606
3608
 
3607
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
3609
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
3608
3610
 
3609
3611
  result->op = GGML_OP_SUM_ROWS;
3610
3612
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3625,8 +3627,8 @@ struct ggml_tensor * ggml_mean(
3625
3627
  is_node = true;
3626
3628
  }
3627
3629
 
3628
- int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3629
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
3630
+ int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3631
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3630
3632
 
3631
3633
  result->op = GGML_OP_MEAN;
3632
3634
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3648,8 +3650,7 @@ struct ggml_tensor * ggml_argmax(
3648
3650
  is_node = true;
3649
3651
  }
3650
3652
 
3651
- int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
3652
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
3653
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
3653
3654
 
3654
3655
  result->op = GGML_OP_ARGMAX;
3655
3656
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3672,7 +3673,7 @@ struct ggml_tensor * ggml_repeat(
3672
3673
  is_node = true;
3673
3674
  }
3674
3675
 
3675
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3676
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3676
3677
 
3677
3678
  result->op = GGML_OP_REPEAT;
3678
3679
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3699,7 +3700,7 @@ struct ggml_tensor * ggml_repeat_back(
3699
3700
  return a;
3700
3701
  }
3701
3702
 
3702
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3703
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3703
3704
 
3704
3705
  result->op = GGML_OP_REPEAT_BACK;
3705
3706
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3830,12 +3831,25 @@ struct ggml_tensor * ggml_relu_inplace(
3830
3831
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3831
3832
  }
3832
3833
 
3833
- // ggml_leaky
3834
+ // ggml_leaky_relu
3834
3835
 
3835
- struct ggml_tensor * ggml_leaky(
3836
+ struct ggml_tensor * ggml_leaky_relu(
3836
3837
  struct ggml_context * ctx,
3837
- struct ggml_tensor * a) {
3838
- return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3838
+ struct ggml_tensor * a, float negative_slope, bool inplace) {
3839
+ bool is_node = false;
3840
+
3841
+ if (!inplace && (a->grad)) {
3842
+ is_node = true;
3843
+ }
3844
+
3845
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3846
+ ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
3847
+
3848
+ result->op = GGML_OP_LEAKY_RELU;
3849
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
3850
+ result->src[0] = a;
3851
+
3852
+ return result;
3839
3853
  }
3840
3854
 
3841
3855
  // ggml_gelu
@@ -4022,8 +4036,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
4022
4036
 
4023
4037
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4024
4038
 
4025
- result->op = GGML_OP_GROUP_NORM;
4026
4039
  result->op_params[0] = n_groups;
4040
+
4041
+ result->op = GGML_OP_GROUP_NORM;
4027
4042
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4028
4043
  result->src[0] = a;
4029
4044
  result->src[1] = NULL; // TODO: maybe store epsilon here?
@@ -4061,7 +4076,7 @@ struct ggml_tensor * ggml_mul_mat(
4061
4076
  }
4062
4077
 
4063
4078
  const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4064
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4079
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4065
4080
 
4066
4081
  result->op = GGML_OP_MUL_MAT;
4067
4082
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4071,21 +4086,30 @@ struct ggml_tensor * ggml_mul_mat(
4071
4086
  return result;
4072
4087
  }
4073
4088
 
4089
+ void ggml_mul_mat_set_prec(
4090
+ struct ggml_tensor * a,
4091
+ enum ggml_prec prec) {
4092
+ const int32_t prec_i32 = (int32_t) prec;
4093
+
4094
+ ggml_set_op_params_i32(a, 0, prec_i32);
4095
+ }
4096
+
4074
4097
  // ggml_mul_mat_id
4075
4098
 
4076
4099
  struct ggml_tensor * ggml_mul_mat_id(
4077
4100
  struct ggml_context * ctx,
4078
- struct ggml_tensor * as[],
4101
+ struct ggml_tensor * const as[],
4102
+ int n_as,
4079
4103
  struct ggml_tensor * ids,
4080
4104
  int id,
4081
4105
  struct ggml_tensor * b) {
4082
4106
 
4083
- int64_t n_as = ids->ne[0];
4084
-
4085
4107
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4086
- GGML_ASSERT(ggml_is_vector(ids));
4108
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4109
+ GGML_ASSERT(ids->ne[1] == b->ne[1]);
4110
+ GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4087
4111
  GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4088
- GGML_ASSERT(id >= 0 && id < n_as);
4112
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4089
4113
 
4090
4114
  bool is_node = false;
4091
4115
 
@@ -4094,16 +4118,17 @@ struct ggml_tensor * ggml_mul_mat_id(
4094
4118
  }
4095
4119
 
4096
4120
  const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4097
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
4121
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4098
4122
 
4099
4123
  ggml_set_op_params_i32(result, 0, id);
4124
+ ggml_set_op_params_i32(result, 1, n_as);
4100
4125
 
4101
4126
  result->op = GGML_OP_MUL_MAT_ID;
4102
4127
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4103
4128
  result->src[0] = ids;
4104
4129
  result->src[1] = b;
4105
4130
 
4106
- for (int64_t i = 0; i < n_as; i++) {
4131
+ for (int i = 0; i < n_as; i++) {
4107
4132
  struct ggml_tensor * a = as[i];
4108
4133
  GGML_ASSERT(ggml_are_same_shape(as[0], a));
4109
4134
  GGML_ASSERT(ggml_can_mul_mat(a, b));
@@ -4131,7 +4156,7 @@ struct ggml_tensor * ggml_out_prod(
4131
4156
 
4132
4157
  // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
4133
4158
  const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
4134
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4159
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4135
4160
 
4136
4161
  result->op = GGML_OP_OUT_PROD;
4137
4162
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4146,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
4146
4171
  static struct ggml_tensor * ggml_scale_impl(
4147
4172
  struct ggml_context * ctx,
4148
4173
  struct ggml_tensor * a,
4149
- struct ggml_tensor * b,
4174
+ float s,
4150
4175
  bool inplace) {
4151
- GGML_ASSERT(ggml_is_scalar(b));
4152
4176
  GGML_ASSERT(ggml_is_padded_1d(a));
4153
4177
 
4154
4178
  bool is_node = false;
4155
4179
 
4156
- if (a->grad || b->grad) {
4180
+ if (a->grad) {
4157
4181
  is_node = true;
4158
4182
  }
4159
4183
 
4160
4184
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4161
4185
 
4186
+ ggml_set_op_params(result, &s, sizeof(s));
4187
+
4162
4188
  result->op = GGML_OP_SCALE;
4163
4189
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4164
4190
  result->src[0] = a;
4165
- result->src[1] = b;
4166
4191
 
4167
4192
  return result;
4168
4193
  }
@@ -4170,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
4170
4195
  struct ggml_tensor * ggml_scale(
4171
4196
  struct ggml_context * ctx,
4172
4197
  struct ggml_tensor * a,
4173
- struct ggml_tensor * b) {
4174
- return ggml_scale_impl(ctx, a, b, false);
4198
+ float s) {
4199
+ return ggml_scale_impl(ctx, a, s, false);
4175
4200
  }
4176
4201
 
4177
4202
  struct ggml_tensor * ggml_scale_inplace(
4178
4203
  struct ggml_context * ctx,
4179
4204
  struct ggml_tensor * a,
4180
- struct ggml_tensor * b) {
4181
- return ggml_scale_impl(ctx, a, b, true);
4205
+ float s) {
4206
+ return ggml_scale_impl(ctx, a, s, true);
4182
4207
  }
4183
4208
 
4184
4209
  // ggml_set
@@ -4416,7 +4441,7 @@ struct ggml_tensor * ggml_reshape(
4416
4441
  //GGML_ASSERT(false);
4417
4442
  }
4418
4443
 
4419
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
4444
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
4420
4445
  ggml_format_name(result, "%s (reshaped)", a->name);
4421
4446
 
4422
4447
  result->op = GGML_OP_RESHAPE;
@@ -4731,7 +4756,9 @@ struct ggml_tensor * ggml_get_rows(
4731
4756
  struct ggml_context * ctx,
4732
4757
  struct ggml_tensor * a,
4733
4758
  struct ggml_tensor * b) {
4734
- GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
4759
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
4760
+ GGML_ASSERT(b->ne[3] == 1);
4761
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
4735
4762
 
4736
4763
  bool is_node = false;
4737
4764
 
@@ -4741,7 +4768,7 @@ struct ggml_tensor * ggml_get_rows(
4741
4768
 
4742
4769
  // TODO: implement non F32 return
4743
4770
  //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
4744
- struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
4771
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
4745
4772
 
4746
4773
  result->op = GGML_OP_GET_ROWS;
4747
4774
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4792,7 +4819,7 @@ struct ggml_tensor * ggml_diag(
4792
4819
  }
4793
4820
 
4794
4821
  const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
4795
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
4822
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
4796
4823
 
4797
4824
  result->op = GGML_OP_DIAG;
4798
4825
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5439,7 +5466,7 @@ struct ggml_tensor * ggml_pool_1d(
5439
5466
  is_node = true;
5440
5467
  }
5441
5468
 
5442
- const int64_t ne[3] = {
5469
+ const int64_t ne[2] = {
5443
5470
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5444
5471
  a->ne[1],
5445
5472
  };
@@ -5519,6 +5546,30 @@ static struct ggml_tensor * ggml_upscale_impl(
5519
5546
  return result;
5520
5547
  }
5521
5548
 
5549
+ struct ggml_tensor * ggml_pad(
5550
+ struct ggml_context * ctx,
5551
+ struct ggml_tensor * a,
5552
+ int p0, int p1, int p2, int p3) {
5553
+ bool is_node = false;
5554
+
5555
+ if (a->grad) {
5556
+ GGML_ASSERT(false); // TODO: implement backward
5557
+ is_node = true;
5558
+ }
5559
+
5560
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
5561
+ a->ne[0] + p0,
5562
+ a->ne[1] + p1,
5563
+ a->ne[2] + p2,
5564
+ a->ne[3] + p3);
5565
+
5566
+ result->op = GGML_OP_PAD;
5567
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5568
+ result->src[0] = a;
5569
+
5570
+ return result;
5571
+ }
5572
+
5522
5573
  struct ggml_tensor * ggml_upscale(
5523
5574
  struct ggml_context * ctx,
5524
5575
  struct ggml_tensor * a,
@@ -5534,7 +5585,7 @@ struct ggml_tensor * ggml_argsort(
5534
5585
  enum ggml_sort_order order) {
5535
5586
  bool is_node = false;
5536
5587
 
5537
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
5588
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
5538
5589
 
5539
5590
  ggml_set_op_params_i32(result, 0, (int32_t) order);
5540
5591
 
@@ -5581,7 +5632,7 @@ struct ggml_tensor * ggml_flash_attn(
5581
5632
  }
5582
5633
 
5583
5634
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
5584
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
5635
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
5585
5636
 
5586
5637
  int32_t t = masked ? 1 : 0;
5587
5638
  ggml_set_op_params(result, &t, sizeof(t));
@@ -5614,7 +5665,7 @@ struct ggml_tensor * ggml_flash_ff(
5614
5665
  }
5615
5666
 
5616
5667
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5617
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
5668
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
5618
5669
 
5619
5670
  result->op = GGML_OP_FLASH_FF;
5620
5671
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5730,7 +5781,6 @@ struct ggml_tensor * ggml_win_part(
5730
5781
  const int np = npx*npy;
5731
5782
 
5732
5783
  const int64_t ne[4] = { a->ne[0], w, w, np, };
5733
-
5734
5784
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5735
5785
 
5736
5786
  int32_t params[] = { npx, npy, w };
@@ -7520,7 +7570,7 @@ static void ggml_compute_forward_acc_f32(
7520
7570
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7521
7571
 
7522
7572
  // view src0 and dst with these strides and data offset inbytes during acc
7523
- // nb0 is implicitely element_size because src0 and dst are contiguous
7573
+ // nb0 is implicitly element_size because src0 and dst are contiguous
7524
7574
  size_t nb1 = ((int32_t *) dst->op_params)[0];
7525
7575
  size_t nb2 = ((int32_t *) dst->op_params)[1];
7526
7576
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -7716,6 +7766,8 @@ static void ggml_compute_forward_mul_f32(
7716
7766
 
7717
7767
  #ifdef GGML_USE_CLBLAST
7718
7768
  if (src1->backend == GGML_BACKEND_GPU) {
7769
+ // TODO: OpenCL kernel support full broadcast
7770
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7719
7771
  if (ith == 0) {
7720
7772
  ggml_cl_mul(src0, src1, dst);
7721
7773
  }
@@ -8981,10 +9033,9 @@ static void ggml_compute_forward_silu(
8981
9033
  } break;
8982
9034
  }
8983
9035
  }
9036
+ // ggml_compute_forward_leaky_relu
8984
9037
 
8985
- // ggml_compute_forward_leaky
8986
-
8987
- static void ggml_compute_forward_leaky_f32(
9038
+ static void ggml_compute_forward_leaky_relu_f32(
8988
9039
  const struct ggml_compute_params * params,
8989
9040
  const struct ggml_tensor * src0,
8990
9041
  struct ggml_tensor * dst) {
@@ -8998,24 +9049,27 @@ static void ggml_compute_forward_leaky_f32(
8998
9049
  const int n = ggml_nrows(src0);
8999
9050
  const int nc = src0->ne[0];
9000
9051
 
9052
+ float negative_slope;
9053
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
9054
+
9001
9055
  assert(dst->nb[0] == sizeof(float));
9002
9056
  assert(src0->nb[0] == sizeof(float));
9003
9057
 
9004
9058
  for (int i = 0; i < n; i++) {
9005
- ggml_vec_leaky_f32(nc,
9059
+ ggml_vec_leaky_relu_f32(nc,
9006
9060
  (float *) ((char *) dst->data + i*( dst->nb[1])),
9007
- (float *) ((char *) src0->data + i*(src0->nb[1])));
9061
+ (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
9008
9062
  }
9009
9063
  }
9010
9064
 
9011
- static void ggml_compute_forward_leaky(
9065
+ static void ggml_compute_forward_leaky_relu(
9012
9066
  const struct ggml_compute_params * params,
9013
9067
  const struct ggml_tensor * src0,
9014
9068
  struct ggml_tensor * dst) {
9015
9069
  switch (src0->type) {
9016
9070
  case GGML_TYPE_F32:
9017
9071
  {
9018
- ggml_compute_forward_leaky_f32(params, src0, dst);
9072
+ ggml_compute_forward_leaky_relu_f32(params, src0, dst);
9019
9073
  } break;
9020
9074
  default:
9021
9075
  {
@@ -9110,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
9110
9164
  float eps;
9111
9165
  memcpy(&eps, dst->op_params, sizeof(float));
9112
9166
 
9167
+ GGML_ASSERT(eps > 0.0f);
9168
+
9113
9169
  // TODO: optimize
9114
9170
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9115
9171
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9179,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
9179
9235
  float eps;
9180
9236
  memcpy(&eps, dst->op_params, sizeof(float));
9181
9237
 
9238
+ GGML_ASSERT(eps > 0.0f);
9239
+
9182
9240
  // TODO: optimize
9183
9241
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9184
9242
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9504,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9504
9562
  const int64_t ne0 = dst->ne[0];
9505
9563
  const int64_t ne1 = dst->ne[1];
9506
9564
 
9565
+ // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
9566
+ // all the experts for each batch element and the processing would become incredibly slow
9507
9567
  // TODO: find the optimal values for these
9508
- if (ggml_is_contiguous(src0) &&
9568
+ if (dst->op != GGML_OP_MUL_MAT_ID &&
9569
+ ggml_is_contiguous(src0) &&
9509
9570
  ggml_is_contiguous(src1) &&
9510
9571
  //src0->type == GGML_TYPE_F32 &&
9511
9572
  src1->type == GGML_TYPE_F32 &&
@@ -9593,8 +9654,7 @@ static void ggml_compute_forward_mul_mat(
9593
9654
 
9594
9655
  const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9595
9656
  const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
9596
-
9597
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9657
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9598
9658
 
9599
9659
  if (type != GGML_TYPE_F32) {
9600
9660
  float * const wdata = params->wdata;
@@ -9611,10 +9671,10 @@ static void ggml_compute_forward_mul_mat(
9611
9671
  }
9612
9672
 
9613
9673
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9614
- ne11, ne01, ne10,
9615
- 1.0f, y, ne10,
9616
- x, ne00,
9617
- 0.0f, d, ne01);
9674
+ ne1, ne01, ne10,
9675
+ 1.0f, y, ne10,
9676
+ x, ne00,
9677
+ 0.0f, d, ne01);
9618
9678
  }
9619
9679
  }
9620
9680
 
@@ -9627,9 +9687,10 @@ static void ggml_compute_forward_mul_mat(
9627
9687
  if (params->type == GGML_TASK_INIT) {
9628
9688
  if (src1->type != vec_dot_type) {
9629
9689
  char * wdata = params->wdata;
9630
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9690
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9631
9691
 
9632
9692
  assert(params->wsize >= ne11*ne12*ne13*row_size);
9693
+ assert(src1->type == GGML_TYPE_F32);
9633
9694
 
9634
9695
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
9635
9696
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -9649,10 +9710,10 @@ static void ggml_compute_forward_mul_mat(
9649
9710
  }
9650
9711
 
9651
9712
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9652
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9713
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9653
9714
 
9654
- const int64_t nr0 = ne01; // src0 rows
9655
- const int64_t nr1 = ne11*ne12*ne13; // src1 rows
9715
+ const int64_t nr0 = ne01; // src0 rows
9716
+ const int64_t nr1 = ne1*ne12*ne13; // src1 rows
9656
9717
 
9657
9718
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9658
9719
 
@@ -9694,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
9694
9755
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9695
9756
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9696
9757
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9697
- const int64_t i13 = (ir1/(ne12*ne11));
9698
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
9699
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
9758
+ const int64_t i13 = (ir1/(ne12*ne1));
9759
+ const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
9760
+ const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
9700
9761
 
9701
9762
  // broadcast src0 into src1
9702
9763
  const int64_t i03 = i13/r3;
@@ -9736,20 +9797,191 @@ static void ggml_compute_forward_mul_mat(
9736
9797
 
9737
9798
  static void ggml_compute_forward_mul_mat_id(
9738
9799
  const struct ggml_compute_params * params,
9800
+ const struct ggml_tensor * ids,
9801
+ const struct ggml_tensor * src1,
9739
9802
  struct ggml_tensor * dst) {
9740
9803
 
9741
- const struct ggml_tensor * ids = dst->src[0];
9742
- const struct ggml_tensor * src1 = dst->src[1];
9804
+ const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
9805
+
9806
+ GGML_TENSOR_BINARY_OP_LOCALS
9807
+
9808
+ const int ith = params->ith;
9809
+ const int nth = params->nth;
9810
+
9811
+ const enum ggml_type type = src0->type;
9812
+
9813
+ const bool src1_cont = ggml_is_contiguous(src1);
9814
+
9815
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9816
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9817
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
9818
+
9819
+ GGML_ASSERT(ne0 == ne01);
9820
+ GGML_ASSERT(ne1 == ne11);
9821
+ GGML_ASSERT(ne2 == ne12);
9822
+ GGML_ASSERT(ne3 == ne13);
9823
+
9824
+ // we don't support permuted src0 or src1
9825
+ GGML_ASSERT(nb00 == ggml_type_size(type));
9826
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9827
+
9828
+ // dst cannot be transposed or permuted
9829
+ GGML_ASSERT(nb0 == sizeof(float));
9830
+ GGML_ASSERT(nb0 <= nb1);
9831
+ GGML_ASSERT(nb1 <= nb2);
9832
+ GGML_ASSERT(nb2 <= nb3);
9833
+
9834
+ // broadcast factors
9835
+ const int64_t r2 = ne12/ne02;
9836
+ const int64_t r3 = ne13/ne03;
9837
+
9838
+ // row groups
9839
+ const int id = ggml_get_op_params_i32(dst, 0);
9840
+ const int n_as = ggml_get_op_params_i32(dst, 1);
9841
+
9842
+ char * wdata_src1_end = (src1->type == vec_dot_type) ?
9843
+ (char *) params->wdata :
9844
+ (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
9845
+
9846
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
9847
+ int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
9848
+
9849
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
9850
+
9851
+ if (params->type == GGML_TASK_INIT) {
9852
+ char * wdata = params->wdata;
9853
+ if (src1->type != vec_dot_type) {
9854
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9855
+
9856
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
9857
+ assert(src1->type == GGML_TYPE_F32);
9858
+
9859
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
9860
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
9861
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
9862
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
9863
+ wdata += row_size;
9864
+ }
9865
+ }
9866
+ }
9867
+ }
9868
+
9869
+ // initialize matrix_row_counts
9870
+ GGML_ASSERT(wdata == wdata_src1_end);
9871
+ memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
9872
+
9873
+ // group rows by src0 matrix
9874
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9875
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9876
+
9877
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
9878
+ MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
9879
+ matrix_row_counts[row_id] += 1;
9880
+ }
9881
+
9882
+ return;
9883
+ }
9884
+
9885
+ if (params->type == GGML_TASK_FINALIZE) {
9886
+ return;
9887
+ }
9888
+
9889
+ // compute each matrix multiplication in sequence
9890
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
9891
+ const int64_t cne1 = matrix_row_counts[cur_a];
9892
+
9893
+ if (cne1 == 0) {
9894
+ continue;
9895
+ }
9743
9896
 
9744
- const int id = ggml_get_op_params_i32(dst, 0);
9897
+ const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
9745
9898
 
9746
- const int a_id = ((int32_t *)ids->data)[id];
9899
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9900
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9747
9901
 
9748
- GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
9902
+ const int64_t nr0 = ne01; // src0 rows
9903
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9749
9904
 
9750
- const struct ggml_tensor * src0 = dst->src[a_id + 2];
9905
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9751
9906
 
9752
- ggml_compute_forward_mul_mat(params, src0, src1, dst);
9907
+ // distribute the thread work across the inner or outer loop based on which one is larger
9908
+
9909
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
9910
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
9911
+
9912
+ const int64_t ith0 = ith % nth0;
9913
+ const int64_t ith1 = ith / nth0;
9914
+
9915
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
9916
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
9917
+
9918
+ const int64_t ir010 = dr0*ith0;
9919
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
9920
+
9921
+ const int64_t ir110 = dr1*ith1;
9922
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
9923
+
9924
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
9925
+
9926
+ // threads with no work simply yield (not sure if it helps)
9927
+ if (ir010 >= ir011 || ir110 >= ir111) {
9928
+ sched_yield();
9929
+ continue;
9930
+ }
9931
+
9932
+ assert(ne12 % ne02 == 0);
9933
+ assert(ne13 % ne03 == 0);
9934
+
9935
+ // block-tiling attempt
9936
+ const int64_t blck_0 = 16;
9937
+ const int64_t blck_1 = 16;
9938
+
9939
+ // attempt to reduce false-sharing (does not seem to make a difference)
9940
+ float tmp[16];
9941
+
9942
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9943
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9944
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9945
+ const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
9946
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9947
+ const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
9948
+ const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
9949
+
9950
+ // broadcast src0 into src1
9951
+ const int64_t i03 = i13/r3;
9952
+ const int64_t i02 = i12/r2;
9953
+
9954
+ const int64_t i1 = i11;
9955
+ const int64_t i2 = i12;
9956
+ const int64_t i3 = i13;
9957
+
9958
+ const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
9959
+
9960
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
9961
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
9962
+ // the original src1 data pointer, so we should index using the indices directly
9963
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
9964
+ const char * src1_col = (const char *) wdata +
9965
+ (src1_cont || src1->type != vec_dot_type
9966
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
9967
+ : (i11*nb11 + i12*nb12 + i13*nb13));
9968
+
9969
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
9970
+
9971
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9972
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
9973
+ //}
9974
+
9975
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9976
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
9977
+ }
9978
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
9979
+ }
9980
+ }
9981
+ }
9982
+ }
9983
+
9984
+ #undef MMID_MATRIX_ROW
9753
9985
  }
9754
9986
 
9755
9987
  // ggml_compute_forward_out_prod
@@ -10093,19 +10325,17 @@ static void ggml_compute_forward_out_prod(
10093
10325
  static void ggml_compute_forward_scale_f32(
10094
10326
  const struct ggml_compute_params * params,
10095
10327
  const struct ggml_tensor * src0,
10096
- const struct ggml_tensor * src1,
10097
10328
  struct ggml_tensor * dst) {
10098
10329
  GGML_ASSERT(ggml_is_contiguous(src0));
10099
10330
  GGML_ASSERT(ggml_is_contiguous(dst));
10100
10331
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10101
- GGML_ASSERT(ggml_is_scalar(src1));
10102
10332
 
10103
10333
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10104
10334
  return;
10105
10335
  }
10106
10336
 
10107
10337
  // scale factor
10108
- const float v = *(float *) src1->data;
10338
+ const float v = *(float *) dst->op_params;
10109
10339
 
10110
10340
  const int ith = params->ith;
10111
10341
  const int nth = params->nth;
@@ -10136,12 +10366,11 @@ static void ggml_compute_forward_scale_f32(
10136
10366
  static void ggml_compute_forward_scale(
10137
10367
  const struct ggml_compute_params * params,
10138
10368
  const struct ggml_tensor * src0,
10139
- const struct ggml_tensor * src1,
10140
10369
  struct ggml_tensor * dst) {
10141
10370
  switch (src0->type) {
10142
10371
  case GGML_TYPE_F32:
10143
10372
  {
10144
- ggml_compute_forward_scale_f32(params, src0, src1, dst);
10373
+ ggml_compute_forward_scale_f32(params, src0, dst);
10145
10374
  } break;
10146
10375
  default:
10147
10376
  {
@@ -10161,7 +10390,7 @@ static void ggml_compute_forward_set_f32(
10161
10390
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10162
10391
 
10163
10392
  // view src0 and dst with these strides and data offset inbytes during set
10164
- // nb0 is implicitely element_size because src0 and dst are contiguous
10393
+ // nb0 is implicitly element_size because src0 and dst are contiguous
10165
10394
  size_t nb1 = ((int32_t *) dst->op_params)[0];
10166
10395
  size_t nb2 = ((int32_t *) dst->op_params)[1];
10167
10396
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -10325,21 +10554,30 @@ static void ggml_compute_forward_get_rows_q(
10325
10554
  return;
10326
10555
  }
10327
10556
 
10328
- const int nc = src0->ne[0];
10329
- const int nr = ggml_nelements(src1);
10557
+ GGML_TENSOR_BINARY_OP_LOCALS
10558
+
10559
+ const int64_t nc = ne00;
10560
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10561
+
10330
10562
  const enum ggml_type type = src0->type;
10331
10563
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
10332
10564
 
10333
- assert( dst->ne[0] == nc);
10334
- assert( dst->ne[1] == nr);
10335
- assert(src0->nb[0] == ggml_type_size(type));
10565
+ assert(ne0 == nc);
10566
+ assert(ne02 == ne11);
10567
+ assert(nb00 == ggml_type_size(type));
10568
+ assert(ggml_nrows(dst) == nr);
10336
10569
 
10337
- for (int i = 0; i < nr; ++i) {
10338
- const int r = ((int32_t *) src1->data)[i];
10570
+ // TODO: multi-thread
10571
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10572
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10573
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10574
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10339
10575
 
10340
- dequantize_row_q(
10341
- (const void *) ((char *) src0->data + r*src0->nb[1]),
10342
- (float *) ((char *) dst->data + i*dst->nb[1]), nc);
10576
+ dequantize_row_q(
10577
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10578
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10579
+ }
10580
+ }
10343
10581
  }
10344
10582
  }
10345
10583
 
@@ -10354,19 +10592,26 @@ static void ggml_compute_forward_get_rows_f16(
10354
10592
  return;
10355
10593
  }
10356
10594
 
10357
- const int nc = src0->ne[0];
10358
- const int nr = ggml_nelements(src1);
10595
+ GGML_TENSOR_BINARY_OP_LOCALS
10359
10596
 
10360
- assert( dst->ne[0] == nc);
10361
- assert( dst->ne[1] == nr);
10362
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
10597
+ const int64_t nc = ne00;
10598
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10363
10599
 
10364
- for (int i = 0; i < nr; ++i) {
10365
- const int r = ((int32_t *) src1->data)[i];
10600
+ assert(ne0 == nc);
10601
+ assert(ne02 == ne11);
10602
+ assert(nb00 == sizeof(ggml_fp16_t));
10603
+ assert(ggml_nrows(dst) == nr);
10366
10604
 
10367
- for (int j = 0; j < nc; ++j) {
10368
- ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
10369
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
10605
+ // TODO: multi-thread
10606
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10607
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10608
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10609
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10610
+
10611
+ ggml_fp16_to_fp32_row(
10612
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10613
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10614
+ }
10370
10615
  }
10371
10616
  }
10372
10617
  }
@@ -10382,19 +10627,27 @@ static void ggml_compute_forward_get_rows_f32(
10382
10627
  return;
10383
10628
  }
10384
10629
 
10385
- const int nc = src0->ne[0];
10386
- const int nr = ggml_nelements(src1);
10630
+ GGML_TENSOR_BINARY_OP_LOCALS
10387
10631
 
10388
- assert( dst->ne[0] == nc);
10389
- assert( dst->ne[1] == nr);
10390
- assert(src0->nb[0] == sizeof(float));
10632
+ const int64_t nc = ne00;
10633
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10391
10634
 
10392
- for (int i = 0; i < nr; ++i) {
10393
- const int r = ((int32_t *) src1->data)[i];
10635
+ assert(ne0 == nc);
10636
+ assert(ne02 == ne11);
10637
+ assert(nb00 == sizeof(float));
10638
+ assert(ggml_nrows(dst) == nr);
10394
10639
 
10395
- ggml_vec_cpy_f32(nc,
10396
- (float *) ((char *) dst->data + i*dst->nb[1]),
10397
- (float *) ((char *) src0->data + r*src0->nb[1]));
10640
+ // TODO: multi-thread
10641
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10642
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10643
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10644
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10645
+
10646
+ ggml_vec_cpy_f32(nc,
10647
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
10648
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
10649
+ }
10650
+ }
10398
10651
  }
10399
10652
  }
10400
10653
 
@@ -11306,10 +11559,13 @@ static void ggml_compute_forward_rope_f32(
11306
11559
  }
11307
11560
  } else {
11308
11561
  // TODO: this might be wrong for ne0 != n_dims - need double check
11309
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
11562
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11563
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11310
11564
  theta_base *= freq_scale;
11311
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11312
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11565
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11566
+ if (ic < n_dims) {
11567
+ const int64_t ib = 0;
11568
+
11313
11569
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11314
11570
  float cur_rot = inv_ndims * ic - ib;
11315
11571
 
@@ -11332,6 +11588,14 @@ static void ggml_compute_forward_rope_f32(
11332
11588
 
11333
11589
  dst_data[0] = x0*cos_theta - x1*sin_theta;
11334
11590
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
11591
+ } else {
11592
+ const int64_t i0 = ic;
11593
+
11594
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11595
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11596
+
11597
+ dst_data[0] = src[0];
11598
+ dst_data[1] = src[1];
11335
11599
  }
11336
11600
  }
11337
11601
  }
@@ -11459,10 +11723,13 @@ static void ggml_compute_forward_rope_f16(
11459
11723
  }
11460
11724
  } else {
11461
11725
  // TODO: this might be wrong for ne0 != n_dims - need double check
11462
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
11726
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11727
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11463
11728
  theta_base *= freq_scale;
11464
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11465
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11729
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11730
+ if (ic < n_dims) {
11731
+ const int64_t ib = 0;
11732
+
11466
11733
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11467
11734
  float cur_rot = inv_ndims * ic - ib;
11468
11735
 
@@ -11485,6 +11752,14 @@ static void ggml_compute_forward_rope_f16(
11485
11752
 
11486
11753
  dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
11487
11754
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
11755
+ } else {
11756
+ const int64_t i0 = ic;
11757
+
11758
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11759
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11760
+
11761
+ dst_data[0] = src[0];
11762
+ dst_data[1] = src[1];
11488
11763
  }
11489
11764
  }
11490
11765
  }
@@ -12114,6 +12389,7 @@ static void ggml_compute_forward_upscale_f32(
12114
12389
  GGML_ASSERT(src0->nb[0] == sizeof(float));
12115
12390
 
12116
12391
  const int ith = params->ith;
12392
+ const int nth = params->nth;
12117
12393
 
12118
12394
  GGML_TENSOR_UNARY_OP_LOCALS
12119
12395
 
@@ -12121,16 +12397,17 @@ static void ggml_compute_forward_upscale_f32(
12121
12397
 
12122
12398
  // TODO: optimize
12123
12399
 
12124
- for (int i03 = 0; i03 < ne03; i03++) {
12125
- for (int i02 = ith; i02 < ne02; i02++) {
12126
- for (int m = 0; m < dst->ne[1]; m++) {
12127
- int i01 = m / scale_factor;
12128
- for (int n = 0; n < dst->ne[0]; n++) {
12129
- int i00 = n / scale_factor;
12130
-
12131
- const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
12400
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
12401
+ const int64_t i03 = i3;
12402
+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
12403
+ const int64_t i02 = i2;
12404
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
12405
+ const int64_t i01 = i1 / scale_factor;
12406
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
12407
+ const int64_t i00 = i0 / scale_factor;
12132
12408
 
12133
- float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
12409
+ const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
12410
+ float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
12134
12411
 
12135
12412
  *y = *x;
12136
12413
  }
@@ -12155,6 +12432,64 @@ static void ggml_compute_forward_upscale(
12155
12432
  }
12156
12433
  }
12157
12434
 
12435
+ // ggml_compute_forward_pad
12436
+
12437
+ static void ggml_compute_forward_pad_f32(
12438
+ const struct ggml_compute_params * params,
12439
+ const struct ggml_tensor * src0,
12440
+ struct ggml_tensor * dst) {
12441
+
12442
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12443
+ return;
12444
+ }
12445
+
12446
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
12447
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
12448
+
12449
+ const int ith = params->ith;
12450
+ const int nth = params->nth;
12451
+
12452
+ GGML_TENSOR_UNARY_OP_LOCALS
12453
+
12454
+ float * dst_ptr = (float *) dst->data;
12455
+
12456
+ // TODO: optimize
12457
+
12458
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
12459
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
12460
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
12461
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
12462
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
12463
+
12464
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12465
+
12466
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
12467
+ dst_ptr[dst_idx] = *src_ptr;
12468
+ } else {
12469
+ dst_ptr[dst_idx] = 0;
12470
+ }
12471
+ }
12472
+ }
12473
+ }
12474
+ }
12475
+ }
12476
+
12477
+ static void ggml_compute_forward_pad(
12478
+ const struct ggml_compute_params * params,
12479
+ const struct ggml_tensor * src0,
12480
+ struct ggml_tensor * dst) {
12481
+ switch (src0->type) {
12482
+ case GGML_TYPE_F32:
12483
+ {
12484
+ ggml_compute_forward_pad_f32(params, src0, dst);
12485
+ } break;
12486
+ default:
12487
+ {
12488
+ GGML_ASSERT(false);
12489
+ } break;
12490
+ }
12491
+ }
12492
+
12158
12493
  // ggml_compute_forward_argsort
12159
12494
 
12160
12495
  static void ggml_compute_forward_argsort_f32(
@@ -13362,10 +13697,6 @@ static void ggml_compute_forward_unary(
13362
13697
  {
13363
13698
  ggml_compute_forward_silu(params, src0, dst);
13364
13699
  } break;
13365
- case GGML_UNARY_OP_LEAKY:
13366
- {
13367
- ggml_compute_forward_leaky(params, src0, dst);
13368
- } break;
13369
13700
  default:
13370
13701
  {
13371
13702
  GGML_ASSERT(false);
@@ -14041,7 +14372,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14041
14372
  } break;
14042
14373
  case GGML_OP_MUL_MAT_ID:
14043
14374
  {
14044
- ggml_compute_forward_mul_mat_id(params, tensor);
14375
+ ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
14045
14376
  } break;
14046
14377
  case GGML_OP_OUT_PROD:
14047
14378
  {
@@ -14049,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14049
14380
  } break;
14050
14381
  case GGML_OP_SCALE:
14051
14382
  {
14052
- ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
14383
+ ggml_compute_forward_scale(params, tensor->src[0], tensor);
14053
14384
  } break;
14054
14385
  case GGML_OP_SET:
14055
14386
  {
@@ -14147,10 +14478,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14147
14478
  {
14148
14479
  ggml_compute_forward_upscale(params, tensor->src[0], tensor);
14149
14480
  } break;
14481
+ case GGML_OP_PAD:
14482
+ {
14483
+ ggml_compute_forward_pad(params, tensor->src[0], tensor);
14484
+ } break;
14150
14485
  case GGML_OP_ARGSORT:
14151
14486
  {
14152
14487
  ggml_compute_forward_argsort(params, tensor->src[0], tensor);
14153
14488
  } break;
14489
+ case GGML_OP_LEAKY_RELU:
14490
+ {
14491
+ ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
14492
+ } break;
14154
14493
  case GGML_OP_FLASH_ATTN:
14155
14494
  {
14156
14495
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -14405,7 +14744,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14405
14744
  return replacements->vals[i];
14406
14745
  }
14407
14746
 
14408
- struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14747
+ struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
14409
14748
 
14410
14749
  // insert clone into replacements
14411
14750
  GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
@@ -14475,7 +14814,7 @@ void ggml_build_backward_gradient_checkpointing(
14475
14814
  // insert new tensors recomputing src, reusing already made replacements,
14476
14815
  // remember replacements: remember new tensors with mapping from corresponding gf nodes
14477
14816
  // recurse for input tensors,
14478
- // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
14817
+ // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
14479
14818
  node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
14480
14819
  }
14481
14820
  // insert rewritten backward node with replacements made into resulting backward graph gb
@@ -14497,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
14497
14836
 
14498
14837
  static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14499
14838
  if (ggml_hash_contains(zero_table, a)) {
14500
- struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14839
+ struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
14501
14840
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14502
14841
  } else {
14503
14842
  return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
@@ -14633,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14633
14972
  src0->grad,
14634
14973
  ggml_scale(ctx,
14635
14974
  ggml_mul(ctx, src0, tensor->grad),
14636
- ggml_new_f32(ctx, 2.0f)),
14975
+ 2.0f),
14637
14976
  zero_table);
14638
14977
  }
14639
14978
  } break;
@@ -14647,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14647
14986
  ggml_div(ctx,
14648
14987
  tensor->grad,
14649
14988
  tensor),
14650
- ggml_new_f32(ctx, 0.5f)),
14989
+ 0.5f),
14651
14990
  zero_table);
14652
14991
  }
14653
14992
  } break;
@@ -14813,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14813
15152
  {
14814
15153
  // necessary for llama
14815
15154
  if (src0->grad) {
15155
+ const float s = ((float *) tensor->op_params)[0];
15156
+
14816
15157
  src0->grad =
14817
15158
  ggml_add_or_set(ctx,
14818
15159
  src0->grad,
14819
- ggml_scale_impl(ctx, tensor->grad, src1, false),
14820
- zero_table);
14821
- }
14822
- if (src1->grad) {
14823
- src1->grad =
14824
- ggml_add_or_set(ctx,
14825
- src1->grad,
14826
- ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
15160
+ ggml_scale_impl(ctx, tensor->grad, s, false),
14827
15161
  zero_table);
14828
15162
  }
14829
15163
  } break;
@@ -15001,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15001
15335
  const int n_past = ((int32_t *) tensor->op_params)[0];
15002
15336
  src0->grad =
15003
15337
  ggml_add_or_set(ctx, src0->grad,
15338
+ /* ggml_diag_mask_inf_impl() shouldn't be here */
15339
+ /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
15004
15340
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15005
15341
  zero_table);
15006
15342
  }
@@ -15143,10 +15479,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15143
15479
  {
15144
15480
  GGML_ASSERT(false); // TODO: not implemented
15145
15481
  } break;
15482
+ case GGML_OP_PAD:
15483
+ {
15484
+ GGML_ASSERT(false); // TODO: not implemented
15485
+ } break;
15146
15486
  case GGML_OP_ARGSORT:
15147
15487
  {
15148
15488
  GGML_ASSERT(false); // TODO: not implemented
15149
15489
  } break;
15490
+ case GGML_OP_LEAKY_RELU:
15491
+ {
15492
+ GGML_ASSERT(false); // TODO: not implemented
15493
+ } break;
15150
15494
  case GGML_OP_FLASH_ATTN:
15151
15495
  {
15152
15496
  struct ggml_tensor * flash_grad = NULL;
@@ -15752,6 +16096,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15752
16096
  case GGML_OP_ARGMAX:
15753
16097
  case GGML_OP_REPEAT:
15754
16098
  case GGML_OP_REPEAT_BACK:
16099
+ case GGML_OP_LEAKY_RELU:
15755
16100
  {
15756
16101
  n_tasks = 1;
15757
16102
  } break;
@@ -15764,7 +16109,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15764
16109
  case GGML_UNARY_OP_TANH:
15765
16110
  case GGML_UNARY_OP_ELU:
15766
16111
  case GGML_UNARY_OP_RELU:
15767
- case GGML_UNARY_OP_LEAKY:
15768
16112
  {
15769
16113
  n_tasks = 1;
15770
16114
  } break;
@@ -15821,7 +16165,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15821
16165
  } break;
15822
16166
  case GGML_OP_MUL_MAT_ID:
15823
16167
  {
15824
- // FIXME: blas
15825
16168
  n_tasks = n_threads;
15826
16169
  } break;
15827
16170
  case GGML_OP_OUT_PROD:
@@ -15883,6 +16226,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15883
16226
  {
15884
16227
  n_tasks = n_threads;
15885
16228
  } break;
16229
+ case GGML_OP_PAD:
16230
+ {
16231
+ n_tasks = n_threads;
16232
+ } break;
15886
16233
  case GGML_OP_ARGSORT:
15887
16234
  {
15888
16235
  n_tasks = n_threads;
@@ -16146,25 +16493,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16146
16493
  } else
16147
16494
  #endif
16148
16495
  if (node->src[1]->type != vec_dot_type) {
16149
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16496
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
16150
16497
  }
16151
16498
  } break;
16152
16499
  case GGML_OP_MUL_MAT_ID:
16153
16500
  {
16154
- const struct ggml_tensor * a = node->src[2];
16155
- const struct ggml_tensor * b = node->src[1];
16156
- const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
16157
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16158
- if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
16159
- if (a->type != GGML_TYPE_F32) {
16160
- // here we need memory just for single 2D matrix from src0
16161
- cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
16162
- }
16163
- } else
16164
- #endif
16165
- if (b->type != vec_dot_type) {
16166
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
16501
+ const struct ggml_tensor * src0 = node->src[2];
16502
+ const struct ggml_tensor * src1 = node->src[1];
16503
+ const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16504
+ if (src1->type != vec_dot_type) {
16505
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
16167
16506
  }
16507
+ const int n_as = ggml_get_op_params_i32(node, 1);
16508
+ cur = GGML_PAD(cur, sizeof(int64_t)); // align
16509
+ cur += n_as * sizeof(int64_t); // matrix_row_counts
16510
+ cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16168
16511
  } break;
16169
16512
  case GGML_OP_OUT_PROD:
16170
16513
  {
@@ -16394,7 +16737,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
16394
16737
  fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
16395
16738
  ggml_type_name(tensor->type),
16396
16739
  ggml_op_name (tensor->op),
16397
- tensor->n_dims,
16740
+ ggml_n_dims(tensor),
16398
16741
  ne[0], ne[1], ne[2], ne[3],
16399
16742
  nb[0], nb[1], nb[2], nb[3],
16400
16743
  tensor->data,
@@ -16409,7 +16752,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16409
16752
  arg,
16410
16753
  ggml_type_name(tensor->type),
16411
16754
  ggml_op_name (tensor->op),
16412
- tensor->n_dims,
16755
+ ggml_n_dims(tensor),
16413
16756
  ne[0], ne[1], ne[2], ne[3],
16414
16757
  nb[0], nb[1], nb[2], nb[3],
16415
16758
  tensor->data,
@@ -16499,11 +16842,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16499
16842
 
16500
16843
  const uint32_t type = tensor->type;
16501
16844
  const uint32_t op = tensor->op;
16502
- const uint32_t n_dims = tensor->n_dims;
16503
16845
 
16504
16846
  fwrite(&type, sizeof(uint32_t), 1, fout);
16505
16847
  fwrite(&op, sizeof(uint32_t), 1, fout);
16506
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16507
16848
 
16508
16849
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16509
16850
  const uint64_t ne = tensor->ne[j];
@@ -16533,11 +16874,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16533
16874
 
16534
16875
  const uint32_t type = tensor->type;
16535
16876
  const uint32_t op = tensor->op;
16536
- const uint32_t n_dims = tensor->n_dims;
16537
16877
 
16538
16878
  fwrite(&type, sizeof(uint32_t), 1, fout);
16539
16879
  fwrite(&op, sizeof(uint32_t), 1, fout);
16540
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16541
16880
 
16542
16881
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16543
16882
  const uint64_t ne = tensor->ne[j];
@@ -16709,12 +17048,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16709
17048
  {
16710
17049
  uint32_t type;
16711
17050
  uint32_t op;
16712
- uint32_t n_dims;
16713
17051
 
16714
17052
  for (uint32_t i = 0; i < n_leafs; ++i) {
16715
17053
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16716
17054
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16717
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16718
17055
 
16719
17056
  int64_t ne[GGML_MAX_DIMS];
16720
17057
  size_t nb[GGML_MAX_DIMS];
@@ -16730,7 +17067,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16730
17067
  nb[j] = nb_cur;
16731
17068
  }
16732
17069
 
16733
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
17070
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16734
17071
 
16735
17072
  tensor->op = (enum ggml_op) op;
16736
17073
 
@@ -16747,7 +17084,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16747
17084
 
16748
17085
  ptr += ggml_nbytes(tensor);
16749
17086
 
16750
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17087
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16751
17088
  }
16752
17089
  }
16753
17090
 
@@ -16757,12 +17094,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16757
17094
  {
16758
17095
  uint32_t type;
16759
17096
  uint32_t op;
16760
- uint32_t n_dims;
16761
17097
 
16762
17098
  for (uint32_t i = 0; i < n_nodes; ++i) {
16763
17099
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16764
17100
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16765
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16766
17101
 
16767
17102
  enum ggml_op eop = (enum ggml_op) op;
16768
17103
 
@@ -16833,7 +17168,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16833
17168
  } break;
16834
17169
  default:
16835
17170
  {
16836
- tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
17171
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16837
17172
 
16838
17173
  tensor->op = eop;
16839
17174
  } break;
@@ -16852,7 +17187,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16852
17187
 
16853
17188
  result->nodes[i] = tensor;
16854
17189
 
16855
- fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17190
+ fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16856
17191
  }
16857
17192
  }
16858
17193
  }
@@ -16990,7 +17325,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16990
17325
  fprintf(fp, "(%s)|", ggml_type_name(node->type));
16991
17326
  }
16992
17327
 
16993
- if (node->n_dims == 2) {
17328
+ if (ggml_is_matrix(node)) {
16994
17329
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
16995
17330
  } else {
16996
17331
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
@@ -17257,7 +17592,7 @@ static enum ggml_opt_result ggml_opt_adam(
17257
17592
  int64_t i = 0;
17258
17593
  for (int p = 0; p < np; ++p) {
17259
17594
  const int64_t ne = ggml_nelements(ps[p]);
17260
- const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
17595
+ const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
17261
17596
  for (int64_t j = 0; j < ne; ++j) {
17262
17597
  float x = ggml_get_f32_1d(ps[p], j);
17263
17598
  float g_ = g[i]*gnorm;
@@ -18531,7 +18866,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18531
18866
  return NULL;
18532
18867
  }
18533
18868
 
18534
- const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
18869
+ const size_t size_cur = ggml_row_size(info->type, ne);
18535
18870
 
18536
18871
  ctx->size += GGML_PAD(size_cur, ctx->alignment);
18537
18872
  }
@@ -18860,6 +19195,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
18860
19195
  return ctx->infos[i].name.data;
18861
19196
  }
18862
19197
 
19198
+ enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
19199
+ return ctx->infos[i].type;
19200
+ }
19201
+
18863
19202
  // returns the index
18864
19203
  static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
18865
19204
  const int idx = gguf_find_key(ctx, key);
@@ -19035,8 +19374,8 @@ void gguf_add_tensor(
19035
19374
  ctx->infos[idx].ne[i] = 1;
19036
19375
  }
19037
19376
 
19038
- ctx->infos[idx].n_dims = tensor->n_dims;
19039
- for (int i = 0; i < tensor->n_dims; i++) {
19377
+ ctx->infos[idx].n_dims = ggml_n_dims(tensor);
19378
+ for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
19040
19379
  ctx->infos[idx].ne[i] = tensor->ne[i];
19041
19380
  }
19042
19381