llama_cpp 0.9.5 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,4 @@
1
- #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
1
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
2
2
  #define _USE_MATH_DEFINES // For M_PI on MSVC
3
3
 
4
4
  #include "ggml-impl.h"
@@ -33,7 +33,7 @@
33
33
  // we should just be careful :)
34
34
  #pragma warning(disable: 4244 4267)
35
35
 
36
- // disable POSIX deprecation warnigns
36
+ // disable POSIX deprecation warnings
37
37
  // these functions are never going away, anyway
38
38
  #pragma warning(disable: 4996)
39
39
  #endif
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
233
233
  #define UNUSED GGML_UNUSED
234
234
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
235
235
 
236
- //
237
- // tensor access macros
238
- //
239
-
240
- #define GGML_TENSOR_UNARY_OP_LOCALS \
241
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
242
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
243
- GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
244
- GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
245
-
246
- #define GGML_TENSOR_BINARY_OP_LOCALS \
247
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
248
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
249
- GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
250
- GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
251
- GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
252
- GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
253
-
254
236
  #if defined(GGML_USE_ACCELERATE)
255
237
  #include <Accelerate/Accelerate.h>
256
238
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@@ -1413,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1413
1395
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1414
1396
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1415
1397
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
- inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1398
+ inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1417
1399
 
1418
1400
  static const float GELU_COEF_A = 0.044715f;
1419
1401
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1613
1595
  "GROUP_NORM",
1614
1596
 
1615
1597
  "MUL_MAT",
1598
+ "MUL_MAT_ID",
1616
1599
  "OUT_PROD",
1617
1600
 
1618
1601
  "SCALE",
@@ -1640,6 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1640
1623
  "POOL_1D",
1641
1624
  "POOL_2D",
1642
1625
  "UPSCALE",
1626
+ "PAD",
1627
+ "ARGSORT",
1628
+ "LEAKY_RELU",
1643
1629
 
1644
1630
  "FLASH_ATTN",
1645
1631
  "FLASH_FF",
@@ -1666,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1666
1652
  "CROSS_ENTROPY_LOSS_BACK",
1667
1653
  };
1668
1654
 
1669
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1655
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1670
1656
 
1671
1657
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1672
1658
  "none",
@@ -1695,6 +1681,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1695
1681
  "group_norm(x)",
1696
1682
 
1697
1683
  "X*Y",
1684
+ "X[i]*Y",
1698
1685
  "X*Y",
1699
1686
 
1700
1687
  "x*v",
@@ -1722,6 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1722
1709
  "pool_1d(x)",
1723
1710
  "pool_2d(x)",
1724
1711
  "upscale(x)",
1712
+ "pad(x)",
1713
+ "argsort(x)",
1714
+ "leaky_relu(x)",
1725
1715
 
1726
1716
  "flash_attn(x)",
1727
1717
  "flash_ff(x)",
@@ -1748,15 +1738,32 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1748
1738
  "cross_entropy_loss_back(x,y)",
1749
1739
  };
1750
1740
 
1751
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1741
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1752
1742
 
1753
1743
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1754
1744
 
1745
+
1746
+ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1747
+ "ABS",
1748
+ "SGN",
1749
+ "NEG",
1750
+ "STEP",
1751
+ "TANH",
1752
+ "ELU",
1753
+ "RELU",
1754
+ "GELU",
1755
+ "GELU_QUICK",
1756
+ "SILU",
1757
+ };
1758
+
1759
+ static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
1760
+
1761
+
1755
1762
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
1756
1763
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
1757
1764
 
1758
1765
  // WARN:
1759
- // Mis-confguration can lead to problem that's hard to reason about:
1766
+ // Mis-configuration can lead to problem that's hard to reason about:
1760
1767
  // * At best it crash or talks nosense.
1761
1768
  // * At worst it talks slightly difference but hard to perceive.
1762
1769
  //
@@ -1771,6 +1778,7 @@ static void ggml_setup_op_has_task_pass(void) {
1771
1778
 
1772
1779
  p[GGML_OP_ACC ] = true;
1773
1780
  p[GGML_OP_MUL_MAT ] = true;
1781
+ p[GGML_OP_MUL_MAT_ID ] = true;
1774
1782
  p[GGML_OP_OUT_PROD ] = true;
1775
1783
  p[GGML_OP_SET ] = true;
1776
1784
  p[GGML_OP_GET_ROWS_BACK ] = true;
@@ -1989,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
1989
1997
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1990
1998
  }
1991
1999
 
1992
- size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
1993
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1994
-
1995
- return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
1996
- }
1997
-
1998
2000
  int ggml_blck_size(enum ggml_type type) {
1999
2001
  return type_traits[type].blck_size;
2000
2002
  }
@@ -2003,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
2003
2005
  return type_traits[type].type_size;
2004
2006
  }
2005
2007
 
2006
- float ggml_type_sizef(enum ggml_type type) {
2007
- return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
2008
+ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2009
+ assert(ne % ggml_blck_size(type) == 0);
2010
+ return ggml_type_size(type)*ne/ggml_blck_size(type);
2011
+ }
2012
+
2013
+ double ggml_type_sizef(enum ggml_type type) {
2014
+ return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2008
2015
  }
2009
2016
 
2010
2017
  const char * ggml_type_name(enum ggml_type type) {
@@ -2023,28 +2030,55 @@ const char * ggml_op_symbol(enum ggml_op op) {
2023
2030
  return GGML_OP_SYMBOL[op];
2024
2031
  }
2025
2032
 
2033
+ const char * ggml_unary_op_name(enum ggml_unary_op op) {
2034
+ return GGML_UNARY_OP_NAME[op];
2035
+ }
2036
+
2037
+ const char * ggml_op_desc(const struct ggml_tensor * t) {
2038
+ if (t->op == GGML_OP_UNARY) {
2039
+ enum ggml_unary_op uop = ggml_get_unary_op(t);
2040
+ return ggml_unary_op_name(uop);
2041
+ }
2042
+ else {
2043
+ return ggml_op_name(t->op);
2044
+ }
2045
+ }
2046
+
2026
2047
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
2027
2048
  return ggml_type_size(tensor->type);
2028
2049
  }
2029
2050
 
2030
- static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2051
+ bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2031
2052
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2032
2053
 
2033
2054
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2034
2055
  }
2035
2056
 
2036
- static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
2057
+ bool ggml_is_vector(const struct ggml_tensor * tensor) {
2037
2058
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2038
2059
 
2039
2060
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2040
2061
  }
2041
2062
 
2042
- static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2063
+ bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2043
2064
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2044
2065
 
2045
2066
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
2046
2067
  }
2047
2068
 
2069
+ bool ggml_is_3d(const struct ggml_tensor * tensor) {
2070
+ return tensor->ne[3] == 1;
2071
+ }
2072
+
2073
+ int ggml_n_dims(const struct ggml_tensor * tensor) {
2074
+ for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
2075
+ if (tensor->ne[i] > 1) {
2076
+ return i + 1;
2077
+ }
2078
+ }
2079
+ return 1;
2080
+ }
2081
+
2048
2082
  static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2049
2083
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2050
2084
 
@@ -2451,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2451
2485
  view_src = view_src->view_src;
2452
2486
  }
2453
2487
 
2454
- size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
2488
+ size_t data_size = ggml_row_size(type, ne[0]);
2455
2489
  for (int i = 1; i < n_dims; i++) {
2456
2490
  data_size *= ne[i];
2457
2491
  }
@@ -2494,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2494
2528
  /*.type =*/ type,
2495
2529
  /*.backend =*/ GGML_BACKEND_CPU,
2496
2530
  /*.buffer =*/ NULL,
2497
- /*.n_dims =*/ n_dims,
2498
2531
  /*.ne =*/ { 1, 1, 1, 1 },
2499
2532
  /*.nb =*/ { 0, 0, 0, 0 },
2500
2533
  /*.op =*/ GGML_OP_NONE,
@@ -2601,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
2601
2634
  }
2602
2635
 
2603
2636
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
2604
- return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
2637
+ return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
2605
2638
  }
2606
2639
 
2607
2640
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
@@ -3050,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
3050
3083
  struct ggml_tensor * ggml_view_tensor(
3051
3084
  struct ggml_context * ctx,
3052
3085
  struct ggml_tensor * src) {
3053
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
3086
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
3054
3087
  ggml_format_name(result, "%s (view)", src->name);
3055
3088
 
3056
3089
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -3154,9 +3187,7 @@ static struct ggml_tensor * ggml_add_impl(
3154
3187
  struct ggml_tensor * a,
3155
3188
  struct ggml_tensor * b,
3156
3189
  bool inplace) {
3157
- // TODO: support less-strict constraint
3158
- // GGML_ASSERT(ggml_can_repeat(b, a));
3159
- GGML_ASSERT(ggml_can_repeat_rows(b, a));
3190
+ GGML_ASSERT(ggml_can_repeat(b, a));
3160
3191
 
3161
3192
  bool is_node = false;
3162
3193
 
@@ -3210,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
3210
3241
  is_node = true;
3211
3242
  }
3212
3243
 
3213
- struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
3244
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
3214
3245
 
3215
3246
  result->op = GGML_OP_ADD;
3216
- result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
3247
+ result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
3217
3248
  result->src[0] = a;
3218
3249
  result->src[1] = b;
3219
3250
 
@@ -3371,9 +3402,7 @@ static struct ggml_tensor * ggml_mul_impl(
3371
3402
  struct ggml_tensor * a,
3372
3403
  struct ggml_tensor * b,
3373
3404
  bool inplace) {
3374
- // TODO: support less-strict constraint
3375
- // GGML_ASSERT(ggml_can_repeat(b, a));
3376
- GGML_ASSERT(ggml_can_repeat_rows(b, a));
3405
+ GGML_ASSERT(ggml_can_repeat(b, a));
3377
3406
 
3378
3407
  bool is_node = false;
3379
3408
 
@@ -3418,7 +3447,7 @@ static struct ggml_tensor * ggml_div_impl(
3418
3447
  struct ggml_tensor * a,
3419
3448
  struct ggml_tensor * b,
3420
3449
  bool inplace) {
3421
- GGML_ASSERT(ggml_are_same_shape(a, b));
3450
+ GGML_ASSERT(ggml_can_repeat(b, a));
3422
3451
 
3423
3452
  bool is_node = false;
3424
3453
 
@@ -3584,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows(
3584
3613
  is_node = true;
3585
3614
  }
3586
3615
 
3587
- int64_t ne[4] = {1,1,1,1};
3588
- for (int i=1; i<a->n_dims; ++i) {
3616
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
3617
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3589
3618
  ne[i] = a->ne[i];
3590
3619
  }
3591
3620
 
3592
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
3621
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
3593
3622
 
3594
3623
  result->op = GGML_OP_SUM_ROWS;
3595
3624
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3610,8 +3639,8 @@ struct ggml_tensor * ggml_mean(
3610
3639
  is_node = true;
3611
3640
  }
3612
3641
 
3613
- int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3614
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
3642
+ int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3643
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3615
3644
 
3616
3645
  result->op = GGML_OP_MEAN;
3617
3646
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3633,8 +3662,7 @@ struct ggml_tensor * ggml_argmax(
3633
3662
  is_node = true;
3634
3663
  }
3635
3664
 
3636
- int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
3637
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
3665
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
3638
3666
 
3639
3667
  result->op = GGML_OP_ARGMAX;
3640
3668
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3657,7 +3685,7 @@ struct ggml_tensor * ggml_repeat(
3657
3685
  is_node = true;
3658
3686
  }
3659
3687
 
3660
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3688
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3661
3689
 
3662
3690
  result->op = GGML_OP_REPEAT;
3663
3691
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3684,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back(
3684
3712
  return a;
3685
3713
  }
3686
3714
 
3687
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3715
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3688
3716
 
3689
3717
  result->op = GGML_OP_REPEAT_BACK;
3690
3718
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3815,12 +3843,25 @@ struct ggml_tensor * ggml_relu_inplace(
3815
3843
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3816
3844
  }
3817
3845
 
3818
- // ggml_leaky
3846
+ // ggml_leaky_relu
3819
3847
 
3820
- struct ggml_tensor * ggml_leaky(
3848
+ struct ggml_tensor * ggml_leaky_relu(
3821
3849
  struct ggml_context * ctx,
3822
- struct ggml_tensor * a) {
3823
- return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3850
+ struct ggml_tensor * a, float negative_slope, bool inplace) {
3851
+ bool is_node = false;
3852
+
3853
+ if (!inplace && (a->grad)) {
3854
+ is_node = true;
3855
+ }
3856
+
3857
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3858
+ ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
3859
+
3860
+ result->op = GGML_OP_LEAKY_RELU;
3861
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
3862
+ result->src[0] = a;
3863
+
3864
+ return result;
3824
3865
  }
3825
3866
 
3826
3867
  // ggml_gelu
@@ -4007,8 +4048,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
4007
4048
 
4008
4049
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4009
4050
 
4010
- result->op = GGML_OP_GROUP_NORM;
4011
4051
  result->op_params[0] = n_groups;
4052
+
4053
+ result->op = GGML_OP_GROUP_NORM;
4012
4054
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4013
4055
  result->src[0] = a;
4014
4056
  result->src[1] = NULL; // TODO: maybe store epsilon here?
@@ -4046,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat(
4046
4088
  }
4047
4089
 
4048
4090
  const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4049
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4091
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4050
4092
 
4051
4093
  result->op = GGML_OP_MUL_MAT;
4052
4094
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4056,6 +4098,51 @@ struct ggml_tensor * ggml_mul_mat(
4056
4098
  return result;
4057
4099
  }
4058
4100
 
4101
+ // ggml_mul_mat_id
4102
+
4103
+ struct ggml_tensor * ggml_mul_mat_id(
4104
+ struct ggml_context * ctx,
4105
+ struct ggml_tensor * const as[],
4106
+ int n_as,
4107
+ struct ggml_tensor * ids,
4108
+ int id,
4109
+ struct ggml_tensor * b) {
4110
+
4111
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
4112
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4113
+ GGML_ASSERT(ids->ne[1] == b->ne[1]);
4114
+ GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4115
+ GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4116
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4117
+
4118
+ bool is_node = false;
4119
+
4120
+ if (as[0]->grad || b->grad) {
4121
+ is_node = true;
4122
+ }
4123
+
4124
+ const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4125
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4126
+
4127
+ ggml_set_op_params_i32(result, 0, id);
4128
+ ggml_set_op_params_i32(result, 1, n_as);
4129
+
4130
+ result->op = GGML_OP_MUL_MAT_ID;
4131
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4132
+ result->src[0] = ids;
4133
+ result->src[1] = b;
4134
+
4135
+ for (int i = 0; i < n_as; i++) {
4136
+ struct ggml_tensor * a = as[i];
4137
+ GGML_ASSERT(ggml_are_same_shape(as[0], a));
4138
+ GGML_ASSERT(ggml_can_mul_mat(a, b));
4139
+ GGML_ASSERT(!ggml_is_transposed(a));
4140
+ result->src[i + 2] = a;
4141
+ }
4142
+
4143
+ return result;
4144
+ }
4145
+
4059
4146
  // ggml_out_prod
4060
4147
 
4061
4148
  struct ggml_tensor * ggml_out_prod(
@@ -4073,7 +4160,7 @@ struct ggml_tensor * ggml_out_prod(
4073
4160
 
4074
4161
  // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
4075
4162
  const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
4076
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4163
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4077
4164
 
4078
4165
  result->op = GGML_OP_OUT_PROD;
4079
4166
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4209,7 +4296,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
4209
4296
  struct ggml_tensor * b,
4210
4297
  size_t nb1,
4211
4298
  size_t offset) {
4212
- return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
4299
+ return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
4213
4300
  }
4214
4301
 
4215
4302
  // ggml_cpy
@@ -4358,7 +4445,7 @@ struct ggml_tensor * ggml_reshape(
4358
4445
  //GGML_ASSERT(false);
4359
4446
  }
4360
4447
 
4361
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
4448
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
4362
4449
  ggml_format_name(result, "%s (reshaped)", a->name);
4363
4450
 
4364
4451
  result->op = GGML_OP_RESHAPE;
@@ -4673,7 +4760,9 @@ struct ggml_tensor * ggml_get_rows(
4673
4760
  struct ggml_context * ctx,
4674
4761
  struct ggml_tensor * a,
4675
4762
  struct ggml_tensor * b) {
4676
- GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
4763
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
4764
+ GGML_ASSERT(b->ne[3] == 1);
4765
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
4677
4766
 
4678
4767
  bool is_node = false;
4679
4768
 
@@ -4683,7 +4772,7 @@ struct ggml_tensor * ggml_get_rows(
4683
4772
 
4684
4773
  // TODO: implement non F32 return
4685
4774
  //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
4686
- struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
4775
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
4687
4776
 
4688
4777
  result->op = GGML_OP_GET_ROWS;
4689
4778
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4734,7 +4823,7 @@ struct ggml_tensor * ggml_diag(
4734
4823
  }
4735
4824
 
4736
4825
  const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
4737
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
4826
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
4738
4827
 
4739
4828
  result->op = GGML_OP_DIAG;
4740
4829
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5381,7 +5470,7 @@ struct ggml_tensor * ggml_pool_1d(
5381
5470
  is_node = true;
5382
5471
  }
5383
5472
 
5384
- const int64_t ne[3] = {
5473
+ const int64_t ne[2] = {
5385
5474
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5386
5475
  a->ne[1],
5387
5476
  };
@@ -5461,6 +5550,30 @@ static struct ggml_tensor * ggml_upscale_impl(
5461
5550
  return result;
5462
5551
  }
5463
5552
 
5553
+ struct ggml_tensor * ggml_pad(
5554
+ struct ggml_context * ctx,
5555
+ struct ggml_tensor * a,
5556
+ int p0, int p1, int p2, int p3) {
5557
+ bool is_node = false;
5558
+
5559
+ if (a->grad) {
5560
+ GGML_ASSERT(false); // TODO: implement backward
5561
+ is_node = true;
5562
+ }
5563
+
5564
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
5565
+ a->ne[0] + p0,
5566
+ a->ne[1] + p1,
5567
+ a->ne[2] + p2,
5568
+ a->ne[3] + p3);
5569
+
5570
+ result->op = GGML_OP_PAD;
5571
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5572
+ result->src[0] = a;
5573
+
5574
+ return result;
5575
+ }
5576
+
5464
5577
  struct ggml_tensor * ggml_upscale(
5465
5578
  struct ggml_context * ctx,
5466
5579
  struct ggml_tensor * a,
@@ -5468,6 +5581,43 @@ struct ggml_tensor * ggml_upscale(
5468
5581
  return ggml_upscale_impl(ctx, a, scale_factor);
5469
5582
  }
5470
5583
 
5584
+ // ggml_argsort
5585
+
5586
+ struct ggml_tensor * ggml_argsort(
5587
+ struct ggml_context * ctx,
5588
+ struct ggml_tensor * a,
5589
+ enum ggml_sort_order order) {
5590
+ bool is_node = false;
5591
+
5592
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
5593
+
5594
+ ggml_set_op_params_i32(result, 0, (int32_t) order);
5595
+
5596
+ result->op = GGML_OP_ARGSORT;
5597
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5598
+ result->src[0] = a;
5599
+
5600
+ return result;
5601
+ }
5602
+
5603
+ // ggml_top_k
5604
+
5605
+ struct ggml_tensor * ggml_top_k(
5606
+ struct ggml_context * ctx,
5607
+ struct ggml_tensor * a,
5608
+ int k) {
5609
+ GGML_ASSERT(a->ne[0] >= k);
5610
+
5611
+ struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
5612
+
5613
+ result = ggml_view_4d(ctx, result,
5614
+ k, result->ne[1], result->ne[2], result->ne[3],
5615
+ result->nb[1], result->nb[2], result->nb[3],
5616
+ 0);
5617
+
5618
+ return result;
5619
+ }
5620
+
5471
5621
  // ggml_flash_attn
5472
5622
 
5473
5623
  struct ggml_tensor * ggml_flash_attn(
@@ -5486,7 +5636,7 @@ struct ggml_tensor * ggml_flash_attn(
5486
5636
  }
5487
5637
 
5488
5638
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
5489
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
5639
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
5490
5640
 
5491
5641
  int32_t t = masked ? 1 : 0;
5492
5642
  ggml_set_op_params(result, &t, sizeof(t));
@@ -5519,7 +5669,7 @@ struct ggml_tensor * ggml_flash_ff(
5519
5669
  }
5520
5670
 
5521
5671
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5522
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
5672
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
5523
5673
 
5524
5674
  result->op = GGML_OP_FLASH_FF;
5525
5675
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5635,7 +5785,6 @@ struct ggml_tensor * ggml_win_part(
5635
5785
  const int np = npx*npy;
5636
5786
 
5637
5787
  const int64_t ne[4] = { a->ne[0], w, w, np, };
5638
-
5639
5788
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5640
5789
 
5641
5790
  int32_t params[] = { npx, npy, w };
@@ -6827,7 +6976,7 @@ static void ggml_compute_forward_add_f32(
6827
6976
  const struct ggml_tensor * src0,
6828
6977
  const struct ggml_tensor * src1,
6829
6978
  struct ggml_tensor * dst) {
6830
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
6979
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
6831
6980
 
6832
6981
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6833
6982
  return;
@@ -6860,16 +7009,19 @@ static void ggml_compute_forward_add_f32(
6860
7009
  const int64_t i13 = i03 % ne13;
6861
7010
  const int64_t i12 = i02 % ne12;
6862
7011
  const int64_t i11 = i01 % ne11;
7012
+ const int64_t nr0 = ne00 / ne10;
6863
7013
 
6864
7014
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
6865
7015
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
6866
7016
  float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
6867
7017
 
7018
+ for (int64_t r = 0; r < nr0; ++r) {
6868
7019
  #ifdef GGML_USE_ACCELERATE
6869
- vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
7020
+ vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
6870
7021
  #else
6871
- ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
7022
+ ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
6872
7023
  #endif
7024
+ }
6873
7025
  }
6874
7026
  } else {
6875
7027
  // src1 is not contiguous
@@ -6886,8 +7038,9 @@ static void ggml_compute_forward_add_f32(
6886
7038
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
6887
7039
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
6888
7040
 
6889
- for (int i0 = 0; i0 < ne0; i0++) {
6890
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
7041
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
7042
+ const int64_t i10 = i0 % ne10;
7043
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
6891
7044
 
6892
7045
  dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
6893
7046
  }
@@ -7421,7 +7574,7 @@ static void ggml_compute_forward_acc_f32(
7421
7574
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7422
7575
 
7423
7576
  // view src0 and dst with these strides and data offset inbytes during acc
7424
- // nb0 is implicitely element_size because src0 and dst are contiguous
7577
+ // nb0 is implicitly element_size because src0 and dst are contiguous
7425
7578
  size_t nb1 = ((int32_t *) dst->op_params)[0];
7426
7579
  size_t nb2 = ((int32_t *) dst->op_params)[1];
7427
7580
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -7607,7 +7760,7 @@ static void ggml_compute_forward_mul_f32(
7607
7760
  const struct ggml_tensor * src0,
7608
7761
  const struct ggml_tensor * src1,
7609
7762
  struct ggml_tensor * dst) {
7610
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
7763
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7611
7764
 
7612
7765
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7613
7766
  return;
@@ -7617,6 +7770,8 @@ static void ggml_compute_forward_mul_f32(
7617
7770
 
7618
7771
  #ifdef GGML_USE_CLBLAST
7619
7772
  if (src1->backend == GGML_BACKEND_GPU) {
7773
+ // TODO: OpenCL kernel support full broadcast
7774
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7620
7775
  if (ith == 0) {
7621
7776
  ggml_cl_mul(src0, src1, dst);
7622
7777
  }
@@ -7630,7 +7785,6 @@ static void ggml_compute_forward_mul_f32(
7630
7785
 
7631
7786
  GGML_ASSERT( nb0 == sizeof(float));
7632
7787
  GGML_ASSERT(nb00 == sizeof(float));
7633
- GGML_ASSERT(ne00 == ne10);
7634
7788
 
7635
7789
  if (nb10 == sizeof(float)) {
7636
7790
  for (int64_t ir = ith; ir < nr; ir += nth) {
@@ -7642,20 +7796,21 @@ static void ggml_compute_forward_mul_f32(
7642
7796
  const int64_t i13 = i03 % ne13;
7643
7797
  const int64_t i12 = i02 % ne12;
7644
7798
  const int64_t i11 = i01 % ne11;
7799
+ const int64_t nr0 = ne00 / ne10;
7645
7800
 
7646
7801
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7647
7802
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7648
7803
  float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
7649
7804
 
7805
+ for (int64_t r = 0 ; r < nr0; ++r) {
7650
7806
  #ifdef GGML_USE_ACCELERATE
7651
- UNUSED(ggml_vec_mul_f32);
7807
+ UNUSED(ggml_vec_mul_f32);
7652
7808
 
7653
- vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
7809
+ vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
7654
7810
  #else
7655
- ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
7811
+ ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
7656
7812
  #endif
7657
- // }
7658
- // }
7813
+ }
7659
7814
  }
7660
7815
  } else {
7661
7816
  // src1 is not contiguous
@@ -7673,8 +7828,9 @@ static void ggml_compute_forward_mul_f32(
7673
7828
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7674
7829
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7675
7830
 
7676
- for (int64_t i0 = 0; i0 < ne00; i0++) {
7677
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
7831
+ for (int64_t i0 = 0; i0 < ne00; ++i0) {
7832
+ const int64_t i10 = i0 % ne10;
7833
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
7678
7834
 
7679
7835
  dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
7680
7836
  }
@@ -7708,14 +7864,16 @@ static void ggml_compute_forward_div_f32(
7708
7864
  const struct ggml_tensor * src0,
7709
7865
  const struct ggml_tensor * src1,
7710
7866
  struct ggml_tensor * dst) {
7711
- assert(params->ith == 0);
7712
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7867
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7713
7868
 
7714
7869
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7715
7870
  return;
7716
7871
  }
7717
7872
 
7718
- const int nr = ggml_nrows(src0);
7873
+ const int ith = params->ith;
7874
+ const int nth = params->nth;
7875
+
7876
+ const int64_t nr = ggml_nrows(src0);
7719
7877
 
7720
7878
  GGML_TENSOR_BINARY_OP_LOCALS
7721
7879
 
@@ -7723,41 +7881,50 @@ static void ggml_compute_forward_div_f32(
7723
7881
  GGML_ASSERT(nb00 == sizeof(float));
7724
7882
 
7725
7883
  if (nb10 == sizeof(float)) {
7726
- for (int ir = 0; ir < nr; ++ir) {
7727
- // src0, src1 and dst are same shape => same indices
7728
- const int i3 = ir/(ne2*ne1);
7729
- const int i2 = (ir - i3*ne2*ne1)/ne1;
7730
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
7884
+ for (int64_t ir = ith; ir < nr; ir += nth) {
7885
+ // src0 and dst are same shape => same indices
7886
+ const int64_t i03 = ir/(ne02*ne01);
7887
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
7888
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
7889
+
7890
+ const int64_t i13 = i03 % ne13;
7891
+ const int64_t i12 = i02 % ne12;
7892
+ const int64_t i11 = i01 % ne11;
7893
+ const int64_t nr0 = ne00 / ne10;
7894
+
7895
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7896
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7897
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
7731
7898
 
7899
+ for (int64_t r = 0; r < nr0; ++r) {
7732
7900
  #ifdef GGML_USE_ACCELERATE
7733
- UNUSED(ggml_vec_div_f32);
7901
+ UNUSED(ggml_vec_div_f32);
7734
7902
 
7735
- vDSP_vdiv(
7736
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
7737
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
7738
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
7739
- ne0);
7903
+ vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
7740
7904
  #else
7741
- ggml_vec_div_f32(ne0,
7742
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
7743
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
7744
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
7905
+ ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
7745
7906
  #endif
7746
- // }
7747
- // }
7907
+ }
7748
7908
  }
7749
7909
  } else {
7750
7910
  // src1 is not contiguous
7751
- for (int ir = 0; ir < nr; ++ir) {
7752
- // src0, src1 and dst are same shape => same indices
7753
- const int i3 = ir/(ne2*ne1);
7754
- const int i2 = (ir - i3*ne2*ne1)/ne1;
7755
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
7911
+ for (int64_t ir = ith; ir < nr; ir += nth) {
7912
+ // src0 and dst are same shape => same indices
7913
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
7914
+ const int64_t i03 = ir/(ne02*ne01);
7915
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
7916
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
7756
7917
 
7757
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
7758
- float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
7759
- for (int i0 = 0; i0 < ne0; i0++) {
7760
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
7918
+ const int64_t i13 = i03 % ne13;
7919
+ const int64_t i12 = i02 % ne12;
7920
+ const int64_t i11 = i01 % ne11;
7921
+
7922
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7923
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7924
+
7925
+ for (int64_t i0 = 0; i0 < ne00; ++i0) {
7926
+ const int64_t i10 = i0 % ne10;
7927
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
7761
7928
 
7762
7929
  dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
7763
7930
  }
@@ -8203,7 +8370,7 @@ static void ggml_compute_forward_repeat_f16(
8203
8370
  return;
8204
8371
  }
8205
8372
 
8206
- GGML_TENSOR_UNARY_OP_LOCALS;
8373
+ GGML_TENSOR_UNARY_OP_LOCALS
8207
8374
 
8208
8375
  // guaranteed to be an integer due to the check in ggml_can_repeat
8209
8376
  const int nr0 = (int)(ne0/ne00);
@@ -8348,6 +8515,7 @@ static void ggml_compute_forward_concat_f32(
8348
8515
  GGML_ASSERT(src0->nb[0] == sizeof(float));
8349
8516
 
8350
8517
  const int ith = params->ith;
8518
+ const int nth = params->nth;
8351
8519
 
8352
8520
  GGML_TENSOR_BINARY_OP_LOCALS
8353
8521
 
@@ -8357,7 +8525,7 @@ static void ggml_compute_forward_concat_f32(
8357
8525
  GGML_ASSERT(nb10 == sizeof(float));
8358
8526
 
8359
8527
  for (int i3 = 0; i3 < ne3; i3++) {
8360
- for (int i2 = ith; i2 < ne2; i2++) {
8528
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
8361
8529
  if (i2 < ne02) { // src0
8362
8530
  for (int i1 = 0; i1 < ne1; i1++) {
8363
8531
  for (int i0 = 0; i0 < ne0; i0++) {
@@ -8869,10 +9037,9 @@ static void ggml_compute_forward_silu(
8869
9037
  } break;
8870
9038
  }
8871
9039
  }
9040
+ // ggml_compute_forward_leaky_relu
8872
9041
 
8873
- // ggml_compute_forward_leaky
8874
-
8875
- static void ggml_compute_forward_leaky_f32(
9042
+ static void ggml_compute_forward_leaky_relu_f32(
8876
9043
  const struct ggml_compute_params * params,
8877
9044
  const struct ggml_tensor * src0,
8878
9045
  struct ggml_tensor * dst) {
@@ -8886,24 +9053,27 @@ static void ggml_compute_forward_leaky_f32(
8886
9053
  const int n = ggml_nrows(src0);
8887
9054
  const int nc = src0->ne[0];
8888
9055
 
9056
+ float negative_slope;
9057
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
9058
+
8889
9059
  assert(dst->nb[0] == sizeof(float));
8890
9060
  assert(src0->nb[0] == sizeof(float));
8891
9061
 
8892
9062
  for (int i = 0; i < n; i++) {
8893
- ggml_vec_leaky_f32(nc,
9063
+ ggml_vec_leaky_relu_f32(nc,
8894
9064
  (float *) ((char *) dst->data + i*( dst->nb[1])),
8895
- (float *) ((char *) src0->data + i*(src0->nb[1])));
9065
+ (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
8896
9066
  }
8897
9067
  }
8898
9068
 
8899
- static void ggml_compute_forward_leaky(
9069
+ static void ggml_compute_forward_leaky_relu(
8900
9070
  const struct ggml_compute_params * params,
8901
9071
  const struct ggml_tensor * src0,
8902
9072
  struct ggml_tensor * dst) {
8903
9073
  switch (src0->type) {
8904
9074
  case GGML_TYPE_F32:
8905
9075
  {
8906
- ggml_compute_forward_leaky_f32(params, src0, dst);
9076
+ ggml_compute_forward_leaky_relu_f32(params, src0, dst);
8907
9077
  } break;
8908
9078
  default:
8909
9079
  {
@@ -9392,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9392
9562
  const int64_t ne0 = dst->ne[0];
9393
9563
  const int64_t ne1 = dst->ne[1];
9394
9564
 
9565
+ // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
9566
+ // all the experts for each batch element and the processing would become incredibly slow
9395
9567
  // TODO: find the optimal values for these
9396
- if (ggml_is_contiguous(src0) &&
9568
+ if (dst->op != GGML_OP_MUL_MAT_ID &&
9569
+ ggml_is_contiguous(src0) &&
9397
9570
  ggml_is_contiguous(src1) &&
9398
9571
  //src0->type == GGML_TYPE_F32 &&
9399
9572
  src1->type == GGML_TYPE_F32 &&
@@ -9407,11 +9580,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9407
9580
  }
9408
9581
  #endif
9409
9582
 
9583
+ // off1 = offset in i11 and i1
9584
+ // cne1 = ne11 and ne1
9585
+ // in a normal matrix multiplication, off1 = 0 and cne1 = ne1
9586
+ // during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
9410
9587
  static void ggml_compute_forward_mul_mat(
9411
9588
  const struct ggml_compute_params * params,
9412
9589
  const struct ggml_tensor * src0,
9413
9590
  const struct ggml_tensor * src1,
9414
- struct ggml_tensor * dst) {
9591
+ struct ggml_tensor * dst,
9592
+ int64_t off1, int64_t cne1) {
9415
9593
  int64_t t0 = ggml_perf_time_us();
9416
9594
  UNUSED(t0);
9417
9595
 
@@ -9479,10 +9657,9 @@ static void ggml_compute_forward_mul_mat(
9479
9657
  const int64_t i03 = i13/r3;
9480
9658
  const int64_t i02 = i12/r2;
9481
9659
 
9482
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9483
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
9484
-
9485
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9660
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9661
+ const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
9662
+ float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
9486
9663
 
9487
9664
  if (type != GGML_TYPE_F32) {
9488
9665
  float * const wdata = params->wdata;
@@ -9499,10 +9676,10 @@ static void ggml_compute_forward_mul_mat(
9499
9676
  }
9500
9677
 
9501
9678
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9502
- ne11, ne01, ne10,
9503
- 1.0f, y, ne10,
9504
- x, ne00,
9505
- 0.0f, d, ne01);
9679
+ cne1, ne01, ne10,
9680
+ 1.0f, y, ne10,
9681
+ x, ne00,
9682
+ 0.0f, d, ne01);
9506
9683
  }
9507
9684
  }
9508
9685
 
@@ -9515,7 +9692,10 @@ static void ggml_compute_forward_mul_mat(
9515
9692
  if (params->type == GGML_TASK_INIT) {
9516
9693
  if (src1->type != vec_dot_type) {
9517
9694
  char * wdata = params->wdata;
9518
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9695
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9696
+
9697
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
9698
+ assert(src1->type == GGML_TYPE_F32);
9519
9699
 
9520
9700
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
9521
9701
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -9535,10 +9715,10 @@ static void ggml_compute_forward_mul_mat(
9535
9715
  }
9536
9716
 
9537
9717
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9538
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9718
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9539
9719
 
9540
9720
  const int64_t nr0 = ne01; // src0 rows
9541
- const int64_t nr1 = ne11*ne12*ne13; // src1 rows
9721
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9542
9722
 
9543
9723
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9544
9724
 
@@ -9580,9 +9760,9 @@ static void ggml_compute_forward_mul_mat(
9580
9760
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9581
9761
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9582
9762
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9583
- const int64_t i13 = (ir1/(ne12*ne11));
9584
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
9585
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
9763
+ const int64_t i13 = (ir1/(ne12*cne1));
9764
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9765
+ const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
9586
9766
 
9587
9767
  // broadcast src0 into src1
9588
9768
  const int64_t i03 = i13/r3;
@@ -9618,6 +9798,34 @@ static void ggml_compute_forward_mul_mat(
9618
9798
  }
9619
9799
  }
9620
9800
 
9801
+ // ggml_compute_forward_mul_mat_id
9802
+
9803
+ static void ggml_compute_forward_mul_mat_id(
9804
+ const struct ggml_compute_params * params,
9805
+ const struct ggml_tensor * src0,
9806
+ const struct ggml_tensor * src1,
9807
+ struct ggml_tensor * dst) {
9808
+
9809
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9810
+ // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
9811
+ ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
9812
+ return;
9813
+ }
9814
+
9815
+ const struct ggml_tensor * ids = src0;
9816
+ const int id = ggml_get_op_params_i32(dst, 0);
9817
+ const int n_as = ggml_get_op_params_i32(dst, 1);
9818
+
9819
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9820
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9821
+
9822
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
9823
+
9824
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
9825
+ ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
9826
+ }
9827
+ }
9828
+
9621
9829
  // ggml_compute_forward_out_prod
9622
9830
 
9623
9831
  static void ggml_compute_forward_out_prod_f32(
@@ -10027,7 +10235,7 @@ static void ggml_compute_forward_set_f32(
10027
10235
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10028
10236
 
10029
10237
  // view src0 and dst with these strides and data offset inbytes during set
10030
- // nb0 is implicitely element_size because src0 and dst are contiguous
10238
+ // nb0 is implicitly element_size because src0 and dst are contiguous
10031
10239
  size_t nb1 = ((int32_t *) dst->op_params)[0];
10032
10240
  size_t nb2 = ((int32_t *) dst->op_params)[1];
10033
10241
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -10191,21 +10399,30 @@ static void ggml_compute_forward_get_rows_q(
10191
10399
  return;
10192
10400
  }
10193
10401
 
10194
- const int nc = src0->ne[0];
10195
- const int nr = ggml_nelements(src1);
10402
+ GGML_TENSOR_BINARY_OP_LOCALS
10403
+
10404
+ const int64_t nc = ne00;
10405
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10406
+
10196
10407
  const enum ggml_type type = src0->type;
10197
10408
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
10198
10409
 
10199
- assert( dst->ne[0] == nc);
10200
- assert( dst->ne[1] == nr);
10201
- assert(src0->nb[0] == ggml_type_size(type));
10410
+ assert(ne0 == nc);
10411
+ assert(ne02 == ne11);
10412
+ assert(nb00 == ggml_type_size(type));
10413
+ assert(ggml_nrows(dst) == nr);
10202
10414
 
10203
- for (int i = 0; i < nr; ++i) {
10204
- const int r = ((int32_t *) src1->data)[i];
10415
+ // TODO: multi-thread
10416
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10417
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10418
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10419
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10205
10420
 
10206
- dequantize_row_q(
10207
- (const void *) ((char *) src0->data + r*src0->nb[1]),
10208
- (float *) ((char *) dst->data + i*dst->nb[1]), nc);
10421
+ dequantize_row_q(
10422
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10423
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10424
+ }
10425
+ }
10209
10426
  }
10210
10427
  }
10211
10428
 
@@ -10220,19 +10437,26 @@ static void ggml_compute_forward_get_rows_f16(
10220
10437
  return;
10221
10438
  }
10222
10439
 
10223
- const int nc = src0->ne[0];
10224
- const int nr = ggml_nelements(src1);
10440
+ GGML_TENSOR_BINARY_OP_LOCALS
10225
10441
 
10226
- assert( dst->ne[0] == nc);
10227
- assert( dst->ne[1] == nr);
10228
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
10442
+ const int64_t nc = ne00;
10443
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10229
10444
 
10230
- for (int i = 0; i < nr; ++i) {
10231
- const int r = ((int32_t *) src1->data)[i];
10445
+ assert(ne0 == nc);
10446
+ assert(ne02 == ne11);
10447
+ assert(nb00 == sizeof(ggml_fp16_t));
10448
+ assert(ggml_nrows(dst) == nr);
10232
10449
 
10233
- for (int j = 0; j < nc; ++j) {
10234
- ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
10235
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
10450
+ // TODO: multi-thread
10451
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10452
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10453
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10454
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10455
+
10456
+ ggml_fp16_to_fp32_row(
10457
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10458
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10459
+ }
10236
10460
  }
10237
10461
  }
10238
10462
  }
@@ -10248,19 +10472,27 @@ static void ggml_compute_forward_get_rows_f32(
10248
10472
  return;
10249
10473
  }
10250
10474
 
10251
- const int nc = src0->ne[0];
10252
- const int nr = ggml_nelements(src1);
10475
+ GGML_TENSOR_BINARY_OP_LOCALS
10253
10476
 
10254
- assert( dst->ne[0] == nc);
10255
- assert( dst->ne[1] == nr);
10256
- assert(src0->nb[0] == sizeof(float));
10477
+ const int64_t nc = ne00;
10478
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10257
10479
 
10258
- for (int i = 0; i < nr; ++i) {
10259
- const int r = ((int32_t *) src1->data)[i];
10480
+ assert(ne0 == nc);
10481
+ assert(ne02 == ne11);
10482
+ assert(nb00 == sizeof(float));
10483
+ assert(ggml_nrows(dst) == nr);
10260
10484
 
10261
- ggml_vec_cpy_f32(nc,
10262
- (float *) ((char *) dst->data + i*dst->nb[1]),
10263
- (float *) ((char *) src0->data + r*src0->nb[1]));
10485
+ // TODO: multi-thread
10486
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10487
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10488
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10489
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10490
+
10491
+ ggml_vec_cpy_f32(nc,
10492
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
10493
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
10494
+ }
10495
+ }
10264
10496
  }
10265
10497
  }
10266
10498
 
@@ -11980,6 +12212,7 @@ static void ggml_compute_forward_upscale_f32(
11980
12212
  GGML_ASSERT(src0->nb[0] == sizeof(float));
11981
12213
 
11982
12214
  const int ith = params->ith;
12215
+ const int nth = params->nth;
11983
12216
 
11984
12217
  GGML_TENSOR_UNARY_OP_LOCALS
11985
12218
 
@@ -11987,16 +12220,17 @@ static void ggml_compute_forward_upscale_f32(
11987
12220
 
11988
12221
  // TODO: optimize
11989
12222
 
11990
- for (int i03 = 0; i03 < ne03; i03++) {
11991
- for (int i02 = ith; i02 < ne02; i02++) {
11992
- for (int m = 0; m < dst->ne[1]; m++) {
11993
- int i01 = m / scale_factor;
11994
- for (int n = 0; n < dst->ne[0]; n++) {
11995
- int i00 = n / scale_factor;
11996
-
11997
- const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
12223
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
12224
+ const int64_t i03 = i3;
12225
+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
12226
+ const int64_t i02 = i2;
12227
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
12228
+ const int64_t i01 = i1 / scale_factor;
12229
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
12230
+ const int64_t i00 = i0 / scale_factor;
11998
12231
 
11999
- float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
12232
+ const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
12233
+ float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
12000
12234
 
12001
12235
  *y = *x;
12002
12236
  }
@@ -12021,6 +12255,125 @@ static void ggml_compute_forward_upscale(
12021
12255
  }
12022
12256
  }
12023
12257
 
12258
+ // ggml_compute_forward_pad
12259
+
12260
+ static void ggml_compute_forward_pad_f32(
12261
+ const struct ggml_compute_params * params,
12262
+ const struct ggml_tensor * src0,
12263
+ struct ggml_tensor * dst) {
12264
+
12265
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12266
+ return;
12267
+ }
12268
+
12269
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
12270
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
12271
+
12272
+ const int ith = params->ith;
12273
+ const int nth = params->nth;
12274
+
12275
+ GGML_TENSOR_UNARY_OP_LOCALS
12276
+
12277
+ float * dst_ptr = (float *) dst->data;
12278
+
12279
+ // TODO: optimize
12280
+
12281
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
12282
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
12283
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
12284
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
12285
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
12286
+
12287
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12288
+
12289
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
12290
+ dst_ptr[dst_idx] = *src_ptr;
12291
+ } else {
12292
+ dst_ptr[dst_idx] = 0;
12293
+ }
12294
+ }
12295
+ }
12296
+ }
12297
+ }
12298
+ }
12299
+
12300
+ static void ggml_compute_forward_pad(
12301
+ const struct ggml_compute_params * params,
12302
+ const struct ggml_tensor * src0,
12303
+ struct ggml_tensor * dst) {
12304
+ switch (src0->type) {
12305
+ case GGML_TYPE_F32:
12306
+ {
12307
+ ggml_compute_forward_pad_f32(params, src0, dst);
12308
+ } break;
12309
+ default:
12310
+ {
12311
+ GGML_ASSERT(false);
12312
+ } break;
12313
+ }
12314
+ }
12315
+
12316
+ // ggml_compute_forward_argsort
12317
+
12318
+ static void ggml_compute_forward_argsort_f32(
12319
+ const struct ggml_compute_params * params,
12320
+ const struct ggml_tensor * src0,
12321
+ struct ggml_tensor * dst) {
12322
+
12323
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12324
+ return;
12325
+ }
12326
+
12327
+ GGML_TENSOR_UNARY_OP_LOCALS
12328
+
12329
+ GGML_ASSERT(nb0 == sizeof(float));
12330
+
12331
+ const int ith = params->ith;
12332
+ const int nth = params->nth;
12333
+
12334
+ const int64_t nr = ggml_nrows(src0);
12335
+
12336
+ enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
12337
+
12338
+ for (int64_t i = ith; i < nr; i += nth) {
12339
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
12340
+ const float * src_data = (float *)((char *) src0->data + i*nb01);
12341
+
12342
+ for (int64_t j = 0; j < ne0; j++) {
12343
+ dst_data[j] = j;
12344
+ }
12345
+
12346
+ // C doesn't have a functional sort, so we do a bubble sort instead
12347
+ for (int64_t j = 0; j < ne0; j++) {
12348
+ for (int64_t k = j + 1; k < ne0; k++) {
12349
+ if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
12350
+ (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
12351
+ int32_t tmp = dst_data[j];
12352
+ dst_data[j] = dst_data[k];
12353
+ dst_data[k] = tmp;
12354
+ }
12355
+ }
12356
+ }
12357
+ }
12358
+ }
12359
+
12360
+ static void ggml_compute_forward_argsort(
12361
+ const struct ggml_compute_params * params,
12362
+ const struct ggml_tensor * src0,
12363
+ struct ggml_tensor * dst) {
12364
+
12365
+ switch (src0->type) {
12366
+ case GGML_TYPE_F32:
12367
+ {
12368
+ ggml_compute_forward_argsort_f32(params, src0, dst);
12369
+ } break;
12370
+ default:
12371
+ {
12372
+ GGML_ASSERT(false);
12373
+ } break;
12374
+ }
12375
+ }
12376
+
12024
12377
  // ggml_compute_forward_flash_attn
12025
12378
 
12026
12379
  static void ggml_compute_forward_flash_attn_f32(
@@ -13167,10 +13520,6 @@ static void ggml_compute_forward_unary(
13167
13520
  {
13168
13521
  ggml_compute_forward_silu(params, src0, dst);
13169
13522
  } break;
13170
- case GGML_UNARY_OP_LEAKY:
13171
- {
13172
- ggml_compute_forward_leaky(params, src0, dst);
13173
- } break;
13174
13523
  default:
13175
13524
  {
13176
13525
  GGML_ASSERT(false);
@@ -13842,7 +14191,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13842
14191
  } break;
13843
14192
  case GGML_OP_MUL_MAT:
13844
14193
  {
13845
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
14194
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
14195
+ } break;
14196
+ case GGML_OP_MUL_MAT_ID:
14197
+ {
14198
+ ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
13846
14199
  } break;
13847
14200
  case GGML_OP_OUT_PROD:
13848
14201
  {
@@ -13948,6 +14301,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13948
14301
  {
13949
14302
  ggml_compute_forward_upscale(params, tensor->src[0], tensor);
13950
14303
  } break;
14304
+ case GGML_OP_PAD:
14305
+ {
14306
+ ggml_compute_forward_pad(params, tensor->src[0], tensor);
14307
+ } break;
14308
+ case GGML_OP_ARGSORT:
14309
+ {
14310
+ ggml_compute_forward_argsort(params, tensor->src[0], tensor);
14311
+ } break;
14312
+ case GGML_OP_LEAKY_RELU:
14313
+ {
14314
+ ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
14315
+ } break;
13951
14316
  case GGML_OP_FLASH_ATTN:
13952
14317
  {
13953
14318
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -14202,7 +14567,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14202
14567
  return replacements->vals[i];
14203
14568
  }
14204
14569
 
14205
- struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14570
+ struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
14206
14571
 
14207
14572
  // insert clone into replacements
14208
14573
  GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
@@ -14272,7 +14637,7 @@ void ggml_build_backward_gradient_checkpointing(
14272
14637
  // insert new tensors recomputing src, reusing already made replacements,
14273
14638
  // remember replacements: remember new tensors with mapping from corresponding gf nodes
14274
14639
  // recurse for input tensors,
14275
- // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
14640
+ // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
14276
14641
  node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
14277
14642
  }
14278
14643
  // insert rewritten backward node with replacements made into resulting backward graph gb
@@ -14598,6 +14963,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14598
14963
  zero_table);
14599
14964
  }
14600
14965
  } break;
14966
+ case GGML_OP_MUL_MAT_ID:
14967
+ {
14968
+ GGML_ASSERT(false); // TODO: not implemented
14969
+ } break;
14601
14970
  case GGML_OP_OUT_PROD:
14602
14971
  {
14603
14972
  GGML_ASSERT(false); // TODO: not implemented
@@ -14936,6 +15305,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14936
15305
  {
14937
15306
  GGML_ASSERT(false); // TODO: not implemented
14938
15307
  } break;
15308
+ case GGML_OP_PAD:
15309
+ {
15310
+ GGML_ASSERT(false); // TODO: not implemented
15311
+ } break;
15312
+ case GGML_OP_ARGSORT:
15313
+ {
15314
+ GGML_ASSERT(false); // TODO: not implemented
15315
+ } break;
15316
+ case GGML_OP_LEAKY_RELU:
15317
+ {
15318
+ GGML_ASSERT(false); // TODO: not implemented
15319
+ } break;
14939
15320
  case GGML_OP_FLASH_ATTN:
14940
15321
  {
14941
15322
  struct ggml_tensor * flash_grad = NULL;
@@ -15296,12 +15677,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15296
15677
  return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15297
15678
  }
15298
15679
 
15299
- struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15300
- const size_t obj_size = sizeof(struct ggml_cgraph);
15301
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15302
- struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15303
-
15304
- *cgraph = (struct ggml_cgraph) {
15680
+ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
15681
+ struct ggml_cgraph cgraph = {
15305
15682
  /*.size =*/ 0,
15306
15683
  /*.n_nodes =*/ i1 - i0,
15307
15684
  /*.n_leafs =*/ 0,
@@ -15536,7 +15913,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15536
15913
  n_tasks = n_threads;
15537
15914
  } break;
15538
15915
  case GGML_OP_SUB:
15539
- case GGML_OP_DIV:
15540
15916
  case GGML_OP_SQR:
15541
15917
  case GGML_OP_SQRT:
15542
15918
  case GGML_OP_LOG:
@@ -15546,6 +15922,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15546
15922
  case GGML_OP_ARGMAX:
15547
15923
  case GGML_OP_REPEAT:
15548
15924
  case GGML_OP_REPEAT_BACK:
15925
+ case GGML_OP_LEAKY_RELU:
15549
15926
  {
15550
15927
  n_tasks = 1;
15551
15928
  } break;
@@ -15558,7 +15935,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15558
15935
  case GGML_UNARY_OP_TANH:
15559
15936
  case GGML_UNARY_OP_ELU:
15560
15937
  case GGML_UNARY_OP_RELU:
15561
- case GGML_UNARY_OP_LEAKY:
15562
15938
  {
15563
15939
  n_tasks = 1;
15564
15940
  } break;
@@ -15569,10 +15945,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15569
15945
  {
15570
15946
  n_tasks = n_threads;
15571
15947
  } break;
15948
+ default:
15949
+ GGML_ASSERT(false);
15572
15950
  }
15573
15951
  break;
15574
15952
  case GGML_OP_SILU_BACK:
15575
15953
  case GGML_OP_MUL:
15954
+ case GGML_OP_DIV:
15576
15955
  case GGML_OP_NORM:
15577
15956
  case GGML_OP_RMS_NORM:
15578
15957
  case GGML_OP_RMS_NORM_BACK:
@@ -15610,6 +15989,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15610
15989
  }
15611
15990
  #endif
15612
15991
  } break;
15992
+ case GGML_OP_MUL_MAT_ID:
15993
+ {
15994
+ // FIXME: blas
15995
+ n_tasks = n_threads;
15996
+ } break;
15613
15997
  case GGML_OP_OUT_PROD:
15614
15998
  {
15615
15999
  n_tasks = n_threads;
@@ -15629,7 +16013,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15629
16013
  } break;
15630
16014
  case GGML_OP_DIAG_MASK_ZERO:
15631
16015
  case GGML_OP_DIAG_MASK_INF:
15632
- case GGML_OP_SOFT_MAX:
15633
16016
  case GGML_OP_SOFT_MAX_BACK:
15634
16017
  case GGML_OP_ROPE:
15635
16018
  case GGML_OP_ROPE_BACK:
@@ -15645,6 +16028,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15645
16028
  {
15646
16029
  n_tasks = 1; //TODO
15647
16030
  } break;
16031
+ case GGML_OP_SOFT_MAX:
16032
+ {
16033
+ n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
16034
+ } break;
15648
16035
  case GGML_OP_CONV_TRANSPOSE_1D:
15649
16036
  {
15650
16037
  n_tasks = n_threads;
@@ -15666,6 +16053,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15666
16053
  {
15667
16054
  n_tasks = n_threads;
15668
16055
  } break;
16056
+ case GGML_OP_PAD:
16057
+ {
16058
+ n_tasks = n_threads;
16059
+ } break;
16060
+ case GGML_OP_ARGSORT:
16061
+ {
16062
+ n_tasks = n_threads;
16063
+ } break;
15669
16064
  case GGML_OP_FLASH_ATTN:
15670
16065
  {
15671
16066
  n_tasks = n_threads;
@@ -15728,6 +16123,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15728
16123
  {
15729
16124
  n_tasks = 1;
15730
16125
  } break;
16126
+ case GGML_OP_COUNT:
16127
+ {
16128
+ GGML_ASSERT(false);
16129
+ } break;
15731
16130
  default:
15732
16131
  {
15733
16132
  fprintf(stderr, "%s: op not implemented: ", __func__);
@@ -15876,18 +16275,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15876
16275
 
15877
16276
  // thread scheduling for the different operations + work buffer size estimation
15878
16277
  for (int i = 0; i < cgraph->n_nodes; i++) {
15879
- int n_tasks = 1;
15880
-
15881
16278
  struct ggml_tensor * node = cgraph->nodes[i];
15882
16279
 
16280
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16281
+
15883
16282
  size_t cur = 0;
15884
16283
 
15885
16284
  switch (node->op) {
15886
16285
  case GGML_OP_CPY:
15887
16286
  case GGML_OP_DUP:
15888
16287
  {
15889
- n_tasks = n_threads;
15890
-
15891
16288
  if (ggml_is_quantized(node->type)) {
15892
16289
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15893
16290
  }
@@ -15895,16 +16292,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15895
16292
  case GGML_OP_ADD:
15896
16293
  case GGML_OP_ADD1:
15897
16294
  {
15898
- n_tasks = n_threads;
15899
-
15900
16295
  if (ggml_is_quantized(node->src[0]->type)) {
15901
16296
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15902
16297
  }
15903
16298
  } break;
15904
16299
  case GGML_OP_ACC:
15905
16300
  {
15906
- n_tasks = n_threads;
15907
-
15908
16301
  if (ggml_is_quantized(node->src[0]->type)) {
15909
16302
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
15910
16303
  }
@@ -15927,21 +16320,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15927
16320
  } else
15928
16321
  #endif
15929
16322
  if (node->src[1]->type != vec_dot_type) {
15930
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16323
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
16324
+ }
16325
+ } break;
16326
+ case GGML_OP_MUL_MAT_ID:
16327
+ {
16328
+ const struct ggml_tensor * a = node->src[2];
16329
+ const struct ggml_tensor * b = node->src[1];
16330
+ const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
16331
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16332
+ if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
16333
+ if (a->type != GGML_TYPE_F32) {
16334
+ // here we need memory just for single 2D matrix from src0
16335
+ cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
16336
+ }
16337
+ } else
16338
+ #endif
16339
+ if (b->type != vec_dot_type) {
16340
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
15931
16341
  }
15932
16342
  } break;
15933
16343
  case GGML_OP_OUT_PROD:
15934
16344
  {
15935
- n_tasks = n_threads;
15936
-
15937
16345
  if (ggml_is_quantized(node->src[0]->type)) {
15938
16346
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15939
16347
  }
15940
16348
  } break;
15941
16349
  case GGML_OP_SOFT_MAX:
15942
16350
  {
15943
- n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
15944
-
15945
16351
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15946
16352
  } break;
15947
16353
  case GGML_OP_CONV_TRANSPOSE_1D:
@@ -15969,10 +16375,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15969
16375
  GGML_ASSERT(false);
15970
16376
  }
15971
16377
  } break;
15972
- case GGML_OP_IM2COL:
15973
- {
15974
- n_tasks = n_threads;
15975
- } break;
15976
16378
  case GGML_OP_CONV_TRANSPOSE_2D:
15977
16379
  {
15978
16380
  const int64_t ne00 = node->src[0]->ne[0]; // W
@@ -15989,8 +16391,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15989
16391
  } break;
15990
16392
  case GGML_OP_FLASH_ATTN:
15991
16393
  {
15992
- n_tasks = n_threads;
15993
-
15994
16394
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
15995
16395
 
15996
16396
  if (node->src[1]->type == GGML_TYPE_F32) {
@@ -16003,8 +16403,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16003
16403
  } break;
16004
16404
  case GGML_OP_FLASH_FF:
16005
16405
  {
16006
- n_tasks = n_threads;
16007
-
16008
16406
  if (node->src[1]->type == GGML_TYPE_F32) {
16009
16407
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16010
16408
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
@@ -16015,8 +16413,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16015
16413
  } break;
16016
16414
  case GGML_OP_FLASH_ATTN_BACK:
16017
16415
  {
16018
- n_tasks = n_threads;
16019
-
16020
16416
  const int64_t D = node->src[0]->ne[0];
16021
16417
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16022
16418
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
@@ -16031,8 +16427,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16031
16427
 
16032
16428
  case GGML_OP_CROSS_ENTROPY_LOSS:
16033
16429
  {
16034
- n_tasks = n_threads;
16035
-
16036
16430
  cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16037
16431
  } break;
16038
16432
  case GGML_OP_COUNT:
@@ -16174,7 +16568,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
16174
16568
  fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
16175
16569
  ggml_type_name(tensor->type),
16176
16570
  ggml_op_name (tensor->op),
16177
- tensor->n_dims,
16571
+ ggml_n_dims(tensor),
16178
16572
  ne[0], ne[1], ne[2], ne[3],
16179
16573
  nb[0], nb[1], nb[2], nb[3],
16180
16574
  tensor->data,
@@ -16189,7 +16583,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16189
16583
  arg,
16190
16584
  ggml_type_name(tensor->type),
16191
16585
  ggml_op_name (tensor->op),
16192
- tensor->n_dims,
16586
+ ggml_n_dims(tensor),
16193
16587
  ne[0], ne[1], ne[2], ne[3],
16194
16588
  nb[0], nb[1], nb[2], nb[3],
16195
16589
  tensor->data,
@@ -16279,11 +16673,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16279
16673
 
16280
16674
  const uint32_t type = tensor->type;
16281
16675
  const uint32_t op = tensor->op;
16282
- const uint32_t n_dims = tensor->n_dims;
16283
16676
 
16284
16677
  fwrite(&type, sizeof(uint32_t), 1, fout);
16285
16678
  fwrite(&op, sizeof(uint32_t), 1, fout);
16286
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16287
16679
 
16288
16680
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16289
16681
  const uint64_t ne = tensor->ne[j];
@@ -16313,11 +16705,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16313
16705
 
16314
16706
  const uint32_t type = tensor->type;
16315
16707
  const uint32_t op = tensor->op;
16316
- const uint32_t n_dims = tensor->n_dims;
16317
16708
 
16318
16709
  fwrite(&type, sizeof(uint32_t), 1, fout);
16319
16710
  fwrite(&op, sizeof(uint32_t), 1, fout);
16320
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16321
16711
 
16322
16712
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16323
16713
  const uint64_t ne = tensor->ne[j];
@@ -16489,12 +16879,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16489
16879
  {
16490
16880
  uint32_t type;
16491
16881
  uint32_t op;
16492
- uint32_t n_dims;
16493
16882
 
16494
16883
  for (uint32_t i = 0; i < n_leafs; ++i) {
16495
16884
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16496
16885
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16497
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16498
16886
 
16499
16887
  int64_t ne[GGML_MAX_DIMS];
16500
16888
  size_t nb[GGML_MAX_DIMS];
@@ -16510,7 +16898,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16510
16898
  nb[j] = nb_cur;
16511
16899
  }
16512
16900
 
16513
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
16901
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16514
16902
 
16515
16903
  tensor->op = (enum ggml_op) op;
16516
16904
 
@@ -16527,7 +16915,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16527
16915
 
16528
16916
  ptr += ggml_nbytes(tensor);
16529
16917
 
16530
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
16918
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16531
16919
  }
16532
16920
  }
16533
16921
 
@@ -16537,12 +16925,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16537
16925
  {
16538
16926
  uint32_t type;
16539
16927
  uint32_t op;
16540
- uint32_t n_dims;
16541
16928
 
16542
16929
  for (uint32_t i = 0; i < n_nodes; ++i) {
16543
16930
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16544
16931
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16545
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16546
16932
 
16547
16933
  enum ggml_op eop = (enum ggml_op) op;
16548
16934
 
@@ -16613,7 +16999,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16613
16999
  } break;
16614
17000
  default:
16615
17001
  {
16616
- tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
17002
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16617
17003
 
16618
17004
  tensor->op = eop;
16619
17005
  } break;
@@ -16632,7 +17018,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16632
17018
 
16633
17019
  result->nodes[i] = tensor;
16634
17020
 
16635
- fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17021
+ fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16636
17022
  }
16637
17023
  }
16638
17024
  }
@@ -16770,7 +17156,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16770
17156
  fprintf(fp, "(%s)|", ggml_type_name(node->type));
16771
17157
  }
16772
17158
 
16773
- if (node->n_dims == 2) {
17159
+ if (ggml_is_matrix(node)) {
16774
17160
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
16775
17161
  } else {
16776
17162
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
@@ -17037,7 +17423,7 @@ static enum ggml_opt_result ggml_opt_adam(
17037
17423
  int64_t i = 0;
17038
17424
  for (int p = 0; p < np; ++p) {
17039
17425
  const int64_t ne = ggml_nelements(ps[p]);
17040
- const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
17426
+ const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
17041
17427
  for (int64_t j = 0; j < ne; ++j) {
17042
17428
  float x = ggml_get_f32_1d(ps[p], j);
17043
17429
  float g_ = g[i]*gnorm;
@@ -17819,8 +18205,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
17819
18205
  memcpy(&qh, &y[i].qh, sizeof(qh));
17820
18206
 
17821
18207
  for (int j = 0; j < QK5_0; j += 2) {
17822
- const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
17823
- const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
18208
+ const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
18209
+ const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
17824
18210
 
17825
18211
  // cast to 16 bins
17826
18212
  const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -17849,8 +18235,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
17849
18235
  memcpy(&qh, &y[i].qh, sizeof(qh));
17850
18236
 
17851
18237
  for (int j = 0; j < QK5_1; j += 2) {
17852
- const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
17853
- const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
18238
+ const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
18239
+ const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
17854
18240
 
17855
18241
  // cast to 16 bins
17856
18242
  const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -18040,6 +18426,7 @@ struct gguf_kv {
18040
18426
 
18041
18427
  struct gguf_header {
18042
18428
  char magic[4];
18429
+
18043
18430
  uint32_t version;
18044
18431
  uint64_t n_tensors; // GGUFv2
18045
18432
  uint64_t n_kv; // GGUFv2
@@ -18129,7 +18516,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18129
18516
 
18130
18517
  for (uint32_t i = 0; i < sizeof(magic); i++) {
18131
18518
  if (magic[i] != GGUF_MAGIC[i]) {
18132
- fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
18519
+ fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
18133
18520
  fclose(file);
18134
18521
  return NULL;
18135
18522
  }
@@ -18144,7 +18531,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18144
18531
  {
18145
18532
  strncpy(ctx->header.magic, magic, 4);
18146
18533
 
18147
-
18148
18534
  ctx->kv = NULL;
18149
18535
  ctx->infos = NULL;
18150
18536
  ctx->data = NULL;
@@ -18311,7 +18697,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18311
18697
  return NULL;
18312
18698
  }
18313
18699
 
18314
- const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
18700
+ const size_t size_cur = ggml_row_size(info->type, ne);
18315
18701
 
18316
18702
  ctx->size += GGML_PAD(size_cur, ctx->alignment);
18317
18703
  }
@@ -18815,8 +19201,8 @@ void gguf_add_tensor(
18815
19201
  ctx->infos[idx].ne[i] = 1;
18816
19202
  }
18817
19203
 
18818
- ctx->infos[idx].n_dims = tensor->n_dims;
18819
- for (int i = 0; i < tensor->n_dims; i++) {
19204
+ ctx->infos[idx].n_dims = ggml_n_dims(tensor);
19205
+ for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
18820
19206
  ctx->infos[idx].ne[i] = tensor->ne[i];
18821
19207
  }
18822
19208