llama_cpp 0.9.5 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
1
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
2
2
  #define _USE_MATH_DEFINES // For M_PI on MSVC
3
3
 
4
4
  #include "ggml-impl.h"
@@ -33,7 +33,7 @@
33
33
  // we should just be careful :)
34
34
  #pragma warning(disable: 4244 4267)
35
35
 
36
- // disable POSIX deprecation warnigns
36
+ // disable POSIX deprecation warnings
37
37
  // these functions are never going away, anyway
38
38
  #pragma warning(disable: 4996)
39
39
  #endif
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
233
233
  #define UNUSED GGML_UNUSED
234
234
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
235
235
 
236
- //
237
- // tensor access macros
238
- //
239
-
240
- #define GGML_TENSOR_UNARY_OP_LOCALS \
241
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
242
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
243
- GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
244
- GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
245
-
246
- #define GGML_TENSOR_BINARY_OP_LOCALS \
247
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
248
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
249
- GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
250
- GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
251
- GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
252
- GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
253
-
254
236
  #if defined(GGML_USE_ACCELERATE)
255
237
  #include <Accelerate/Accelerate.h>
256
238
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@@ -1413,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1413
1395
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1414
1396
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1415
1397
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
- inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1398
+ inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1417
1399
 
1418
1400
  static const float GELU_COEF_A = 0.044715f;
1419
1401
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1613
1595
  "GROUP_NORM",
1614
1596
 
1615
1597
  "MUL_MAT",
1598
+ "MUL_MAT_ID",
1616
1599
  "OUT_PROD",
1617
1600
 
1618
1601
  "SCALE",
@@ -1640,6 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1640
1623
  "POOL_1D",
1641
1624
  "POOL_2D",
1642
1625
  "UPSCALE",
1626
+ "PAD",
1627
+ "ARGSORT",
1628
+ "LEAKY_RELU",
1643
1629
 
1644
1630
  "FLASH_ATTN",
1645
1631
  "FLASH_FF",
@@ -1666,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1666
1652
  "CROSS_ENTROPY_LOSS_BACK",
1667
1653
  };
1668
1654
 
1669
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1655
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1670
1656
 
1671
1657
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1672
1658
  "none",
@@ -1695,6 +1681,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1695
1681
  "group_norm(x)",
1696
1682
 
1697
1683
  "X*Y",
1684
+ "X[i]*Y",
1698
1685
  "X*Y",
1699
1686
 
1700
1687
  "x*v",
@@ -1722,6 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1722
1709
  "pool_1d(x)",
1723
1710
  "pool_2d(x)",
1724
1711
  "upscale(x)",
1712
+ "pad(x)",
1713
+ "argsort(x)",
1714
+ "leaky_relu(x)",
1725
1715
 
1726
1716
  "flash_attn(x)",
1727
1717
  "flash_ff(x)",
@@ -1748,15 +1738,32 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1748
1738
  "cross_entropy_loss_back(x,y)",
1749
1739
  };
1750
1740
 
1751
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1741
+ static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1752
1742
 
1753
1743
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1754
1744
 
1745
+
1746
+ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1747
+ "ABS",
1748
+ "SGN",
1749
+ "NEG",
1750
+ "STEP",
1751
+ "TANH",
1752
+ "ELU",
1753
+ "RELU",
1754
+ "GELU",
1755
+ "GELU_QUICK",
1756
+ "SILU",
1757
+ };
1758
+
1759
+ static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
1760
+
1761
+
1755
1762
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
1756
1763
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
1757
1764
 
1758
1765
  // WARN:
1759
- // Mis-confguration can lead to problem that's hard to reason about:
1766
+ // Mis-configuration can lead to problem that's hard to reason about:
1760
1767
  // * At best it crash or talks nosense.
1761
1768
  // * At worst it talks slightly difference but hard to perceive.
1762
1769
  //
@@ -1771,6 +1778,7 @@ static void ggml_setup_op_has_task_pass(void) {
1771
1778
 
1772
1779
  p[GGML_OP_ACC ] = true;
1773
1780
  p[GGML_OP_MUL_MAT ] = true;
1781
+ p[GGML_OP_MUL_MAT_ID ] = true;
1774
1782
  p[GGML_OP_OUT_PROD ] = true;
1775
1783
  p[GGML_OP_SET ] = true;
1776
1784
  p[GGML_OP_GET_ROWS_BACK ] = true;
@@ -1989,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
1989
1997
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1990
1998
  }
1991
1999
 
1992
- size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
1993
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1994
-
1995
- return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
1996
- }
1997
-
1998
2000
  int ggml_blck_size(enum ggml_type type) {
1999
2001
  return type_traits[type].blck_size;
2000
2002
  }
@@ -2003,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
2003
2005
  return type_traits[type].type_size;
2004
2006
  }
2005
2007
 
2006
- float ggml_type_sizef(enum ggml_type type) {
2007
- return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
2008
+ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2009
+ assert(ne % ggml_blck_size(type) == 0);
2010
+ return ggml_type_size(type)*ne/ggml_blck_size(type);
2011
+ }
2012
+
2013
+ double ggml_type_sizef(enum ggml_type type) {
2014
+ return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2008
2015
  }
2009
2016
 
2010
2017
  const char * ggml_type_name(enum ggml_type type) {
@@ -2023,28 +2030,55 @@ const char * ggml_op_symbol(enum ggml_op op) {
2023
2030
  return GGML_OP_SYMBOL[op];
2024
2031
  }
2025
2032
 
2033
+ const char * ggml_unary_op_name(enum ggml_unary_op op) {
2034
+ return GGML_UNARY_OP_NAME[op];
2035
+ }
2036
+
2037
+ const char * ggml_op_desc(const struct ggml_tensor * t) {
2038
+ if (t->op == GGML_OP_UNARY) {
2039
+ enum ggml_unary_op uop = ggml_get_unary_op(t);
2040
+ return ggml_unary_op_name(uop);
2041
+ }
2042
+ else {
2043
+ return ggml_op_name(t->op);
2044
+ }
2045
+ }
2046
+
2026
2047
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
2027
2048
  return ggml_type_size(tensor->type);
2028
2049
  }
2029
2050
 
2030
- static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2051
+ bool ggml_is_scalar(const struct ggml_tensor * tensor) {
2031
2052
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2032
2053
 
2033
2054
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2034
2055
  }
2035
2056
 
2036
- static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
2057
+ bool ggml_is_vector(const struct ggml_tensor * tensor) {
2037
2058
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2038
2059
 
2039
2060
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
2040
2061
  }
2041
2062
 
2042
- static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2063
+ bool ggml_is_matrix(const struct ggml_tensor * tensor) {
2043
2064
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2044
2065
 
2045
2066
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
2046
2067
  }
2047
2068
 
2069
+ bool ggml_is_3d(const struct ggml_tensor * tensor) {
2070
+ return tensor->ne[3] == 1;
2071
+ }
2072
+
2073
+ int ggml_n_dims(const struct ggml_tensor * tensor) {
2074
+ for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
2075
+ if (tensor->ne[i] > 1) {
2076
+ return i + 1;
2077
+ }
2078
+ }
2079
+ return 1;
2080
+ }
2081
+
2048
2082
  static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2049
2083
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2050
2084
 
@@ -2451,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2451
2485
  view_src = view_src->view_src;
2452
2486
  }
2453
2487
 
2454
- size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
2488
+ size_t data_size = ggml_row_size(type, ne[0]);
2455
2489
  for (int i = 1; i < n_dims; i++) {
2456
2490
  data_size *= ne[i];
2457
2491
  }
@@ -2494,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2494
2528
  /*.type =*/ type,
2495
2529
  /*.backend =*/ GGML_BACKEND_CPU,
2496
2530
  /*.buffer =*/ NULL,
2497
- /*.n_dims =*/ n_dims,
2498
2531
  /*.ne =*/ { 1, 1, 1, 1 },
2499
2532
  /*.nb =*/ { 0, 0, 0, 0 },
2500
2533
  /*.op =*/ GGML_OP_NONE,
@@ -2601,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
2601
2634
  }
2602
2635
 
2603
2636
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
2604
- return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
2637
+ return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
2605
2638
  }
2606
2639
 
2607
2640
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
@@ -3050,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
3050
3083
  struct ggml_tensor * ggml_view_tensor(
3051
3084
  struct ggml_context * ctx,
3052
3085
  struct ggml_tensor * src) {
3053
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
3086
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
3054
3087
  ggml_format_name(result, "%s (view)", src->name);
3055
3088
 
3056
3089
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -3154,9 +3187,7 @@ static struct ggml_tensor * ggml_add_impl(
3154
3187
  struct ggml_tensor * a,
3155
3188
  struct ggml_tensor * b,
3156
3189
  bool inplace) {
3157
- // TODO: support less-strict constraint
3158
- // GGML_ASSERT(ggml_can_repeat(b, a));
3159
- GGML_ASSERT(ggml_can_repeat_rows(b, a));
3190
+ GGML_ASSERT(ggml_can_repeat(b, a));
3160
3191
 
3161
3192
  bool is_node = false;
3162
3193
 
@@ -3210,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
3210
3241
  is_node = true;
3211
3242
  }
3212
3243
 
3213
- struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
3244
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
3214
3245
 
3215
3246
  result->op = GGML_OP_ADD;
3216
- result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
3247
+ result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
3217
3248
  result->src[0] = a;
3218
3249
  result->src[1] = b;
3219
3250
 
@@ -3371,9 +3402,7 @@ static struct ggml_tensor * ggml_mul_impl(
3371
3402
  struct ggml_tensor * a,
3372
3403
  struct ggml_tensor * b,
3373
3404
  bool inplace) {
3374
- // TODO: support less-strict constraint
3375
- // GGML_ASSERT(ggml_can_repeat(b, a));
3376
- GGML_ASSERT(ggml_can_repeat_rows(b, a));
3405
+ GGML_ASSERT(ggml_can_repeat(b, a));
3377
3406
 
3378
3407
  bool is_node = false;
3379
3408
 
@@ -3418,7 +3447,7 @@ static struct ggml_tensor * ggml_div_impl(
3418
3447
  struct ggml_tensor * a,
3419
3448
  struct ggml_tensor * b,
3420
3449
  bool inplace) {
3421
- GGML_ASSERT(ggml_are_same_shape(a, b));
3450
+ GGML_ASSERT(ggml_can_repeat(b, a));
3422
3451
 
3423
3452
  bool is_node = false;
3424
3453
 
@@ -3584,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows(
3584
3613
  is_node = true;
3585
3614
  }
3586
3615
 
3587
- int64_t ne[4] = {1,1,1,1};
3588
- for (int i=1; i<a->n_dims; ++i) {
3616
+ int64_t ne[GGML_MAX_DIMS] = { 1 };
3617
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3589
3618
  ne[i] = a->ne[i];
3590
3619
  }
3591
3620
 
3592
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
3621
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
3593
3622
 
3594
3623
  result->op = GGML_OP_SUM_ROWS;
3595
3624
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3610,8 +3639,8 @@ struct ggml_tensor * ggml_mean(
3610
3639
  is_node = true;
3611
3640
  }
3612
3641
 
3613
- int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3614
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
3642
+ int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
3643
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3615
3644
 
3616
3645
  result->op = GGML_OP_MEAN;
3617
3646
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3633,8 +3662,7 @@ struct ggml_tensor * ggml_argmax(
3633
3662
  is_node = true;
3634
3663
  }
3635
3664
 
3636
- int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
3637
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
3665
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
3638
3666
 
3639
3667
  result->op = GGML_OP_ARGMAX;
3640
3668
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3657,7 +3685,7 @@ struct ggml_tensor * ggml_repeat(
3657
3685
  is_node = true;
3658
3686
  }
3659
3687
 
3660
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3688
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3661
3689
 
3662
3690
  result->op = GGML_OP_REPEAT;
3663
3691
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3684,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back(
3684
3712
  return a;
3685
3713
  }
3686
3714
 
3687
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
3715
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
3688
3716
 
3689
3717
  result->op = GGML_OP_REPEAT_BACK;
3690
3718
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -3815,12 +3843,25 @@ struct ggml_tensor * ggml_relu_inplace(
3815
3843
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3816
3844
  }
3817
3845
 
3818
- // ggml_leaky
3846
+ // ggml_leaky_relu
3819
3847
 
3820
- struct ggml_tensor * ggml_leaky(
3848
+ struct ggml_tensor * ggml_leaky_relu(
3821
3849
  struct ggml_context * ctx,
3822
- struct ggml_tensor * a) {
3823
- return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3850
+ struct ggml_tensor * a, float negative_slope, bool inplace) {
3851
+ bool is_node = false;
3852
+
3853
+ if (!inplace && (a->grad)) {
3854
+ is_node = true;
3855
+ }
3856
+
3857
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3858
+ ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
3859
+
3860
+ result->op = GGML_OP_LEAKY_RELU;
3861
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
3862
+ result->src[0] = a;
3863
+
3864
+ return result;
3824
3865
  }
3825
3866
 
3826
3867
  // ggml_gelu
@@ -4007,8 +4048,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
4007
4048
 
4008
4049
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4009
4050
 
4010
- result->op = GGML_OP_GROUP_NORM;
4011
4051
  result->op_params[0] = n_groups;
4052
+
4053
+ result->op = GGML_OP_GROUP_NORM;
4012
4054
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4013
4055
  result->src[0] = a;
4014
4056
  result->src[1] = NULL; // TODO: maybe store epsilon here?
@@ -4046,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat(
4046
4088
  }
4047
4089
 
4048
4090
  const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4049
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4091
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4050
4092
 
4051
4093
  result->op = GGML_OP_MUL_MAT;
4052
4094
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4056,6 +4098,51 @@ struct ggml_tensor * ggml_mul_mat(
4056
4098
  return result;
4057
4099
  }
4058
4100
 
4101
+ // ggml_mul_mat_id
4102
+
4103
+ struct ggml_tensor * ggml_mul_mat_id(
4104
+ struct ggml_context * ctx,
4105
+ struct ggml_tensor * const as[],
4106
+ int n_as,
4107
+ struct ggml_tensor * ids,
4108
+ int id,
4109
+ struct ggml_tensor * b) {
4110
+
4111
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
4112
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
4113
+ GGML_ASSERT(ids->ne[1] == b->ne[1]);
4114
+ GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4115
+ GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
4116
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]);
4117
+
4118
+ bool is_node = false;
4119
+
4120
+ if (as[0]->grad || b->grad) {
4121
+ is_node = true;
4122
+ }
4123
+
4124
+ const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4125
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4126
+
4127
+ ggml_set_op_params_i32(result, 0, id);
4128
+ ggml_set_op_params_i32(result, 1, n_as);
4129
+
4130
+ result->op = GGML_OP_MUL_MAT_ID;
4131
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4132
+ result->src[0] = ids;
4133
+ result->src[1] = b;
4134
+
4135
+ for (int i = 0; i < n_as; i++) {
4136
+ struct ggml_tensor * a = as[i];
4137
+ GGML_ASSERT(ggml_are_same_shape(as[0], a));
4138
+ GGML_ASSERT(ggml_can_mul_mat(a, b));
4139
+ GGML_ASSERT(!ggml_is_transposed(a));
4140
+ result->src[i + 2] = a;
4141
+ }
4142
+
4143
+ return result;
4144
+ }
4145
+
4059
4146
  // ggml_out_prod
4060
4147
 
4061
4148
  struct ggml_tensor * ggml_out_prod(
@@ -4073,7 +4160,7 @@ struct ggml_tensor * ggml_out_prod(
4073
4160
 
4074
4161
  // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
4075
4162
  const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
4076
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
4163
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4077
4164
 
4078
4165
  result->op = GGML_OP_OUT_PROD;
4079
4166
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4209,7 +4296,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
4209
4296
  struct ggml_tensor * b,
4210
4297
  size_t nb1,
4211
4298
  size_t offset) {
4212
- return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
4299
+ return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
4213
4300
  }
4214
4301
 
4215
4302
  // ggml_cpy
@@ -4358,7 +4445,7 @@ struct ggml_tensor * ggml_reshape(
4358
4445
  //GGML_ASSERT(false);
4359
4446
  }
4360
4447
 
4361
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
4448
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
4362
4449
  ggml_format_name(result, "%s (reshaped)", a->name);
4363
4450
 
4364
4451
  result->op = GGML_OP_RESHAPE;
@@ -4673,7 +4760,9 @@ struct ggml_tensor * ggml_get_rows(
4673
4760
  struct ggml_context * ctx,
4674
4761
  struct ggml_tensor * a,
4675
4762
  struct ggml_tensor * b) {
4676
- GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
4763
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
4764
+ GGML_ASSERT(b->ne[3] == 1);
4765
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
4677
4766
 
4678
4767
  bool is_node = false;
4679
4768
 
@@ -4683,7 +4772,7 @@ struct ggml_tensor * ggml_get_rows(
4683
4772
 
4684
4773
  // TODO: implement non F32 return
4685
4774
  //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
4686
- struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
4775
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
4687
4776
 
4688
4777
  result->op = GGML_OP_GET_ROWS;
4689
4778
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4734,7 +4823,7 @@ struct ggml_tensor * ggml_diag(
4734
4823
  }
4735
4824
 
4736
4825
  const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
4737
- struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
4826
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
4738
4827
 
4739
4828
  result->op = GGML_OP_DIAG;
4740
4829
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5381,7 +5470,7 @@ struct ggml_tensor * ggml_pool_1d(
5381
5470
  is_node = true;
5382
5471
  }
5383
5472
 
5384
- const int64_t ne[3] = {
5473
+ const int64_t ne[2] = {
5385
5474
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5386
5475
  a->ne[1],
5387
5476
  };
@@ -5461,6 +5550,30 @@ static struct ggml_tensor * ggml_upscale_impl(
5461
5550
  return result;
5462
5551
  }
5463
5552
 
5553
+ struct ggml_tensor * ggml_pad(
5554
+ struct ggml_context * ctx,
5555
+ struct ggml_tensor * a,
5556
+ int p0, int p1, int p2, int p3) {
5557
+ bool is_node = false;
5558
+
5559
+ if (a->grad) {
5560
+ GGML_ASSERT(false); // TODO: implement backward
5561
+ is_node = true;
5562
+ }
5563
+
5564
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
5565
+ a->ne[0] + p0,
5566
+ a->ne[1] + p1,
5567
+ a->ne[2] + p2,
5568
+ a->ne[3] + p3);
5569
+
5570
+ result->op = GGML_OP_PAD;
5571
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5572
+ result->src[0] = a;
5573
+
5574
+ return result;
5575
+ }
5576
+
5464
5577
  struct ggml_tensor * ggml_upscale(
5465
5578
  struct ggml_context * ctx,
5466
5579
  struct ggml_tensor * a,
@@ -5468,6 +5581,43 @@ struct ggml_tensor * ggml_upscale(
5468
5581
  return ggml_upscale_impl(ctx, a, scale_factor);
5469
5582
  }
5470
5583
 
5584
+ // ggml_argsort
5585
+
5586
+ struct ggml_tensor * ggml_argsort(
5587
+ struct ggml_context * ctx,
5588
+ struct ggml_tensor * a,
5589
+ enum ggml_sort_order order) {
5590
+ bool is_node = false;
5591
+
5592
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
5593
+
5594
+ ggml_set_op_params_i32(result, 0, (int32_t) order);
5595
+
5596
+ result->op = GGML_OP_ARGSORT;
5597
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5598
+ result->src[0] = a;
5599
+
5600
+ return result;
5601
+ }
5602
+
5603
+ // ggml_top_k
5604
+
5605
+ struct ggml_tensor * ggml_top_k(
5606
+ struct ggml_context * ctx,
5607
+ struct ggml_tensor * a,
5608
+ int k) {
5609
+ GGML_ASSERT(a->ne[0] >= k);
5610
+
5611
+ struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
5612
+
5613
+ result = ggml_view_4d(ctx, result,
5614
+ k, result->ne[1], result->ne[2], result->ne[3],
5615
+ result->nb[1], result->nb[2], result->nb[3],
5616
+ 0);
5617
+
5618
+ return result;
5619
+ }
5620
+
5471
5621
  // ggml_flash_attn
5472
5622
 
5473
5623
  struct ggml_tensor * ggml_flash_attn(
@@ -5486,7 +5636,7 @@ struct ggml_tensor * ggml_flash_attn(
5486
5636
  }
5487
5637
 
5488
5638
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
5489
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
5639
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
5490
5640
 
5491
5641
  int32_t t = masked ? 1 : 0;
5492
5642
  ggml_set_op_params(result, &t, sizeof(t));
@@ -5519,7 +5669,7 @@ struct ggml_tensor * ggml_flash_ff(
5519
5669
  }
5520
5670
 
5521
5671
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5522
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
5672
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
5523
5673
 
5524
5674
  result->op = GGML_OP_FLASH_FF;
5525
5675
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5635,7 +5785,6 @@ struct ggml_tensor * ggml_win_part(
5635
5785
  const int np = npx*npy;
5636
5786
 
5637
5787
  const int64_t ne[4] = { a->ne[0], w, w, np, };
5638
-
5639
5788
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5640
5789
 
5641
5790
  int32_t params[] = { npx, npy, w };
@@ -6827,7 +6976,7 @@ static void ggml_compute_forward_add_f32(
6827
6976
  const struct ggml_tensor * src0,
6828
6977
  const struct ggml_tensor * src1,
6829
6978
  struct ggml_tensor * dst) {
6830
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
6979
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
6831
6980
 
6832
6981
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6833
6982
  return;
@@ -6860,16 +7009,19 @@ static void ggml_compute_forward_add_f32(
6860
7009
  const int64_t i13 = i03 % ne13;
6861
7010
  const int64_t i12 = i02 % ne12;
6862
7011
  const int64_t i11 = i01 % ne11;
7012
+ const int64_t nr0 = ne00 / ne10;
6863
7013
 
6864
7014
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
6865
7015
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
6866
7016
  float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
6867
7017
 
7018
+ for (int64_t r = 0; r < nr0; ++r) {
6868
7019
  #ifdef GGML_USE_ACCELERATE
6869
- vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
7020
+ vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
6870
7021
  #else
6871
- ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
7022
+ ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
6872
7023
  #endif
7024
+ }
6873
7025
  }
6874
7026
  } else {
6875
7027
  // src1 is not contiguous
@@ -6886,8 +7038,9 @@ static void ggml_compute_forward_add_f32(
6886
7038
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
6887
7039
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
6888
7040
 
6889
- for (int i0 = 0; i0 < ne0; i0++) {
6890
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
7041
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
7042
+ const int64_t i10 = i0 % ne10;
7043
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
6891
7044
 
6892
7045
  dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
6893
7046
  }
@@ -7421,7 +7574,7 @@ static void ggml_compute_forward_acc_f32(
7421
7574
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7422
7575
 
7423
7576
  // view src0 and dst with these strides and data offset inbytes during acc
7424
- // nb0 is implicitely element_size because src0 and dst are contiguous
7577
+ // nb0 is implicitly element_size because src0 and dst are contiguous
7425
7578
  size_t nb1 = ((int32_t *) dst->op_params)[0];
7426
7579
  size_t nb2 = ((int32_t *) dst->op_params)[1];
7427
7580
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -7607,7 +7760,7 @@ static void ggml_compute_forward_mul_f32(
7607
7760
  const struct ggml_tensor * src0,
7608
7761
  const struct ggml_tensor * src1,
7609
7762
  struct ggml_tensor * dst) {
7610
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
7763
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7611
7764
 
7612
7765
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7613
7766
  return;
@@ -7617,6 +7770,8 @@ static void ggml_compute_forward_mul_f32(
7617
7770
 
7618
7771
  #ifdef GGML_USE_CLBLAST
7619
7772
  if (src1->backend == GGML_BACKEND_GPU) {
7773
+ // TODO: OpenCL kernel support full broadcast
7774
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7620
7775
  if (ith == 0) {
7621
7776
  ggml_cl_mul(src0, src1, dst);
7622
7777
  }
@@ -7630,7 +7785,6 @@ static void ggml_compute_forward_mul_f32(
7630
7785
 
7631
7786
  GGML_ASSERT( nb0 == sizeof(float));
7632
7787
  GGML_ASSERT(nb00 == sizeof(float));
7633
- GGML_ASSERT(ne00 == ne10);
7634
7788
 
7635
7789
  if (nb10 == sizeof(float)) {
7636
7790
  for (int64_t ir = ith; ir < nr; ir += nth) {
@@ -7642,20 +7796,21 @@ static void ggml_compute_forward_mul_f32(
7642
7796
  const int64_t i13 = i03 % ne13;
7643
7797
  const int64_t i12 = i02 % ne12;
7644
7798
  const int64_t i11 = i01 % ne11;
7799
+ const int64_t nr0 = ne00 / ne10;
7645
7800
 
7646
7801
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7647
7802
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7648
7803
  float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
7649
7804
 
7805
+ for (int64_t r = 0 ; r < nr0; ++r) {
7650
7806
  #ifdef GGML_USE_ACCELERATE
7651
- UNUSED(ggml_vec_mul_f32);
7807
+ UNUSED(ggml_vec_mul_f32);
7652
7808
 
7653
- vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
7809
+ vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
7654
7810
  #else
7655
- ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
7811
+ ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
7656
7812
  #endif
7657
- // }
7658
- // }
7813
+ }
7659
7814
  }
7660
7815
  } else {
7661
7816
  // src1 is not contiguous
@@ -7673,8 +7828,9 @@ static void ggml_compute_forward_mul_f32(
7673
7828
  float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7674
7829
  float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7675
7830
 
7676
- for (int64_t i0 = 0; i0 < ne00; i0++) {
7677
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
7831
+ for (int64_t i0 = 0; i0 < ne00; ++i0) {
7832
+ const int64_t i10 = i0 % ne10;
7833
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
7678
7834
 
7679
7835
  dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
7680
7836
  }
@@ -7708,14 +7864,16 @@ static void ggml_compute_forward_div_f32(
7708
7864
  const struct ggml_tensor * src0,
7709
7865
  const struct ggml_tensor * src1,
7710
7866
  struct ggml_tensor * dst) {
7711
- assert(params->ith == 0);
7712
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7867
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7713
7868
 
7714
7869
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7715
7870
  return;
7716
7871
  }
7717
7872
 
7718
- const int nr = ggml_nrows(src0);
7873
+ const int ith = params->ith;
7874
+ const int nth = params->nth;
7875
+
7876
+ const int64_t nr = ggml_nrows(src0);
7719
7877
 
7720
7878
  GGML_TENSOR_BINARY_OP_LOCALS
7721
7879
 
@@ -7723,41 +7881,50 @@ static void ggml_compute_forward_div_f32(
7723
7881
  GGML_ASSERT(nb00 == sizeof(float));
7724
7882
 
7725
7883
  if (nb10 == sizeof(float)) {
7726
- for (int ir = 0; ir < nr; ++ir) {
7727
- // src0, src1 and dst are same shape => same indices
7728
- const int i3 = ir/(ne2*ne1);
7729
- const int i2 = (ir - i3*ne2*ne1)/ne1;
7730
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
7884
+ for (int64_t ir = ith; ir < nr; ir += nth) {
7885
+ // src0 and dst are same shape => same indices
7886
+ const int64_t i03 = ir/(ne02*ne01);
7887
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
7888
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
7889
+
7890
+ const int64_t i13 = i03 % ne13;
7891
+ const int64_t i12 = i02 % ne12;
7892
+ const int64_t i11 = i01 % ne11;
7893
+ const int64_t nr0 = ne00 / ne10;
7894
+
7895
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7896
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7897
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
7731
7898
 
7899
+ for (int64_t r = 0; r < nr0; ++r) {
7732
7900
  #ifdef GGML_USE_ACCELERATE
7733
- UNUSED(ggml_vec_div_f32);
7901
+ UNUSED(ggml_vec_div_f32);
7734
7902
 
7735
- vDSP_vdiv(
7736
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
7737
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
7738
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
7739
- ne0);
7903
+ vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
7740
7904
  #else
7741
- ggml_vec_div_f32(ne0,
7742
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
7743
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
7744
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
7905
+ ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
7745
7906
  #endif
7746
- // }
7747
- // }
7907
+ }
7748
7908
  }
7749
7909
  } else {
7750
7910
  // src1 is not contiguous
7751
- for (int ir = 0; ir < nr; ++ir) {
7752
- // src0, src1 and dst are same shape => same indices
7753
- const int i3 = ir/(ne2*ne1);
7754
- const int i2 = (ir - i3*ne2*ne1)/ne1;
7755
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
7911
+ for (int64_t ir = ith; ir < nr; ir += nth) {
7912
+ // src0 and dst are same shape => same indices
7913
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
7914
+ const int64_t i03 = ir/(ne02*ne01);
7915
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
7916
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
7756
7917
 
7757
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
7758
- float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
7759
- for (int i0 = 0; i0 < ne0; i0++) {
7760
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
7918
+ const int64_t i13 = i03 % ne13;
7919
+ const int64_t i12 = i02 % ne12;
7920
+ const int64_t i11 = i01 % ne11;
7921
+
7922
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
7923
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
7924
+
7925
+ for (int64_t i0 = 0; i0 < ne00; ++i0) {
7926
+ const int64_t i10 = i0 % ne10;
7927
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
7761
7928
 
7762
7929
  dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
7763
7930
  }
@@ -8203,7 +8370,7 @@ static void ggml_compute_forward_repeat_f16(
8203
8370
  return;
8204
8371
  }
8205
8372
 
8206
- GGML_TENSOR_UNARY_OP_LOCALS;
8373
+ GGML_TENSOR_UNARY_OP_LOCALS
8207
8374
 
8208
8375
  // guaranteed to be an integer due to the check in ggml_can_repeat
8209
8376
  const int nr0 = (int)(ne0/ne00);
@@ -8348,6 +8515,7 @@ static void ggml_compute_forward_concat_f32(
8348
8515
  GGML_ASSERT(src0->nb[0] == sizeof(float));
8349
8516
 
8350
8517
  const int ith = params->ith;
8518
+ const int nth = params->nth;
8351
8519
 
8352
8520
  GGML_TENSOR_BINARY_OP_LOCALS
8353
8521
 
@@ -8357,7 +8525,7 @@ static void ggml_compute_forward_concat_f32(
8357
8525
  GGML_ASSERT(nb10 == sizeof(float));
8358
8526
 
8359
8527
  for (int i3 = 0; i3 < ne3; i3++) {
8360
- for (int i2 = ith; i2 < ne2; i2++) {
8528
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
8361
8529
  if (i2 < ne02) { // src0
8362
8530
  for (int i1 = 0; i1 < ne1; i1++) {
8363
8531
  for (int i0 = 0; i0 < ne0; i0++) {
@@ -8869,10 +9037,9 @@ static void ggml_compute_forward_silu(
8869
9037
  } break;
8870
9038
  }
8871
9039
  }
9040
+ // ggml_compute_forward_leaky_relu
8872
9041
 
8873
- // ggml_compute_forward_leaky
8874
-
8875
- static void ggml_compute_forward_leaky_f32(
9042
+ static void ggml_compute_forward_leaky_relu_f32(
8876
9043
  const struct ggml_compute_params * params,
8877
9044
  const struct ggml_tensor * src0,
8878
9045
  struct ggml_tensor * dst) {
@@ -8886,24 +9053,27 @@ static void ggml_compute_forward_leaky_f32(
8886
9053
  const int n = ggml_nrows(src0);
8887
9054
  const int nc = src0->ne[0];
8888
9055
 
9056
+ float negative_slope;
9057
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
9058
+
8889
9059
  assert(dst->nb[0] == sizeof(float));
8890
9060
  assert(src0->nb[0] == sizeof(float));
8891
9061
 
8892
9062
  for (int i = 0; i < n; i++) {
8893
- ggml_vec_leaky_f32(nc,
9063
+ ggml_vec_leaky_relu_f32(nc,
8894
9064
  (float *) ((char *) dst->data + i*( dst->nb[1])),
8895
- (float *) ((char *) src0->data + i*(src0->nb[1])));
9065
+ (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
8896
9066
  }
8897
9067
  }
8898
9068
 
8899
- static void ggml_compute_forward_leaky(
9069
+ static void ggml_compute_forward_leaky_relu(
8900
9070
  const struct ggml_compute_params * params,
8901
9071
  const struct ggml_tensor * src0,
8902
9072
  struct ggml_tensor * dst) {
8903
9073
  switch (src0->type) {
8904
9074
  case GGML_TYPE_F32:
8905
9075
  {
8906
- ggml_compute_forward_leaky_f32(params, src0, dst);
9076
+ ggml_compute_forward_leaky_relu_f32(params, src0, dst);
8907
9077
  } break;
8908
9078
  default:
8909
9079
  {
@@ -9392,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9392
9562
  const int64_t ne0 = dst->ne[0];
9393
9563
  const int64_t ne1 = dst->ne[1];
9394
9564
 
9565
+ // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
9566
+ // all the experts for each batch element and the processing would become incredibly slow
9395
9567
  // TODO: find the optimal values for these
9396
- if (ggml_is_contiguous(src0) &&
9568
+ if (dst->op != GGML_OP_MUL_MAT_ID &&
9569
+ ggml_is_contiguous(src0) &&
9397
9570
  ggml_is_contiguous(src1) &&
9398
9571
  //src0->type == GGML_TYPE_F32 &&
9399
9572
  src1->type == GGML_TYPE_F32 &&
@@ -9407,11 +9580,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9407
9580
  }
9408
9581
  #endif
9409
9582
 
9583
+ // off1 = offset in i11 and i1
9584
+ // cne1 = ne11 and ne1
9585
+ // in a normal matrix multiplication, off1 = 0 and cne1 = ne1
9586
+ // during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
9410
9587
  static void ggml_compute_forward_mul_mat(
9411
9588
  const struct ggml_compute_params * params,
9412
9589
  const struct ggml_tensor * src0,
9413
9590
  const struct ggml_tensor * src1,
9414
- struct ggml_tensor * dst) {
9591
+ struct ggml_tensor * dst,
9592
+ int64_t off1, int64_t cne1) {
9415
9593
  int64_t t0 = ggml_perf_time_us();
9416
9594
  UNUSED(t0);
9417
9595
 
@@ -9479,10 +9657,9 @@ static void ggml_compute_forward_mul_mat(
9479
9657
  const int64_t i03 = i13/r3;
9480
9658
  const int64_t i02 = i12/r2;
9481
9659
 
9482
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9483
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
9484
-
9485
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9660
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9661
+ const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
9662
+ float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
9486
9663
 
9487
9664
  if (type != GGML_TYPE_F32) {
9488
9665
  float * const wdata = params->wdata;
@@ -9499,10 +9676,10 @@ static void ggml_compute_forward_mul_mat(
9499
9676
  }
9500
9677
 
9501
9678
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9502
- ne11, ne01, ne10,
9503
- 1.0f, y, ne10,
9504
- x, ne00,
9505
- 0.0f, d, ne01);
9679
+ cne1, ne01, ne10,
9680
+ 1.0f, y, ne10,
9681
+ x, ne00,
9682
+ 0.0f, d, ne01);
9506
9683
  }
9507
9684
  }
9508
9685
 
@@ -9515,7 +9692,10 @@ static void ggml_compute_forward_mul_mat(
9515
9692
  if (params->type == GGML_TASK_INIT) {
9516
9693
  if (src1->type != vec_dot_type) {
9517
9694
  char * wdata = params->wdata;
9518
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9695
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9696
+
9697
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
9698
+ assert(src1->type == GGML_TYPE_F32);
9519
9699
 
9520
9700
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
9521
9701
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -9535,10 +9715,10 @@ static void ggml_compute_forward_mul_mat(
9535
9715
  }
9536
9716
 
9537
9717
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9538
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
9718
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9539
9719
 
9540
9720
  const int64_t nr0 = ne01; // src0 rows
9541
- const int64_t nr1 = ne11*ne12*ne13; // src1 rows
9721
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9542
9722
 
9543
9723
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9544
9724
 
@@ -9580,9 +9760,9 @@ static void ggml_compute_forward_mul_mat(
9580
9760
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9581
9761
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9582
9762
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9583
- const int64_t i13 = (ir1/(ne12*ne11));
9584
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
9585
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
9763
+ const int64_t i13 = (ir1/(ne12*cne1));
9764
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9765
+ const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
9586
9766
 
9587
9767
  // broadcast src0 into src1
9588
9768
  const int64_t i03 = i13/r3;
@@ -9618,6 +9798,34 @@ static void ggml_compute_forward_mul_mat(
9618
9798
  }
9619
9799
  }
9620
9800
 
9801
+ // ggml_compute_forward_mul_mat_id
9802
+
9803
+ static void ggml_compute_forward_mul_mat_id(
9804
+ const struct ggml_compute_params * params,
9805
+ const struct ggml_tensor * src0,
9806
+ const struct ggml_tensor * src1,
9807
+ struct ggml_tensor * dst) {
9808
+
9809
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9810
+ // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
9811
+ ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
9812
+ return;
9813
+ }
9814
+
9815
+ const struct ggml_tensor * ids = src0;
9816
+ const int id = ggml_get_op_params_i32(dst, 0);
9817
+ const int n_as = ggml_get_op_params_i32(dst, 1);
9818
+
9819
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9820
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9821
+
9822
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
9823
+
9824
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
9825
+ ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
9826
+ }
9827
+ }
9828
+
9621
9829
  // ggml_compute_forward_out_prod
9622
9830
 
9623
9831
  static void ggml_compute_forward_out_prod_f32(
@@ -10027,7 +10235,7 @@ static void ggml_compute_forward_set_f32(
10027
10235
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10028
10236
 
10029
10237
  // view src0 and dst with these strides and data offset inbytes during set
10030
- // nb0 is implicitely element_size because src0 and dst are contiguous
10238
+ // nb0 is implicitly element_size because src0 and dst are contiguous
10031
10239
  size_t nb1 = ((int32_t *) dst->op_params)[0];
10032
10240
  size_t nb2 = ((int32_t *) dst->op_params)[1];
10033
10241
  size_t nb3 = ((int32_t *) dst->op_params)[2];
@@ -10191,21 +10399,30 @@ static void ggml_compute_forward_get_rows_q(
10191
10399
  return;
10192
10400
  }
10193
10401
 
10194
- const int nc = src0->ne[0];
10195
- const int nr = ggml_nelements(src1);
10402
+ GGML_TENSOR_BINARY_OP_LOCALS
10403
+
10404
+ const int64_t nc = ne00;
10405
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10406
+
10196
10407
  const enum ggml_type type = src0->type;
10197
10408
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
10198
10409
 
10199
- assert( dst->ne[0] == nc);
10200
- assert( dst->ne[1] == nr);
10201
- assert(src0->nb[0] == ggml_type_size(type));
10410
+ assert(ne0 == nc);
10411
+ assert(ne02 == ne11);
10412
+ assert(nb00 == ggml_type_size(type));
10413
+ assert(ggml_nrows(dst) == nr);
10202
10414
 
10203
- for (int i = 0; i < nr; ++i) {
10204
- const int r = ((int32_t *) src1->data)[i];
10415
+ // TODO: multi-thread
10416
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10417
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10418
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10419
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10205
10420
 
10206
- dequantize_row_q(
10207
- (const void *) ((char *) src0->data + r*src0->nb[1]),
10208
- (float *) ((char *) dst->data + i*dst->nb[1]), nc);
10421
+ dequantize_row_q(
10422
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10423
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10424
+ }
10425
+ }
10209
10426
  }
10210
10427
  }
10211
10428
 
@@ -10220,19 +10437,26 @@ static void ggml_compute_forward_get_rows_f16(
10220
10437
  return;
10221
10438
  }
10222
10439
 
10223
- const int nc = src0->ne[0];
10224
- const int nr = ggml_nelements(src1);
10440
+ GGML_TENSOR_BINARY_OP_LOCALS
10225
10441
 
10226
- assert( dst->ne[0] == nc);
10227
- assert( dst->ne[1] == nr);
10228
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
10442
+ const int64_t nc = ne00;
10443
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10229
10444
 
10230
- for (int i = 0; i < nr; ++i) {
10231
- const int r = ((int32_t *) src1->data)[i];
10445
+ assert(ne0 == nc);
10446
+ assert(ne02 == ne11);
10447
+ assert(nb00 == sizeof(ggml_fp16_t));
10448
+ assert(ggml_nrows(dst) == nr);
10232
10449
 
10233
- for (int j = 0; j < nc; ++j) {
10234
- ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
10235
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
10450
+ // TODO: multi-thread
10451
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10452
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10453
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10454
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10455
+
10456
+ ggml_fp16_to_fp32_row(
10457
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
10458
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
10459
+ }
10236
10460
  }
10237
10461
  }
10238
10462
  }
@@ -10248,19 +10472,27 @@ static void ggml_compute_forward_get_rows_f32(
10248
10472
  return;
10249
10473
  }
10250
10474
 
10251
- const int nc = src0->ne[0];
10252
- const int nr = ggml_nelements(src1);
10475
+ GGML_TENSOR_BINARY_OP_LOCALS
10253
10476
 
10254
- assert( dst->ne[0] == nc);
10255
- assert( dst->ne[1] == nr);
10256
- assert(src0->nb[0] == sizeof(float));
10477
+ const int64_t nc = ne00;
10478
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
10257
10479
 
10258
- for (int i = 0; i < nr; ++i) {
10259
- const int r = ((int32_t *) src1->data)[i];
10480
+ assert(ne0 == nc);
10481
+ assert(ne02 == ne11);
10482
+ assert(nb00 == sizeof(float));
10483
+ assert(ggml_nrows(dst) == nr);
10260
10484
 
10261
- ggml_vec_cpy_f32(nc,
10262
- (float *) ((char *) dst->data + i*dst->nb[1]),
10263
- (float *) ((char *) src0->data + r*src0->nb[1]));
10485
+ // TODO: multi-thread
10486
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10487
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10488
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
10489
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
10490
+
10491
+ ggml_vec_cpy_f32(nc,
10492
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
10493
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
10494
+ }
10495
+ }
10264
10496
  }
10265
10497
  }
10266
10498
 
@@ -11980,6 +12212,7 @@ static void ggml_compute_forward_upscale_f32(
11980
12212
  GGML_ASSERT(src0->nb[0] == sizeof(float));
11981
12213
 
11982
12214
  const int ith = params->ith;
12215
+ const int nth = params->nth;
11983
12216
 
11984
12217
  GGML_TENSOR_UNARY_OP_LOCALS
11985
12218
 
@@ -11987,16 +12220,17 @@ static void ggml_compute_forward_upscale_f32(
11987
12220
 
11988
12221
  // TODO: optimize
11989
12222
 
11990
- for (int i03 = 0; i03 < ne03; i03++) {
11991
- for (int i02 = ith; i02 < ne02; i02++) {
11992
- for (int m = 0; m < dst->ne[1]; m++) {
11993
- int i01 = m / scale_factor;
11994
- for (int n = 0; n < dst->ne[0]; n++) {
11995
- int i00 = n / scale_factor;
11996
-
11997
- const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
12223
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
12224
+ const int64_t i03 = i3;
12225
+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
12226
+ const int64_t i02 = i2;
12227
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
12228
+ const int64_t i01 = i1 / scale_factor;
12229
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
12230
+ const int64_t i00 = i0 / scale_factor;
11998
12231
 
11999
- float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
12232
+ const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
12233
+ float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
12000
12234
 
12001
12235
  *y = *x;
12002
12236
  }
@@ -12021,6 +12255,125 @@ static void ggml_compute_forward_upscale(
12021
12255
  }
12022
12256
  }
12023
12257
 
12258
+ // ggml_compute_forward_pad
12259
+
12260
+ static void ggml_compute_forward_pad_f32(
12261
+ const struct ggml_compute_params * params,
12262
+ const struct ggml_tensor * src0,
12263
+ struct ggml_tensor * dst) {
12264
+
12265
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12266
+ return;
12267
+ }
12268
+
12269
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
12270
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
12271
+
12272
+ const int ith = params->ith;
12273
+ const int nth = params->nth;
12274
+
12275
+ GGML_TENSOR_UNARY_OP_LOCALS
12276
+
12277
+ float * dst_ptr = (float *) dst->data;
12278
+
12279
+ // TODO: optimize
12280
+
12281
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
12282
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
12283
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
12284
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
12285
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
12286
+
12287
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12288
+
12289
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
12290
+ dst_ptr[dst_idx] = *src_ptr;
12291
+ } else {
12292
+ dst_ptr[dst_idx] = 0;
12293
+ }
12294
+ }
12295
+ }
12296
+ }
12297
+ }
12298
+ }
12299
+
12300
+ static void ggml_compute_forward_pad(
12301
+ const struct ggml_compute_params * params,
12302
+ const struct ggml_tensor * src0,
12303
+ struct ggml_tensor * dst) {
12304
+ switch (src0->type) {
12305
+ case GGML_TYPE_F32:
12306
+ {
12307
+ ggml_compute_forward_pad_f32(params, src0, dst);
12308
+ } break;
12309
+ default:
12310
+ {
12311
+ GGML_ASSERT(false);
12312
+ } break;
12313
+ }
12314
+ }
12315
+
12316
+ // ggml_compute_forward_argsort
12317
+
12318
+ static void ggml_compute_forward_argsort_f32(
12319
+ const struct ggml_compute_params * params,
12320
+ const struct ggml_tensor * src0,
12321
+ struct ggml_tensor * dst) {
12322
+
12323
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12324
+ return;
12325
+ }
12326
+
12327
+ GGML_TENSOR_UNARY_OP_LOCALS
12328
+
12329
+ GGML_ASSERT(nb0 == sizeof(float));
12330
+
12331
+ const int ith = params->ith;
12332
+ const int nth = params->nth;
12333
+
12334
+ const int64_t nr = ggml_nrows(src0);
12335
+
12336
+ enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
12337
+
12338
+ for (int64_t i = ith; i < nr; i += nth) {
12339
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
12340
+ const float * src_data = (float *)((char *) src0->data + i*nb01);
12341
+
12342
+ for (int64_t j = 0; j < ne0; j++) {
12343
+ dst_data[j] = j;
12344
+ }
12345
+
12346
+ // C doesn't have a functional sort, so we do a bubble sort instead
12347
+ for (int64_t j = 0; j < ne0; j++) {
12348
+ for (int64_t k = j + 1; k < ne0; k++) {
12349
+ if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
12350
+ (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
12351
+ int32_t tmp = dst_data[j];
12352
+ dst_data[j] = dst_data[k];
12353
+ dst_data[k] = tmp;
12354
+ }
12355
+ }
12356
+ }
12357
+ }
12358
+ }
12359
+
12360
+ static void ggml_compute_forward_argsort(
12361
+ const struct ggml_compute_params * params,
12362
+ const struct ggml_tensor * src0,
12363
+ struct ggml_tensor * dst) {
12364
+
12365
+ switch (src0->type) {
12366
+ case GGML_TYPE_F32:
12367
+ {
12368
+ ggml_compute_forward_argsort_f32(params, src0, dst);
12369
+ } break;
12370
+ default:
12371
+ {
12372
+ GGML_ASSERT(false);
12373
+ } break;
12374
+ }
12375
+ }
12376
+
12024
12377
  // ggml_compute_forward_flash_attn
12025
12378
 
12026
12379
  static void ggml_compute_forward_flash_attn_f32(
@@ -13167,10 +13520,6 @@ static void ggml_compute_forward_unary(
13167
13520
  {
13168
13521
  ggml_compute_forward_silu(params, src0, dst);
13169
13522
  } break;
13170
- case GGML_UNARY_OP_LEAKY:
13171
- {
13172
- ggml_compute_forward_leaky(params, src0, dst);
13173
- } break;
13174
13523
  default:
13175
13524
  {
13176
13525
  GGML_ASSERT(false);
@@ -13842,7 +14191,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13842
14191
  } break;
13843
14192
  case GGML_OP_MUL_MAT:
13844
14193
  {
13845
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
14194
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
14195
+ } break;
14196
+ case GGML_OP_MUL_MAT_ID:
14197
+ {
14198
+ ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
13846
14199
  } break;
13847
14200
  case GGML_OP_OUT_PROD:
13848
14201
  {
@@ -13948,6 +14301,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13948
14301
  {
13949
14302
  ggml_compute_forward_upscale(params, tensor->src[0], tensor);
13950
14303
  } break;
14304
+ case GGML_OP_PAD:
14305
+ {
14306
+ ggml_compute_forward_pad(params, tensor->src[0], tensor);
14307
+ } break;
14308
+ case GGML_OP_ARGSORT:
14309
+ {
14310
+ ggml_compute_forward_argsort(params, tensor->src[0], tensor);
14311
+ } break;
14312
+ case GGML_OP_LEAKY_RELU:
14313
+ {
14314
+ ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
14315
+ } break;
13951
14316
  case GGML_OP_FLASH_ATTN:
13952
14317
  {
13953
14318
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -14202,7 +14567,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14202
14567
  return replacements->vals[i];
14203
14568
  }
14204
14569
 
14205
- struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14570
+ struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
14206
14571
 
14207
14572
  // insert clone into replacements
14208
14573
  GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
@@ -14272,7 +14637,7 @@ void ggml_build_backward_gradient_checkpointing(
14272
14637
  // insert new tensors recomputing src, reusing already made replacements,
14273
14638
  // remember replacements: remember new tensors with mapping from corresponding gf nodes
14274
14639
  // recurse for input tensors,
14275
- // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
14640
+ // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
14276
14641
  node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
14277
14642
  }
14278
14643
  // insert rewritten backward node with replacements made into resulting backward graph gb
@@ -14598,6 +14963,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14598
14963
  zero_table);
14599
14964
  }
14600
14965
  } break;
14966
+ case GGML_OP_MUL_MAT_ID:
14967
+ {
14968
+ GGML_ASSERT(false); // TODO: not implemented
14969
+ } break;
14601
14970
  case GGML_OP_OUT_PROD:
14602
14971
  {
14603
14972
  GGML_ASSERT(false); // TODO: not implemented
@@ -14936,6 +15305,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14936
15305
  {
14937
15306
  GGML_ASSERT(false); // TODO: not implemented
14938
15307
  } break;
15308
+ case GGML_OP_PAD:
15309
+ {
15310
+ GGML_ASSERT(false); // TODO: not implemented
15311
+ } break;
15312
+ case GGML_OP_ARGSORT:
15313
+ {
15314
+ GGML_ASSERT(false); // TODO: not implemented
15315
+ } break;
15316
+ case GGML_OP_LEAKY_RELU:
15317
+ {
15318
+ GGML_ASSERT(false); // TODO: not implemented
15319
+ } break;
14939
15320
  case GGML_OP_FLASH_ATTN:
14940
15321
  {
14941
15322
  struct ggml_tensor * flash_grad = NULL;
@@ -15296,12 +15677,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15296
15677
  return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15297
15678
  }
15298
15679
 
15299
- struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15300
- const size_t obj_size = sizeof(struct ggml_cgraph);
15301
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15302
- struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15303
-
15304
- *cgraph = (struct ggml_cgraph) {
15680
+ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
15681
+ struct ggml_cgraph cgraph = {
15305
15682
  /*.size =*/ 0,
15306
15683
  /*.n_nodes =*/ i1 - i0,
15307
15684
  /*.n_leafs =*/ 0,
@@ -15536,7 +15913,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15536
15913
  n_tasks = n_threads;
15537
15914
  } break;
15538
15915
  case GGML_OP_SUB:
15539
- case GGML_OP_DIV:
15540
15916
  case GGML_OP_SQR:
15541
15917
  case GGML_OP_SQRT:
15542
15918
  case GGML_OP_LOG:
@@ -15546,6 +15922,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15546
15922
  case GGML_OP_ARGMAX:
15547
15923
  case GGML_OP_REPEAT:
15548
15924
  case GGML_OP_REPEAT_BACK:
15925
+ case GGML_OP_LEAKY_RELU:
15549
15926
  {
15550
15927
  n_tasks = 1;
15551
15928
  } break;
@@ -15558,7 +15935,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15558
15935
  case GGML_UNARY_OP_TANH:
15559
15936
  case GGML_UNARY_OP_ELU:
15560
15937
  case GGML_UNARY_OP_RELU:
15561
- case GGML_UNARY_OP_LEAKY:
15562
15938
  {
15563
15939
  n_tasks = 1;
15564
15940
  } break;
@@ -15569,10 +15945,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15569
15945
  {
15570
15946
  n_tasks = n_threads;
15571
15947
  } break;
15948
+ default:
15949
+ GGML_ASSERT(false);
15572
15950
  }
15573
15951
  break;
15574
15952
  case GGML_OP_SILU_BACK:
15575
15953
  case GGML_OP_MUL:
15954
+ case GGML_OP_DIV:
15576
15955
  case GGML_OP_NORM:
15577
15956
  case GGML_OP_RMS_NORM:
15578
15957
  case GGML_OP_RMS_NORM_BACK:
@@ -15610,6 +15989,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15610
15989
  }
15611
15990
  #endif
15612
15991
  } break;
15992
+ case GGML_OP_MUL_MAT_ID:
15993
+ {
15994
+ // FIXME: blas
15995
+ n_tasks = n_threads;
15996
+ } break;
15613
15997
  case GGML_OP_OUT_PROD:
15614
15998
  {
15615
15999
  n_tasks = n_threads;
@@ -15629,7 +16013,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15629
16013
  } break;
15630
16014
  case GGML_OP_DIAG_MASK_ZERO:
15631
16015
  case GGML_OP_DIAG_MASK_INF:
15632
- case GGML_OP_SOFT_MAX:
15633
16016
  case GGML_OP_SOFT_MAX_BACK:
15634
16017
  case GGML_OP_ROPE:
15635
16018
  case GGML_OP_ROPE_BACK:
@@ -15645,6 +16028,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15645
16028
  {
15646
16029
  n_tasks = 1; //TODO
15647
16030
  } break;
16031
+ case GGML_OP_SOFT_MAX:
16032
+ {
16033
+ n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
16034
+ } break;
15648
16035
  case GGML_OP_CONV_TRANSPOSE_1D:
15649
16036
  {
15650
16037
  n_tasks = n_threads;
@@ -15666,6 +16053,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15666
16053
  {
15667
16054
  n_tasks = n_threads;
15668
16055
  } break;
16056
+ case GGML_OP_PAD:
16057
+ {
16058
+ n_tasks = n_threads;
16059
+ } break;
16060
+ case GGML_OP_ARGSORT:
16061
+ {
16062
+ n_tasks = n_threads;
16063
+ } break;
15669
16064
  case GGML_OP_FLASH_ATTN:
15670
16065
  {
15671
16066
  n_tasks = n_threads;
@@ -15728,6 +16123,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15728
16123
  {
15729
16124
  n_tasks = 1;
15730
16125
  } break;
16126
+ case GGML_OP_COUNT:
16127
+ {
16128
+ GGML_ASSERT(false);
16129
+ } break;
15731
16130
  default:
15732
16131
  {
15733
16132
  fprintf(stderr, "%s: op not implemented: ", __func__);
@@ -15876,18 +16275,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15876
16275
 
15877
16276
  // thread scheduling for the different operations + work buffer size estimation
15878
16277
  for (int i = 0; i < cgraph->n_nodes; i++) {
15879
- int n_tasks = 1;
15880
-
15881
16278
  struct ggml_tensor * node = cgraph->nodes[i];
15882
16279
 
16280
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16281
+
15883
16282
  size_t cur = 0;
15884
16283
 
15885
16284
  switch (node->op) {
15886
16285
  case GGML_OP_CPY:
15887
16286
  case GGML_OP_DUP:
15888
16287
  {
15889
- n_tasks = n_threads;
15890
-
15891
16288
  if (ggml_is_quantized(node->type)) {
15892
16289
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15893
16290
  }
@@ -15895,16 +16292,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15895
16292
  case GGML_OP_ADD:
15896
16293
  case GGML_OP_ADD1:
15897
16294
  {
15898
- n_tasks = n_threads;
15899
-
15900
16295
  if (ggml_is_quantized(node->src[0]->type)) {
15901
16296
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15902
16297
  }
15903
16298
  } break;
15904
16299
  case GGML_OP_ACC:
15905
16300
  {
15906
- n_tasks = n_threads;
15907
-
15908
16301
  if (ggml_is_quantized(node->src[0]->type)) {
15909
16302
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
15910
16303
  }
@@ -15927,21 +16320,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15927
16320
  } else
15928
16321
  #endif
15929
16322
  if (node->src[1]->type != vec_dot_type) {
15930
- cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16323
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
16324
+ }
16325
+ } break;
16326
+ case GGML_OP_MUL_MAT_ID:
16327
+ {
16328
+ const struct ggml_tensor * a = node->src[2];
16329
+ const struct ggml_tensor * b = node->src[1];
16330
+ const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
16331
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16332
+ if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
16333
+ if (a->type != GGML_TYPE_F32) {
16334
+ // here we need memory just for single 2D matrix from src0
16335
+ cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
16336
+ }
16337
+ } else
16338
+ #endif
16339
+ if (b->type != vec_dot_type) {
16340
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
15931
16341
  }
15932
16342
  } break;
15933
16343
  case GGML_OP_OUT_PROD:
15934
16344
  {
15935
- n_tasks = n_threads;
15936
-
15937
16345
  if (ggml_is_quantized(node->src[0]->type)) {
15938
16346
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15939
16347
  }
15940
16348
  } break;
15941
16349
  case GGML_OP_SOFT_MAX:
15942
16350
  {
15943
- n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
15944
-
15945
16351
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15946
16352
  } break;
15947
16353
  case GGML_OP_CONV_TRANSPOSE_1D:
@@ -15969,10 +16375,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15969
16375
  GGML_ASSERT(false);
15970
16376
  }
15971
16377
  } break;
15972
- case GGML_OP_IM2COL:
15973
- {
15974
- n_tasks = n_threads;
15975
- } break;
15976
16378
  case GGML_OP_CONV_TRANSPOSE_2D:
15977
16379
  {
15978
16380
  const int64_t ne00 = node->src[0]->ne[0]; // W
@@ -15989,8 +16391,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15989
16391
  } break;
15990
16392
  case GGML_OP_FLASH_ATTN:
15991
16393
  {
15992
- n_tasks = n_threads;
15993
-
15994
16394
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
15995
16395
 
15996
16396
  if (node->src[1]->type == GGML_TYPE_F32) {
@@ -16003,8 +16403,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16003
16403
  } break;
16004
16404
  case GGML_OP_FLASH_FF:
16005
16405
  {
16006
- n_tasks = n_threads;
16007
-
16008
16406
  if (node->src[1]->type == GGML_TYPE_F32) {
16009
16407
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16010
16408
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
@@ -16015,8 +16413,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16015
16413
  } break;
16016
16414
  case GGML_OP_FLASH_ATTN_BACK:
16017
16415
  {
16018
- n_tasks = n_threads;
16019
-
16020
16416
  const int64_t D = node->src[0]->ne[0];
16021
16417
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16022
16418
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
@@ -16031,8 +16427,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16031
16427
 
16032
16428
  case GGML_OP_CROSS_ENTROPY_LOSS:
16033
16429
  {
16034
- n_tasks = n_threads;
16035
-
16036
16430
  cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16037
16431
  } break;
16038
16432
  case GGML_OP_COUNT:
@@ -16174,7 +16568,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
16174
16568
  fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
16175
16569
  ggml_type_name(tensor->type),
16176
16570
  ggml_op_name (tensor->op),
16177
- tensor->n_dims,
16571
+ ggml_n_dims(tensor),
16178
16572
  ne[0], ne[1], ne[2], ne[3],
16179
16573
  nb[0], nb[1], nb[2], nb[3],
16180
16574
  tensor->data,
@@ -16189,7 +16583,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16189
16583
  arg,
16190
16584
  ggml_type_name(tensor->type),
16191
16585
  ggml_op_name (tensor->op),
16192
- tensor->n_dims,
16586
+ ggml_n_dims(tensor),
16193
16587
  ne[0], ne[1], ne[2], ne[3],
16194
16588
  nb[0], nb[1], nb[2], nb[3],
16195
16589
  tensor->data,
@@ -16279,11 +16673,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16279
16673
 
16280
16674
  const uint32_t type = tensor->type;
16281
16675
  const uint32_t op = tensor->op;
16282
- const uint32_t n_dims = tensor->n_dims;
16283
16676
 
16284
16677
  fwrite(&type, sizeof(uint32_t), 1, fout);
16285
16678
  fwrite(&op, sizeof(uint32_t), 1, fout);
16286
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16287
16679
 
16288
16680
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16289
16681
  const uint64_t ne = tensor->ne[j];
@@ -16313,11 +16705,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16313
16705
 
16314
16706
  const uint32_t type = tensor->type;
16315
16707
  const uint32_t op = tensor->op;
16316
- const uint32_t n_dims = tensor->n_dims;
16317
16708
 
16318
16709
  fwrite(&type, sizeof(uint32_t), 1, fout);
16319
16710
  fwrite(&op, sizeof(uint32_t), 1, fout);
16320
- fwrite(&n_dims, sizeof(uint32_t), 1, fout);
16321
16711
 
16322
16712
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
16323
16713
  const uint64_t ne = tensor->ne[j];
@@ -16489,12 +16879,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16489
16879
  {
16490
16880
  uint32_t type;
16491
16881
  uint32_t op;
16492
- uint32_t n_dims;
16493
16882
 
16494
16883
  for (uint32_t i = 0; i < n_leafs; ++i) {
16495
16884
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16496
16885
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16497
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16498
16886
 
16499
16887
  int64_t ne[GGML_MAX_DIMS];
16500
16888
  size_t nb[GGML_MAX_DIMS];
@@ -16510,7 +16898,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16510
16898
  nb[j] = nb_cur;
16511
16899
  }
16512
16900
 
16513
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
16901
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16514
16902
 
16515
16903
  tensor->op = (enum ggml_op) op;
16516
16904
 
@@ -16527,7 +16915,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16527
16915
 
16528
16916
  ptr += ggml_nbytes(tensor);
16529
16917
 
16530
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
16918
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16531
16919
  }
16532
16920
  }
16533
16921
 
@@ -16537,12 +16925,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16537
16925
  {
16538
16926
  uint32_t type;
16539
16927
  uint32_t op;
16540
- uint32_t n_dims;
16541
16928
 
16542
16929
  for (uint32_t i = 0; i < n_nodes; ++i) {
16543
16930
  type = *(const uint32_t *) ptr; ptr += sizeof(type);
16544
16931
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
16545
- n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
16546
16932
 
16547
16933
  enum ggml_op eop = (enum ggml_op) op;
16548
16934
 
@@ -16613,7 +16999,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16613
16999
  } break;
16614
17000
  default:
16615
17001
  {
16616
- tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
17002
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
16617
17003
 
16618
17004
  tensor->op = eop;
16619
17005
  } break;
@@ -16632,7 +17018,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
16632
17018
 
16633
17019
  result->nodes[i] = tensor;
16634
17020
 
16635
- fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17021
+ fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
16636
17022
  }
16637
17023
  }
16638
17024
  }
@@ -16770,7 +17156,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16770
17156
  fprintf(fp, "(%s)|", ggml_type_name(node->type));
16771
17157
  }
16772
17158
 
16773
- if (node->n_dims == 2) {
17159
+ if (ggml_is_matrix(node)) {
16774
17160
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
16775
17161
  } else {
16776
17162
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
@@ -17037,7 +17423,7 @@ static enum ggml_opt_result ggml_opt_adam(
17037
17423
  int64_t i = 0;
17038
17424
  for (int p = 0; p < np; ++p) {
17039
17425
  const int64_t ne = ggml_nelements(ps[p]);
17040
- const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
17426
+ const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
17041
17427
  for (int64_t j = 0; j < ne; ++j) {
17042
17428
  float x = ggml_get_f32_1d(ps[p], j);
17043
17429
  float g_ = g[i]*gnorm;
@@ -17819,8 +18205,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
17819
18205
  memcpy(&qh, &y[i].qh, sizeof(qh));
17820
18206
 
17821
18207
  for (int j = 0; j < QK5_0; j += 2) {
17822
- const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
17823
- const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
18208
+ const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
18209
+ const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
17824
18210
 
17825
18211
  // cast to 16 bins
17826
18212
  const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -17849,8 +18235,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
17849
18235
  memcpy(&qh, &y[i].qh, sizeof(qh));
17850
18236
 
17851
18237
  for (int j = 0; j < QK5_1; j += 2) {
17852
- const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
17853
- const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
18238
+ const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
18239
+ const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
17854
18240
 
17855
18241
  // cast to 16 bins
17856
18242
  const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -18040,6 +18426,7 @@ struct gguf_kv {
18040
18426
 
18041
18427
  struct gguf_header {
18042
18428
  char magic[4];
18429
+
18043
18430
  uint32_t version;
18044
18431
  uint64_t n_tensors; // GGUFv2
18045
18432
  uint64_t n_kv; // GGUFv2
@@ -18129,7 +18516,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18129
18516
 
18130
18517
  for (uint32_t i = 0; i < sizeof(magic); i++) {
18131
18518
  if (magic[i] != GGUF_MAGIC[i]) {
18132
- fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
18519
+ fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
18133
18520
  fclose(file);
18134
18521
  return NULL;
18135
18522
  }
@@ -18144,7 +18531,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18144
18531
  {
18145
18532
  strncpy(ctx->header.magic, magic, 4);
18146
18533
 
18147
-
18148
18534
  ctx->kv = NULL;
18149
18535
  ctx->infos = NULL;
18150
18536
  ctx->data = NULL;
@@ -18311,7 +18697,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18311
18697
  return NULL;
18312
18698
  }
18313
18699
 
18314
- const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
18700
+ const size_t size_cur = ggml_row_size(info->type, ne);
18315
18701
 
18316
18702
  ctx->size += GGML_PAD(size_cur, ctx->alignment);
18317
18703
  }
@@ -18815,8 +19201,8 @@ void gguf_add_tensor(
18815
19201
  ctx->infos[idx].ne[i] = 1;
18816
19202
  }
18817
19203
 
18818
- ctx->infos[idx].n_dims = tensor->n_dims;
18819
- for (int i = 0; i < tensor->n_dims; i++) {
19204
+ ctx->infos[idx].n_dims = ggml_n_dims(tensor);
19205
+ for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
18820
19206
  ctx->infos[idx].ne[i] = tensor->ne[i];
18821
19207
  }
18822
19208