llama_cpp 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -220,9 +220,27 @@ inline static void* ggml_aligned_malloc(size_t size) {
220
220
  #define GGML_ALIGNED_FREE(ptr) free(ptr)
221
221
  #endif
222
222
 
223
- #define UNUSED(x) (void)(x)
223
+ #define UNUSED GGML_UNUSED
224
224
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
225
225
 
226
+ //
227
+ // tensor access macros
228
+ //
229
+
230
+ #define GGML_TENSOR_UNARY_OP_LOCALS \
231
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
232
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
233
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
234
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
235
+
236
+ #define GGML_TENSOR_BINARY_OP_LOCALS \
237
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
238
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
239
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
240
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \
241
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
242
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
243
+
226
244
  #if defined(GGML_USE_ACCELERATE)
227
245
  #include <Accelerate/Accelerate.h>
228
246
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@@ -463,14 +481,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
463
481
  return GGML_FP32_TO_FP16(x);
464
482
  }
465
483
 
466
- void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
467
- for (size_t i = 0; i < n; i++) {
484
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
485
+ for (int i = 0; i < n; i++) {
468
486
  y[i] = GGML_FP16_TO_FP32(x[i]);
469
487
  }
470
488
  }
471
489
 
472
- void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
473
- size_t i = 0;
490
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
491
+ int i = 0;
474
492
  #if defined(__F16C__)
475
493
  for (; i + 7 < n; i += 8) {
476
494
  __m256 x_vec = _mm256_loadu_ps(x + i);
@@ -1609,109 +1627,112 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
1609
1627
  }
1610
1628
  }
1611
1629
 
1630
+ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
1631
+ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
1612
1632
  static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1613
1633
  static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1614
1634
  static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1615
1635
  static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1616
1636
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1617
1637
 
1618
- static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1638
+ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1639
+ [GGML_TYPE_F32] = {
1640
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
1641
+ .vec_dot_type = GGML_TYPE_F32,
1642
+ },
1643
+ [GGML_TYPE_F16] = {
1644
+ .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
1645
+ .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1646
+ .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1647
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
1648
+ .vec_dot_type = GGML_TYPE_F16,
1649
+ },
1619
1650
  [GGML_TYPE_Q4_0] = {
1620
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0,
1621
- .quantize_row_q = quantize_row_q4_0,
1622
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
1623
- .quantize_row_q_dot = quantize_row_q8_0,
1624
- .vec_dot_q = ggml_vec_dot_q4_0_q8_0,
1651
+ .to_float = (ggml_to_float_t) dequantize_row_q4_0,
1652
+ .from_float = quantize_row_q4_0,
1653
+ .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
1654
+ .vec_dot = ggml_vec_dot_q4_0_q8_0,
1625
1655
  .vec_dot_type = GGML_TYPE_Q8_0,
1626
1656
  },
1627
1657
  [GGML_TYPE_Q4_1] = {
1628
- .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1,
1629
- .quantize_row_q = quantize_row_q4_1,
1630
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
1631
- .quantize_row_q_dot = quantize_row_q8_1,
1632
- .vec_dot_q = ggml_vec_dot_q4_1_q8_1,
1658
+ .to_float = (ggml_to_float_t) dequantize_row_q4_1,
1659
+ .from_float = quantize_row_q4_1,
1660
+ .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
1661
+ .vec_dot = ggml_vec_dot_q4_1_q8_1,
1633
1662
  .vec_dot_type = GGML_TYPE_Q8_1,
1634
1663
  },
1635
1664
  [GGML_TYPE_Q5_0] = {
1636
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0,
1637
- .quantize_row_q = quantize_row_q5_0,
1638
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference,
1639
- .quantize_row_q_dot = quantize_row_q8_0,
1640
- .vec_dot_q = ggml_vec_dot_q5_0_q8_0,
1665
+ .to_float = (ggml_to_float_t) dequantize_row_q5_0,
1666
+ .from_float = quantize_row_q5_0,
1667
+ .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
1668
+ .vec_dot = ggml_vec_dot_q5_0_q8_0,
1641
1669
  .vec_dot_type = GGML_TYPE_Q8_0,
1642
1670
  },
1643
1671
  [GGML_TYPE_Q5_1] = {
1644
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_1,
1645
- .quantize_row_q = quantize_row_q5_1,
1646
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference,
1647
- .quantize_row_q_dot = quantize_row_q8_1,
1648
- .vec_dot_q = ggml_vec_dot_q5_1_q8_1,
1672
+ .to_float = (ggml_to_float_t) dequantize_row_q5_1,
1673
+ .from_float = quantize_row_q5_1,
1674
+ .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
1675
+ .vec_dot = ggml_vec_dot_q5_1_q8_1,
1649
1676
  .vec_dot_type = GGML_TYPE_Q8_1,
1650
1677
  },
1651
1678
  [GGML_TYPE_Q8_0] = {
1652
- .dequantize_row_q = dequantize_row_q8_0,
1653
- .quantize_row_q = quantize_row_q8_0,
1654
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference,
1655
- .quantize_row_q_dot = quantize_row_q8_0,
1656
- .vec_dot_q = ggml_vec_dot_q8_0_q8_0,
1679
+ .to_float = dequantize_row_q8_0,
1680
+ .from_float = quantize_row_q8_0,
1681
+ .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
1682
+ .vec_dot = ggml_vec_dot_q8_0_q8_0,
1657
1683
  .vec_dot_type = GGML_TYPE_Q8_0,
1658
1684
  },
1659
1685
  [GGML_TYPE_Q8_1] = {
1660
- .dequantize_row_q = NULL, // TODO
1661
- .quantize_row_q = quantize_row_q8_1,
1662
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference,
1663
- .quantize_row_q_dot = quantize_row_q8_1,
1664
- .vec_dot_q = NULL, // TODO
1686
+ .from_float = quantize_row_q8_1,
1687
+ .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
1665
1688
  .vec_dot_type = GGML_TYPE_Q8_1,
1666
1689
  },
1667
1690
  #ifdef GGML_USE_K_QUANTS
1668
1691
  [GGML_TYPE_Q2_K] = {
1669
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
1670
- .quantize_row_q = quantize_row_q2_K,
1671
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
1672
- .quantize_row_q_dot = quantize_row_q8_K,
1673
- .vec_dot_q = ggml_vec_dot_q2_K_q8_K,
1692
+ .to_float = (ggml_to_float_t) dequantize_row_q2_K,
1693
+ .from_float = quantize_row_q2_K,
1694
+ .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
1695
+ .vec_dot = ggml_vec_dot_q2_K_q8_K,
1674
1696
  .vec_dot_type = GGML_TYPE_Q8_K,
1675
1697
  },
1676
1698
  [GGML_TYPE_Q3_K] = {
1677
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
1678
- .quantize_row_q = quantize_row_q3_K,
1679
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
1680
- .quantize_row_q_dot = quantize_row_q8_K,
1681
- .vec_dot_q = ggml_vec_dot_q3_K_q8_K,
1699
+ .to_float = (ggml_to_float_t) dequantize_row_q3_K,
1700
+ .from_float = quantize_row_q3_K,
1701
+ .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
1702
+ .vec_dot = ggml_vec_dot_q3_K_q8_K,
1682
1703
  .vec_dot_type = GGML_TYPE_Q8_K,
1683
1704
  },
1684
1705
  [GGML_TYPE_Q4_K] = {
1685
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
1686
- .quantize_row_q = quantize_row_q4_K,
1687
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
1688
- .quantize_row_q_dot = quantize_row_q8_K,
1689
- .vec_dot_q = ggml_vec_dot_q4_K_q8_K,
1706
+ .to_float = (ggml_to_float_t) dequantize_row_q4_K,
1707
+ .from_float = quantize_row_q4_K,
1708
+ .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
1709
+ .vec_dot = ggml_vec_dot_q4_K_q8_K,
1690
1710
  .vec_dot_type = GGML_TYPE_Q8_K,
1691
1711
  },
1692
1712
  [GGML_TYPE_Q5_K] = {
1693
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
1694
- .quantize_row_q = quantize_row_q5_K,
1695
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
1696
- .quantize_row_q_dot = quantize_row_q8_K,
1697
- .vec_dot_q = ggml_vec_dot_q5_K_q8_K,
1713
+ .to_float = (ggml_to_float_t) dequantize_row_q5_K,
1714
+ .from_float = quantize_row_q5_K,
1715
+ .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
1716
+ .vec_dot = ggml_vec_dot_q5_K_q8_K,
1698
1717
  .vec_dot_type = GGML_TYPE_Q8_K,
1699
1718
  },
1700
1719
  [GGML_TYPE_Q6_K] = {
1701
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
1702
- .quantize_row_q = quantize_row_q6_K,
1703
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
1704
- .quantize_row_q_dot = quantize_row_q8_K,
1705
- .vec_dot_q = ggml_vec_dot_q6_K_q8_K,
1720
+ .to_float = (ggml_to_float_t) dequantize_row_q6_K,
1721
+ .from_float = quantize_row_q6_K,
1722
+ .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
1723
+ .vec_dot = ggml_vec_dot_q6_K_q8_K,
1706
1724
  .vec_dot_type = GGML_TYPE_Q8_K,
1707
1725
  },
1726
+ [GGML_TYPE_Q8_K] = {
1727
+ .from_float = quantize_row_q8_K,
1728
+ }
1708
1729
  #endif
1709
1730
  };
1710
1731
 
1711
1732
  // For internal test use
1712
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1733
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
1713
1734
  GGML_ASSERT(i < GGML_TYPE_COUNT);
1714
- return quantize_fns[i];
1735
+ return type_traits[i];
1715
1736
  }
1716
1737
 
1717
1738
 
@@ -2257,7 +2278,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
2257
2278
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
2258
2279
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
2259
2280
 
2260
- inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
2281
+ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
2261
2282
  #ifdef GGML_SIMD
2262
2283
  float sumf = 0.0f;
2263
2284
  const int np = (n & ~(GGML_F32_STEP - 1));
@@ -2294,7 +2315,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
2294
2315
  *s = sumf;
2295
2316
  }
2296
2317
 
2297
- inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
2318
+ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
2298
2319
  ggml_float sumf = 0.0;
2299
2320
 
2300
2321
  #if defined(GGML_SIMD)
@@ -3447,6 +3468,8 @@ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) {
3447
3468
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
3448
3469
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
3449
3470
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
3471
+ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
3472
+ inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
3450
3473
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3451
3474
 
3452
3475
  static const float GELU_COEF_A = 0.044715f;
@@ -3598,6 +3621,16 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3598
3621
  *s = 1.f/(*s);
3599
3622
  }
3600
3623
 
3624
+ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
3625
+ float max = -INFINITY;
3626
+ int idx = 0;
3627
+ for (int i = 0; i < n; ++i) {
3628
+ max = MAX(max, x[i]);
3629
+ if (max == x[i]) { idx = i; }
3630
+ }
3631
+ *s = idx;
3632
+ }
3633
+
3601
3634
  //
3602
3635
  // data types
3603
3636
  //
@@ -3707,12 +3740,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3707
3740
  "SUM",
3708
3741
  "SUM_ROWS",
3709
3742
  "MEAN",
3743
+ "ARGMAX",
3710
3744
  "REPEAT",
3711
3745
  "REPEAT_BACK",
3712
3746
  "ABS",
3713
3747
  "SGN",
3714
3748
  "NEG",
3715
3749
  "STEP",
3750
+ "TANH",
3751
+ "ELU",
3716
3752
  "RELU",
3717
3753
  "GELU",
3718
3754
  "GELU_QUICK",
@@ -3744,9 +3780,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3744
3780
  "ROPE_BACK",
3745
3781
  "ALIBI",
3746
3782
  "CLAMP",
3747
- "CONV_1D_S1_PH",
3748
- "CONV_1D_S2_PH",
3749
- "CONV_2D_SK_P0",
3783
+ "CONV_1D",
3784
+ "CONV_2D",
3750
3785
 
3751
3786
  "FLASH_ATTN",
3752
3787
  "FLASH_FF",
@@ -3765,7 +3800,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3765
3800
  "CROSS_ENTROPY_LOSS_BACK",
3766
3801
  };
3767
3802
 
3768
- static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3803
+ static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
3769
3804
 
3770
3805
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3771
3806
  "none",
@@ -3783,12 +3818,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3783
3818
  "Σx",
3784
3819
  "Σx_k",
3785
3820
  "Σx/n",
3821
+ "argmax(x)",
3786
3822
  "repeat(x)",
3787
3823
  "repeat_back(x)",
3788
3824
  "abs(x)",
3789
3825
  "sgn(x)",
3790
3826
  "-x",
3791
3827
  "step(x)",
3828
+ "tanh(x)",
3829
+ "elu(x)",
3792
3830
  "relu(x)",
3793
3831
  "gelu(x)",
3794
3832
  "gelu_quick(x)",
@@ -3820,9 +3858,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3820
3858
  "rope_back(x)",
3821
3859
  "alibi(x)",
3822
3860
  "clamp(x)",
3823
- "conv_1d_s1_ph(x)",
3824
- "conv_1d_s2_ph(x)",
3825
- "conv_2d_sk_p0(x)",
3861
+ "conv_1d(x)",
3862
+ "conv_2d(x)",
3826
3863
 
3827
3864
  "flash_attn(x)",
3828
3865
  "flash_ff(x)",
@@ -3841,11 +3878,45 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3841
3878
  "cross_entropy_loss_back(x,y)",
3842
3879
  };
3843
3880
 
3844
- static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3881
+ static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
3845
3882
 
3846
3883
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3847
3884
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
3848
3885
 
3886
+ // WARN:
3887
+ // Mis-confguration can lead to problem that's hard to reason about:
3888
+ // * At best it crash or talks nosense.
3889
+ // * At worst it talks slightly difference but hard to perceive.
3890
+ //
3891
+ // An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
3892
+ // Take care about compile options (e.g., GGML_USE_xxx).
3893
+ static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
3894
+ static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
3895
+
3896
+ static void ggml_setup_op_has_task_pass(void) {
3897
+ { // INIT
3898
+ bool * p = GGML_OP_HAS_INIT;
3899
+
3900
+ p[GGML_OP_ACC ] = true;
3901
+ p[GGML_OP_MUL_MAT ] = true;
3902
+ p[GGML_OP_OUT_PROD ] = true;
3903
+ p[GGML_OP_SET ] = true;
3904
+ p[GGML_OP_GET_ROWS_BACK ] = true;
3905
+ p[GGML_OP_DIAG_MASK_INF ] = true;
3906
+ p[GGML_OP_DIAG_MASK_ZERO ] = true;
3907
+ p[GGML_OP_CONV_1D ] = true;
3908
+ p[GGML_OP_CONV_2D ] = true;
3909
+ p[GGML_OP_FLASH_ATTN_BACK ] = true;
3910
+ p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3911
+ }
3912
+
3913
+ { // FINALIZE
3914
+ bool * p = GGML_OP_HAS_FINALIZE;
3915
+
3916
+ p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3917
+ }
3918
+ }
3919
+
3849
3920
  //
3850
3921
  // ggml context
3851
3922
  //
@@ -4267,6 +4338,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4267
4338
  ggml_cl_init();
4268
4339
  #endif
4269
4340
 
4341
+ ggml_setup_op_has_task_pass();
4342
+
4270
4343
  is_first_call = false;
4271
4344
  }
4272
4345
 
@@ -5403,6 +5476,30 @@ struct ggml_tensor * ggml_mean(
5403
5476
  return result;
5404
5477
  }
5405
5478
 
5479
+ // ggml_argmax
5480
+
5481
+ struct ggml_tensor * ggml_argmax(
5482
+ struct ggml_context * ctx,
5483
+ struct ggml_tensor * a) {
5484
+ GGML_ASSERT(ggml_is_matrix(a));
5485
+ bool is_node = false;
5486
+
5487
+ if (a->grad) {
5488
+ GGML_ASSERT(false);
5489
+ is_node = true;
5490
+ }
5491
+
5492
+ int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
5493
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
5494
+
5495
+ result->op = GGML_OP_ARGMAX;
5496
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5497
+ result->src0 = a;
5498
+ result->src1 = NULL;
5499
+
5500
+ return result;
5501
+ }
5502
+
5406
5503
  // ggml_repeat
5407
5504
 
5408
5505
  struct ggml_tensor * ggml_repeat(
@@ -5596,6 +5693,74 @@ struct ggml_tensor * ggml_step_inplace(
5596
5693
  return ggml_step_impl(ctx, a, true);
5597
5694
  }
5598
5695
 
5696
+ // ggml_tanh
5697
+
5698
+ struct ggml_tensor * ggml_tanh_impl(
5699
+ struct ggml_context * ctx,
5700
+ struct ggml_tensor * a,
5701
+ bool inplace) {
5702
+ bool is_node = false;
5703
+
5704
+ if (!inplace && (a->grad)) {
5705
+ is_node = true;
5706
+ }
5707
+
5708
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5709
+
5710
+ result->op = GGML_OP_TANH;
5711
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5712
+ result->src0 = a;
5713
+ result->src1 = NULL;
5714
+
5715
+ return result;
5716
+ }
5717
+
5718
+ struct ggml_tensor * ggml_tanh(
5719
+ struct ggml_context * ctx,
5720
+ struct ggml_tensor * a) {
5721
+ return ggml_tanh_impl(ctx, a, false);
5722
+ }
5723
+
5724
+ struct ggml_tensor * ggml_tanh_inplace(
5725
+ struct ggml_context * ctx,
5726
+ struct ggml_tensor * a) {
5727
+ return ggml_tanh_impl(ctx, a, true);
5728
+ }
5729
+
5730
+ // ggml_elu
5731
+
5732
+ struct ggml_tensor * ggml_elu_impl(
5733
+ struct ggml_context * ctx,
5734
+ struct ggml_tensor * a,
5735
+ bool inplace) {
5736
+ bool is_node = false;
5737
+
5738
+ if (!inplace && (a->grad)) {
5739
+ is_node = true;
5740
+ }
5741
+
5742
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5743
+
5744
+ result->op = GGML_OP_ELU;
5745
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5746
+ result->src0 = a;
5747
+ result->src1 = NULL;
5748
+
5749
+ return result;
5750
+ }
5751
+
5752
+ struct ggml_tensor * ggml_elu(
5753
+ struct ggml_context * ctx,
5754
+ struct ggml_tensor * a) {
5755
+ return ggml_elu_impl(ctx, a, false);
5756
+ }
5757
+
5758
+ struct ggml_tensor * ggml_elu_inplace(
5759
+ struct ggml_context * ctx,
5760
+ struct ggml_tensor * a) {
5761
+ return ggml_elu_impl(ctx, a, true);
5762
+ }
5763
+
5599
5764
  // ggml_relu
5600
5765
 
5601
5766
  struct ggml_tensor * ggml_relu_impl(
@@ -6837,6 +7002,8 @@ struct ggml_tensor * ggml_rope_back(
6837
7002
  int n_dims,
6838
7003
  int mode) {
6839
7004
  GGML_ASSERT(n_past >= 0);
7005
+ GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
7006
+
6840
7007
  bool is_node = false;
6841
7008
 
6842
7009
  if (a->grad) {
@@ -6937,15 +7104,21 @@ struct ggml_tensor * ggml_clamp(
6937
7104
  return result;
6938
7105
  }
6939
7106
 
6940
- // ggml_conv_1d_s1_ph
7107
+ // ggml_conv_1d
7108
+
7109
+ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
7110
+ return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
7111
+ }
6941
7112
 
6942
- struct ggml_tensor * ggml_conv_1d_s1_ph(
7113
+ GGML_API struct ggml_tensor * ggml_conv_1d(
6943
7114
  struct ggml_context * ctx,
6944
7115
  struct ggml_tensor * a,
6945
- struct ggml_tensor * b) {
7116
+ struct ggml_tensor * b,
7117
+ int s0,
7118
+ int p0,
7119
+ int d0) {
6946
7120
  GGML_ASSERT(ggml_is_matrix(b));
6947
7121
  GGML_ASSERT(a->ne[1] == b->ne[1]);
6948
- GGML_ASSERT(a->ne[3] == 1);
6949
7122
  bool is_node = false;
6950
7123
 
6951
7124
  if (a->grad || b->grad) {
@@ -6953,26 +7126,43 @@ struct ggml_tensor * ggml_conv_1d_s1_ph(
6953
7126
  is_node = true;
6954
7127
  }
6955
7128
 
6956
- const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6957
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7129
+ const int64_t ne[4] = {
7130
+ ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
7131
+ a->ne[2], 1, 1,
7132
+ };
7133
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7134
+
7135
+ ggml_scratch_save(ctx);
7136
+ struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7137
+ ((int32_t*)c->data)[0] = s0;
7138
+ ((int32_t*)c->data)[1] = p0;
7139
+ ((int32_t*)c->data)[2] = d0;
7140
+ ggml_scratch_load(ctx);
6958
7141
 
6959
- result->op = GGML_OP_CONV_1D_S1_PH;
7142
+ result->op = GGML_OP_CONV_1D;
6960
7143
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6961
7144
  result->src0 = a;
6962
7145
  result->src1 = b;
7146
+ result->opt[0] = c;
6963
7147
 
6964
7148
  return result;
6965
7149
  }
6966
7150
 
6967
- // ggml_conv_1d_s2_ph
7151
+ // ggml_conv_2d
6968
7152
 
6969
- struct ggml_tensor * ggml_conv_1d_s2_ph(
6970
- struct ggml_context * ctx,
6971
- struct ggml_tensor * a,
6972
- struct ggml_tensor * b) {
6973
- GGML_ASSERT(ggml_is_matrix(b));
6974
- GGML_ASSERT(a->ne[1] == b->ne[1]);
6975
- GGML_ASSERT(a->ne[3] == 1);
7153
+ struct ggml_tensor* ggml_conv_2d(
7154
+ struct ggml_context* ctx,
7155
+ struct ggml_tensor * a,
7156
+ struct ggml_tensor * b,
7157
+ int s0,
7158
+ int s1,
7159
+ int p0,
7160
+ int p1,
7161
+ int d0,
7162
+ int d1) {
7163
+
7164
+ GGML_ASSERT(b->ne[3] == 1);
7165
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
6976
7166
  bool is_node = false;
6977
7167
 
6978
7168
  if (a->grad || b->grad) {
@@ -6980,43 +7170,42 @@ struct ggml_tensor * ggml_conv_1d_s2_ph(
6980
7170
  is_node = true;
6981
7171
  }
6982
7172
 
6983
- const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6984
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7173
+ const int64_t ne[4] = {
7174
+ ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
7175
+ ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
7176
+ a->ne[3], 1,
7177
+ };
7178
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7179
+
7180
+ ggml_scratch_save(ctx);
7181
+ struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7182
+ ((int32_t*)c->data)[0] = s0;
7183
+ ((int32_t*)c->data)[1] = s1;
7184
+ ((int32_t*)c->data)[2] = p0;
7185
+ ((int32_t*)c->data)[3] = p1;
7186
+ ((int32_t*)c->data)[4] = d0;
7187
+ ((int32_t*)c->data)[5] = d1;
7188
+ ggml_scratch_load(ctx);
6985
7189
 
6986
- result->op = GGML_OP_CONV_1D_S2_PH;
7190
+ result->op = GGML_OP_CONV_2D;
6987
7191
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6988
7192
  result->src0 = a;
6989
7193
  result->src1 = b;
7194
+ result->opt[0] = c;
6990
7195
 
6991
7196
  return result;
7197
+
6992
7198
  }
6993
7199
 
6994
- // ggml_conv_2d_sk_p0
7200
+ // ggml_conv_1d_ph
6995
7201
 
6996
- struct ggml_tensor * ggml_conv_2d_sk_p0(
7202
+ struct ggml_tensor* ggml_conv_1d_ph(
6997
7203
  struct ggml_context * ctx,
6998
7204
  struct ggml_tensor * a,
6999
- struct ggml_tensor * b) {
7000
- GGML_ASSERT(b->ne[3] == 1);
7001
- GGML_ASSERT(a->ne[2] == b->ne[2]);
7002
- GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
7003
- GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
7004
- bool is_node = false;
7005
-
7006
- if (a->grad || b->grad) {
7007
- GGML_ASSERT(false); // TODO: implement backward
7008
- is_node = true;
7009
- }
7010
-
7011
- const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
7012
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7013
-
7014
- result->op = GGML_OP_CONV_2D_SK_P0;
7015
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7016
- result->src0 = a;
7017
- result->src1 = b;
7018
-
7019
- return result;
7205
+ struct ggml_tensor * b,
7206
+ int s,
7207
+ int d) {
7208
+ return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7020
7209
  }
7021
7210
 
7022
7211
  // ggml_flash_attn
@@ -7566,25 +7755,7 @@ static void ggml_compute_forward_dup_f16(
7566
7755
  return;
7567
7756
  }
7568
7757
 
7569
- const int64_t ne00 = src0->ne[0];
7570
- const int64_t ne01 = src0->ne[1];
7571
- const int64_t ne02 = src0->ne[2];
7572
- const int64_t ne03 = src0->ne[3];
7573
-
7574
- const int64_t ne0 = dst->ne[0];
7575
- const int64_t ne1 = dst->ne[1];
7576
- const int64_t ne2 = dst->ne[2];
7577
- const int64_t ne3 = dst->ne[3];
7578
-
7579
- const size_t nb00 = src0->nb[0];
7580
- const size_t nb01 = src0->nb[1];
7581
- const size_t nb02 = src0->nb[2];
7582
- const size_t nb03 = src0->nb[3];
7583
-
7584
- const size_t nb0 = dst->nb[0];
7585
- const size_t nb1 = dst->nb[1];
7586
- const size_t nb2 = dst->nb[2];
7587
- const size_t nb3 = dst->nb[3];
7758
+ GGML_TENSOR_UNARY_OP_LOCALS;
7588
7759
 
7589
7760
  const int ith = params->ith; // thread index
7590
7761
  const int nth = params->nth; // number of threads
@@ -7657,8 +7828,8 @@ static void ggml_compute_forward_dup_f16(
7657
7828
  id += ne00 * (ne01 - ir1);
7658
7829
  }
7659
7830
  }
7660
- } else if (ggml_is_quantized(dst->type)) {
7661
- quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
7831
+ } else if (type_traits[dst->type].from_float) {
7832
+ ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
7662
7833
  float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
7663
7834
 
7664
7835
  size_t id = 0;
@@ -7855,25 +8026,7 @@ static void ggml_compute_forward_dup_f32(
7855
8026
  return;
7856
8027
  }
7857
8028
 
7858
- const int64_t ne00 = src0->ne[0];
7859
- const int64_t ne01 = src0->ne[1];
7860
- const int64_t ne02 = src0->ne[2];
7861
- const int64_t ne03 = src0->ne[3];
7862
-
7863
- const int64_t ne0 = dst->ne[0];
7864
- const int64_t ne1 = dst->ne[1];
7865
- const int64_t ne2 = dst->ne[2];
7866
- const int64_t ne3 = dst->ne[3];
7867
-
7868
- const size_t nb00 = src0->nb[0];
7869
- const size_t nb01 = src0->nb[1];
7870
- const size_t nb02 = src0->nb[2];
7871
- const size_t nb03 = src0->nb[3];
7872
-
7873
- const size_t nb0 = dst->nb[0];
7874
- const size_t nb1 = dst->nb[1];
7875
- const size_t nb2 = dst->nb[2];
7876
- const size_t nb3 = dst->nb[3];
8029
+ GGML_TENSOR_UNARY_OP_LOCALS;
7877
8030
 
7878
8031
  const int ith = params->ith; // thread index
7879
8032
  const int nth = params->nth; // number of threads
@@ -7928,26 +8081,8 @@ static void ggml_compute_forward_dup_f32(
7928
8081
  id += rs * (ne01 - ir1);
7929
8082
  }
7930
8083
  }
7931
- } else if (dst->type == GGML_TYPE_F16) {
7932
- size_t id = 0;
7933
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
7934
-
7935
- for (int i03 = 0; i03 < ne03; i03++) {
7936
- for (int i02 = 0; i02 < ne02; i02++) {
7937
- id += ne00 * ir0;
7938
- for (int i01 = ir0; i01 < ir1; i01++) {
7939
- for (int i00 = 0; i00 < ne00; i00++) {
7940
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
7941
-
7942
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
7943
- id++;
7944
- }
7945
- }
7946
- id += ne00 * (ne01 - ir1);
7947
- }
7948
- }
7949
- } else if (ggml_is_quantized(dst->type)) {
7950
- quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
8084
+ } else if (type_traits[dst->type].from_float) {
8085
+ ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
7951
8086
 
7952
8087
  size_t id = 0;
7953
8088
  size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
@@ -8171,24 +8306,8 @@ static void ggml_compute_forward_add_f32(
8171
8306
  const int nth = params->nth;
8172
8307
 
8173
8308
  const int nr = ggml_nrows(src0);
8174
- const int64_t ne0 = src0->ne[0];
8175
- const int64_t ne1 = src0->ne[1];
8176
- const int64_t ne2 = src0->ne[2];
8177
-
8178
- const size_t nb00 = src0->nb[0];
8179
- const size_t nb01 = src0->nb[1];
8180
- const size_t nb02 = src0->nb[2];
8181
- const size_t nb03 = src0->nb[3];
8182
-
8183
- const size_t nb10 = src1->nb[0];
8184
- const size_t nb11 = src1->nb[1];
8185
- const size_t nb12 = src1->nb[2];
8186
- const size_t nb13 = src1->nb[3];
8187
8309
 
8188
- const size_t nb0 = dst->nb[0];
8189
- const size_t nb1 = dst->nb[1];
8190
- const size_t nb2 = dst->nb[2];
8191
- const size_t nb3 = dst->nb[3];
8310
+ GGML_TENSOR_BINARY_OP_LOCALS;
8192
8311
 
8193
8312
  GGML_ASSERT( nb0 == sizeof(float));
8194
8313
  GGML_ASSERT(nb00 == sizeof(float));
@@ -8257,28 +8376,12 @@ static void ggml_compute_forward_add_f16_f32(
8257
8376
  const int nth = params->nth;
8258
8377
 
8259
8378
  const int nr = ggml_nrows(src0);
8260
- const int64_t ne0 = src0->ne[0];
8261
- const int64_t ne1 = src0->ne[1];
8262
- const int64_t ne2 = src0->ne[2];
8263
8379
 
8264
- const size_t nb00 = src0->nb[0];
8265
- const size_t nb01 = src0->nb[1];
8266
- const size_t nb02 = src0->nb[2];
8267
- const size_t nb03 = src0->nb[3];
8268
-
8269
- const size_t nb10 = src1->nb[0];
8270
- const size_t nb11 = src1->nb[1];
8271
- const size_t nb12 = src1->nb[2];
8272
- const size_t nb13 = src1->nb[3];
8273
-
8274
- const size_t nb0 = dst->nb[0];
8275
- const size_t nb1 = dst->nb[1];
8276
- const size_t nb2 = dst->nb[2];
8277
- const size_t nb3 = dst->nb[3];
8380
+ GGML_TENSOR_BINARY_OP_LOCALS;
8278
8381
 
8279
8382
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8280
8383
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
8281
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
8384
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
8282
8385
 
8283
8386
  GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
8284
8387
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8327,24 +8430,8 @@ static void ggml_compute_forward_add_f16_f16(
8327
8430
  const int nth = params->nth;
8328
8431
 
8329
8432
  const int nr = ggml_nrows(src0);
8330
- const int64_t ne0 = src0->ne[0];
8331
- const int64_t ne1 = src0->ne[1];
8332
- const int64_t ne2 = src0->ne[2];
8333
-
8334
- const size_t nb00 = src0->nb[0];
8335
- const size_t nb01 = src0->nb[1];
8336
- const size_t nb02 = src0->nb[2];
8337
- const size_t nb03 = src0->nb[3];
8338
8433
 
8339
- const size_t nb10 = src1->nb[0];
8340
- const size_t nb11 = src1->nb[1];
8341
- const size_t nb12 = src1->nb[2];
8342
- const size_t nb13 = src1->nb[3];
8343
-
8344
- const size_t nb0 = dst->nb[0];
8345
- const size_t nb1 = dst->nb[1];
8346
- const size_t nb2 = dst->nb[2];
8347
- const size_t nb3 = dst->nb[3];
8434
+ GGML_TENSOR_BINARY_OP_LOCALS;
8348
8435
 
8349
8436
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8350
8437
  GGML_ASSERT(src1->type == GGML_TYPE_F16);
@@ -8394,32 +8481,15 @@ static void ggml_compute_forward_add_q_f32(
8394
8481
  }
8395
8482
 
8396
8483
  const int nr = ggml_nrows(src0);
8397
- const int64_t ne00 = src0->ne[0];
8398
- const int64_t ne01 = src0->ne[1];
8399
- const int64_t ne02 = src0->ne[2];
8400
- //const int64_t ne03 = src0->ne[3];
8401
-
8402
- const size_t nb00 = src0->nb[0];
8403
- const size_t nb01 = src0->nb[1];
8404
- const size_t nb02 = src0->nb[2];
8405
- const size_t nb03 = src0->nb[3];
8406
8484
 
8407
- const size_t nb10 = src1->nb[0];
8408
- const size_t nb11 = src1->nb[1];
8409
- const size_t nb12 = src1->nb[2];
8410
- const size_t nb13 = src1->nb[3];
8411
-
8412
- const size_t nb0 = dst->nb[0];
8413
- const size_t nb1 = dst->nb[1];
8414
- const size_t nb2 = dst->nb[2];
8415
- const size_t nb3 = dst->nb[3];
8485
+ GGML_TENSOR_BINARY_OP_LOCALS;
8416
8486
 
8417
8487
  const int ith = params->ith;
8418
8488
  const int nth = params->nth;
8419
8489
 
8420
8490
  const enum ggml_type type = src0->type;
8421
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
8422
- quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
8491
+ ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
8492
+ ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8423
8493
 
8424
8494
  // we don't support permuted src0 or src1
8425
8495
  GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
@@ -8533,19 +8603,8 @@ static void ggml_compute_forward_add1_f32(
8533
8603
  const int nth = params->nth;
8534
8604
 
8535
8605
  const int nr = ggml_nrows(src0);
8536
- const int64_t ne0 = src0->ne[0];
8537
- const int64_t ne1 = src0->ne[1];
8538
- const int64_t ne2 = src0->ne[2];
8539
-
8540
- const size_t nb00 = src0->nb[0];
8541
- const size_t nb01 = src0->nb[1];
8542
- const size_t nb02 = src0->nb[2];
8543
- const size_t nb03 = src0->nb[3];
8544
8606
 
8545
- const size_t nb0 = dst->nb[0];
8546
- const size_t nb1 = dst->nb[1];
8547
- const size_t nb2 = dst->nb[2];
8548
- const size_t nb3 = dst->nb[3];
8607
+ GGML_TENSOR_UNARY_OP_LOCALS;
8549
8608
 
8550
8609
  GGML_ASSERT( nb0 == sizeof(float));
8551
8610
  GGML_ASSERT(nb00 == sizeof(float));
@@ -8599,23 +8658,12 @@ static void ggml_compute_forward_add1_f16_f32(
8599
8658
  const int nth = params->nth;
8600
8659
 
8601
8660
  const int nr = ggml_nrows(src0);
8602
- const int64_t ne0 = src0->ne[0];
8603
- const int64_t ne1 = src0->ne[1];
8604
- const int64_t ne2 = src0->ne[2];
8605
8661
 
8606
- const size_t nb00 = src0->nb[0];
8607
- const size_t nb01 = src0->nb[1];
8608
- const size_t nb02 = src0->nb[2];
8609
- const size_t nb03 = src0->nb[3];
8610
-
8611
- const size_t nb0 = dst->nb[0];
8612
- const size_t nb1 = dst->nb[1];
8613
- const size_t nb2 = dst->nb[2];
8614
- const size_t nb3 = dst->nb[3];
8662
+ GGML_TENSOR_UNARY_OP_LOCALS;
8615
8663
 
8616
8664
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8617
8665
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
8618
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
8666
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
8619
8667
 
8620
8668
  GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
8621
8669
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8660,23 +8708,12 @@ static void ggml_compute_forward_add1_f16_f16(
8660
8708
  const int nth = params->nth;
8661
8709
 
8662
8710
  const int nr = ggml_nrows(src0);
8663
- const int64_t ne0 = src0->ne[0];
8664
- const int64_t ne1 = src0->ne[1];
8665
- const int64_t ne2 = src0->ne[2];
8666
-
8667
- const size_t nb00 = src0->nb[0];
8668
- const size_t nb01 = src0->nb[1];
8669
- const size_t nb02 = src0->nb[2];
8670
- const size_t nb03 = src0->nb[3];
8671
8711
 
8672
- const size_t nb0 = dst->nb[0];
8673
- const size_t nb1 = dst->nb[1];
8674
- const size_t nb2 = dst->nb[2];
8675
- const size_t nb3 = dst->nb[3];
8712
+ GGML_TENSOR_UNARY_OP_LOCALS;
8676
8713
 
8677
8714
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8678
8715
  GGML_ASSERT(src1->type == GGML_TYPE_F16);
8679
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
8716
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
8680
8717
 
8681
8718
  GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
8682
8719
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8721,23 +8758,12 @@ static void ggml_compute_forward_add1_q_f32(
8721
8758
  const int nth = params->nth;
8722
8759
 
8723
8760
  const int nr = ggml_nrows(src0);
8724
- const int64_t ne0 = src0->ne[0];
8725
- const int64_t ne1 = src0->ne[1];
8726
- const int64_t ne2 = src0->ne[2];
8727
8761
 
8728
- const size_t nb00 = src0->nb[0];
8729
- const size_t nb01 = src0->nb[1];
8730
- const size_t nb02 = src0->nb[2];
8731
- const size_t nb03 = src0->nb[3];
8732
-
8733
- const size_t nb0 = dst->nb[0];
8734
- const size_t nb1 = dst->nb[1];
8735
- const size_t nb2 = dst->nb[2];
8736
- const size_t nb3 = dst->nb[3];
8762
+ GGML_TENSOR_UNARY_OP_LOCALS;
8737
8763
 
8738
8764
  const enum ggml_type type = src0->type;
8739
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
8740
- quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
8765
+ ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
8766
+ ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8741
8767
 
8742
8768
  // we don't support permuted src0
8743
8769
  GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
@@ -8865,15 +8891,8 @@ static void ggml_compute_forward_acc_f32(
8865
8891
  const int nr = ggml_nrows(src1);
8866
8892
  const int nc = src1->ne[0];
8867
8893
 
8868
- const int64_t ne10 = src1->ne[0];
8869
- const int64_t ne11 = src1->ne[1];
8870
- const int64_t ne12 = src1->ne[2];
8871
- const int64_t ne13 = src1->ne[3];
8872
-
8873
- const size_t nb10 = src1->nb[0];
8874
- const size_t nb11 = src1->nb[1];
8875
- const size_t nb12 = src1->nb[2];
8876
- const size_t nb13 = src1->nb[3];
8894
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
8895
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
8877
8896
 
8878
8897
  // src0 and dst as viewed during acc
8879
8898
  const size_t nb0 = ggml_element_size(src0);
@@ -8962,24 +8981,8 @@ static void ggml_compute_forward_sub_f32(
8962
8981
  }
8963
8982
 
8964
8983
  const int nr = ggml_nrows(src0);
8965
- const int64_t ne0 = src0->ne[0];
8966
- const int64_t ne1 = src0->ne[1];
8967
- const int64_t ne2 = src0->ne[2];
8968
-
8969
- const size_t nb00 = src0->nb[0];
8970
- const size_t nb01 = src0->nb[1];
8971
- const size_t nb02 = src0->nb[2];
8972
- const size_t nb03 = src0->nb[3];
8973
8984
 
8974
- const size_t nb10 = src1->nb[0];
8975
- const size_t nb11 = src1->nb[1];
8976
- const size_t nb12 = src1->nb[2];
8977
- const size_t nb13 = src1->nb[3];
8978
-
8979
- const size_t nb0 = dst->nb[0];
8980
- const size_t nb1 = dst->nb[1];
8981
- const size_t nb2 = dst->nb[2];
8982
- const size_t nb3 = dst->nb[3];
8985
+ GGML_TENSOR_BINARY_OP_LOCALS;
8983
8986
 
8984
8987
  GGML_ASSERT( nb0 == sizeof(float));
8985
8988
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9069,29 +9072,7 @@ static void ggml_compute_forward_mul_f32(
9069
9072
 
9070
9073
  const int64_t nr = ggml_nrows(src0);
9071
9074
 
9072
- const int64_t ne00 = src0->ne[0];
9073
- const int64_t ne01 = src0->ne[1];
9074
- const int64_t ne02 = src0->ne[2];
9075
-
9076
- const int64_t ne10 = src1->ne[0];
9077
- const int64_t ne11 = src1->ne[1];
9078
- const int64_t ne12 = src1->ne[2];
9079
- const int64_t ne13 = src1->ne[3];
9080
-
9081
- const size_t nb00 = src0->nb[0];
9082
- const size_t nb01 = src0->nb[1];
9083
- const size_t nb02 = src0->nb[2];
9084
- const size_t nb03 = src0->nb[3];
9085
-
9086
- const size_t nb10 = src1->nb[0];
9087
- const size_t nb11 = src1->nb[1];
9088
- const size_t nb12 = src1->nb[2];
9089
- const size_t nb13 = src1->nb[3];
9090
-
9091
- const size_t nb0 = dst->nb[0];
9092
- const size_t nb1 = dst->nb[1];
9093
- const size_t nb2 = dst->nb[2];
9094
- const size_t nb3 = dst->nb[3];
9075
+ GGML_TENSOR_BINARY_OP_LOCALS;
9095
9076
 
9096
9077
  GGML_ASSERT( nb0 == sizeof(float));
9097
9078
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9179,24 +9160,8 @@ static void ggml_compute_forward_div_f32(
9179
9160
  }
9180
9161
 
9181
9162
  const int nr = ggml_nrows(src0);
9182
- const int64_t ne0 = src0->ne[0];
9183
- const int64_t ne1 = src0->ne[1];
9184
- const int64_t ne2 = src0->ne[2];
9185
-
9186
- const size_t nb00 = src0->nb[0];
9187
- const size_t nb01 = src0->nb[1];
9188
- const size_t nb02 = src0->nb[2];
9189
- const size_t nb03 = src0->nb[3];
9190
-
9191
- const size_t nb10 = src1->nb[0];
9192
- const size_t nb11 = src1->nb[1];
9193
- const size_t nb12 = src1->nb[2];
9194
- const size_t nb13 = src1->nb[3];
9195
9163
 
9196
- const size_t nb0 = dst->nb[0];
9197
- const size_t nb1 = dst->nb[1];
9198
- const size_t nb2 = dst->nb[2];
9199
- const size_t nb3 = dst->nb[3];
9164
+ GGML_TENSOR_BINARY_OP_LOCALS;
9200
9165
 
9201
9166
  GGML_ASSERT( nb0 == sizeof(float));
9202
9167
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9403,14 +9368,8 @@ static void ggml_compute_forward_sum_f32(
9403
9368
  assert(ggml_is_scalar(dst));
9404
9369
  assert(src0->nb[0] == sizeof(float));
9405
9370
 
9406
- const int64_t ne00 = src0->ne[0];
9407
- const int64_t ne01 = src0->ne[1];
9408
- const int64_t ne02 = src0->ne[2];
9409
- const int64_t ne03 = src0->ne[3];
9410
-
9411
- const size_t nb01 = src0->nb[1];
9412
- const size_t nb02 = src0->nb[2];
9413
- const size_t nb03 = src0->nb[3];
9371
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9372
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9414
9373
 
9415
9374
  ggml_float sum = 0;
9416
9375
  ggml_float row_sum = 0;
@@ -9459,29 +9418,13 @@ static void ggml_compute_forward_sum_rows_f32(
9459
9418
  GGML_ASSERT(src0->nb[0] == sizeof(float));
9460
9419
  GGML_ASSERT(dst->nb[0] == sizeof(float));
9461
9420
 
9462
- const int64_t ne00 = src0->ne[0];
9463
- const int64_t ne01 = src0->ne[1];
9464
- const int64_t ne02 = src0->ne[2];
9465
- const int64_t ne03 = src0->ne[3];
9466
-
9467
- const int64_t ne0 = dst->ne[0];
9468
- const int64_t ne1 = dst->ne[1];
9469
- const int64_t ne2 = dst->ne[2];
9470
- const int64_t ne3 = dst->ne[3];
9421
+ GGML_TENSOR_UNARY_OP_LOCALS;
9471
9422
 
9472
9423
  GGML_ASSERT(ne0 == 1);
9473
9424
  GGML_ASSERT(ne1 == ne01);
9474
9425
  GGML_ASSERT(ne2 == ne02);
9475
9426
  GGML_ASSERT(ne3 == ne03);
9476
9427
 
9477
- const size_t nb01 = src0->nb[1];
9478
- const size_t nb02 = src0->nb[2];
9479
- const size_t nb03 = src0->nb[3];
9480
-
9481
- const size_t nb1 = dst->nb[1];
9482
- const size_t nb2 = dst->nb[2];
9483
- const size_t nb3 = dst->nb[3];
9484
-
9485
9428
  for (int64_t i3 = 0; i3 < ne03; i3++) {
9486
9429
  for (int64_t i2 = 0; i2 < ne02; i2++) {
9487
9430
  for (int64_t i1 = 0; i1 < ne01; i1++) {
@@ -9525,19 +9468,7 @@ static void ggml_compute_forward_mean_f32(
9525
9468
 
9526
9469
  assert(src0->nb[0] == sizeof(float));
9527
9470
 
9528
- const int64_t ne00 = src0->ne[0];
9529
- const int64_t ne01 = src0->ne[1];
9530
- const int64_t ne02 = src0->ne[2];
9531
- const int64_t ne03 = src0->ne[3];
9532
-
9533
- const size_t nb01 = src0->nb[1];
9534
- const size_t nb02 = src0->nb[2];
9535
- const size_t nb03 = src0->nb[3];
9536
-
9537
- const int64_t ne0 = dst->ne[0];
9538
- const int64_t ne1 = dst->ne[1];
9539
- const int64_t ne2 = dst->ne[2];
9540
- const int64_t ne3 = dst->ne[3];
9471
+ GGML_TENSOR_UNARY_OP_LOCALS;
9541
9472
 
9542
9473
  assert(ne0 == 1);
9543
9474
  assert(ne1 == ne01);
@@ -9549,10 +9480,6 @@ static void ggml_compute_forward_mean_f32(
9549
9480
  UNUSED(ne2);
9550
9481
  UNUSED(ne3);
9551
9482
 
9552
- const size_t nb1 = dst->nb[1];
9553
- const size_t nb2 = dst->nb[2];
9554
- const size_t nb3 = dst->nb[3];
9555
-
9556
9483
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9557
9484
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9558
9485
  for (int64_t i01 = 0; i01 < ne01; i01++) {
@@ -9582,38 +9509,66 @@ static void ggml_compute_forward_mean(
9582
9509
  }
9583
9510
  }
9584
9511
 
9585
- // ggml_compute_forward_repeat
9512
+ // ggml_compute_forward_argmax
9586
9513
 
9587
- static void ggml_compute_forward_repeat_f32(
9514
+ static void ggml_compute_forward_argmax_f32(
9588
9515
  const struct ggml_compute_params * params,
9589
9516
  const struct ggml_tensor * src0,
9590
9517
  struct ggml_tensor * dst) {
9591
- GGML_ASSERT(params->ith == 0);
9592
- GGML_ASSERT(ggml_can_repeat(src0, dst));
9518
+ assert(params->ith == 0);
9593
9519
 
9594
9520
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9595
9521
  return;
9596
9522
  }
9597
9523
 
9598
- const int64_t ne0 = dst->ne[0];
9599
- const int64_t ne1 = dst->ne[1];
9600
- const int64_t ne2 = dst->ne[2];
9601
- const int64_t ne3 = dst->ne[3];
9524
+ assert(src0->nb[0] == sizeof(float));
9525
+ assert(dst->nb[0] == sizeof(float));
9602
9526
 
9603
9527
  const int64_t ne00 = src0->ne[0];
9604
9528
  const int64_t ne01 = src0->ne[1];
9605
- const int64_t ne02 = src0->ne[2];
9606
- const int64_t ne03 = src0->ne[3];
9607
-
9608
- const size_t nb0 = dst->nb[0];
9609
- const size_t nb1 = dst->nb[1];
9610
- const size_t nb2 = dst->nb[2];
9611
- const size_t nb3 = dst->nb[3];
9612
9529
 
9613
- const size_t nb00 = src0->nb[0];
9614
9530
  const size_t nb01 = src0->nb[1];
9615
- const size_t nb02 = src0->nb[2];
9616
- const size_t nb03 = src0->nb[3];
9531
+ const size_t nb0 = dst->nb[0];
9532
+
9533
+ for (int64_t i1 = 0; i1 < ne01; i1++) {
9534
+ float * src = (float *) ((char *) src0->data + i1*nb01);
9535
+ int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0);
9536
+ int v = 0;
9537
+ ggml_vec_argmax_f32(ne00, &v, src);
9538
+ dst_[0] = v;
9539
+ }
9540
+ }
9541
+
9542
+ static void ggml_compute_forward_argmax(
9543
+ const struct ggml_compute_params * params,
9544
+ const struct ggml_tensor * src0,
9545
+ struct ggml_tensor * dst) {
9546
+ switch (src0->type) {
9547
+ case GGML_TYPE_F32:
9548
+ {
9549
+ ggml_compute_forward_argmax_f32(params, src0, dst);
9550
+ } break;
9551
+ default:
9552
+ {
9553
+ GGML_ASSERT(false);
9554
+ } break;
9555
+ }
9556
+ }
9557
+
9558
+ // ggml_compute_forward_repeat
9559
+
9560
+ static void ggml_compute_forward_repeat_f32(
9561
+ const struct ggml_compute_params * params,
9562
+ const struct ggml_tensor * src0,
9563
+ struct ggml_tensor * dst) {
9564
+ GGML_ASSERT(params->ith == 0);
9565
+ GGML_ASSERT(ggml_can_repeat(src0, dst));
9566
+
9567
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9568
+ return;
9569
+ }
9570
+
9571
+ GGML_TENSOR_UNARY_OP_LOCALS;
9617
9572
 
9618
9573
  // guaranteed to be an integer due to the check in ggml_can_repeat
9619
9574
  const int nr0 = (int)(ne0/ne00);
@@ -9674,25 +9629,7 @@ static void ggml_compute_forward_repeat_back_f32(
9674
9629
  return;
9675
9630
  }
9676
9631
 
9677
- const int64_t ne0 = dst->ne[0];
9678
- const int64_t ne1 = dst->ne[1];
9679
- const int64_t ne2 = dst->ne[2];
9680
- const int64_t ne3 = dst->ne[3];
9681
-
9682
- const int64_t ne00 = src0->ne[0];
9683
- const int64_t ne01 = src0->ne[1];
9684
- const int64_t ne02 = src0->ne[2];
9685
- const int64_t ne03 = src0->ne[3];
9686
-
9687
- const size_t nb0 = dst->nb[0];
9688
- const size_t nb1 = dst->nb[1];
9689
- const size_t nb2 = dst->nb[2];
9690
- const size_t nb3 = dst->nb[3];
9691
-
9692
- const size_t nb00 = src0->nb[0];
9693
- const size_t nb01 = src0->nb[1];
9694
- const size_t nb02 = src0->nb[2];
9695
- const size_t nb03 = src0->nb[3];
9632
+ GGML_TENSOR_UNARY_OP_LOCALS;
9696
9633
 
9697
9634
  // guaranteed to be an integer due to the check in ggml_can_repeat
9698
9635
  const int nr0 = (int)(ne00/ne0);
@@ -9922,6 +9859,90 @@ static void ggml_compute_forward_step(
9922
9859
  }
9923
9860
  }
9924
9861
 
9862
+ // ggml_compute_forward_tanh
9863
+
9864
+ static void ggml_compute_forward_tanh_f32(
9865
+ const struct ggml_compute_params * params,
9866
+ const struct ggml_tensor * src0,
9867
+ struct ggml_tensor * dst) {
9868
+ assert(params->ith == 0);
9869
+ assert(ggml_are_same_shape(src0, dst));
9870
+
9871
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9872
+ return;
9873
+ }
9874
+
9875
+ const int n = ggml_nrows(src0);
9876
+ const int nc = src0->ne[0];
9877
+
9878
+ assert(dst->nb[0] == sizeof(float));
9879
+ assert(src0->nb[0] == sizeof(float));
9880
+
9881
+ for (int i = 0; i < n; i++) {
9882
+ ggml_vec_tanh_f32(nc,
9883
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9884
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9885
+ }
9886
+ }
9887
+
9888
+ static void ggml_compute_forward_tanh(
9889
+ const struct ggml_compute_params * params,
9890
+ const struct ggml_tensor * src0,
9891
+ struct ggml_tensor * dst) {
9892
+ switch (src0->type) {
9893
+ case GGML_TYPE_F32:
9894
+ {
9895
+ ggml_compute_forward_tanh_f32(params, src0, dst);
9896
+ } break;
9897
+ default:
9898
+ {
9899
+ GGML_ASSERT(false);
9900
+ } break;
9901
+ }
9902
+ }
9903
+
9904
+ // ggml_compute_forward_elu
9905
+
9906
+ static void ggml_compute_forward_elu_f32(
9907
+ const struct ggml_compute_params * params,
9908
+ const struct ggml_tensor * src0,
9909
+ struct ggml_tensor * dst) {
9910
+ assert(params->ith == 0);
9911
+ assert(ggml_are_same_shape(src0, dst));
9912
+
9913
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9914
+ return;
9915
+ }
9916
+
9917
+ const int n = ggml_nrows(src0);
9918
+ const int nc = src0->ne[0];
9919
+
9920
+ assert(dst->nb[0] == sizeof(float));
9921
+ assert(src0->nb[0] == sizeof(float));
9922
+
9923
+ for (int i = 0; i < n; i++) {
9924
+ ggml_vec_elu_f32(nc,
9925
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9926
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9927
+ }
9928
+ }
9929
+
9930
+ static void ggml_compute_forward_elu(
9931
+ const struct ggml_compute_params * params,
9932
+ const struct ggml_tensor * src0,
9933
+ struct ggml_tensor * dst) {
9934
+ switch (src0->type) {
9935
+ case GGML_TYPE_F32:
9936
+ {
9937
+ ggml_compute_forward_elu_f32(params, src0, dst);
9938
+ } break;
9939
+ default:
9940
+ {
9941
+ GGML_ASSERT(false);
9942
+ } break;
9943
+ }
9944
+ }
9945
+
9925
9946
  // ggml_compute_forward_relu
9926
9947
 
9927
9948
  static void ggml_compute_forward_relu_f32(
@@ -10223,18 +10244,7 @@ static void ggml_compute_forward_norm_f32(
10223
10244
  const int ith = params->ith;
10224
10245
  const int nth = params->nth;
10225
10246
 
10226
- const int64_t ne00 = src0->ne[0];
10227
- const int64_t ne01 = src0->ne[1];
10228
- const int64_t ne02 = src0->ne[2];
10229
- const int64_t ne03 = src0->ne[3];
10230
-
10231
- const size_t nb01 = src0->nb[1];
10232
- const size_t nb02 = src0->nb[2];
10233
- const size_t nb03 = src0->nb[3];
10234
-
10235
- const size_t nb1 = dst->nb[1];
10236
- const size_t nb2 = dst->nb[2];
10237
- const size_t nb3 = dst->nb[3];
10247
+ GGML_TENSOR_UNARY_OP_LOCALS;
10238
10248
 
10239
10249
  const float eps = 1e-5f; // TODO: make this a parameter
10240
10250
 
@@ -10300,18 +10310,7 @@ static void ggml_compute_forward_rms_norm_f32(
10300
10310
  const int ith = params->ith;
10301
10311
  const int nth = params->nth;
10302
10312
 
10303
- const int64_t ne00 = src0->ne[0];
10304
- const int64_t ne01 = src0->ne[1];
10305
- const int64_t ne02 = src0->ne[2];
10306
- const int64_t ne03 = src0->ne[3];
10307
-
10308
- const size_t nb01 = src0->nb[1];
10309
- const size_t nb02 = src0->nb[2];
10310
- const size_t nb03 = src0->nb[3];
10311
-
10312
- const size_t nb1 = dst->nb[1];
10313
- const size_t nb2 = dst->nb[2];
10314
- const size_t nb3 = dst->nb[3];
10313
+ GGML_TENSOR_UNARY_OP_LOCALS;
10315
10314
 
10316
10315
  const float eps = 1e-6f; // TODO: make this a parameter
10317
10316
 
@@ -10376,22 +10375,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
10376
10375
  const int ith = params->ith;
10377
10376
  const int nth = params->nth;
10378
10377
 
10379
- const int64_t ne00 = src0->ne[0];
10380
- const int64_t ne01 = src0->ne[1];
10381
- const int64_t ne02 = src0->ne[2];
10382
- const int64_t ne03 = src0->ne[3];
10383
-
10384
- const size_t nb01 = src0->nb[1];
10385
- const size_t nb02 = src0->nb[2];
10386
- const size_t nb03 = src0->nb[3];
10387
-
10388
- const size_t nb11 = src1->nb[1];
10389
- const size_t nb12 = src1->nb[2];
10390
- const size_t nb13 = src1->nb[3];
10391
-
10392
- const size_t nb1 = dst->nb[1];
10393
- const size_t nb2 = dst->nb[2];
10394
- const size_t nb3 = dst->nb[3];
10378
+ GGML_TENSOR_BINARY_OP_LOCALS;
10395
10379
 
10396
10380
  const float eps = 1e-6f; // TODO: make this a parameter
10397
10381
 
@@ -10541,416 +10525,45 @@ static void ggml_compute_forward_rms_norm_back(
10541
10525
  {
10542
10526
  ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
10543
10527
  } break;
10544
- default:
10545
- {
10546
- GGML_ASSERT(false);
10547
- } break;
10548
- }
10549
- }
10550
-
10551
-
10552
- // ggml_compute_forward_mul_mat
10553
-
10554
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10555
- // helper function to determine if it is better to use BLAS or not
10556
- // for large matrices, BLAS is faster
10557
- static bool ggml_compute_forward_mul_mat_use_blas(
10558
- const struct ggml_tensor * src0,
10559
- const struct ggml_tensor * src1,
10560
- struct ggml_tensor * dst) {
10561
- //const int64_t ne00 = src0->ne[0];
10562
- //const int64_t ne01 = src0->ne[1];
10563
-
10564
- const int64_t ne10 = src1->ne[0];
10565
-
10566
- const int64_t ne0 = dst->ne[0];
10567
- const int64_t ne1 = dst->ne[1];
10568
-
10569
- // TODO: find the optimal values for these
10570
- if (ggml_is_contiguous(src0) &&
10571
- ggml_is_contiguous(src1) &&
10572
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
10573
-
10574
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
10575
- return true;
10576
- }
10577
-
10578
- return false;
10579
- }
10580
- #endif
10581
-
10582
- static void ggml_compute_forward_mul_mat_f32(
10583
- const struct ggml_compute_params * params,
10584
- const struct ggml_tensor * src0,
10585
- const struct ggml_tensor * src1,
10586
- struct ggml_tensor * dst) {
10587
- int64_t t0 = ggml_perf_time_us();
10588
- UNUSED(t0);
10589
-
10590
- const int64_t ne00 = src0->ne[0];
10591
- const int64_t ne01 = src0->ne[1];
10592
- const int64_t ne02 = src0->ne[2];
10593
- const int64_t ne03 = src0->ne[3];
10594
-
10595
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10596
- const int64_t ne10 = src1->ne[0];
10597
- #endif
10598
- const int64_t ne11 = src1->ne[1];
10599
- #ifndef NDEBUG
10600
- const int64_t ne12 = src1->ne[2];
10601
- const int64_t ne13 = src1->ne[3];
10602
-
10603
- const int64_t ne0 = dst->ne[0];
10604
- const int64_t ne1 = dst->ne[1];
10605
- const int64_t ne2 = dst->ne[2];
10606
- const int64_t ne3 = dst->ne[3];
10607
-
10608
- const int nb00 = src0->nb[0];
10609
- #endif
10610
- const int nb01 = src0->nb[1];
10611
- const int nb02 = src0->nb[2];
10612
- const int nb03 = src0->nb[3];
10613
-
10614
- #ifndef NDEBUG
10615
- const int nb10 = src1->nb[0];
10616
- #endif
10617
- const int nb11 = src1->nb[1];
10618
- const int nb12 = src1->nb[2];
10619
- const int nb13 = src1->nb[3];
10620
-
10621
- const int nb0 = dst->nb[0];
10622
- const int nb1 = dst->nb[1];
10623
- const int nb2 = dst->nb[2];
10624
- const int nb3 = dst->nb[3];
10625
-
10626
- const int ith = params->ith;
10627
- const int nth = params->nth;
10628
-
10629
- assert(ne02 == ne12);
10630
- assert(ne03 == ne13);
10631
- assert(ne2 == ne12);
10632
- assert(ne3 == ne13);
10633
-
10634
- // we don't support permuted src0 or src1
10635
- assert(nb00 == sizeof(float));
10636
- assert(nb10 == sizeof(float));
10637
-
10638
- // dst cannot be transposed or permuted
10639
- assert(nb0 == sizeof(float));
10640
- assert(nb0 <= nb1);
10641
- assert(nb1 <= nb2);
10642
- assert(nb2 <= nb3);
10643
-
10644
- assert(ne0 == ne01);
10645
- assert(ne1 == ne11);
10646
- assert(ne2 == ne02);
10647
- assert(ne3 == ne03);
10648
-
10649
- // nb01 >= nb00 - src0 is not transposed
10650
- // compute by src0 rows
10651
-
10652
- #if defined(GGML_USE_CLBLAST)
10653
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10654
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10655
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10656
- }
10657
- return;
10658
- }
10659
- #endif
10660
-
10661
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10662
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10663
- if (params->ith != 0) {
10664
- return;
10665
- }
10666
-
10667
- if (params->type == GGML_TASK_INIT) {
10668
- return;
10669
- }
10670
-
10671
- if (params->type == GGML_TASK_FINALIZE) {
10672
- return;
10673
- }
10674
-
10675
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10676
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10677
- const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
10678
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
10679
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10680
-
10681
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
10682
- ne11, ne01, ne10,
10683
- 1.0f, y, ne10,
10684
- x, ne00,
10685
- 0.0f, d, ne01);
10686
- }
10687
- }
10688
- //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
10689
-
10690
- return;
10691
- }
10692
- #endif
10693
-
10694
- if (params->type == GGML_TASK_INIT) {
10695
- return;
10696
- }
10697
-
10698
- if (params->type == GGML_TASK_FINALIZE) {
10699
- return;
10700
- }
10701
-
10702
- // parallelize by src0 rows using ggml_vec_dot_f32
10703
-
10704
- // total rows in src0
10705
- const int nr = ne01*ne02*ne03;
10706
-
10707
- // rows per thread
10708
- const int dr = (nr + nth - 1)/nth;
10709
-
10710
- // row range for this thread
10711
- const int ir0 = dr*ith;
10712
- const int ir1 = MIN(ir0 + dr, nr);
10713
-
10714
- for (int ir = ir0; ir < ir1; ++ir) {
10715
- // src0 indices
10716
- const int i03 = ir/(ne02*ne01);
10717
- const int i02 = (ir - i03*ne02*ne01)/ne01;
10718
- const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
10719
-
10720
- for (int64_t ic = 0; ic < ne11; ++ic) {
10721
- // src1 indices
10722
- const int i13 = i03;
10723
- const int i12 = i02;
10724
- const int i11 = ic;
10725
-
10726
- // dst indices
10727
- const int i0 = i01;
10728
- const int i1 = i11;
10729
- const int i2 = i02;
10730
- const int i3 = i03;
10731
-
10732
- ggml_vec_dot_f32(ne00,
10733
- (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
10734
- (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
10735
- (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
10736
- }
10737
- }
10738
-
10739
- //int64_t t1 = ggml_perf_time_us();
10740
- //static int64_t acc = 0;
10741
- //acc += t1 - t0;
10742
- //if (t1 - t0 > 10) {
10743
- // printf("\n");
10744
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
10745
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
10746
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
10747
- // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
10748
-
10749
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
10750
- //}
10751
- }
10752
-
10753
- static void ggml_compute_forward_mul_mat_f16_f32(
10754
- const struct ggml_compute_params * params,
10755
- const struct ggml_tensor * src0,
10756
- const struct ggml_tensor * src1,
10757
- struct ggml_tensor * dst) {
10758
- int64_t t0 = ggml_perf_time_us();
10759
- UNUSED(t0);
10760
-
10761
- const int64_t ne00 = src0->ne[0];
10762
- const int64_t ne01 = src0->ne[1];
10763
- const int64_t ne02 = src0->ne[2];
10764
- const int64_t ne03 = src0->ne[3];
10765
-
10766
- const int64_t ne10 = src1->ne[0];
10767
- const int64_t ne11 = src1->ne[1];
10768
- const int64_t ne12 = src1->ne[2];
10769
- const int64_t ne13 = src1->ne[3];
10770
-
10771
- const int64_t ne0 = dst->ne[0];
10772
- const int64_t ne1 = dst->ne[1];
10773
- const int64_t ne2 = dst->ne[2];
10774
- const int64_t ne3 = dst->ne[3];
10775
- //const int64_t ne = ne0*ne1*ne2*ne3;
10776
-
10777
- const int nb00 = src0->nb[0];
10778
- const int nb01 = src0->nb[1];
10779
- const int nb02 = src0->nb[2];
10780
- const int nb03 = src0->nb[3];
10781
-
10782
- const int nb10 = src1->nb[0];
10783
- const int nb11 = src1->nb[1];
10784
- const int nb12 = src1->nb[2];
10785
- const int nb13 = src1->nb[3];
10786
-
10787
- const int nb0 = dst->nb[0];
10788
- const int nb1 = dst->nb[1];
10789
- const int nb2 = dst->nb[2];
10790
- const int nb3 = dst->nb[3];
10791
-
10792
- const int ith = params->ith;
10793
- const int nth = params->nth;
10794
-
10795
- GGML_ASSERT(ne02 == ne12);
10796
- GGML_ASSERT(ne03 == ne13);
10797
- GGML_ASSERT(ne2 == ne12);
10798
- GGML_ASSERT(ne3 == ne13);
10799
-
10800
- // TODO: we don't support permuted src0
10801
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
10802
-
10803
- // dst cannot be transposed or permuted
10804
- GGML_ASSERT(nb0 == sizeof(float));
10805
- GGML_ASSERT(nb0 <= nb1);
10806
- GGML_ASSERT(nb1 <= nb2);
10807
- GGML_ASSERT(nb2 <= nb3);
10808
-
10809
- GGML_ASSERT(ne0 == ne01);
10810
- GGML_ASSERT(ne1 == ne11);
10811
- GGML_ASSERT(ne2 == ne02);
10812
- GGML_ASSERT(ne3 == ne03);
10813
-
10814
- // nb01 >= nb00 - src0 is not transposed
10815
- // compute by src0 rows
10816
-
10817
- #if defined(GGML_USE_CLBLAST)
10818
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10819
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10820
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10821
- }
10822
- return;
10823
- }
10824
- #endif
10825
-
10826
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10827
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10828
- GGML_ASSERT(nb10 == sizeof(float));
10829
-
10830
- if (params->ith != 0) {
10831
- return;
10832
- }
10833
-
10834
- if (params->type == GGML_TASK_INIT) {
10835
- return;
10836
- }
10837
-
10838
- if (params->type == GGML_TASK_FINALIZE) {
10839
- return;
10840
- }
10841
-
10842
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10843
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10844
- float * const wdata = params->wdata;
10845
- {
10846
- size_t id = 0;
10847
- for (int64_t i01 = 0; i01 < ne01; ++i01) {
10848
- for (int64_t i00 = 0; i00 < ne00; ++i00) {
10849
- wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
10850
- }
10851
- }
10852
-
10853
- assert(id*sizeof(float) <= params->wsize);
10854
- }
10855
-
10856
- const float * x = wdata;
10857
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
10858
-
10859
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10860
-
10861
- // zT = y * xT
10862
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
10863
- ne11, ne01, ne10,
10864
- 1.0f, y, ne10,
10865
- x, ne00,
10866
- 0.0f, d, ne01);
10867
- }
10868
- }
10869
-
10870
- /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
10871
-
10872
- return;
10873
- }
10874
- #endif
10875
-
10876
- if (params->type == GGML_TASK_INIT) {
10877
- ggml_fp16_t * const wdata = params->wdata;
10878
-
10879
- size_t id = 0;
10880
- for (int64_t i13 = 0; i13 < ne13; ++i13) {
10881
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
10882
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
10883
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
10884
- wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
10885
- }
10886
- }
10887
- }
10888
- }
10889
-
10890
- GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
10891
-
10892
- return;
10893
- }
10894
-
10895
- if (params->type == GGML_TASK_FINALIZE) {
10896
- return;
10528
+ default:
10529
+ {
10530
+ GGML_ASSERT(false);
10531
+ } break;
10897
10532
  }
10533
+ }
10898
10534
 
10899
- // fp16 -> half the size, so divide by 2
10900
- // TODO: do not support transposed src1
10901
- assert(nb10/2 == sizeof(ggml_fp16_t));
10902
-
10903
- // parallelize by src0 rows using ggml_vec_dot_f16
10904
-
10905
- // total rows in src0
10906
- const int nr = ne01*ne02*ne03;
10907
-
10908
- // rows per thread
10909
- const int dr = (nr + nth - 1)/nth;
10910
-
10911
- // row range for this thread
10912
- const int ir0 = dr*ith;
10913
- const int ir1 = MIN(ir0 + dr, nr);
10914
-
10915
- ggml_fp16_t * wdata = params->wdata;
10916
10535
 
10917
- for (int ir = ir0; ir < ir1; ++ir) {
10918
- // src0 indices
10919
- const int i03 = ir/(ne02*ne01);
10920
- const int i02 = (ir - i03*ne02*ne01)/ne01;
10921
- const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
10536
+ // ggml_compute_forward_mul_mat
10922
10537
 
10923
- const int i13 = i03;
10924
- const int i12 = i02;
10538
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10539
+ // helper function to determine if it is better to use BLAS or not
10540
+ // for large matrices, BLAS is faster
10541
+ static bool ggml_compute_forward_mul_mat_use_blas(
10542
+ const struct ggml_tensor * src0,
10543
+ const struct ggml_tensor * src1,
10544
+ struct ggml_tensor * dst) {
10545
+ //const int64_t ne00 = src0->ne[0];
10546
+ //const int64_t ne01 = src0->ne[1];
10925
10547
 
10926
- const int i0 = i01;
10927
- const int i2 = i02;
10928
- const int i3 = i03;
10548
+ const int64_t ne10 = src1->ne[0];
10929
10549
 
10930
- ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
10931
- ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00;
10550
+ const int64_t ne0 = dst->ne[0];
10551
+ const int64_t ne1 = dst->ne[1];
10932
10552
 
10933
- float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
10553
+ // TODO: find the optimal values for these
10554
+ if (ggml_is_contiguous(src0) &&
10555
+ ggml_is_contiguous(src1) &&
10556
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
10934
10557
 
10935
- for (int64_t ic = 0; ic < ne11; ++ic) {
10936
- ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
10937
- }
10558
+ /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
10559
+ return true;
10938
10560
  }
10939
10561
 
10940
- //int64_t t1 = ggml_time_us();
10941
- //static int64_t acc = 0;
10942
- //acc += t1 - t0;
10943
- //if (t1 - t0 > 10) {
10944
- // printf("\n");
10945
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
10946
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
10947
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
10948
-
10949
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
10950
- //}
10562
+ return false;
10951
10563
  }
10564
+ #endif
10952
10565
 
10953
- static void ggml_compute_forward_mul_mat_q_f32(
10566
+ static void ggml_compute_forward_mul_mat(
10954
10567
  const struct ggml_compute_params * params,
10955
10568
  const struct ggml_tensor * src0,
10956
10569
  const struct ggml_tensor * src1,
@@ -10958,35 +10571,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
10958
10571
  int64_t t0 = ggml_perf_time_us();
10959
10572
  UNUSED(t0);
10960
10573
 
10961
- const int64_t ne00 = src0->ne[0];
10962
- const int64_t ne01 = src0->ne[1];
10963
- const int64_t ne02 = src0->ne[2];
10964
- const int64_t ne03 = src0->ne[3];
10965
-
10966
- const int64_t ne10 = src1->ne[0];
10967
- const int64_t ne11 = src1->ne[1];
10968
- const int64_t ne12 = src1->ne[2];
10969
- const int64_t ne13 = src1->ne[3];
10970
-
10971
- const int64_t ne0 = dst->ne[0];
10972
- const int64_t ne1 = dst->ne[1];
10973
- const int64_t ne2 = dst->ne[2];
10974
- const int64_t ne3 = dst->ne[3];
10975
-
10976
- const int nb00 = src0->nb[0];
10977
- const int nb01 = src0->nb[1];
10978
- const int nb02 = src0->nb[2];
10979
- const int nb03 = src0->nb[3];
10980
-
10981
- const int nb10 = src1->nb[0];
10982
- const int nb11 = src1->nb[1];
10983
- const int nb12 = src1->nb[2];
10984
- const int nb13 = src1->nb[3];
10985
-
10986
- const int nb0 = dst->nb[0];
10987
- const int nb1 = dst->nb[1];
10988
- const int nb2 = dst->nb[2];
10989
- const int nb3 = dst->nb[3];
10574
+ GGML_TENSOR_BINARY_OP_LOCALS;
10990
10575
 
10991
10576
  const int ith = params->ith;
10992
10577
  const int nth = params->nth;
@@ -10997,12 +10582,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
10997
10582
  GGML_ASSERT(ne3 == ne13);
10998
10583
 
10999
10584
  const enum ggml_type type = src0->type;
11000
- quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
11001
- vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
11002
- enum ggml_type const vec_dot_type = quantize_fns[type].vec_dot_type;
10585
+
10586
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10587
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10588
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
11003
10589
 
11004
10590
  // we don't support permuted src0 or src1
11005
- GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
10591
+ GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
11006
10592
  GGML_ASSERT(nb10 == sizeof(float));
11007
10593
 
11008
10594
  // dst cannot be transposed or permuted
@@ -11042,27 +10628,27 @@ static void ggml_compute_forward_mul_mat_q_f32(
11042
10628
  return;
11043
10629
  }
11044
10630
 
11045
- float * const wdata = params->wdata;
11046
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
11047
-
11048
10631
  for (int64_t i03 = 0; i03 < ne03; i03++) {
11049
10632
  for (int64_t i02 = 0; i02 < ne02; i02++) {
10633
+ const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
11050
10634
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
11051
10635
 
11052
10636
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
11053
10637
 
11054
- {
10638
+ if (type != GGML_TYPE_F32) {
10639
+ float * const wdata = params->wdata;
10640
+ ggml_to_float_t const to_float = type_traits[type].to_float;
10641
+
11055
10642
  size_t id = 0;
11056
10643
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
11057
- dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
10644
+ to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
11058
10645
  id += ne00;
11059
10646
  }
11060
10647
 
11061
10648
  assert(id*sizeof(float) <= params->wsize);
10649
+ x = wdata;
11062
10650
  }
11063
10651
 
11064
- const float * x = wdata;
11065
-
11066
10652
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
11067
10653
  ne11, ne01, ne10,
11068
10654
  1.0f, y, ne10,
@@ -11078,14 +10664,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
11078
10664
  #endif
11079
10665
 
11080
10666
  if (params->type == GGML_TASK_INIT) {
11081
- char * wdata = params->wdata;
11082
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11083
-
11084
- for (int64_t i13 = 0; i13 < ne13; ++i13) {
11085
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11086
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11087
- quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
11088
- wdata += row_size;
10667
+ if (src1->type != vec_dot_type) {
10668
+ char * wdata = params->wdata;
10669
+ const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10670
+
10671
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
10672
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10673
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10674
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
10675
+ wdata += row_size;
10676
+ }
11089
10677
  }
11090
10678
  }
11091
10679
  }
@@ -11109,7 +10697,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
11109
10697
  const int ir0 = dr*ith;
11110
10698
  const int ir1 = MIN(ir0 + dr, nr);
11111
10699
 
11112
- void * wdata = params->wdata;
10700
+ void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11113
10701
  const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11114
10702
 
11115
10703
  for (int ir = ir0; ir < ir1; ++ir) {
@@ -11133,7 +10721,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
11133
10721
  assert(ne00 % 32 == 0);
11134
10722
 
11135
10723
  for (int64_t ic = 0; ic < ne11; ++ic) {
11136
- vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
10724
+ vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
11137
10725
  }
11138
10726
  }
11139
10727
 
@@ -11150,40 +10738,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
11150
10738
  //}
11151
10739
  }
11152
10740
 
11153
- static void ggml_compute_forward_mul_mat(
11154
- const struct ggml_compute_params * params,
11155
- const struct ggml_tensor * src0,
11156
- const struct ggml_tensor * src1,
11157
- struct ggml_tensor * dst) {
11158
- switch (src0->type) {
11159
- case GGML_TYPE_Q4_0:
11160
- case GGML_TYPE_Q4_1:
11161
- case GGML_TYPE_Q5_0:
11162
- case GGML_TYPE_Q5_1:
11163
- case GGML_TYPE_Q8_0:
11164
- case GGML_TYPE_Q8_1:
11165
- case GGML_TYPE_Q2_K:
11166
- case GGML_TYPE_Q3_K:
11167
- case GGML_TYPE_Q4_K:
11168
- case GGML_TYPE_Q5_K:
11169
- case GGML_TYPE_Q6_K:
11170
- {
11171
- ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
11172
- } break;
11173
- case GGML_TYPE_F16:
11174
- {
11175
- ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
11176
- } break;
11177
- case GGML_TYPE_F32:
11178
- {
11179
- ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);
11180
- } break;
11181
- default:
11182
- {
11183
- GGML_ASSERT(false);
11184
- } break;
11185
- }
11186
- }
11187
10741
 
11188
10742
  // ggml_compute_forward_out_prod
11189
10743
 
@@ -11196,35 +10750,7 @@ static void ggml_compute_forward_out_prod_f32(
11196
10750
  int64_t t0 = ggml_perf_time_us();
11197
10751
  UNUSED(t0);
11198
10752
 
11199
- const int64_t ne00 = src0->ne[0];
11200
- const int64_t ne01 = src0->ne[1];
11201
- const int64_t ne02 = src0->ne[2];
11202
- const int64_t ne03 = src0->ne[3];
11203
-
11204
- const int64_t ne10 = src1->ne[0];
11205
- //const int64_t ne11 = src1->ne[1];
11206
- const int64_t ne12 = src1->ne[2];
11207
- const int64_t ne13 = src1->ne[3];
11208
-
11209
- const int64_t ne0 = dst->ne[0];
11210
- const int64_t ne1 = dst->ne[1];
11211
- const int64_t ne2 = dst->ne[2];
11212
- const int64_t ne3 = dst->ne[3];
11213
-
11214
- const int nb00 = src0->nb[0];
11215
- const int nb01 = src0->nb[1];
11216
- const int nb02 = src0->nb[2];
11217
- const int nb03 = src0->nb[3];
11218
-
11219
- const int nb10 = src1->nb[0];
11220
- const int nb11 = src1->nb[1];
11221
- const int nb12 = src1->nb[2];
11222
- const int nb13 = src1->nb[3];
11223
-
11224
- const int nb0 = dst->nb[0];
11225
- const int nb1 = dst->nb[1];
11226
- const int nb2 = dst->nb[2];
11227
- const int nb3 = dst->nb[3];
10753
+ GGML_TENSOR_BINARY_OP_LOCALS;
11228
10754
 
11229
10755
  const int ith = params->ith;
11230
10756
  const int nth = params->nth;
@@ -11459,15 +10985,8 @@ static void ggml_compute_forward_set_f32(
11459
10985
  const int nr = ggml_nrows(src1);
11460
10986
  const int nc = src1->ne[0];
11461
10987
 
11462
- const int64_t ne10 = src1->ne[0];
11463
- const int64_t ne11 = src1->ne[1];
11464
- const int64_t ne12 = src1->ne[2];
11465
- const int64_t ne13 = src1->ne[3];
11466
-
11467
- const size_t nb10 = src1->nb[0];
11468
- const size_t nb11 = src1->nb[1];
11469
- const size_t nb12 = src1->nb[2];
11470
- const size_t nb13 = src1->nb[3];
10988
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
10989
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
11471
10990
 
11472
10991
  // src0 and dst as viewed during set
11473
10992
  const size_t nb0 = ggml_element_size(src0);
@@ -11608,7 +11127,7 @@ static void ggml_compute_forward_get_rows_q(
11608
11127
  const int nc = src0->ne[0];
11609
11128
  const int nr = ggml_nelements(src1);
11610
11129
  const enum ggml_type type = src0->type;
11611
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
11130
+ ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
11612
11131
 
11613
11132
  assert( dst->ne[0] == nc);
11614
11133
  assert( dst->ne[1] == nr);
@@ -11858,29 +11377,14 @@ static void ggml_compute_forward_diag_f32(
11858
11377
 
11859
11378
  // TODO: handle transposed/permuted matrices
11860
11379
 
11861
- const int ne00 = src0->ne[0];
11862
- const int ne01 = src0->ne[1];
11863
- const int ne02 = src0->ne[2];
11864
- const int ne03 = src0->ne[3];
11865
- const int ne0 = dst->ne[0];
11866
- const int ne1 = dst->ne[1];
11867
- const int ne2 = dst->ne[2];
11868
- const int ne3 = dst->ne[3];
11380
+ GGML_TENSOR_UNARY_OP_LOCALS;
11381
+
11869
11382
  GGML_ASSERT(ne00 == ne0);
11870
11383
  GGML_ASSERT(ne00 == ne1);
11871
11384
  GGML_ASSERT(ne01 == 1);
11872
11385
  GGML_ASSERT(ne02 == ne2);
11873
11386
  GGML_ASSERT(ne03 == ne3);
11874
11387
 
11875
- const int nb00 = src0->nb[0];
11876
- //const int nb01 = src0->nb[1];
11877
- const int nb02 = src0->nb[2];
11878
- const int nb03 = src0->nb[3];
11879
- const int nb0 = dst->nb[0];
11880
- const int nb1 = dst->nb[1];
11881
- const int nb2 = dst->nb[2];
11882
- const int nb3 = dst->nb[3];
11883
-
11884
11388
  GGML_ASSERT(nb00 == sizeof(float));
11885
11389
  GGML_ASSERT(nb0 == sizeof(float));
11886
11390
 
@@ -12457,20 +11961,7 @@ static void ggml_compute_forward_rope_f32(
12457
11961
 
12458
11962
  assert(n_past >= 0);
12459
11963
 
12460
- const size_t nb00 = src0->nb[0];
12461
- const size_t nb01 = src0->nb[1];
12462
- const size_t nb02 = src0->nb[2];
12463
- const size_t nb03 = src0->nb[3];
12464
-
12465
- const int64_t ne0 = dst->ne[0];
12466
- const int64_t ne1 = dst->ne[1];
12467
- const int64_t ne2 = dst->ne[2];
12468
- const int64_t ne3 = dst->ne[3];
12469
-
12470
- const size_t nb0 = dst->nb[0];
12471
- const size_t nb1 = dst->nb[1];
12472
- const size_t nb2 = dst->nb[2];
12473
- const size_t nb3 = dst->nb[3];
11964
+ GGML_TENSOR_UNARY_OP_LOCALS;
12474
11965
 
12475
11966
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12476
11967
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12597,20 +12088,7 @@ static void ggml_compute_forward_rope_f16(
12597
12088
 
12598
12089
  assert(n_past >= 0);
12599
12090
 
12600
- const size_t nb00 = src0->nb[0];
12601
- const size_t nb01 = src0->nb[1];
12602
- const size_t nb02 = src0->nb[2];
12603
- const size_t nb03 = src0->nb[3];
12604
-
12605
- const int64_t ne0 = dst->ne[0];
12606
- const int64_t ne1 = dst->ne[1];
12607
- const int64_t ne2 = dst->ne[2];
12608
- const int64_t ne3 = dst->ne[3];
12609
-
12610
- const size_t nb0 = dst->nb[0];
12611
- const size_t nb1 = dst->nb[1];
12612
- const size_t nb2 = dst->nb[2];
12613
- const size_t nb3 = dst->nb[3];
12091
+ GGML_TENSOR_UNARY_OP_LOCALS;
12614
12092
 
12615
12093
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12616
12094
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12763,21 +12241,7 @@ static void ggml_compute_forward_rope_back_f32(
12763
12241
 
12764
12242
  assert(n_past >= 0);
12765
12243
 
12766
- const size_t nb00 = src0->nb[0];
12767
- const size_t nb01 = src0->nb[1];
12768
- const size_t nb02 = src0->nb[2];
12769
- const size_t nb03 = src0->nb[3];
12770
-
12771
- const int64_t ne0 = dst->ne[0];
12772
- const int64_t ne1 = dst->ne[1];
12773
- const int64_t ne2 = dst->ne[2];
12774
- const int64_t ne3 = dst->ne[3];
12775
-
12776
- const size_t nb0 = dst->nb[0];
12777
- const size_t nb1 = dst->nb[1];
12778
- const size_t nb2 = dst->nb[2];
12779
- const size_t nb3 = dst->nb[3];
12780
-
12244
+ GGML_TENSOR_UNARY_OP_LOCALS;
12781
12245
 
12782
12246
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12783
12247
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12876,21 +12340,7 @@ static void ggml_compute_forward_rope_back_f16(
12876
12340
 
12877
12341
  assert(n_past >= 0);
12878
12342
 
12879
- const size_t nb00 = src0->nb[0];
12880
- const size_t nb01 = src0->nb[1];
12881
- const size_t nb02 = src0->nb[2];
12882
- const size_t nb03 = src0->nb[3];
12883
-
12884
- const int64_t ne0 = dst->ne[0];
12885
- const int64_t ne1 = dst->ne[1];
12886
- const int64_t ne2 = dst->ne[2];
12887
- const int64_t ne3 = dst->ne[3];
12888
-
12889
- const size_t nb0 = dst->nb[0];
12890
- const size_t nb1 = dst->nb[1];
12891
- const size_t nb2 = dst->nb[2];
12892
- const size_t nb3 = dst->nb[3];
12893
-
12343
+ GGML_TENSOR_UNARY_OP_LOCALS;
12894
12344
 
12895
12345
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12896
12346
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12988,7 +12438,7 @@ static void ggml_compute_forward_rope_back(
12988
12438
  }
12989
12439
  }
12990
12440
 
12991
- // ggml_compute_forward_conv_1d_s1_ph
12441
+ // ggml_compute_forward_conv_1d
12992
12442
 
12993
12443
  static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
12994
12444
  const struct ggml_compute_params * params,
@@ -13002,36 +12452,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
13002
12452
  int64_t t0 = ggml_perf_time_us();
13003
12453
  UNUSED(t0);
13004
12454
 
13005
- const int64_t ne00 = src0->ne[0];
13006
- const int64_t ne01 = src0->ne[1];
13007
- const int64_t ne02 = src0->ne[2];
13008
- //const int64_t ne03 = src0->ne[3];
13009
-
13010
- const int64_t ne10 = src1->ne[0];
13011
- const int64_t ne11 = src1->ne[1];
13012
- //const int64_t ne12 = src1->ne[2];
13013
- //const int64_t ne13 = src1->ne[3];
13014
-
13015
- //const int64_t ne0 = dst->ne[0];
13016
- //const int64_t ne1 = dst->ne[1];
13017
- //const int64_t ne2 = dst->ne[2];
13018
- //const int64_t ne3 = dst->ne[3];
13019
- //const int64_t ne = ne0*ne1*ne2*ne3;
13020
-
13021
- const int nb00 = src0->nb[0];
13022
- const int nb01 = src0->nb[1];
13023
- const int nb02 = src0->nb[2];
13024
- //const int nb03 = src0->nb[3];
13025
-
13026
- const int nb10 = src1->nb[0];
13027
- const int nb11 = src1->nb[1];
13028
- //const int nb12 = src1->nb[2];
13029
- //const int nb13 = src1->nb[3];
13030
-
13031
- //const int nb0 = dst->nb[0];
13032
- const int nb1 = dst->nb[1];
13033
- //const int nb2 = dst->nb[2];
13034
- //const int nb3 = dst->nb[3];
12455
+ GGML_TENSOR_BINARY_OP_LOCALS;
13035
12456
 
13036
12457
  const int ith = params->ith;
13037
12458
  const int nth = params->nth;
@@ -13122,36 +12543,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
13122
12543
  int64_t t0 = ggml_perf_time_us();
13123
12544
  UNUSED(t0);
13124
12545
 
13125
- const int64_t ne00 = src0->ne[0];
13126
- const int64_t ne01 = src0->ne[1];
13127
- const int64_t ne02 = src0->ne[2];
13128
- //const int64_t ne03 = src0->ne[3];
13129
-
13130
- const int64_t ne10 = src1->ne[0];
13131
- const int64_t ne11 = src1->ne[1];
13132
- //const int64_t ne12 = src1->ne[2];
13133
- //const int64_t ne13 = src1->ne[3];
13134
-
13135
- //const int64_t ne0 = dst->ne[0];
13136
- //const int64_t ne1 = dst->ne[1];
13137
- //const int64_t ne2 = dst->ne[2];
13138
- //const int64_t ne3 = dst->ne[3];
13139
- //const int64_t ne = ne0*ne1*ne2*ne3;
13140
-
13141
- const int nb00 = src0->nb[0];
13142
- const int nb01 = src0->nb[1];
13143
- const int nb02 = src0->nb[2];
13144
- //const int nb03 = src0->nb[3];
13145
-
13146
- const int nb10 = src1->nb[0];
13147
- const int nb11 = src1->nb[1];
13148
- //const int nb12 = src1->nb[2];
13149
- //const int nb13 = src1->nb[3];
13150
-
13151
- //const int nb0 = dst->nb[0];
13152
- const int nb1 = dst->nb[1];
13153
- //const int nb2 = dst->nb[2];
13154
- //const int nb3 = dst->nb[3];
12546
+ GGML_TENSOR_BINARY_OP_LOCALS;
13155
12547
 
13156
12548
  const int ith = params->ith;
13157
12549
  const int nth = params->nth;
@@ -13251,8 +12643,6 @@ static void ggml_compute_forward_conv_1d_s1_ph(
13251
12643
  }
13252
12644
  }
13253
12645
 
13254
- // ggml_compute_forward_conv_1d_s2_ph
13255
-
13256
12646
  static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
13257
12647
  const struct ggml_compute_params * params,
13258
12648
  const struct ggml_tensor * src0,
@@ -13265,36 +12655,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
13265
12655
  int64_t t0 = ggml_perf_time_us();
13266
12656
  UNUSED(t0);
13267
12657
 
13268
- const int64_t ne00 = src0->ne[0];
13269
- const int64_t ne01 = src0->ne[1];
13270
- const int64_t ne02 = src0->ne[2];
13271
- //const int64_t ne03 = src0->ne[3];
13272
-
13273
- const int64_t ne10 = src1->ne[0];
13274
- const int64_t ne11 = src1->ne[1];
13275
- //const int64_t ne12 = src1->ne[2];
13276
- //const int64_t ne13 = src1->ne[3];
13277
-
13278
- //const int64_t ne0 = dst->ne[0];
13279
- //const int64_t ne1 = dst->ne[1];
13280
- //const int64_t ne2 = dst->ne[2];
13281
- //const int64_t ne3 = dst->ne[3];
13282
- //const int64_t ne = ne0*ne1*ne2*ne3;
13283
-
13284
- const int nb00 = src0->nb[0];
13285
- const int nb01 = src0->nb[1];
13286
- const int nb02 = src0->nb[2];
13287
- //const int nb03 = src0->nb[3];
13288
-
13289
- const int nb10 = src1->nb[0];
13290
- const int nb11 = src1->nb[1];
13291
- //const int nb12 = src1->nb[2];
13292
- //const int nb13 = src1->nb[3];
13293
-
13294
- //const int nb0 = dst->nb[0];
13295
- const int nb1 = dst->nb[1];
13296
- //const int nb2 = dst->nb[2];
13297
- //const int nb3 = dst->nb[3];
12658
+ GGML_TENSOR_BINARY_OP_LOCALS;
13298
12659
 
13299
12660
  const int ith = params->ith;
13300
12661
  const int nth = params->nth;
@@ -13385,36 +12746,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
13385
12746
  int64_t t0 = ggml_perf_time_us();
13386
12747
  UNUSED(t0);
13387
12748
 
13388
- const int64_t ne00 = src0->ne[0];
13389
- const int64_t ne01 = src0->ne[1];
13390
- const int64_t ne02 = src0->ne[2];
13391
- //const int64_t ne03 = src0->ne[3];
13392
-
13393
- const int64_t ne10 = src1->ne[0];
13394
- const int64_t ne11 = src1->ne[1];
13395
- //const int64_t ne12 = src1->ne[2];
13396
- //const int64_t ne13 = src1->ne[3];
13397
-
13398
- //const int64_t ne0 = dst->ne[0];
13399
- //const int64_t ne1 = dst->ne[1];
13400
- //const int64_t ne2 = dst->ne[2];
13401
- //const int64_t ne3 = dst->ne[3];
13402
- //const int64_t ne = ne0*ne1*ne2*ne3;
13403
-
13404
- const int nb00 = src0->nb[0];
13405
- const int nb01 = src0->nb[1];
13406
- const int nb02 = src0->nb[2];
13407
- //const int nb03 = src0->nb[3];
13408
-
13409
- const int nb10 = src1->nb[0];
13410
- const int nb11 = src1->nb[1];
13411
- //const int nb12 = src1->nb[2];
13412
- //const int nb13 = src1->nb[3];
13413
-
13414
- //const int nb0 = dst->nb[0];
13415
- const int nb1 = dst->nb[1];
13416
- //const int nb2 = dst->nb[2];
13417
- //const int nb3 = dst->nb[3];
12749
+ GGML_TENSOR_BINARY_OP_LOCALS;
13418
12750
 
13419
12751
  const int ith = params->ith;
13420
12752
  const int nth = params->nth;
@@ -13514,6 +12846,28 @@ static void ggml_compute_forward_conv_1d_s2_ph(
13514
12846
  }
13515
12847
  }
13516
12848
 
12849
+ // ggml_compute_forward_conv_1d
12850
+
12851
+ static void ggml_compute_forward_conv_1d(
12852
+ const struct ggml_compute_params * params,
12853
+ const struct ggml_tensor * src0,
12854
+ const struct ggml_tensor * src1,
12855
+ const struct ggml_tensor * opt0,
12856
+ struct ggml_tensor * dst) {
12857
+ const int32_t s0 = ((const int32_t*)(opt0->data))[0];
12858
+ const int32_t p0 = ((const int32_t*)(opt0->data))[1];
12859
+ const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12860
+ GGML_ASSERT(d0 == 1); // dilation not supported
12861
+ GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
12862
+ if (s0 == 1) {
12863
+ ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
12864
+ } else if (s0 == 2) {
12865
+ ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
12866
+ } else {
12867
+ GGML_ASSERT(false); // only stride 1 and 2 supported
12868
+ };
12869
+ }
12870
+
13517
12871
  // ggml_compute_forward_conv_2d_sk_p0
13518
12872
 
13519
12873
  static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
@@ -13528,36 +12882,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13528
12882
  int64_t t0 = ggml_perf_time_us();
13529
12883
  UNUSED(t0);
13530
12884
 
13531
- const int ne00 = src0->ne[0];
13532
- const int ne01 = src0->ne[1];
13533
- const int ne02 = src0->ne[2];
13534
- //const int ne03 = src0->ne[3];
13535
-
13536
- const int ne10 = src1->ne[0];
13537
- //const int ne11 = src1->ne[1];
13538
- const int ne12 = src1->ne[2];
13539
- //const int ne13 = src1->ne[3];
13540
-
13541
- const int ne0 = dst->ne[0];
13542
- const int ne1 = dst->ne[1];
13543
- const int ne2 = dst->ne[2];
13544
- //const int ne3 = dst->ne[3];
13545
- //const int ne = ne0*ne1*ne2*ne3;
13546
-
13547
- const int nb00 = src0->nb[0];
13548
- //const int nb01 = src0->nb[1];
13549
- //const int nb02 = src0->nb[2];
13550
- const int nb03 = src0->nb[3];
13551
-
13552
- const int nb10 = src1->nb[0];
13553
- //const int nb11 = src1->nb[1];
13554
- const int nb12 = src1->nb[2];
13555
- //const int nb13 = src1->nb[3];
13556
-
13557
- //const int nb0 = dst->nb[0];
13558
- //const int nb1 = dst->nb[1];
13559
- const int nb2 = dst->nb[2];
13560
- //const int nb3 = dst->nb[3];
12885
+ GGML_TENSOR_BINARY_OP_LOCALS;
13561
12886
 
13562
12887
  const int ith = params->ith;
13563
12888
  const int nth = params->nth;
@@ -13650,6 +12975,34 @@ static void ggml_compute_forward_conv_2d_sk_p0(
13650
12975
  }
13651
12976
  }
13652
12977
 
12978
+ // ggml_compute_forward_conv_2d
12979
+
12980
+ static void ggml_compute_forward_conv_2d(
12981
+ const struct ggml_compute_params* params,
12982
+ const struct ggml_tensor* src0,
12983
+ const struct ggml_tensor* src1,
12984
+ const struct ggml_tensor* opt0,
12985
+ struct ggml_tensor* dst) {
12986
+ const int32_t s0 = ((const int32_t*)(opt0->data))[0];
12987
+ const int32_t s1 = ((const int32_t*)(opt0->data))[1];
12988
+ const int32_t p0 = ((const int32_t*)(opt0->data))[2];
12989
+ const int32_t p1 = ((const int32_t*)(opt0->data))[3];
12990
+ const int32_t d0 = ((const int32_t*)(opt0->data))[4];
12991
+ const int32_t d1 = ((const int32_t*)(opt0->data))[5];
12992
+ GGML_ASSERT(d0 == 1); // dilation not supported
12993
+ GGML_ASSERT(d1 == 1);
12994
+ GGML_ASSERT(p0 == 0); // padding not supported
12995
+ GGML_ASSERT(p1 == 0);
12996
+
12997
+ if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
12998
+ ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
12999
+ }
13000
+ else {
13001
+ GGML_ASSERT(false); // only stride equal to kernel size is supported
13002
+ };
13003
+ }
13004
+
13005
+
13653
13006
  // ggml_compute_forward_flash_attn
13654
13007
 
13655
13008
  static void ggml_compute_forward_flash_attn_f32(
@@ -13662,45 +13015,14 @@ static void ggml_compute_forward_flash_attn_f32(
13662
13015
  int64_t t0 = ggml_perf_time_us();
13663
13016
  UNUSED(t0);
13664
13017
 
13665
- const int64_t neq0 = q->ne[0];
13666
- const int64_t neq1 = q->ne[1];
13667
- const int64_t neq2 = q->ne[2];
13668
- const int64_t neq3 = q->ne[3];
13669
-
13670
- const int64_t nek0 = k->ne[0];
13671
- const int64_t nek1 = k->ne[1];
13672
- //const int64_t nek2 = k->ne[2];
13673
- //const int64_t nek3 = k->ne[3];
13674
-
13675
- //const int64_t nev0 = v->ne[0];
13676
- const int64_t nev1 = v->ne[1];
13677
- //const int64_t nev2 = v->ne[2];
13678
- //const int64_t nev3 = v->ne[3];
13679
-
13680
- const int64_t ne0 = dst->ne[0];
13681
- const int64_t ne1 = dst->ne[1];
13682
- //const int64_t ne2 = dst->ne[2];
13683
- //const int64_t ne3 = dst->ne[3];
13684
-
13685
- const int nbk0 = k->nb[0];
13686
- const int nbk1 = k->nb[1];
13687
- const int nbk2 = k->nb[2];
13688
- const int nbk3 = k->nb[3];
13689
-
13690
- const int nbq0 = q->nb[0];
13691
- const int nbq1 = q->nb[1];
13692
- const int nbq2 = q->nb[2];
13693
- const int nbq3 = q->nb[3];
13694
-
13695
- const int nbv0 = v->nb[0];
13696
- const int nbv1 = v->nb[1];
13697
- const int nbv2 = v->nb[2];
13698
- const int nbv3 = v->nb[3];
13699
-
13700
- const int nb0 = dst->nb[0];
13701
- const int nb1 = dst->nb[1];
13702
- const int nb2 = dst->nb[2];
13703
- const int nb3 = dst->nb[3];
13018
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
13019
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
13020
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
13021
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
13022
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
13023
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
13024
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13025
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
13704
13026
 
13705
13027
  const int ith = params->ith;
13706
13028
  const int nth = params->nth;
@@ -13871,45 +13193,14 @@ static void ggml_compute_forward_flash_attn_f16(
13871
13193
  int64_t t0 = ggml_perf_time_us();
13872
13194
  UNUSED(t0);
13873
13195
 
13874
- const int64_t neq0 = q->ne[0];
13875
- const int64_t neq1 = q->ne[1];
13876
- const int64_t neq2 = q->ne[2];
13877
- const int64_t neq3 = q->ne[3];
13878
-
13879
- const int64_t nek0 = k->ne[0];
13880
- const int64_t nek1 = k->ne[1];
13881
- //const int64_t nek2 = k->ne[2];
13882
- //const int64_t nek3 = k->ne[3];
13883
-
13884
- //const int64_t nev0 = v->ne[0];
13885
- const int64_t nev1 = v->ne[1];
13886
- //const int64_t nev2 = v->ne[2];
13887
- //const int64_t nev3 = v->ne[3];
13888
-
13889
- const int64_t ne0 = dst->ne[0];
13890
- const int64_t ne1 = dst->ne[1];
13891
- //const int64_t ne2 = dst->ne[2];
13892
- //const int64_t ne3 = dst->ne[3];
13893
-
13894
- const int nbk0 = k->nb[0];
13895
- const int nbk1 = k->nb[1];
13896
- const int nbk2 = k->nb[2];
13897
- const int nbk3 = k->nb[3];
13898
-
13899
- const int nbq0 = q->nb[0];
13900
- const int nbq1 = q->nb[1];
13901
- const int nbq2 = q->nb[2];
13902
- const int nbq3 = q->nb[3];
13903
-
13904
- const int nbv0 = v->nb[0];
13905
- const int nbv1 = v->nb[1];
13906
- const int nbv2 = v->nb[2];
13907
- const int nbv3 = v->nb[3];
13908
-
13909
- const int nb0 = dst->nb[0];
13910
- const int nb1 = dst->nb[1];
13911
- const int nb2 = dst->nb[2];
13912
- const int nb3 = dst->nb[3];
13196
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
13197
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
13198
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
13199
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
13200
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
13201
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
13202
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13203
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
13913
13204
 
13914
13205
  const int ith = params->ith;
13915
13206
  const int nth = params->nth;
@@ -14143,65 +13434,18 @@ static void ggml_compute_forward_flash_ff_f16(
14143
13434
  int64_t t0 = ggml_perf_time_us();
14144
13435
  UNUSED(t0);
14145
13436
 
14146
- const int64_t nea0 = a->ne[0];
14147
- const int64_t nea1 = a->ne[1];
14148
- const int64_t nea2 = a->ne[2];
14149
- const int64_t nea3 = a->ne[3];
14150
-
14151
- const int64_t neb00 = b0->ne[0];
14152
- const int64_t neb01 = b0->ne[1];
14153
- //const int64_t neb02 = b0->ne[2];
14154
- //const int64_t neb03 = b0->ne[3];
14155
-
14156
- const int64_t neb10 = b1->ne[0];
14157
- const int64_t neb11 = b1->ne[1];
14158
- //const int64_t neb12 = b1->ne[2];
14159
- //const int64_t neb13 = b1->ne[3];
14160
-
14161
- const int64_t nec00 = c0->ne[0];
14162
- const int64_t nec01 = c0->ne[1];
14163
- //const int64_t nec02 = c0->ne[2];
14164
- //const int64_t nec03 = c0->ne[3];
14165
-
14166
- const int64_t nec10 = c1->ne[0];
14167
- const int64_t nec11 = c1->ne[1];
14168
- //const int64_t nec12 = c1->ne[2];
14169
- //const int64_t nec13 = c1->ne[3];
14170
-
14171
- const int64_t ne0 = dst->ne[0];
14172
- const int64_t ne1 = dst->ne[1];
14173
- const int64_t ne2 = dst->ne[2];
14174
- //const int64_t ne3 = dst->ne[3];
14175
-
14176
- const int nba0 = a->nb[0];
14177
- const int nba1 = a->nb[1];
14178
- const int nba2 = a->nb[2];
14179
- const int nba3 = a->nb[3];
14180
-
14181
- const int nbb00 = b0->nb[0];
14182
- const int nbb01 = b0->nb[1];
14183
- const int nbb02 = b0->nb[2];
14184
- const int nbb03 = b0->nb[3];
14185
-
14186
- const int nbb10 = b1->nb[0];
14187
- //const int nbb11 = b1->nb[1];
14188
- //const int nbb12 = b1->nb[2];
14189
- //const int nbb13 = b1->nb[3];
14190
-
14191
- const int nbc00 = c0->nb[0];
14192
- const int nbc01 = c0->nb[1];
14193
- const int nbc02 = c0->nb[2];
14194
- const int nbc03 = c0->nb[3];
14195
-
14196
- const int nbc10 = c1->nb[0];
14197
- //const int nbc11 = c1->nb[1];
14198
- //const int nbc12 = c1->nb[2];
14199
- //const int nbc13 = c1->nb[3];
14200
-
14201
- const int nb0 = dst->nb[0];
14202
- const int nb1 = dst->nb[1];
14203
- const int nb2 = dst->nb[2];
14204
- const int nb3 = dst->nb[3];
13437
+ GGML_TENSOR_LOCALS(int64_t, nea, a, ne);
13438
+ GGML_TENSOR_LOCALS(size_t, nba, a, nb);
13439
+ GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne);
13440
+ GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb);
13441
+ GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne);
13442
+ GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb);
13443
+ GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne);
13444
+ GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb);
13445
+ GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne);
13446
+ GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb);
13447
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13448
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
14205
13449
 
14206
13450
  const int ith = params->ith;
14207
13451
  const int nth = params->nth;
@@ -14349,55 +13593,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
14349
13593
  int64_t t0 = ggml_perf_time_us();
14350
13594
  UNUSED(t0);
14351
13595
 
14352
- const int64_t neq0 = q->ne[0];
14353
- const int64_t neq1 = q->ne[1];
14354
- const int64_t neq2 = q->ne[2];
14355
- const int64_t neq3 = q->ne[3];
14356
-
14357
- const int64_t nek0 = k->ne[0];
14358
- const int64_t nek1 = k->ne[1];
14359
- //const int64_t nek2 = k->ne[2];
14360
- //const int64_t nek3 = k->ne[3];
14361
-
14362
- const int64_t nev0 = v->ne[0];
14363
- const int64_t nev1 = v->ne[1];
14364
- //const int64_t nev2 = v->ne[2];
14365
- //const int64_t nev3 = v->ne[3];
14366
-
14367
- const int64_t ned0 = d->ne[0];
14368
- const int64_t ned1 = d->ne[1];
14369
- //const int64_t ned2 = d->ne[2];
14370
- //const int64_t ned3 = d->ne[3];
14371
-
14372
- const int64_t ne0 = dst->ne[0];
14373
- const int64_t ne1 = dst->ne[1];
14374
- const int64_t ne2 = dst->ne[2];
14375
- const int64_t ne3 = dst->ne[3];
14376
-
14377
- const int nbk0 = k->nb[0];
14378
- const int nbk1 = k->nb[1];
14379
- const int nbk2 = k->nb[2];
14380
- const int nbk3 = k->nb[3];
14381
-
14382
- const int nbq0 = q->nb[0];
14383
- const int nbq1 = q->nb[1];
14384
- const int nbq2 = q->nb[2];
14385
- const int nbq3 = q->nb[3];
14386
-
14387
- const int nbv0 = v->nb[0];
14388
- const int nbv1 = v->nb[1];
14389
- const int nbv2 = v->nb[2];
14390
- const int nbv3 = v->nb[3];
14391
-
14392
- const int nbd0 = d->nb[0];
14393
- const int nbd1 = d->nb[1];
14394
- const int nbd2 = d->nb[2];
14395
- const int nbd3 = d->nb[3];
14396
-
14397
- const int nb0 = dst->nb[0];
14398
- const int nb1 = dst->nb[1];
14399
- const int nb2 = dst->nb[2];
14400
- const int nb3 = dst->nb[3];
13596
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
13597
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
13598
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
13599
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
13600
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
13601
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
13602
+ GGML_TENSOR_LOCALS(int64_t, ned, d, ne);
13603
+ GGML_TENSOR_LOCALS(size_t, nbd, d, nb);
13604
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13605
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
14401
13606
 
14402
13607
  const int ith = params->ith;
14403
13608
  const int nth = params->nth;
@@ -14755,15 +13960,8 @@ static void ggml_compute_forward_win_part_f32(
14755
13960
  return;
14756
13961
  }
14757
13962
 
14758
- const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
14759
- const int64_t ne01 = src0->ne[1];
14760
- const int64_t ne02 = src0->ne[2];
14761
- const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
14762
-
14763
- const int64_t ne0 = dst->ne[0];
14764
- const int64_t ne1 = dst->ne[1];
14765
- const int64_t ne2 = dst->ne[2];
14766
- const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
13963
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13964
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14767
13965
 
14768
13966
  const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14769
13967
  const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
@@ -14826,14 +14024,8 @@ static void ggml_compute_forward_win_unpart_f32(
14826
14024
  return;
14827
14025
  }
14828
14026
 
14829
- const int64_t ne00 = src0->ne[0];
14830
- const int64_t ne01 = src0->ne[1];
14831
- const int64_t ne02 = src0->ne[2];
14832
- //const int64_t ne03 = src0->ne[3];
14833
-
14834
- const int64_t ne0 = dst->ne[0];
14835
- const int64_t ne1 = dst->ne[1];
14836
- const int64_t ne2 = dst->ne[2];
14027
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14028
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14837
14029
 
14838
14030
  const int32_t w = ((const int32_t *)(opt0->data))[0];
14839
14031
 
@@ -15431,6 +14623,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15431
14623
  {
15432
14624
  ggml_compute_forward_mean(params, tensor->src0, tensor);
15433
14625
  } break;
14626
+ case GGML_OP_ARGMAX:
14627
+ {
14628
+ ggml_compute_forward_argmax(params, tensor->src0, tensor);
14629
+ } break;
15434
14630
  case GGML_OP_REPEAT:
15435
14631
  {
15436
14632
  ggml_compute_forward_repeat(params, tensor->src0, tensor);
@@ -15455,6 +14651,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15455
14651
  {
15456
14652
  ggml_compute_forward_step(params, tensor->src0, tensor);
15457
14653
  } break;
14654
+ case GGML_OP_TANH:
14655
+ {
14656
+ ggml_compute_forward_tanh(params, tensor->src0, tensor);
14657
+ } break;
14658
+ case GGML_OP_ELU:
14659
+ {
14660
+ ggml_compute_forward_elu(params, tensor->src0, tensor);
14661
+ } break;
15458
14662
  case GGML_OP_RELU:
15459
14663
  {
15460
14664
  ggml_compute_forward_relu(params, tensor->src0, tensor);
@@ -15571,17 +14775,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15571
14775
  {
15572
14776
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
15573
14777
  } break;
15574
- case GGML_OP_CONV_1D_S1_PH:
15575
- {
15576
- ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
15577
- } break;
15578
- case GGML_OP_CONV_1D_S2_PH:
14778
+ case GGML_OP_CONV_1D:
15579
14779
  {
15580
- ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
14780
+ ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
15581
14781
  } break;
15582
- case GGML_OP_CONV_2D_SK_P0:
14782
+ case GGML_OP_CONV_2D:
15583
14783
  {
15584
- ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
14784
+ ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
15585
14785
  } break;
15586
14786
  case GGML_OP_FLASH_ATTN:
15587
14787
  {
@@ -15830,6 +15030,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15830
15030
  }
15831
15031
  } break;
15832
15032
  case GGML_OP_MEAN:
15033
+ case GGML_OP_ARGMAX:
15833
15034
  {
15834
15035
  GGML_ASSERT(false); // TODO: implement
15835
15036
  } break;
@@ -15883,6 +15084,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15883
15084
  // noop
15884
15085
  }
15885
15086
  } break;
15087
+ case GGML_OP_TANH:
15088
+ {
15089
+ GGML_ASSERT(false); // TODO: not implemented
15090
+ } break;
15091
+ case GGML_OP_ELU:
15092
+ {
15093
+ GGML_ASSERT(false); // TODO: not implemented
15094
+ } break;
15886
15095
  case GGML_OP_RELU:
15887
15096
  {
15888
15097
  if (src0->grad) {
@@ -15902,14 +15111,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15902
15111
  {
15903
15112
  GGML_ASSERT(false); // TODO: not implemented
15904
15113
  } break;
15905
- case GGML_OP_ALIBI:
15906
- {
15907
- GGML_ASSERT(false); // TODO: not implemented
15908
- } break;
15909
- case GGML_OP_CLAMP:
15910
- {
15911
- GGML_ASSERT(false); // TODO: not implemented
15912
- } break;
15913
15114
  case GGML_OP_SILU:
15914
15115
  {
15915
15116
  // necessary for llama
@@ -16226,7 +15427,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16226
15427
  // necessary for llama
16227
15428
  if (src0->grad) {
16228
15429
  assert(src1->type == GGML_TYPE_I32);
16229
- assert(ggml_nelements(src1) == 3);
15430
+ assert(ggml_nelements(src1) == 4);
16230
15431
  const int n_past = ((int32_t *) src1->data)[0];
16231
15432
  const int n_dims = ((int32_t *) src1->data)[1];
16232
15433
  const int mode = ((int32_t *) src1->data)[2];
@@ -16266,15 +15467,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16266
15467
  // noop
16267
15468
  }
16268
15469
  } break;
16269
- case GGML_OP_CONV_1D_S1_PH:
15470
+ case GGML_OP_ALIBI:
15471
+ {
15472
+ GGML_ASSERT(false); // TODO: not implemented
15473
+ } break;
15474
+ case GGML_OP_CLAMP:
16270
15475
  {
16271
15476
  GGML_ASSERT(false); // TODO: not implemented
16272
15477
  } break;
16273
- case GGML_OP_CONV_1D_S2_PH:
15478
+ case GGML_OP_CONV_1D:
16274
15479
  {
16275
15480
  GGML_ASSERT(false); // TODO: not implemented
16276
15481
  } break;
16277
- case GGML_OP_CONV_2D_SK_P0:
15482
+ case GGML_OP_CONV_2D:
16278
15483
  {
16279
15484
  GGML_ASSERT(false); // TODO: not implemented
16280
15485
  } break;
@@ -16791,9 +15996,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16791
15996
  if (node_n != -1) {
16792
15997
  /* FINALIZE */
16793
15998
  struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794
- params.nth = node->n_tasks;
16795
- ggml_compute_forward(&params, node);
16796
- ggml_graph_compute_perf_stats_node(node, state->shared);
15999
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16000
+ params.nth = node->n_tasks;
16001
+ ggml_compute_forward(&params, node);
16002
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16003
+ }
16797
16004
  }
16798
16005
 
16799
16006
  // distribute new work or execute it direct if 1T
@@ -16805,10 +16012,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16805
16012
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16806
16013
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
16807
16014
 
16015
+ params.nth = node->n_tasks;
16016
+
16808
16017
  /* INIT */
16809
- params.type = GGML_TASK_INIT;
16810
- params.nth = node->n_tasks;
16811
- ggml_compute_forward(&params, node);
16018
+ if (GGML_OP_HAS_INIT[node->op]) {
16019
+ params.type = GGML_TASK_INIT;
16020
+ ggml_compute_forward(&params, node);
16021
+ }
16812
16022
 
16813
16023
  if (node->n_tasks == 1) {
16814
16024
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
@@ -16816,9 +16026,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16816
16026
  params.type = GGML_TASK_COMPUTE;
16817
16027
  ggml_compute_forward(&params, node);
16818
16028
 
16819
- params.type = GGML_TASK_FINALIZE;
16820
- ggml_compute_forward(&params, node);
16821
- ggml_graph_compute_perf_stats_node(node, state->shared);
16029
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16030
+ params.type = GGML_TASK_FINALIZE;
16031
+ ggml_compute_forward(&params, node);
16032
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16033
+ }
16822
16034
  } else {
16823
16035
  break;
16824
16036
  }
@@ -16924,12 +16136,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16924
16136
  case GGML_OP_SUM:
16925
16137
  case GGML_OP_SUM_ROWS:
16926
16138
  case GGML_OP_MEAN:
16139
+ case GGML_OP_ARGMAX:
16927
16140
  case GGML_OP_REPEAT:
16928
16141
  case GGML_OP_REPEAT_BACK:
16929
16142
  case GGML_OP_ABS:
16930
16143
  case GGML_OP_SGN:
16931
16144
  case GGML_OP_NEG:
16932
16145
  case GGML_OP_STEP:
16146
+ case GGML_OP_TANH:
16147
+ case GGML_OP_ELU:
16933
16148
  case GGML_OP_RELU:
16934
16149
  {
16935
16150
  node->n_tasks = 1;
@@ -16958,6 +16173,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16958
16173
  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
16959
16174
 
16960
16175
  size_t cur = 0;
16176
+ const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
16961
16177
 
16962
16178
  #if defined(GGML_USE_CUBLAS)
16963
16179
  if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
@@ -16973,39 +16189,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16973
16189
  }
16974
16190
  else
16975
16191
  #endif
16976
- if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
16977
16192
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16978
- if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16979
- node->n_tasks = 1; // TODO: this actually is doing nothing
16980
- // the threads are still spinning
16193
+ if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16194
+ node->n_tasks = 1; // TODO: this actually is doing nothing
16195
+ // the threads are still spinning
16196
+ if (node->src0->type != GGML_TYPE_F32) {
16981
16197
  // here we need memory just for single 2D matrix from src0
16982
16198
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
16983
- } else {
16984
- cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
16985
- }
16986
- #else
16987
- cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
16988
- #endif
16989
- } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
16990
- cur = 0;
16991
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16992
- if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16993
- node->n_tasks = 1;
16994
16199
  }
16200
+ } else
16995
16201
  #endif
16996
- } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
16997
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16998
- if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16999
- node->n_tasks = 1;
17000
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
17001
- } else
17002
- #endif
17003
- {
17004
- const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
17005
- cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
17006
- }
16202
+ if (node->src1->type != vec_dot_type) {
16203
+ cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
17007
16204
  } else {
17008
- GGML_ASSERT(false);
16205
+ cur = 0;
17009
16206
  }
17010
16207
 
17011
16208
  work_size = MAX(work_size, cur);
@@ -17043,8 +16240,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
17043
16240
  {
17044
16241
  node->n_tasks = 1; //TODO
17045
16242
  } break;
17046
- case GGML_OP_CONV_1D_S1_PH:
17047
- case GGML_OP_CONV_1D_S2_PH:
16243
+ case GGML_OP_CONV_1D:
17048
16244
  {
17049
16245
  node->n_tasks = n_threads;
17050
16246
 
@@ -17073,7 +16269,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
17073
16269
 
17074
16270
  work_size = MAX(work_size, cur);
17075
16271
  } break;
17076
- case GGML_OP_CONV_2D_SK_P0:
16272
+ case GGML_OP_CONV_2D:
17077
16273
  {
17078
16274
  node->n_tasks = n_threads;
17079
16275
 
@@ -17435,13 +16631,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17435
16631
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17436
16632
  }
17437
16633
 
17438
- // store the pointer address
17439
- {
17440
- const uint64_t ptr = (uint64_t) tensor->data;
17441
-
17442
- fwrite(&ptr, sizeof(uint64_t), 1, fout);
17443
- }
17444
-
17445
16634
  fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
17446
16635
 
17447
16636
  // dump the data
@@ -17475,13 +16664,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17475
16664
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17476
16665
  }
17477
16666
 
17478
- // store the pointer address
17479
- {
17480
- const uint64_t ptr = (uint64_t) tensor->data;
17481
-
17482
- fwrite(&ptr, sizeof(uint64_t), 1, fout);
17483
- }
17484
-
17485
16667
  fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
17486
16668
 
17487
16669
  // output the op arguments
@@ -17666,8 +16848,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17666
16848
 
17667
16849
  tensor->op = (enum ggml_op) op;
17668
16850
 
17669
- uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
17670
-
17671
16851
  memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
17672
16852
 
17673
16853
  tensor->data = (void *) ptr;
@@ -17713,8 +16893,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17713
16893
  nb[j] = nb_cur;
17714
16894
  }
17715
16895
 
17716
- uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
17717
-
17718
16896
  const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
17719
16897
 
17720
16898
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);