llama_cpp 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -220,9 +220,27 @@ inline static void* ggml_aligned_malloc(size_t size) {
220
220
  #define GGML_ALIGNED_FREE(ptr) free(ptr)
221
221
  #endif
222
222
 
223
- #define UNUSED(x) (void)(x)
223
+ #define UNUSED GGML_UNUSED
224
224
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
225
225
 
226
+ //
227
+ // tensor access macros
228
+ //
229
+
230
+ #define GGML_TENSOR_UNARY_OP_LOCALS \
231
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
232
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
233
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
234
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
235
+
236
+ #define GGML_TENSOR_BINARY_OP_LOCALS \
237
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
238
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
239
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
240
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \
241
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
242
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
243
+
226
244
  #if defined(GGML_USE_ACCELERATE)
227
245
  #include <Accelerate/Accelerate.h>
228
246
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@@ -463,14 +481,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
463
481
  return GGML_FP32_TO_FP16(x);
464
482
  }
465
483
 
466
- void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
467
- for (size_t i = 0; i < n; i++) {
484
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
485
+ for (int i = 0; i < n; i++) {
468
486
  y[i] = GGML_FP16_TO_FP32(x[i]);
469
487
  }
470
488
  }
471
489
 
472
- void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
473
- size_t i = 0;
490
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
491
+ int i = 0;
474
492
  #if defined(__F16C__)
475
493
  for (; i + 7 < n; i += 8) {
476
494
  __m256 x_vec = _mm256_loadu_ps(x + i);
@@ -1609,109 +1627,112 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
1609
1627
  }
1610
1628
  }
1611
1629
 
1630
+ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
1631
+ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
1612
1632
  static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1613
1633
  static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1614
1634
  static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1615
1635
  static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1616
1636
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1617
1637
 
1618
- static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1638
+ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1639
+ [GGML_TYPE_F32] = {
1640
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
1641
+ .vec_dot_type = GGML_TYPE_F32,
1642
+ },
1643
+ [GGML_TYPE_F16] = {
1644
+ .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
1645
+ .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1646
+ .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1647
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
1648
+ .vec_dot_type = GGML_TYPE_F16,
1649
+ },
1619
1650
  [GGML_TYPE_Q4_0] = {
1620
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0,
1621
- .quantize_row_q = quantize_row_q4_0,
1622
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
1623
- .quantize_row_q_dot = quantize_row_q8_0,
1624
- .vec_dot_q = ggml_vec_dot_q4_0_q8_0,
1651
+ .to_float = (ggml_to_float_t) dequantize_row_q4_0,
1652
+ .from_float = quantize_row_q4_0,
1653
+ .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
1654
+ .vec_dot = ggml_vec_dot_q4_0_q8_0,
1625
1655
  .vec_dot_type = GGML_TYPE_Q8_0,
1626
1656
  },
1627
1657
  [GGML_TYPE_Q4_1] = {
1628
- .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1,
1629
- .quantize_row_q = quantize_row_q4_1,
1630
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
1631
- .quantize_row_q_dot = quantize_row_q8_1,
1632
- .vec_dot_q = ggml_vec_dot_q4_1_q8_1,
1658
+ .to_float = (ggml_to_float_t) dequantize_row_q4_1,
1659
+ .from_float = quantize_row_q4_1,
1660
+ .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
1661
+ .vec_dot = ggml_vec_dot_q4_1_q8_1,
1633
1662
  .vec_dot_type = GGML_TYPE_Q8_1,
1634
1663
  },
1635
1664
  [GGML_TYPE_Q5_0] = {
1636
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0,
1637
- .quantize_row_q = quantize_row_q5_0,
1638
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference,
1639
- .quantize_row_q_dot = quantize_row_q8_0,
1640
- .vec_dot_q = ggml_vec_dot_q5_0_q8_0,
1665
+ .to_float = (ggml_to_float_t) dequantize_row_q5_0,
1666
+ .from_float = quantize_row_q5_0,
1667
+ .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
1668
+ .vec_dot = ggml_vec_dot_q5_0_q8_0,
1641
1669
  .vec_dot_type = GGML_TYPE_Q8_0,
1642
1670
  },
1643
1671
  [GGML_TYPE_Q5_1] = {
1644
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_1,
1645
- .quantize_row_q = quantize_row_q5_1,
1646
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference,
1647
- .quantize_row_q_dot = quantize_row_q8_1,
1648
- .vec_dot_q = ggml_vec_dot_q5_1_q8_1,
1672
+ .to_float = (ggml_to_float_t) dequantize_row_q5_1,
1673
+ .from_float = quantize_row_q5_1,
1674
+ .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
1675
+ .vec_dot = ggml_vec_dot_q5_1_q8_1,
1649
1676
  .vec_dot_type = GGML_TYPE_Q8_1,
1650
1677
  },
1651
1678
  [GGML_TYPE_Q8_0] = {
1652
- .dequantize_row_q = dequantize_row_q8_0,
1653
- .quantize_row_q = quantize_row_q8_0,
1654
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference,
1655
- .quantize_row_q_dot = quantize_row_q8_0,
1656
- .vec_dot_q = ggml_vec_dot_q8_0_q8_0,
1679
+ .to_float = dequantize_row_q8_0,
1680
+ .from_float = quantize_row_q8_0,
1681
+ .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
1682
+ .vec_dot = ggml_vec_dot_q8_0_q8_0,
1657
1683
  .vec_dot_type = GGML_TYPE_Q8_0,
1658
1684
  },
1659
1685
  [GGML_TYPE_Q8_1] = {
1660
- .dequantize_row_q = NULL, // TODO
1661
- .quantize_row_q = quantize_row_q8_1,
1662
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference,
1663
- .quantize_row_q_dot = quantize_row_q8_1,
1664
- .vec_dot_q = NULL, // TODO
1686
+ .from_float = quantize_row_q8_1,
1687
+ .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
1665
1688
  .vec_dot_type = GGML_TYPE_Q8_1,
1666
1689
  },
1667
1690
  #ifdef GGML_USE_K_QUANTS
1668
1691
  [GGML_TYPE_Q2_K] = {
1669
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
1670
- .quantize_row_q = quantize_row_q2_K,
1671
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
1672
- .quantize_row_q_dot = quantize_row_q8_K,
1673
- .vec_dot_q = ggml_vec_dot_q2_K_q8_K,
1692
+ .to_float = (ggml_to_float_t) dequantize_row_q2_K,
1693
+ .from_float = quantize_row_q2_K,
1694
+ .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
1695
+ .vec_dot = ggml_vec_dot_q2_K_q8_K,
1674
1696
  .vec_dot_type = GGML_TYPE_Q8_K,
1675
1697
  },
1676
1698
  [GGML_TYPE_Q3_K] = {
1677
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
1678
- .quantize_row_q = quantize_row_q3_K,
1679
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
1680
- .quantize_row_q_dot = quantize_row_q8_K,
1681
- .vec_dot_q = ggml_vec_dot_q3_K_q8_K,
1699
+ .to_float = (ggml_to_float_t) dequantize_row_q3_K,
1700
+ .from_float = quantize_row_q3_K,
1701
+ .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
1702
+ .vec_dot = ggml_vec_dot_q3_K_q8_K,
1682
1703
  .vec_dot_type = GGML_TYPE_Q8_K,
1683
1704
  },
1684
1705
  [GGML_TYPE_Q4_K] = {
1685
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
1686
- .quantize_row_q = quantize_row_q4_K,
1687
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
1688
- .quantize_row_q_dot = quantize_row_q8_K,
1689
- .vec_dot_q = ggml_vec_dot_q4_K_q8_K,
1706
+ .to_float = (ggml_to_float_t) dequantize_row_q4_K,
1707
+ .from_float = quantize_row_q4_K,
1708
+ .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
1709
+ .vec_dot = ggml_vec_dot_q4_K_q8_K,
1690
1710
  .vec_dot_type = GGML_TYPE_Q8_K,
1691
1711
  },
1692
1712
  [GGML_TYPE_Q5_K] = {
1693
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
1694
- .quantize_row_q = quantize_row_q5_K,
1695
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
1696
- .quantize_row_q_dot = quantize_row_q8_K,
1697
- .vec_dot_q = ggml_vec_dot_q5_K_q8_K,
1713
+ .to_float = (ggml_to_float_t) dequantize_row_q5_K,
1714
+ .from_float = quantize_row_q5_K,
1715
+ .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
1716
+ .vec_dot = ggml_vec_dot_q5_K_q8_K,
1698
1717
  .vec_dot_type = GGML_TYPE_Q8_K,
1699
1718
  },
1700
1719
  [GGML_TYPE_Q6_K] = {
1701
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
1702
- .quantize_row_q = quantize_row_q6_K,
1703
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
1704
- .quantize_row_q_dot = quantize_row_q8_K,
1705
- .vec_dot_q = ggml_vec_dot_q6_K_q8_K,
1720
+ .to_float = (ggml_to_float_t) dequantize_row_q6_K,
1721
+ .from_float = quantize_row_q6_K,
1722
+ .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
1723
+ .vec_dot = ggml_vec_dot_q6_K_q8_K,
1706
1724
  .vec_dot_type = GGML_TYPE_Q8_K,
1707
1725
  },
1726
+ [GGML_TYPE_Q8_K] = {
1727
+ .from_float = quantize_row_q8_K,
1728
+ }
1708
1729
  #endif
1709
1730
  };
1710
1731
 
1711
1732
  // For internal test use
1712
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1733
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
1713
1734
  GGML_ASSERT(i < GGML_TYPE_COUNT);
1714
- return quantize_fns[i];
1735
+ return type_traits[i];
1715
1736
  }
1716
1737
 
1717
1738
 
@@ -2257,7 +2278,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
2257
2278
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
2258
2279
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
2259
2280
 
2260
- inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
2281
+ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
2261
2282
  #ifdef GGML_SIMD
2262
2283
  float sumf = 0.0f;
2263
2284
  const int np = (n & ~(GGML_F32_STEP - 1));
@@ -2294,7 +2315,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
2294
2315
  *s = sumf;
2295
2316
  }
2296
2317
 
2297
- inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
2318
+ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
2298
2319
  ggml_float sumf = 0.0;
2299
2320
 
2300
2321
  #if defined(GGML_SIMD)
@@ -3447,6 +3468,8 @@ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) {
3447
3468
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
3448
3469
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
3449
3470
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
3471
+ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
3472
+ inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
3450
3473
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3451
3474
 
3452
3475
  static const float GELU_COEF_A = 0.044715f;
@@ -3598,6 +3621,16 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3598
3621
  *s = 1.f/(*s);
3599
3622
  }
3600
3623
 
3624
+ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
3625
+ float max = -INFINITY;
3626
+ int idx = 0;
3627
+ for (int i = 0; i < n; ++i) {
3628
+ max = MAX(max, x[i]);
3629
+ if (max == x[i]) { idx = i; }
3630
+ }
3631
+ *s = idx;
3632
+ }
3633
+
3601
3634
  //
3602
3635
  // data types
3603
3636
  //
@@ -3707,12 +3740,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3707
3740
  "SUM",
3708
3741
  "SUM_ROWS",
3709
3742
  "MEAN",
3743
+ "ARGMAX",
3710
3744
  "REPEAT",
3711
3745
  "REPEAT_BACK",
3712
3746
  "ABS",
3713
3747
  "SGN",
3714
3748
  "NEG",
3715
3749
  "STEP",
3750
+ "TANH",
3751
+ "ELU",
3716
3752
  "RELU",
3717
3753
  "GELU",
3718
3754
  "GELU_QUICK",
@@ -3744,9 +3780,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3744
3780
  "ROPE_BACK",
3745
3781
  "ALIBI",
3746
3782
  "CLAMP",
3747
- "CONV_1D_S1_PH",
3748
- "CONV_1D_S2_PH",
3749
- "CONV_2D_SK_P0",
3783
+ "CONV_1D",
3784
+ "CONV_2D",
3750
3785
 
3751
3786
  "FLASH_ATTN",
3752
3787
  "FLASH_FF",
@@ -3765,7 +3800,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3765
3800
  "CROSS_ENTROPY_LOSS_BACK",
3766
3801
  };
3767
3802
 
3768
- static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3803
+ static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
3769
3804
 
3770
3805
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3771
3806
  "none",
@@ -3783,12 +3818,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3783
3818
  "Σx",
3784
3819
  "Σx_k",
3785
3820
  "Σx/n",
3821
+ "argmax(x)",
3786
3822
  "repeat(x)",
3787
3823
  "repeat_back(x)",
3788
3824
  "abs(x)",
3789
3825
  "sgn(x)",
3790
3826
  "-x",
3791
3827
  "step(x)",
3828
+ "tanh(x)",
3829
+ "elu(x)",
3792
3830
  "relu(x)",
3793
3831
  "gelu(x)",
3794
3832
  "gelu_quick(x)",
@@ -3820,9 +3858,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3820
3858
  "rope_back(x)",
3821
3859
  "alibi(x)",
3822
3860
  "clamp(x)",
3823
- "conv_1d_s1_ph(x)",
3824
- "conv_1d_s2_ph(x)",
3825
- "conv_2d_sk_p0(x)",
3861
+ "conv_1d(x)",
3862
+ "conv_2d(x)",
3826
3863
 
3827
3864
  "flash_attn(x)",
3828
3865
  "flash_ff(x)",
@@ -3841,11 +3878,45 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3841
3878
  "cross_entropy_loss_back(x,y)",
3842
3879
  };
3843
3880
 
3844
- static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3881
+ static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
3845
3882
 
3846
3883
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3847
3884
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
3848
3885
 
3886
+ // WARN:
3887
+ // Mis-confguration can lead to problem that's hard to reason about:
3888
+ // * At best it crash or talks nosense.
3889
+ // * At worst it talks slightly difference but hard to perceive.
3890
+ //
3891
+ // An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
3892
+ // Take care about compile options (e.g., GGML_USE_xxx).
3893
+ static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
3894
+ static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
3895
+
3896
+ static void ggml_setup_op_has_task_pass(void) {
3897
+ { // INIT
3898
+ bool * p = GGML_OP_HAS_INIT;
3899
+
3900
+ p[GGML_OP_ACC ] = true;
3901
+ p[GGML_OP_MUL_MAT ] = true;
3902
+ p[GGML_OP_OUT_PROD ] = true;
3903
+ p[GGML_OP_SET ] = true;
3904
+ p[GGML_OP_GET_ROWS_BACK ] = true;
3905
+ p[GGML_OP_DIAG_MASK_INF ] = true;
3906
+ p[GGML_OP_DIAG_MASK_ZERO ] = true;
3907
+ p[GGML_OP_CONV_1D ] = true;
3908
+ p[GGML_OP_CONV_2D ] = true;
3909
+ p[GGML_OP_FLASH_ATTN_BACK ] = true;
3910
+ p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3911
+ }
3912
+
3913
+ { // FINALIZE
3914
+ bool * p = GGML_OP_HAS_FINALIZE;
3915
+
3916
+ p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3917
+ }
3918
+ }
3919
+
3849
3920
  //
3850
3921
  // ggml context
3851
3922
  //
@@ -4267,6 +4338,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4267
4338
  ggml_cl_init();
4268
4339
  #endif
4269
4340
 
4341
+ ggml_setup_op_has_task_pass();
4342
+
4270
4343
  is_first_call = false;
4271
4344
  }
4272
4345
 
@@ -5403,6 +5476,30 @@ struct ggml_tensor * ggml_mean(
5403
5476
  return result;
5404
5477
  }
5405
5478
 
5479
+ // ggml_argmax
5480
+
5481
+ struct ggml_tensor * ggml_argmax(
5482
+ struct ggml_context * ctx,
5483
+ struct ggml_tensor * a) {
5484
+ GGML_ASSERT(ggml_is_matrix(a));
5485
+ bool is_node = false;
5486
+
5487
+ if (a->grad) {
5488
+ GGML_ASSERT(false);
5489
+ is_node = true;
5490
+ }
5491
+
5492
+ int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
5493
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
5494
+
5495
+ result->op = GGML_OP_ARGMAX;
5496
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5497
+ result->src0 = a;
5498
+ result->src1 = NULL;
5499
+
5500
+ return result;
5501
+ }
5502
+
5406
5503
  // ggml_repeat
5407
5504
 
5408
5505
  struct ggml_tensor * ggml_repeat(
@@ -5596,6 +5693,74 @@ struct ggml_tensor * ggml_step_inplace(
5596
5693
  return ggml_step_impl(ctx, a, true);
5597
5694
  }
5598
5695
 
5696
+ // ggml_tanh
5697
+
5698
+ struct ggml_tensor * ggml_tanh_impl(
5699
+ struct ggml_context * ctx,
5700
+ struct ggml_tensor * a,
5701
+ bool inplace) {
5702
+ bool is_node = false;
5703
+
5704
+ if (!inplace && (a->grad)) {
5705
+ is_node = true;
5706
+ }
5707
+
5708
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5709
+
5710
+ result->op = GGML_OP_TANH;
5711
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5712
+ result->src0 = a;
5713
+ result->src1 = NULL;
5714
+
5715
+ return result;
5716
+ }
5717
+
5718
+ struct ggml_tensor * ggml_tanh(
5719
+ struct ggml_context * ctx,
5720
+ struct ggml_tensor * a) {
5721
+ return ggml_tanh_impl(ctx, a, false);
5722
+ }
5723
+
5724
+ struct ggml_tensor * ggml_tanh_inplace(
5725
+ struct ggml_context * ctx,
5726
+ struct ggml_tensor * a) {
5727
+ return ggml_tanh_impl(ctx, a, true);
5728
+ }
5729
+
5730
+ // ggml_elu
5731
+
5732
+ struct ggml_tensor * ggml_elu_impl(
5733
+ struct ggml_context * ctx,
5734
+ struct ggml_tensor * a,
5735
+ bool inplace) {
5736
+ bool is_node = false;
5737
+
5738
+ if (!inplace && (a->grad)) {
5739
+ is_node = true;
5740
+ }
5741
+
5742
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5743
+
5744
+ result->op = GGML_OP_ELU;
5745
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5746
+ result->src0 = a;
5747
+ result->src1 = NULL;
5748
+
5749
+ return result;
5750
+ }
5751
+
5752
+ struct ggml_tensor * ggml_elu(
5753
+ struct ggml_context * ctx,
5754
+ struct ggml_tensor * a) {
5755
+ return ggml_elu_impl(ctx, a, false);
5756
+ }
5757
+
5758
+ struct ggml_tensor * ggml_elu_inplace(
5759
+ struct ggml_context * ctx,
5760
+ struct ggml_tensor * a) {
5761
+ return ggml_elu_impl(ctx, a, true);
5762
+ }
5763
+
5599
5764
  // ggml_relu
5600
5765
 
5601
5766
  struct ggml_tensor * ggml_relu_impl(
@@ -6837,6 +7002,8 @@ struct ggml_tensor * ggml_rope_back(
6837
7002
  int n_dims,
6838
7003
  int mode) {
6839
7004
  GGML_ASSERT(n_past >= 0);
7005
+ GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
7006
+
6840
7007
  bool is_node = false;
6841
7008
 
6842
7009
  if (a->grad) {
@@ -6937,15 +7104,21 @@ struct ggml_tensor * ggml_clamp(
6937
7104
  return result;
6938
7105
  }
6939
7106
 
6940
- // ggml_conv_1d_s1_ph
7107
+ // ggml_conv_1d
7108
+
7109
+ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
7110
+ return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
7111
+ }
6941
7112
 
6942
- struct ggml_tensor * ggml_conv_1d_s1_ph(
7113
+ GGML_API struct ggml_tensor * ggml_conv_1d(
6943
7114
  struct ggml_context * ctx,
6944
7115
  struct ggml_tensor * a,
6945
- struct ggml_tensor * b) {
7116
+ struct ggml_tensor * b,
7117
+ int s0,
7118
+ int p0,
7119
+ int d0) {
6946
7120
  GGML_ASSERT(ggml_is_matrix(b));
6947
7121
  GGML_ASSERT(a->ne[1] == b->ne[1]);
6948
- GGML_ASSERT(a->ne[3] == 1);
6949
7122
  bool is_node = false;
6950
7123
 
6951
7124
  if (a->grad || b->grad) {
@@ -6953,26 +7126,43 @@ struct ggml_tensor * ggml_conv_1d_s1_ph(
6953
7126
  is_node = true;
6954
7127
  }
6955
7128
 
6956
- const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6957
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7129
+ const int64_t ne[4] = {
7130
+ ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
7131
+ a->ne[2], 1, 1,
7132
+ };
7133
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7134
+
7135
+ ggml_scratch_save(ctx);
7136
+ struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7137
+ ((int32_t*)c->data)[0] = s0;
7138
+ ((int32_t*)c->data)[1] = p0;
7139
+ ((int32_t*)c->data)[2] = d0;
7140
+ ggml_scratch_load(ctx);
6958
7141
 
6959
- result->op = GGML_OP_CONV_1D_S1_PH;
7142
+ result->op = GGML_OP_CONV_1D;
6960
7143
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6961
7144
  result->src0 = a;
6962
7145
  result->src1 = b;
7146
+ result->opt[0] = c;
6963
7147
 
6964
7148
  return result;
6965
7149
  }
6966
7150
 
6967
- // ggml_conv_1d_s2_ph
7151
+ // ggml_conv_2d
6968
7152
 
6969
- struct ggml_tensor * ggml_conv_1d_s2_ph(
6970
- struct ggml_context * ctx,
6971
- struct ggml_tensor * a,
6972
- struct ggml_tensor * b) {
6973
- GGML_ASSERT(ggml_is_matrix(b));
6974
- GGML_ASSERT(a->ne[1] == b->ne[1]);
6975
- GGML_ASSERT(a->ne[3] == 1);
7153
+ struct ggml_tensor* ggml_conv_2d(
7154
+ struct ggml_context* ctx,
7155
+ struct ggml_tensor * a,
7156
+ struct ggml_tensor * b,
7157
+ int s0,
7158
+ int s1,
7159
+ int p0,
7160
+ int p1,
7161
+ int d0,
7162
+ int d1) {
7163
+
7164
+ GGML_ASSERT(b->ne[3] == 1);
7165
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
6976
7166
  bool is_node = false;
6977
7167
 
6978
7168
  if (a->grad || b->grad) {
@@ -6980,43 +7170,42 @@ struct ggml_tensor * ggml_conv_1d_s2_ph(
6980
7170
  is_node = true;
6981
7171
  }
6982
7172
 
6983
- const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6984
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7173
+ const int64_t ne[4] = {
7174
+ ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
7175
+ ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
7176
+ a->ne[3], 1,
7177
+ };
7178
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7179
+
7180
+ ggml_scratch_save(ctx);
7181
+ struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7182
+ ((int32_t*)c->data)[0] = s0;
7183
+ ((int32_t*)c->data)[1] = s1;
7184
+ ((int32_t*)c->data)[2] = p0;
7185
+ ((int32_t*)c->data)[3] = p1;
7186
+ ((int32_t*)c->data)[4] = d0;
7187
+ ((int32_t*)c->data)[5] = d1;
7188
+ ggml_scratch_load(ctx);
6985
7189
 
6986
- result->op = GGML_OP_CONV_1D_S2_PH;
7190
+ result->op = GGML_OP_CONV_2D;
6987
7191
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6988
7192
  result->src0 = a;
6989
7193
  result->src1 = b;
7194
+ result->opt[0] = c;
6990
7195
 
6991
7196
  return result;
7197
+
6992
7198
  }
6993
7199
 
6994
- // ggml_conv_2d_sk_p0
7200
+ // ggml_conv_1d_ph
6995
7201
 
6996
- struct ggml_tensor * ggml_conv_2d_sk_p0(
7202
+ struct ggml_tensor* ggml_conv_1d_ph(
6997
7203
  struct ggml_context * ctx,
6998
7204
  struct ggml_tensor * a,
6999
- struct ggml_tensor * b) {
7000
- GGML_ASSERT(b->ne[3] == 1);
7001
- GGML_ASSERT(a->ne[2] == b->ne[2]);
7002
- GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
7003
- GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
7004
- bool is_node = false;
7005
-
7006
- if (a->grad || b->grad) {
7007
- GGML_ASSERT(false); // TODO: implement backward
7008
- is_node = true;
7009
- }
7010
-
7011
- const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
7012
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7013
-
7014
- result->op = GGML_OP_CONV_2D_SK_P0;
7015
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7016
- result->src0 = a;
7017
- result->src1 = b;
7018
-
7019
- return result;
7205
+ struct ggml_tensor * b,
7206
+ int s,
7207
+ int d) {
7208
+ return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7020
7209
  }
7021
7210
 
7022
7211
  // ggml_flash_attn
@@ -7566,25 +7755,7 @@ static void ggml_compute_forward_dup_f16(
7566
7755
  return;
7567
7756
  }
7568
7757
 
7569
- const int64_t ne00 = src0->ne[0];
7570
- const int64_t ne01 = src0->ne[1];
7571
- const int64_t ne02 = src0->ne[2];
7572
- const int64_t ne03 = src0->ne[3];
7573
-
7574
- const int64_t ne0 = dst->ne[0];
7575
- const int64_t ne1 = dst->ne[1];
7576
- const int64_t ne2 = dst->ne[2];
7577
- const int64_t ne3 = dst->ne[3];
7578
-
7579
- const size_t nb00 = src0->nb[0];
7580
- const size_t nb01 = src0->nb[1];
7581
- const size_t nb02 = src0->nb[2];
7582
- const size_t nb03 = src0->nb[3];
7583
-
7584
- const size_t nb0 = dst->nb[0];
7585
- const size_t nb1 = dst->nb[1];
7586
- const size_t nb2 = dst->nb[2];
7587
- const size_t nb3 = dst->nb[3];
7758
+ GGML_TENSOR_UNARY_OP_LOCALS;
7588
7759
 
7589
7760
  const int ith = params->ith; // thread index
7590
7761
  const int nth = params->nth; // number of threads
@@ -7657,8 +7828,8 @@ static void ggml_compute_forward_dup_f16(
7657
7828
  id += ne00 * (ne01 - ir1);
7658
7829
  }
7659
7830
  }
7660
- } else if (ggml_is_quantized(dst->type)) {
7661
- quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
7831
+ } else if (type_traits[dst->type].from_float) {
7832
+ ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
7662
7833
  float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
7663
7834
 
7664
7835
  size_t id = 0;
@@ -7855,25 +8026,7 @@ static void ggml_compute_forward_dup_f32(
7855
8026
  return;
7856
8027
  }
7857
8028
 
7858
- const int64_t ne00 = src0->ne[0];
7859
- const int64_t ne01 = src0->ne[1];
7860
- const int64_t ne02 = src0->ne[2];
7861
- const int64_t ne03 = src0->ne[3];
7862
-
7863
- const int64_t ne0 = dst->ne[0];
7864
- const int64_t ne1 = dst->ne[1];
7865
- const int64_t ne2 = dst->ne[2];
7866
- const int64_t ne3 = dst->ne[3];
7867
-
7868
- const size_t nb00 = src0->nb[0];
7869
- const size_t nb01 = src0->nb[1];
7870
- const size_t nb02 = src0->nb[2];
7871
- const size_t nb03 = src0->nb[3];
7872
-
7873
- const size_t nb0 = dst->nb[0];
7874
- const size_t nb1 = dst->nb[1];
7875
- const size_t nb2 = dst->nb[2];
7876
- const size_t nb3 = dst->nb[3];
8029
+ GGML_TENSOR_UNARY_OP_LOCALS;
7877
8030
 
7878
8031
  const int ith = params->ith; // thread index
7879
8032
  const int nth = params->nth; // number of threads
@@ -7928,26 +8081,8 @@ static void ggml_compute_forward_dup_f32(
7928
8081
  id += rs * (ne01 - ir1);
7929
8082
  }
7930
8083
  }
7931
- } else if (dst->type == GGML_TYPE_F16) {
7932
- size_t id = 0;
7933
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
7934
-
7935
- for (int i03 = 0; i03 < ne03; i03++) {
7936
- for (int i02 = 0; i02 < ne02; i02++) {
7937
- id += ne00 * ir0;
7938
- for (int i01 = ir0; i01 < ir1; i01++) {
7939
- for (int i00 = 0; i00 < ne00; i00++) {
7940
- const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
7941
-
7942
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
7943
- id++;
7944
- }
7945
- }
7946
- id += ne00 * (ne01 - ir1);
7947
- }
7948
- }
7949
- } else if (ggml_is_quantized(dst->type)) {
7950
- quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
8084
+ } else if (type_traits[dst->type].from_float) {
8085
+ ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
7951
8086
 
7952
8087
  size_t id = 0;
7953
8088
  size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
@@ -8171,24 +8306,8 @@ static void ggml_compute_forward_add_f32(
8171
8306
  const int nth = params->nth;
8172
8307
 
8173
8308
  const int nr = ggml_nrows(src0);
8174
- const int64_t ne0 = src0->ne[0];
8175
- const int64_t ne1 = src0->ne[1];
8176
- const int64_t ne2 = src0->ne[2];
8177
-
8178
- const size_t nb00 = src0->nb[0];
8179
- const size_t nb01 = src0->nb[1];
8180
- const size_t nb02 = src0->nb[2];
8181
- const size_t nb03 = src0->nb[3];
8182
-
8183
- const size_t nb10 = src1->nb[0];
8184
- const size_t nb11 = src1->nb[1];
8185
- const size_t nb12 = src1->nb[2];
8186
- const size_t nb13 = src1->nb[3];
8187
8309
 
8188
- const size_t nb0 = dst->nb[0];
8189
- const size_t nb1 = dst->nb[1];
8190
- const size_t nb2 = dst->nb[2];
8191
- const size_t nb3 = dst->nb[3];
8310
+ GGML_TENSOR_BINARY_OP_LOCALS;
8192
8311
 
8193
8312
  GGML_ASSERT( nb0 == sizeof(float));
8194
8313
  GGML_ASSERT(nb00 == sizeof(float));
@@ -8257,28 +8376,12 @@ static void ggml_compute_forward_add_f16_f32(
8257
8376
  const int nth = params->nth;
8258
8377
 
8259
8378
  const int nr = ggml_nrows(src0);
8260
- const int64_t ne0 = src0->ne[0];
8261
- const int64_t ne1 = src0->ne[1];
8262
- const int64_t ne2 = src0->ne[2];
8263
8379
 
8264
- const size_t nb00 = src0->nb[0];
8265
- const size_t nb01 = src0->nb[1];
8266
- const size_t nb02 = src0->nb[2];
8267
- const size_t nb03 = src0->nb[3];
8268
-
8269
- const size_t nb10 = src1->nb[0];
8270
- const size_t nb11 = src1->nb[1];
8271
- const size_t nb12 = src1->nb[2];
8272
- const size_t nb13 = src1->nb[3];
8273
-
8274
- const size_t nb0 = dst->nb[0];
8275
- const size_t nb1 = dst->nb[1];
8276
- const size_t nb2 = dst->nb[2];
8277
- const size_t nb3 = dst->nb[3];
8380
+ GGML_TENSOR_BINARY_OP_LOCALS;
8278
8381
 
8279
8382
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8280
8383
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
8281
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
8384
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
8282
8385
 
8283
8386
  GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
8284
8387
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8327,24 +8430,8 @@ static void ggml_compute_forward_add_f16_f16(
8327
8430
  const int nth = params->nth;
8328
8431
 
8329
8432
  const int nr = ggml_nrows(src0);
8330
- const int64_t ne0 = src0->ne[0];
8331
- const int64_t ne1 = src0->ne[1];
8332
- const int64_t ne2 = src0->ne[2];
8333
-
8334
- const size_t nb00 = src0->nb[0];
8335
- const size_t nb01 = src0->nb[1];
8336
- const size_t nb02 = src0->nb[2];
8337
- const size_t nb03 = src0->nb[3];
8338
8433
 
8339
- const size_t nb10 = src1->nb[0];
8340
- const size_t nb11 = src1->nb[1];
8341
- const size_t nb12 = src1->nb[2];
8342
- const size_t nb13 = src1->nb[3];
8343
-
8344
- const size_t nb0 = dst->nb[0];
8345
- const size_t nb1 = dst->nb[1];
8346
- const size_t nb2 = dst->nb[2];
8347
- const size_t nb3 = dst->nb[3];
8434
+ GGML_TENSOR_BINARY_OP_LOCALS;
8348
8435
 
8349
8436
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8350
8437
  GGML_ASSERT(src1->type == GGML_TYPE_F16);
@@ -8394,32 +8481,15 @@ static void ggml_compute_forward_add_q_f32(
8394
8481
  }
8395
8482
 
8396
8483
  const int nr = ggml_nrows(src0);
8397
- const int64_t ne00 = src0->ne[0];
8398
- const int64_t ne01 = src0->ne[1];
8399
- const int64_t ne02 = src0->ne[2];
8400
- //const int64_t ne03 = src0->ne[3];
8401
-
8402
- const size_t nb00 = src0->nb[0];
8403
- const size_t nb01 = src0->nb[1];
8404
- const size_t nb02 = src0->nb[2];
8405
- const size_t nb03 = src0->nb[3];
8406
8484
 
8407
- const size_t nb10 = src1->nb[0];
8408
- const size_t nb11 = src1->nb[1];
8409
- const size_t nb12 = src1->nb[2];
8410
- const size_t nb13 = src1->nb[3];
8411
-
8412
- const size_t nb0 = dst->nb[0];
8413
- const size_t nb1 = dst->nb[1];
8414
- const size_t nb2 = dst->nb[2];
8415
- const size_t nb3 = dst->nb[3];
8485
+ GGML_TENSOR_BINARY_OP_LOCALS;
8416
8486
 
8417
8487
  const int ith = params->ith;
8418
8488
  const int nth = params->nth;
8419
8489
 
8420
8490
  const enum ggml_type type = src0->type;
8421
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
8422
- quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
8491
+ ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
8492
+ ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8423
8493
 
8424
8494
  // we don't support permuted src0 or src1
8425
8495
  GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
@@ -8533,19 +8603,8 @@ static void ggml_compute_forward_add1_f32(
8533
8603
  const int nth = params->nth;
8534
8604
 
8535
8605
  const int nr = ggml_nrows(src0);
8536
- const int64_t ne0 = src0->ne[0];
8537
- const int64_t ne1 = src0->ne[1];
8538
- const int64_t ne2 = src0->ne[2];
8539
-
8540
- const size_t nb00 = src0->nb[0];
8541
- const size_t nb01 = src0->nb[1];
8542
- const size_t nb02 = src0->nb[2];
8543
- const size_t nb03 = src0->nb[3];
8544
8606
 
8545
- const size_t nb0 = dst->nb[0];
8546
- const size_t nb1 = dst->nb[1];
8547
- const size_t nb2 = dst->nb[2];
8548
- const size_t nb3 = dst->nb[3];
8607
+ GGML_TENSOR_UNARY_OP_LOCALS;
8549
8608
 
8550
8609
  GGML_ASSERT( nb0 == sizeof(float));
8551
8610
  GGML_ASSERT(nb00 == sizeof(float));
@@ -8599,23 +8658,12 @@ static void ggml_compute_forward_add1_f16_f32(
8599
8658
  const int nth = params->nth;
8600
8659
 
8601
8660
  const int nr = ggml_nrows(src0);
8602
- const int64_t ne0 = src0->ne[0];
8603
- const int64_t ne1 = src0->ne[1];
8604
- const int64_t ne2 = src0->ne[2];
8605
8661
 
8606
- const size_t nb00 = src0->nb[0];
8607
- const size_t nb01 = src0->nb[1];
8608
- const size_t nb02 = src0->nb[2];
8609
- const size_t nb03 = src0->nb[3];
8610
-
8611
- const size_t nb0 = dst->nb[0];
8612
- const size_t nb1 = dst->nb[1];
8613
- const size_t nb2 = dst->nb[2];
8614
- const size_t nb3 = dst->nb[3];
8662
+ GGML_TENSOR_UNARY_OP_LOCALS;
8615
8663
 
8616
8664
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8617
8665
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
8618
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
8666
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
8619
8667
 
8620
8668
  GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
8621
8669
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8660,23 +8708,12 @@ static void ggml_compute_forward_add1_f16_f16(
8660
8708
  const int nth = params->nth;
8661
8709
 
8662
8710
  const int nr = ggml_nrows(src0);
8663
- const int64_t ne0 = src0->ne[0];
8664
- const int64_t ne1 = src0->ne[1];
8665
- const int64_t ne2 = src0->ne[2];
8666
-
8667
- const size_t nb00 = src0->nb[0];
8668
- const size_t nb01 = src0->nb[1];
8669
- const size_t nb02 = src0->nb[2];
8670
- const size_t nb03 = src0->nb[3];
8671
8711
 
8672
- const size_t nb0 = dst->nb[0];
8673
- const size_t nb1 = dst->nb[1];
8674
- const size_t nb2 = dst->nb[2];
8675
- const size_t nb3 = dst->nb[3];
8712
+ GGML_TENSOR_UNARY_OP_LOCALS;
8676
8713
 
8677
8714
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8678
8715
  GGML_ASSERT(src1->type == GGML_TYPE_F16);
8679
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
8716
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
8680
8717
 
8681
8718
  GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
8682
8719
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8721,23 +8758,12 @@ static void ggml_compute_forward_add1_q_f32(
8721
8758
  const int nth = params->nth;
8722
8759
 
8723
8760
  const int nr = ggml_nrows(src0);
8724
- const int64_t ne0 = src0->ne[0];
8725
- const int64_t ne1 = src0->ne[1];
8726
- const int64_t ne2 = src0->ne[2];
8727
8761
 
8728
- const size_t nb00 = src0->nb[0];
8729
- const size_t nb01 = src0->nb[1];
8730
- const size_t nb02 = src0->nb[2];
8731
- const size_t nb03 = src0->nb[3];
8732
-
8733
- const size_t nb0 = dst->nb[0];
8734
- const size_t nb1 = dst->nb[1];
8735
- const size_t nb2 = dst->nb[2];
8736
- const size_t nb3 = dst->nb[3];
8762
+ GGML_TENSOR_UNARY_OP_LOCALS;
8737
8763
 
8738
8764
  const enum ggml_type type = src0->type;
8739
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
8740
- quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
8765
+ ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
8766
+ ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8741
8767
 
8742
8768
  // we don't support permuted src0
8743
8769
  GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
@@ -8865,15 +8891,8 @@ static void ggml_compute_forward_acc_f32(
8865
8891
  const int nr = ggml_nrows(src1);
8866
8892
  const int nc = src1->ne[0];
8867
8893
 
8868
- const int64_t ne10 = src1->ne[0];
8869
- const int64_t ne11 = src1->ne[1];
8870
- const int64_t ne12 = src1->ne[2];
8871
- const int64_t ne13 = src1->ne[3];
8872
-
8873
- const size_t nb10 = src1->nb[0];
8874
- const size_t nb11 = src1->nb[1];
8875
- const size_t nb12 = src1->nb[2];
8876
- const size_t nb13 = src1->nb[3];
8894
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
8895
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
8877
8896
 
8878
8897
  // src0 and dst as viewed during acc
8879
8898
  const size_t nb0 = ggml_element_size(src0);
@@ -8962,24 +8981,8 @@ static void ggml_compute_forward_sub_f32(
8962
8981
  }
8963
8982
 
8964
8983
  const int nr = ggml_nrows(src0);
8965
- const int64_t ne0 = src0->ne[0];
8966
- const int64_t ne1 = src0->ne[1];
8967
- const int64_t ne2 = src0->ne[2];
8968
-
8969
- const size_t nb00 = src0->nb[0];
8970
- const size_t nb01 = src0->nb[1];
8971
- const size_t nb02 = src0->nb[2];
8972
- const size_t nb03 = src0->nb[3];
8973
8984
 
8974
- const size_t nb10 = src1->nb[0];
8975
- const size_t nb11 = src1->nb[1];
8976
- const size_t nb12 = src1->nb[2];
8977
- const size_t nb13 = src1->nb[3];
8978
-
8979
- const size_t nb0 = dst->nb[0];
8980
- const size_t nb1 = dst->nb[1];
8981
- const size_t nb2 = dst->nb[2];
8982
- const size_t nb3 = dst->nb[3];
8985
+ GGML_TENSOR_BINARY_OP_LOCALS;
8983
8986
 
8984
8987
  GGML_ASSERT( nb0 == sizeof(float));
8985
8988
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9069,29 +9072,7 @@ static void ggml_compute_forward_mul_f32(
9069
9072
 
9070
9073
  const int64_t nr = ggml_nrows(src0);
9071
9074
 
9072
- const int64_t ne00 = src0->ne[0];
9073
- const int64_t ne01 = src0->ne[1];
9074
- const int64_t ne02 = src0->ne[2];
9075
-
9076
- const int64_t ne10 = src1->ne[0];
9077
- const int64_t ne11 = src1->ne[1];
9078
- const int64_t ne12 = src1->ne[2];
9079
- const int64_t ne13 = src1->ne[3];
9080
-
9081
- const size_t nb00 = src0->nb[0];
9082
- const size_t nb01 = src0->nb[1];
9083
- const size_t nb02 = src0->nb[2];
9084
- const size_t nb03 = src0->nb[3];
9085
-
9086
- const size_t nb10 = src1->nb[0];
9087
- const size_t nb11 = src1->nb[1];
9088
- const size_t nb12 = src1->nb[2];
9089
- const size_t nb13 = src1->nb[3];
9090
-
9091
- const size_t nb0 = dst->nb[0];
9092
- const size_t nb1 = dst->nb[1];
9093
- const size_t nb2 = dst->nb[2];
9094
- const size_t nb3 = dst->nb[3];
9075
+ GGML_TENSOR_BINARY_OP_LOCALS;
9095
9076
 
9096
9077
  GGML_ASSERT( nb0 == sizeof(float));
9097
9078
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9179,24 +9160,8 @@ static void ggml_compute_forward_div_f32(
9179
9160
  }
9180
9161
 
9181
9162
  const int nr = ggml_nrows(src0);
9182
- const int64_t ne0 = src0->ne[0];
9183
- const int64_t ne1 = src0->ne[1];
9184
- const int64_t ne2 = src0->ne[2];
9185
-
9186
- const size_t nb00 = src0->nb[0];
9187
- const size_t nb01 = src0->nb[1];
9188
- const size_t nb02 = src0->nb[2];
9189
- const size_t nb03 = src0->nb[3];
9190
-
9191
- const size_t nb10 = src1->nb[0];
9192
- const size_t nb11 = src1->nb[1];
9193
- const size_t nb12 = src1->nb[2];
9194
- const size_t nb13 = src1->nb[3];
9195
9163
 
9196
- const size_t nb0 = dst->nb[0];
9197
- const size_t nb1 = dst->nb[1];
9198
- const size_t nb2 = dst->nb[2];
9199
- const size_t nb3 = dst->nb[3];
9164
+ GGML_TENSOR_BINARY_OP_LOCALS;
9200
9165
 
9201
9166
  GGML_ASSERT( nb0 == sizeof(float));
9202
9167
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9403,14 +9368,8 @@ static void ggml_compute_forward_sum_f32(
9403
9368
  assert(ggml_is_scalar(dst));
9404
9369
  assert(src0->nb[0] == sizeof(float));
9405
9370
 
9406
- const int64_t ne00 = src0->ne[0];
9407
- const int64_t ne01 = src0->ne[1];
9408
- const int64_t ne02 = src0->ne[2];
9409
- const int64_t ne03 = src0->ne[3];
9410
-
9411
- const size_t nb01 = src0->nb[1];
9412
- const size_t nb02 = src0->nb[2];
9413
- const size_t nb03 = src0->nb[3];
9371
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9372
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9414
9373
 
9415
9374
  ggml_float sum = 0;
9416
9375
  ggml_float row_sum = 0;
@@ -9459,29 +9418,13 @@ static void ggml_compute_forward_sum_rows_f32(
9459
9418
  GGML_ASSERT(src0->nb[0] == sizeof(float));
9460
9419
  GGML_ASSERT(dst->nb[0] == sizeof(float));
9461
9420
 
9462
- const int64_t ne00 = src0->ne[0];
9463
- const int64_t ne01 = src0->ne[1];
9464
- const int64_t ne02 = src0->ne[2];
9465
- const int64_t ne03 = src0->ne[3];
9466
-
9467
- const int64_t ne0 = dst->ne[0];
9468
- const int64_t ne1 = dst->ne[1];
9469
- const int64_t ne2 = dst->ne[2];
9470
- const int64_t ne3 = dst->ne[3];
9421
+ GGML_TENSOR_UNARY_OP_LOCALS;
9471
9422
 
9472
9423
  GGML_ASSERT(ne0 == 1);
9473
9424
  GGML_ASSERT(ne1 == ne01);
9474
9425
  GGML_ASSERT(ne2 == ne02);
9475
9426
  GGML_ASSERT(ne3 == ne03);
9476
9427
 
9477
- const size_t nb01 = src0->nb[1];
9478
- const size_t nb02 = src0->nb[2];
9479
- const size_t nb03 = src0->nb[3];
9480
-
9481
- const size_t nb1 = dst->nb[1];
9482
- const size_t nb2 = dst->nb[2];
9483
- const size_t nb3 = dst->nb[3];
9484
-
9485
9428
  for (int64_t i3 = 0; i3 < ne03; i3++) {
9486
9429
  for (int64_t i2 = 0; i2 < ne02; i2++) {
9487
9430
  for (int64_t i1 = 0; i1 < ne01; i1++) {
@@ -9525,19 +9468,7 @@ static void ggml_compute_forward_mean_f32(
9525
9468
 
9526
9469
  assert(src0->nb[0] == sizeof(float));
9527
9470
 
9528
- const int64_t ne00 = src0->ne[0];
9529
- const int64_t ne01 = src0->ne[1];
9530
- const int64_t ne02 = src0->ne[2];
9531
- const int64_t ne03 = src0->ne[3];
9532
-
9533
- const size_t nb01 = src0->nb[1];
9534
- const size_t nb02 = src0->nb[2];
9535
- const size_t nb03 = src0->nb[3];
9536
-
9537
- const int64_t ne0 = dst->ne[0];
9538
- const int64_t ne1 = dst->ne[1];
9539
- const int64_t ne2 = dst->ne[2];
9540
- const int64_t ne3 = dst->ne[3];
9471
+ GGML_TENSOR_UNARY_OP_LOCALS;
9541
9472
 
9542
9473
  assert(ne0 == 1);
9543
9474
  assert(ne1 == ne01);
@@ -9549,10 +9480,6 @@ static void ggml_compute_forward_mean_f32(
9549
9480
  UNUSED(ne2);
9550
9481
  UNUSED(ne3);
9551
9482
 
9552
- const size_t nb1 = dst->nb[1];
9553
- const size_t nb2 = dst->nb[2];
9554
- const size_t nb3 = dst->nb[3];
9555
-
9556
9483
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9557
9484
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9558
9485
  for (int64_t i01 = 0; i01 < ne01; i01++) {
@@ -9582,38 +9509,66 @@ static void ggml_compute_forward_mean(
9582
9509
  }
9583
9510
  }
9584
9511
 
9585
- // ggml_compute_forward_repeat
9512
+ // ggml_compute_forward_argmax
9586
9513
 
9587
- static void ggml_compute_forward_repeat_f32(
9514
+ static void ggml_compute_forward_argmax_f32(
9588
9515
  const struct ggml_compute_params * params,
9589
9516
  const struct ggml_tensor * src0,
9590
9517
  struct ggml_tensor * dst) {
9591
- GGML_ASSERT(params->ith == 0);
9592
- GGML_ASSERT(ggml_can_repeat(src0, dst));
9518
+ assert(params->ith == 0);
9593
9519
 
9594
9520
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9595
9521
  return;
9596
9522
  }
9597
9523
 
9598
- const int64_t ne0 = dst->ne[0];
9599
- const int64_t ne1 = dst->ne[1];
9600
- const int64_t ne2 = dst->ne[2];
9601
- const int64_t ne3 = dst->ne[3];
9524
+ assert(src0->nb[0] == sizeof(float));
9525
+ assert(dst->nb[0] == sizeof(float));
9602
9526
 
9603
9527
  const int64_t ne00 = src0->ne[0];
9604
9528
  const int64_t ne01 = src0->ne[1];
9605
- const int64_t ne02 = src0->ne[2];
9606
- const int64_t ne03 = src0->ne[3];
9607
-
9608
- const size_t nb0 = dst->nb[0];
9609
- const size_t nb1 = dst->nb[1];
9610
- const size_t nb2 = dst->nb[2];
9611
- const size_t nb3 = dst->nb[3];
9612
9529
 
9613
- const size_t nb00 = src0->nb[0];
9614
9530
  const size_t nb01 = src0->nb[1];
9615
- const size_t nb02 = src0->nb[2];
9616
- const size_t nb03 = src0->nb[3];
9531
+ const size_t nb0 = dst->nb[0];
9532
+
9533
+ for (int64_t i1 = 0; i1 < ne01; i1++) {
9534
+ float * src = (float *) ((char *) src0->data + i1*nb01);
9535
+ int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0);
9536
+ int v = 0;
9537
+ ggml_vec_argmax_f32(ne00, &v, src);
9538
+ dst_[0] = v;
9539
+ }
9540
+ }
9541
+
9542
+ static void ggml_compute_forward_argmax(
9543
+ const struct ggml_compute_params * params,
9544
+ const struct ggml_tensor * src0,
9545
+ struct ggml_tensor * dst) {
9546
+ switch (src0->type) {
9547
+ case GGML_TYPE_F32:
9548
+ {
9549
+ ggml_compute_forward_argmax_f32(params, src0, dst);
9550
+ } break;
9551
+ default:
9552
+ {
9553
+ GGML_ASSERT(false);
9554
+ } break;
9555
+ }
9556
+ }
9557
+
9558
+ // ggml_compute_forward_repeat
9559
+
9560
+ static void ggml_compute_forward_repeat_f32(
9561
+ const struct ggml_compute_params * params,
9562
+ const struct ggml_tensor * src0,
9563
+ struct ggml_tensor * dst) {
9564
+ GGML_ASSERT(params->ith == 0);
9565
+ GGML_ASSERT(ggml_can_repeat(src0, dst));
9566
+
9567
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9568
+ return;
9569
+ }
9570
+
9571
+ GGML_TENSOR_UNARY_OP_LOCALS;
9617
9572
 
9618
9573
  // guaranteed to be an integer due to the check in ggml_can_repeat
9619
9574
  const int nr0 = (int)(ne0/ne00);
@@ -9674,25 +9629,7 @@ static void ggml_compute_forward_repeat_back_f32(
9674
9629
  return;
9675
9630
  }
9676
9631
 
9677
- const int64_t ne0 = dst->ne[0];
9678
- const int64_t ne1 = dst->ne[1];
9679
- const int64_t ne2 = dst->ne[2];
9680
- const int64_t ne3 = dst->ne[3];
9681
-
9682
- const int64_t ne00 = src0->ne[0];
9683
- const int64_t ne01 = src0->ne[1];
9684
- const int64_t ne02 = src0->ne[2];
9685
- const int64_t ne03 = src0->ne[3];
9686
-
9687
- const size_t nb0 = dst->nb[0];
9688
- const size_t nb1 = dst->nb[1];
9689
- const size_t nb2 = dst->nb[2];
9690
- const size_t nb3 = dst->nb[3];
9691
-
9692
- const size_t nb00 = src0->nb[0];
9693
- const size_t nb01 = src0->nb[1];
9694
- const size_t nb02 = src0->nb[2];
9695
- const size_t nb03 = src0->nb[3];
9632
+ GGML_TENSOR_UNARY_OP_LOCALS;
9696
9633
 
9697
9634
  // guaranteed to be an integer due to the check in ggml_can_repeat
9698
9635
  const int nr0 = (int)(ne00/ne0);
@@ -9922,6 +9859,90 @@ static void ggml_compute_forward_step(
9922
9859
  }
9923
9860
  }
9924
9861
 
9862
+ // ggml_compute_forward_tanh
9863
+
9864
+ static void ggml_compute_forward_tanh_f32(
9865
+ const struct ggml_compute_params * params,
9866
+ const struct ggml_tensor * src0,
9867
+ struct ggml_tensor * dst) {
9868
+ assert(params->ith == 0);
9869
+ assert(ggml_are_same_shape(src0, dst));
9870
+
9871
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9872
+ return;
9873
+ }
9874
+
9875
+ const int n = ggml_nrows(src0);
9876
+ const int nc = src0->ne[0];
9877
+
9878
+ assert(dst->nb[0] == sizeof(float));
9879
+ assert(src0->nb[0] == sizeof(float));
9880
+
9881
+ for (int i = 0; i < n; i++) {
9882
+ ggml_vec_tanh_f32(nc,
9883
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9884
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9885
+ }
9886
+ }
9887
+
9888
+ static void ggml_compute_forward_tanh(
9889
+ const struct ggml_compute_params * params,
9890
+ const struct ggml_tensor * src0,
9891
+ struct ggml_tensor * dst) {
9892
+ switch (src0->type) {
9893
+ case GGML_TYPE_F32:
9894
+ {
9895
+ ggml_compute_forward_tanh_f32(params, src0, dst);
9896
+ } break;
9897
+ default:
9898
+ {
9899
+ GGML_ASSERT(false);
9900
+ } break;
9901
+ }
9902
+ }
9903
+
9904
+ // ggml_compute_forward_elu
9905
+
9906
+ static void ggml_compute_forward_elu_f32(
9907
+ const struct ggml_compute_params * params,
9908
+ const struct ggml_tensor * src0,
9909
+ struct ggml_tensor * dst) {
9910
+ assert(params->ith == 0);
9911
+ assert(ggml_are_same_shape(src0, dst));
9912
+
9913
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9914
+ return;
9915
+ }
9916
+
9917
+ const int n = ggml_nrows(src0);
9918
+ const int nc = src0->ne[0];
9919
+
9920
+ assert(dst->nb[0] == sizeof(float));
9921
+ assert(src0->nb[0] == sizeof(float));
9922
+
9923
+ for (int i = 0; i < n; i++) {
9924
+ ggml_vec_elu_f32(nc,
9925
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9926
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9927
+ }
9928
+ }
9929
+
9930
+ static void ggml_compute_forward_elu(
9931
+ const struct ggml_compute_params * params,
9932
+ const struct ggml_tensor * src0,
9933
+ struct ggml_tensor * dst) {
9934
+ switch (src0->type) {
9935
+ case GGML_TYPE_F32:
9936
+ {
9937
+ ggml_compute_forward_elu_f32(params, src0, dst);
9938
+ } break;
9939
+ default:
9940
+ {
9941
+ GGML_ASSERT(false);
9942
+ } break;
9943
+ }
9944
+ }
9945
+
9925
9946
  // ggml_compute_forward_relu
9926
9947
 
9927
9948
  static void ggml_compute_forward_relu_f32(
@@ -10223,18 +10244,7 @@ static void ggml_compute_forward_norm_f32(
10223
10244
  const int ith = params->ith;
10224
10245
  const int nth = params->nth;
10225
10246
 
10226
- const int64_t ne00 = src0->ne[0];
10227
- const int64_t ne01 = src0->ne[1];
10228
- const int64_t ne02 = src0->ne[2];
10229
- const int64_t ne03 = src0->ne[3];
10230
-
10231
- const size_t nb01 = src0->nb[1];
10232
- const size_t nb02 = src0->nb[2];
10233
- const size_t nb03 = src0->nb[3];
10234
-
10235
- const size_t nb1 = dst->nb[1];
10236
- const size_t nb2 = dst->nb[2];
10237
- const size_t nb3 = dst->nb[3];
10247
+ GGML_TENSOR_UNARY_OP_LOCALS;
10238
10248
 
10239
10249
  const float eps = 1e-5f; // TODO: make this a parameter
10240
10250
 
@@ -10300,18 +10310,7 @@ static void ggml_compute_forward_rms_norm_f32(
10300
10310
  const int ith = params->ith;
10301
10311
  const int nth = params->nth;
10302
10312
 
10303
- const int64_t ne00 = src0->ne[0];
10304
- const int64_t ne01 = src0->ne[1];
10305
- const int64_t ne02 = src0->ne[2];
10306
- const int64_t ne03 = src0->ne[3];
10307
-
10308
- const size_t nb01 = src0->nb[1];
10309
- const size_t nb02 = src0->nb[2];
10310
- const size_t nb03 = src0->nb[3];
10311
-
10312
- const size_t nb1 = dst->nb[1];
10313
- const size_t nb2 = dst->nb[2];
10314
- const size_t nb3 = dst->nb[3];
10313
+ GGML_TENSOR_UNARY_OP_LOCALS;
10315
10314
 
10316
10315
  const float eps = 1e-6f; // TODO: make this a parameter
10317
10316
 
@@ -10376,22 +10375,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
10376
10375
  const int ith = params->ith;
10377
10376
  const int nth = params->nth;
10378
10377
 
10379
- const int64_t ne00 = src0->ne[0];
10380
- const int64_t ne01 = src0->ne[1];
10381
- const int64_t ne02 = src0->ne[2];
10382
- const int64_t ne03 = src0->ne[3];
10383
-
10384
- const size_t nb01 = src0->nb[1];
10385
- const size_t nb02 = src0->nb[2];
10386
- const size_t nb03 = src0->nb[3];
10387
-
10388
- const size_t nb11 = src1->nb[1];
10389
- const size_t nb12 = src1->nb[2];
10390
- const size_t nb13 = src1->nb[3];
10391
-
10392
- const size_t nb1 = dst->nb[1];
10393
- const size_t nb2 = dst->nb[2];
10394
- const size_t nb3 = dst->nb[3];
10378
+ GGML_TENSOR_BINARY_OP_LOCALS;
10395
10379
 
10396
10380
  const float eps = 1e-6f; // TODO: make this a parameter
10397
10381
 
@@ -10541,416 +10525,45 @@ static void ggml_compute_forward_rms_norm_back(
10541
10525
  {
10542
10526
  ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
10543
10527
  } break;
10544
- default:
10545
- {
10546
- GGML_ASSERT(false);
10547
- } break;
10548
- }
10549
- }
10550
-
10551
-
10552
- // ggml_compute_forward_mul_mat
10553
-
10554
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10555
- // helper function to determine if it is better to use BLAS or not
10556
- // for large matrices, BLAS is faster
10557
- static bool ggml_compute_forward_mul_mat_use_blas(
10558
- const struct ggml_tensor * src0,
10559
- const struct ggml_tensor * src1,
10560
- struct ggml_tensor * dst) {
10561
- //const int64_t ne00 = src0->ne[0];
10562
- //const int64_t ne01 = src0->ne[1];
10563
-
10564
- const int64_t ne10 = src1->ne[0];
10565
-
10566
- const int64_t ne0 = dst->ne[0];
10567
- const int64_t ne1 = dst->ne[1];
10568
-
10569
- // TODO: find the optimal values for these
10570
- if (ggml_is_contiguous(src0) &&
10571
- ggml_is_contiguous(src1) &&
10572
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
10573
-
10574
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
10575
- return true;
10576
- }
10577
-
10578
- return false;
10579
- }
10580
- #endif
10581
-
10582
- static void ggml_compute_forward_mul_mat_f32(
10583
- const struct ggml_compute_params * params,
10584
- const struct ggml_tensor * src0,
10585
- const struct ggml_tensor * src1,
10586
- struct ggml_tensor * dst) {
10587
- int64_t t0 = ggml_perf_time_us();
10588
- UNUSED(t0);
10589
-
10590
- const int64_t ne00 = src0->ne[0];
10591
- const int64_t ne01 = src0->ne[1];
10592
- const int64_t ne02 = src0->ne[2];
10593
- const int64_t ne03 = src0->ne[3];
10594
-
10595
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10596
- const int64_t ne10 = src1->ne[0];
10597
- #endif
10598
- const int64_t ne11 = src1->ne[1];
10599
- #ifndef NDEBUG
10600
- const int64_t ne12 = src1->ne[2];
10601
- const int64_t ne13 = src1->ne[3];
10602
-
10603
- const int64_t ne0 = dst->ne[0];
10604
- const int64_t ne1 = dst->ne[1];
10605
- const int64_t ne2 = dst->ne[2];
10606
- const int64_t ne3 = dst->ne[3];
10607
-
10608
- const int nb00 = src0->nb[0];
10609
- #endif
10610
- const int nb01 = src0->nb[1];
10611
- const int nb02 = src0->nb[2];
10612
- const int nb03 = src0->nb[3];
10613
-
10614
- #ifndef NDEBUG
10615
- const int nb10 = src1->nb[0];
10616
- #endif
10617
- const int nb11 = src1->nb[1];
10618
- const int nb12 = src1->nb[2];
10619
- const int nb13 = src1->nb[3];
10620
-
10621
- const int nb0 = dst->nb[0];
10622
- const int nb1 = dst->nb[1];
10623
- const int nb2 = dst->nb[2];
10624
- const int nb3 = dst->nb[3];
10625
-
10626
- const int ith = params->ith;
10627
- const int nth = params->nth;
10628
-
10629
- assert(ne02 == ne12);
10630
- assert(ne03 == ne13);
10631
- assert(ne2 == ne12);
10632
- assert(ne3 == ne13);
10633
-
10634
- // we don't support permuted src0 or src1
10635
- assert(nb00 == sizeof(float));
10636
- assert(nb10 == sizeof(float));
10637
-
10638
- // dst cannot be transposed or permuted
10639
- assert(nb0 == sizeof(float));
10640
- assert(nb0 <= nb1);
10641
- assert(nb1 <= nb2);
10642
- assert(nb2 <= nb3);
10643
-
10644
- assert(ne0 == ne01);
10645
- assert(ne1 == ne11);
10646
- assert(ne2 == ne02);
10647
- assert(ne3 == ne03);
10648
-
10649
- // nb01 >= nb00 - src0 is not transposed
10650
- // compute by src0 rows
10651
-
10652
- #if defined(GGML_USE_CLBLAST)
10653
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10654
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10655
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10656
- }
10657
- return;
10658
- }
10659
- #endif
10660
-
10661
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10662
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10663
- if (params->ith != 0) {
10664
- return;
10665
- }
10666
-
10667
- if (params->type == GGML_TASK_INIT) {
10668
- return;
10669
- }
10670
-
10671
- if (params->type == GGML_TASK_FINALIZE) {
10672
- return;
10673
- }
10674
-
10675
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10676
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10677
- const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
10678
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
10679
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10680
-
10681
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
10682
- ne11, ne01, ne10,
10683
- 1.0f, y, ne10,
10684
- x, ne00,
10685
- 0.0f, d, ne01);
10686
- }
10687
- }
10688
- //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
10689
-
10690
- return;
10691
- }
10692
- #endif
10693
-
10694
- if (params->type == GGML_TASK_INIT) {
10695
- return;
10696
- }
10697
-
10698
- if (params->type == GGML_TASK_FINALIZE) {
10699
- return;
10700
- }
10701
-
10702
- // parallelize by src0 rows using ggml_vec_dot_f32
10703
-
10704
- // total rows in src0
10705
- const int nr = ne01*ne02*ne03;
10706
-
10707
- // rows per thread
10708
- const int dr = (nr + nth - 1)/nth;
10709
-
10710
- // row range for this thread
10711
- const int ir0 = dr*ith;
10712
- const int ir1 = MIN(ir0 + dr, nr);
10713
-
10714
- for (int ir = ir0; ir < ir1; ++ir) {
10715
- // src0 indices
10716
- const int i03 = ir/(ne02*ne01);
10717
- const int i02 = (ir - i03*ne02*ne01)/ne01;
10718
- const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
10719
-
10720
- for (int64_t ic = 0; ic < ne11; ++ic) {
10721
- // src1 indices
10722
- const int i13 = i03;
10723
- const int i12 = i02;
10724
- const int i11 = ic;
10725
-
10726
- // dst indices
10727
- const int i0 = i01;
10728
- const int i1 = i11;
10729
- const int i2 = i02;
10730
- const int i3 = i03;
10731
-
10732
- ggml_vec_dot_f32(ne00,
10733
- (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
10734
- (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
10735
- (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
10736
- }
10737
- }
10738
-
10739
- //int64_t t1 = ggml_perf_time_us();
10740
- //static int64_t acc = 0;
10741
- //acc += t1 - t0;
10742
- //if (t1 - t0 > 10) {
10743
- // printf("\n");
10744
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
10745
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
10746
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
10747
- // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
10748
-
10749
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
10750
- //}
10751
- }
10752
-
10753
- static void ggml_compute_forward_mul_mat_f16_f32(
10754
- const struct ggml_compute_params * params,
10755
- const struct ggml_tensor * src0,
10756
- const struct ggml_tensor * src1,
10757
- struct ggml_tensor * dst) {
10758
- int64_t t0 = ggml_perf_time_us();
10759
- UNUSED(t0);
10760
-
10761
- const int64_t ne00 = src0->ne[0];
10762
- const int64_t ne01 = src0->ne[1];
10763
- const int64_t ne02 = src0->ne[2];
10764
- const int64_t ne03 = src0->ne[3];
10765
-
10766
- const int64_t ne10 = src1->ne[0];
10767
- const int64_t ne11 = src1->ne[1];
10768
- const int64_t ne12 = src1->ne[2];
10769
- const int64_t ne13 = src1->ne[3];
10770
-
10771
- const int64_t ne0 = dst->ne[0];
10772
- const int64_t ne1 = dst->ne[1];
10773
- const int64_t ne2 = dst->ne[2];
10774
- const int64_t ne3 = dst->ne[3];
10775
- //const int64_t ne = ne0*ne1*ne2*ne3;
10776
-
10777
- const int nb00 = src0->nb[0];
10778
- const int nb01 = src0->nb[1];
10779
- const int nb02 = src0->nb[2];
10780
- const int nb03 = src0->nb[3];
10781
-
10782
- const int nb10 = src1->nb[0];
10783
- const int nb11 = src1->nb[1];
10784
- const int nb12 = src1->nb[2];
10785
- const int nb13 = src1->nb[3];
10786
-
10787
- const int nb0 = dst->nb[0];
10788
- const int nb1 = dst->nb[1];
10789
- const int nb2 = dst->nb[2];
10790
- const int nb3 = dst->nb[3];
10791
-
10792
- const int ith = params->ith;
10793
- const int nth = params->nth;
10794
-
10795
- GGML_ASSERT(ne02 == ne12);
10796
- GGML_ASSERT(ne03 == ne13);
10797
- GGML_ASSERT(ne2 == ne12);
10798
- GGML_ASSERT(ne3 == ne13);
10799
-
10800
- // TODO: we don't support permuted src0
10801
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
10802
-
10803
- // dst cannot be transposed or permuted
10804
- GGML_ASSERT(nb0 == sizeof(float));
10805
- GGML_ASSERT(nb0 <= nb1);
10806
- GGML_ASSERT(nb1 <= nb2);
10807
- GGML_ASSERT(nb2 <= nb3);
10808
-
10809
- GGML_ASSERT(ne0 == ne01);
10810
- GGML_ASSERT(ne1 == ne11);
10811
- GGML_ASSERT(ne2 == ne02);
10812
- GGML_ASSERT(ne3 == ne03);
10813
-
10814
- // nb01 >= nb00 - src0 is not transposed
10815
- // compute by src0 rows
10816
-
10817
- #if defined(GGML_USE_CLBLAST)
10818
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10819
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10820
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10821
- }
10822
- return;
10823
- }
10824
- #endif
10825
-
10826
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10827
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10828
- GGML_ASSERT(nb10 == sizeof(float));
10829
-
10830
- if (params->ith != 0) {
10831
- return;
10832
- }
10833
-
10834
- if (params->type == GGML_TASK_INIT) {
10835
- return;
10836
- }
10837
-
10838
- if (params->type == GGML_TASK_FINALIZE) {
10839
- return;
10840
- }
10841
-
10842
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10843
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10844
- float * const wdata = params->wdata;
10845
- {
10846
- size_t id = 0;
10847
- for (int64_t i01 = 0; i01 < ne01; ++i01) {
10848
- for (int64_t i00 = 0; i00 < ne00; ++i00) {
10849
- wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
10850
- }
10851
- }
10852
-
10853
- assert(id*sizeof(float) <= params->wsize);
10854
- }
10855
-
10856
- const float * x = wdata;
10857
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
10858
-
10859
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10860
-
10861
- // zT = y * xT
10862
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
10863
- ne11, ne01, ne10,
10864
- 1.0f, y, ne10,
10865
- x, ne00,
10866
- 0.0f, d, ne01);
10867
- }
10868
- }
10869
-
10870
- /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
10871
-
10872
- return;
10873
- }
10874
- #endif
10875
-
10876
- if (params->type == GGML_TASK_INIT) {
10877
- ggml_fp16_t * const wdata = params->wdata;
10878
-
10879
- size_t id = 0;
10880
- for (int64_t i13 = 0; i13 < ne13; ++i13) {
10881
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
10882
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
10883
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
10884
- wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
10885
- }
10886
- }
10887
- }
10888
- }
10889
-
10890
- GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
10891
-
10892
- return;
10893
- }
10894
-
10895
- if (params->type == GGML_TASK_FINALIZE) {
10896
- return;
10528
+ default:
10529
+ {
10530
+ GGML_ASSERT(false);
10531
+ } break;
10897
10532
  }
10533
+ }
10898
10534
 
10899
- // fp16 -> half the size, so divide by 2
10900
- // TODO: do not support transposed src1
10901
- assert(nb10/2 == sizeof(ggml_fp16_t));
10902
-
10903
- // parallelize by src0 rows using ggml_vec_dot_f16
10904
-
10905
- // total rows in src0
10906
- const int nr = ne01*ne02*ne03;
10907
-
10908
- // rows per thread
10909
- const int dr = (nr + nth - 1)/nth;
10910
-
10911
- // row range for this thread
10912
- const int ir0 = dr*ith;
10913
- const int ir1 = MIN(ir0 + dr, nr);
10914
-
10915
- ggml_fp16_t * wdata = params->wdata;
10916
10535
 
10917
- for (int ir = ir0; ir < ir1; ++ir) {
10918
- // src0 indices
10919
- const int i03 = ir/(ne02*ne01);
10920
- const int i02 = (ir - i03*ne02*ne01)/ne01;
10921
- const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
10536
+ // ggml_compute_forward_mul_mat
10922
10537
 
10923
- const int i13 = i03;
10924
- const int i12 = i02;
10538
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10539
+ // helper function to determine if it is better to use BLAS or not
10540
+ // for large matrices, BLAS is faster
10541
+ static bool ggml_compute_forward_mul_mat_use_blas(
10542
+ const struct ggml_tensor * src0,
10543
+ const struct ggml_tensor * src1,
10544
+ struct ggml_tensor * dst) {
10545
+ //const int64_t ne00 = src0->ne[0];
10546
+ //const int64_t ne01 = src0->ne[1];
10925
10547
 
10926
- const int i0 = i01;
10927
- const int i2 = i02;
10928
- const int i3 = i03;
10548
+ const int64_t ne10 = src1->ne[0];
10929
10549
 
10930
- ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
10931
- ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00;
10550
+ const int64_t ne0 = dst->ne[0];
10551
+ const int64_t ne1 = dst->ne[1];
10932
10552
 
10933
- float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
10553
+ // TODO: find the optimal values for these
10554
+ if (ggml_is_contiguous(src0) &&
10555
+ ggml_is_contiguous(src1) &&
10556
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
10934
10557
 
10935
- for (int64_t ic = 0; ic < ne11; ++ic) {
10936
- ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
10937
- }
10558
+ /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
10559
+ return true;
10938
10560
  }
10939
10561
 
10940
- //int64_t t1 = ggml_time_us();
10941
- //static int64_t acc = 0;
10942
- //acc += t1 - t0;
10943
- //if (t1 - t0 > 10) {
10944
- // printf("\n");
10945
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
10946
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
10947
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
10948
-
10949
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
10950
- //}
10562
+ return false;
10951
10563
  }
10564
+ #endif
10952
10565
 
10953
- static void ggml_compute_forward_mul_mat_q_f32(
10566
+ static void ggml_compute_forward_mul_mat(
10954
10567
  const struct ggml_compute_params * params,
10955
10568
  const struct ggml_tensor * src0,
10956
10569
  const struct ggml_tensor * src1,
@@ -10958,35 +10571,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
10958
10571
  int64_t t0 = ggml_perf_time_us();
10959
10572
  UNUSED(t0);
10960
10573
 
10961
- const int64_t ne00 = src0->ne[0];
10962
- const int64_t ne01 = src0->ne[1];
10963
- const int64_t ne02 = src0->ne[2];
10964
- const int64_t ne03 = src0->ne[3];
10965
-
10966
- const int64_t ne10 = src1->ne[0];
10967
- const int64_t ne11 = src1->ne[1];
10968
- const int64_t ne12 = src1->ne[2];
10969
- const int64_t ne13 = src1->ne[3];
10970
-
10971
- const int64_t ne0 = dst->ne[0];
10972
- const int64_t ne1 = dst->ne[1];
10973
- const int64_t ne2 = dst->ne[2];
10974
- const int64_t ne3 = dst->ne[3];
10975
-
10976
- const int nb00 = src0->nb[0];
10977
- const int nb01 = src0->nb[1];
10978
- const int nb02 = src0->nb[2];
10979
- const int nb03 = src0->nb[3];
10980
-
10981
- const int nb10 = src1->nb[0];
10982
- const int nb11 = src1->nb[1];
10983
- const int nb12 = src1->nb[2];
10984
- const int nb13 = src1->nb[3];
10985
-
10986
- const int nb0 = dst->nb[0];
10987
- const int nb1 = dst->nb[1];
10988
- const int nb2 = dst->nb[2];
10989
- const int nb3 = dst->nb[3];
10574
+ GGML_TENSOR_BINARY_OP_LOCALS;
10990
10575
 
10991
10576
  const int ith = params->ith;
10992
10577
  const int nth = params->nth;
@@ -10997,12 +10582,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
10997
10582
  GGML_ASSERT(ne3 == ne13);
10998
10583
 
10999
10584
  const enum ggml_type type = src0->type;
11000
- quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
11001
- vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
11002
- enum ggml_type const vec_dot_type = quantize_fns[type].vec_dot_type;
10585
+
10586
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10587
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10588
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
11003
10589
 
11004
10590
  // we don't support permuted src0 or src1
11005
- GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
10591
+ GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
11006
10592
  GGML_ASSERT(nb10 == sizeof(float));
11007
10593
 
11008
10594
  // dst cannot be transposed or permuted
@@ -11042,27 +10628,27 @@ static void ggml_compute_forward_mul_mat_q_f32(
11042
10628
  return;
11043
10629
  }
11044
10630
 
11045
- float * const wdata = params->wdata;
11046
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
11047
-
11048
10631
  for (int64_t i03 = 0; i03 < ne03; i03++) {
11049
10632
  for (int64_t i02 = 0; i02 < ne02; i02++) {
10633
+ const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
11050
10634
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
11051
10635
 
11052
10636
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
11053
10637
 
11054
- {
10638
+ if (type != GGML_TYPE_F32) {
10639
+ float * const wdata = params->wdata;
10640
+ ggml_to_float_t const to_float = type_traits[type].to_float;
10641
+
11055
10642
  size_t id = 0;
11056
10643
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
11057
- dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
10644
+ to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
11058
10645
  id += ne00;
11059
10646
  }
11060
10647
 
11061
10648
  assert(id*sizeof(float) <= params->wsize);
10649
+ x = wdata;
11062
10650
  }
11063
10651
 
11064
- const float * x = wdata;
11065
-
11066
10652
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
11067
10653
  ne11, ne01, ne10,
11068
10654
  1.0f, y, ne10,
@@ -11078,14 +10664,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
11078
10664
  #endif
11079
10665
 
11080
10666
  if (params->type == GGML_TASK_INIT) {
11081
- char * wdata = params->wdata;
11082
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11083
-
11084
- for (int64_t i13 = 0; i13 < ne13; ++i13) {
11085
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11086
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11087
- quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
11088
- wdata += row_size;
10667
+ if (src1->type != vec_dot_type) {
10668
+ char * wdata = params->wdata;
10669
+ const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10670
+
10671
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
10672
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
10673
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
10674
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
10675
+ wdata += row_size;
10676
+ }
11089
10677
  }
11090
10678
  }
11091
10679
  }
@@ -11109,7 +10697,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
11109
10697
  const int ir0 = dr*ith;
11110
10698
  const int ir1 = MIN(ir0 + dr, nr);
11111
10699
 
11112
- void * wdata = params->wdata;
10700
+ void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11113
10701
  const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11114
10702
 
11115
10703
  for (int ir = ir0; ir < ir1; ++ir) {
@@ -11133,7 +10721,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
11133
10721
  assert(ne00 % 32 == 0);
11134
10722
 
11135
10723
  for (int64_t ic = 0; ic < ne11; ++ic) {
11136
- vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
10724
+ vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
11137
10725
  }
11138
10726
  }
11139
10727
 
@@ -11150,40 +10738,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
11150
10738
  //}
11151
10739
  }
11152
10740
 
11153
- static void ggml_compute_forward_mul_mat(
11154
- const struct ggml_compute_params * params,
11155
- const struct ggml_tensor * src0,
11156
- const struct ggml_tensor * src1,
11157
- struct ggml_tensor * dst) {
11158
- switch (src0->type) {
11159
- case GGML_TYPE_Q4_0:
11160
- case GGML_TYPE_Q4_1:
11161
- case GGML_TYPE_Q5_0:
11162
- case GGML_TYPE_Q5_1:
11163
- case GGML_TYPE_Q8_0:
11164
- case GGML_TYPE_Q8_1:
11165
- case GGML_TYPE_Q2_K:
11166
- case GGML_TYPE_Q3_K:
11167
- case GGML_TYPE_Q4_K:
11168
- case GGML_TYPE_Q5_K:
11169
- case GGML_TYPE_Q6_K:
11170
- {
11171
- ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
11172
- } break;
11173
- case GGML_TYPE_F16:
11174
- {
11175
- ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
11176
- } break;
11177
- case GGML_TYPE_F32:
11178
- {
11179
- ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);
11180
- } break;
11181
- default:
11182
- {
11183
- GGML_ASSERT(false);
11184
- } break;
11185
- }
11186
- }
11187
10741
 
11188
10742
  // ggml_compute_forward_out_prod
11189
10743
 
@@ -11196,35 +10750,7 @@ static void ggml_compute_forward_out_prod_f32(
11196
10750
  int64_t t0 = ggml_perf_time_us();
11197
10751
  UNUSED(t0);
11198
10752
 
11199
- const int64_t ne00 = src0->ne[0];
11200
- const int64_t ne01 = src0->ne[1];
11201
- const int64_t ne02 = src0->ne[2];
11202
- const int64_t ne03 = src0->ne[3];
11203
-
11204
- const int64_t ne10 = src1->ne[0];
11205
- //const int64_t ne11 = src1->ne[1];
11206
- const int64_t ne12 = src1->ne[2];
11207
- const int64_t ne13 = src1->ne[3];
11208
-
11209
- const int64_t ne0 = dst->ne[0];
11210
- const int64_t ne1 = dst->ne[1];
11211
- const int64_t ne2 = dst->ne[2];
11212
- const int64_t ne3 = dst->ne[3];
11213
-
11214
- const int nb00 = src0->nb[0];
11215
- const int nb01 = src0->nb[1];
11216
- const int nb02 = src0->nb[2];
11217
- const int nb03 = src0->nb[3];
11218
-
11219
- const int nb10 = src1->nb[0];
11220
- const int nb11 = src1->nb[1];
11221
- const int nb12 = src1->nb[2];
11222
- const int nb13 = src1->nb[3];
11223
-
11224
- const int nb0 = dst->nb[0];
11225
- const int nb1 = dst->nb[1];
11226
- const int nb2 = dst->nb[2];
11227
- const int nb3 = dst->nb[3];
10753
+ GGML_TENSOR_BINARY_OP_LOCALS;
11228
10754
 
11229
10755
  const int ith = params->ith;
11230
10756
  const int nth = params->nth;
@@ -11459,15 +10985,8 @@ static void ggml_compute_forward_set_f32(
11459
10985
  const int nr = ggml_nrows(src1);
11460
10986
  const int nc = src1->ne[0];
11461
10987
 
11462
- const int64_t ne10 = src1->ne[0];
11463
- const int64_t ne11 = src1->ne[1];
11464
- const int64_t ne12 = src1->ne[2];
11465
- const int64_t ne13 = src1->ne[3];
11466
-
11467
- const size_t nb10 = src1->nb[0];
11468
- const size_t nb11 = src1->nb[1];
11469
- const size_t nb12 = src1->nb[2];
11470
- const size_t nb13 = src1->nb[3];
10988
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
10989
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
11471
10990
 
11472
10991
  // src0 and dst as viewed during set
11473
10992
  const size_t nb0 = ggml_element_size(src0);
@@ -11608,7 +11127,7 @@ static void ggml_compute_forward_get_rows_q(
11608
11127
  const int nc = src0->ne[0];
11609
11128
  const int nr = ggml_nelements(src1);
11610
11129
  const enum ggml_type type = src0->type;
11611
- dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
11130
+ ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
11612
11131
 
11613
11132
  assert( dst->ne[0] == nc);
11614
11133
  assert( dst->ne[1] == nr);
@@ -11858,29 +11377,14 @@ static void ggml_compute_forward_diag_f32(
11858
11377
 
11859
11378
  // TODO: handle transposed/permuted matrices
11860
11379
 
11861
- const int ne00 = src0->ne[0];
11862
- const int ne01 = src0->ne[1];
11863
- const int ne02 = src0->ne[2];
11864
- const int ne03 = src0->ne[3];
11865
- const int ne0 = dst->ne[0];
11866
- const int ne1 = dst->ne[1];
11867
- const int ne2 = dst->ne[2];
11868
- const int ne3 = dst->ne[3];
11380
+ GGML_TENSOR_UNARY_OP_LOCALS;
11381
+
11869
11382
  GGML_ASSERT(ne00 == ne0);
11870
11383
  GGML_ASSERT(ne00 == ne1);
11871
11384
  GGML_ASSERT(ne01 == 1);
11872
11385
  GGML_ASSERT(ne02 == ne2);
11873
11386
  GGML_ASSERT(ne03 == ne3);
11874
11387
 
11875
- const int nb00 = src0->nb[0];
11876
- //const int nb01 = src0->nb[1];
11877
- const int nb02 = src0->nb[2];
11878
- const int nb03 = src0->nb[3];
11879
- const int nb0 = dst->nb[0];
11880
- const int nb1 = dst->nb[1];
11881
- const int nb2 = dst->nb[2];
11882
- const int nb3 = dst->nb[3];
11883
-
11884
11388
  GGML_ASSERT(nb00 == sizeof(float));
11885
11389
  GGML_ASSERT(nb0 == sizeof(float));
11886
11390
 
@@ -12457,20 +11961,7 @@ static void ggml_compute_forward_rope_f32(
12457
11961
 
12458
11962
  assert(n_past >= 0);
12459
11963
 
12460
- const size_t nb00 = src0->nb[0];
12461
- const size_t nb01 = src0->nb[1];
12462
- const size_t nb02 = src0->nb[2];
12463
- const size_t nb03 = src0->nb[3];
12464
-
12465
- const int64_t ne0 = dst->ne[0];
12466
- const int64_t ne1 = dst->ne[1];
12467
- const int64_t ne2 = dst->ne[2];
12468
- const int64_t ne3 = dst->ne[3];
12469
-
12470
- const size_t nb0 = dst->nb[0];
12471
- const size_t nb1 = dst->nb[1];
12472
- const size_t nb2 = dst->nb[2];
12473
- const size_t nb3 = dst->nb[3];
11964
+ GGML_TENSOR_UNARY_OP_LOCALS;
12474
11965
 
12475
11966
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12476
11967
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12597,20 +12088,7 @@ static void ggml_compute_forward_rope_f16(
12597
12088
 
12598
12089
  assert(n_past >= 0);
12599
12090
 
12600
- const size_t nb00 = src0->nb[0];
12601
- const size_t nb01 = src0->nb[1];
12602
- const size_t nb02 = src0->nb[2];
12603
- const size_t nb03 = src0->nb[3];
12604
-
12605
- const int64_t ne0 = dst->ne[0];
12606
- const int64_t ne1 = dst->ne[1];
12607
- const int64_t ne2 = dst->ne[2];
12608
- const int64_t ne3 = dst->ne[3];
12609
-
12610
- const size_t nb0 = dst->nb[0];
12611
- const size_t nb1 = dst->nb[1];
12612
- const size_t nb2 = dst->nb[2];
12613
- const size_t nb3 = dst->nb[3];
12091
+ GGML_TENSOR_UNARY_OP_LOCALS;
12614
12092
 
12615
12093
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12616
12094
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12763,21 +12241,7 @@ static void ggml_compute_forward_rope_back_f32(
12763
12241
 
12764
12242
  assert(n_past >= 0);
12765
12243
 
12766
- const size_t nb00 = src0->nb[0];
12767
- const size_t nb01 = src0->nb[1];
12768
- const size_t nb02 = src0->nb[2];
12769
- const size_t nb03 = src0->nb[3];
12770
-
12771
- const int64_t ne0 = dst->ne[0];
12772
- const int64_t ne1 = dst->ne[1];
12773
- const int64_t ne2 = dst->ne[2];
12774
- const int64_t ne3 = dst->ne[3];
12775
-
12776
- const size_t nb0 = dst->nb[0];
12777
- const size_t nb1 = dst->nb[1];
12778
- const size_t nb2 = dst->nb[2];
12779
- const size_t nb3 = dst->nb[3];
12780
-
12244
+ GGML_TENSOR_UNARY_OP_LOCALS;
12781
12245
 
12782
12246
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12783
12247
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12876,21 +12340,7 @@ static void ggml_compute_forward_rope_back_f16(
12876
12340
 
12877
12341
  assert(n_past >= 0);
12878
12342
 
12879
- const size_t nb00 = src0->nb[0];
12880
- const size_t nb01 = src0->nb[1];
12881
- const size_t nb02 = src0->nb[2];
12882
- const size_t nb03 = src0->nb[3];
12883
-
12884
- const int64_t ne0 = dst->ne[0];
12885
- const int64_t ne1 = dst->ne[1];
12886
- const int64_t ne2 = dst->ne[2];
12887
- const int64_t ne3 = dst->ne[3];
12888
-
12889
- const size_t nb0 = dst->nb[0];
12890
- const size_t nb1 = dst->nb[1];
12891
- const size_t nb2 = dst->nb[2];
12892
- const size_t nb3 = dst->nb[3];
12893
-
12343
+ GGML_TENSOR_UNARY_OP_LOCALS;
12894
12344
 
12895
12345
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12896
12346
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12988,7 +12438,7 @@ static void ggml_compute_forward_rope_back(
12988
12438
  }
12989
12439
  }
12990
12440
 
12991
- // ggml_compute_forward_conv_1d_s1_ph
12441
+ // ggml_compute_forward_conv_1d
12992
12442
 
12993
12443
  static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
12994
12444
  const struct ggml_compute_params * params,
@@ -13002,36 +12452,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
13002
12452
  int64_t t0 = ggml_perf_time_us();
13003
12453
  UNUSED(t0);
13004
12454
 
13005
- const int64_t ne00 = src0->ne[0];
13006
- const int64_t ne01 = src0->ne[1];
13007
- const int64_t ne02 = src0->ne[2];
13008
- //const int64_t ne03 = src0->ne[3];
13009
-
13010
- const int64_t ne10 = src1->ne[0];
13011
- const int64_t ne11 = src1->ne[1];
13012
- //const int64_t ne12 = src1->ne[2];
13013
- //const int64_t ne13 = src1->ne[3];
13014
-
13015
- //const int64_t ne0 = dst->ne[0];
13016
- //const int64_t ne1 = dst->ne[1];
13017
- //const int64_t ne2 = dst->ne[2];
13018
- //const int64_t ne3 = dst->ne[3];
13019
- //const int64_t ne = ne0*ne1*ne2*ne3;
13020
-
13021
- const int nb00 = src0->nb[0];
13022
- const int nb01 = src0->nb[1];
13023
- const int nb02 = src0->nb[2];
13024
- //const int nb03 = src0->nb[3];
13025
-
13026
- const int nb10 = src1->nb[0];
13027
- const int nb11 = src1->nb[1];
13028
- //const int nb12 = src1->nb[2];
13029
- //const int nb13 = src1->nb[3];
13030
-
13031
- //const int nb0 = dst->nb[0];
13032
- const int nb1 = dst->nb[1];
13033
- //const int nb2 = dst->nb[2];
13034
- //const int nb3 = dst->nb[3];
12455
+ GGML_TENSOR_BINARY_OP_LOCALS;
13035
12456
 
13036
12457
  const int ith = params->ith;
13037
12458
  const int nth = params->nth;
@@ -13122,36 +12543,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
13122
12543
  int64_t t0 = ggml_perf_time_us();
13123
12544
  UNUSED(t0);
13124
12545
 
13125
- const int64_t ne00 = src0->ne[0];
13126
- const int64_t ne01 = src0->ne[1];
13127
- const int64_t ne02 = src0->ne[2];
13128
- //const int64_t ne03 = src0->ne[3];
13129
-
13130
- const int64_t ne10 = src1->ne[0];
13131
- const int64_t ne11 = src1->ne[1];
13132
- //const int64_t ne12 = src1->ne[2];
13133
- //const int64_t ne13 = src1->ne[3];
13134
-
13135
- //const int64_t ne0 = dst->ne[0];
13136
- //const int64_t ne1 = dst->ne[1];
13137
- //const int64_t ne2 = dst->ne[2];
13138
- //const int64_t ne3 = dst->ne[3];
13139
- //const int64_t ne = ne0*ne1*ne2*ne3;
13140
-
13141
- const int nb00 = src0->nb[0];
13142
- const int nb01 = src0->nb[1];
13143
- const int nb02 = src0->nb[2];
13144
- //const int nb03 = src0->nb[3];
13145
-
13146
- const int nb10 = src1->nb[0];
13147
- const int nb11 = src1->nb[1];
13148
- //const int nb12 = src1->nb[2];
13149
- //const int nb13 = src1->nb[3];
13150
-
13151
- //const int nb0 = dst->nb[0];
13152
- const int nb1 = dst->nb[1];
13153
- //const int nb2 = dst->nb[2];
13154
- //const int nb3 = dst->nb[3];
12546
+ GGML_TENSOR_BINARY_OP_LOCALS;
13155
12547
 
13156
12548
  const int ith = params->ith;
13157
12549
  const int nth = params->nth;
@@ -13251,8 +12643,6 @@ static void ggml_compute_forward_conv_1d_s1_ph(
13251
12643
  }
13252
12644
  }
13253
12645
 
13254
- // ggml_compute_forward_conv_1d_s2_ph
13255
-
13256
12646
  static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
13257
12647
  const struct ggml_compute_params * params,
13258
12648
  const struct ggml_tensor * src0,
@@ -13265,36 +12655,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
13265
12655
  int64_t t0 = ggml_perf_time_us();
13266
12656
  UNUSED(t0);
13267
12657
 
13268
- const int64_t ne00 = src0->ne[0];
13269
- const int64_t ne01 = src0->ne[1];
13270
- const int64_t ne02 = src0->ne[2];
13271
- //const int64_t ne03 = src0->ne[3];
13272
-
13273
- const int64_t ne10 = src1->ne[0];
13274
- const int64_t ne11 = src1->ne[1];
13275
- //const int64_t ne12 = src1->ne[2];
13276
- //const int64_t ne13 = src1->ne[3];
13277
-
13278
- //const int64_t ne0 = dst->ne[0];
13279
- //const int64_t ne1 = dst->ne[1];
13280
- //const int64_t ne2 = dst->ne[2];
13281
- //const int64_t ne3 = dst->ne[3];
13282
- //const int64_t ne = ne0*ne1*ne2*ne3;
13283
-
13284
- const int nb00 = src0->nb[0];
13285
- const int nb01 = src0->nb[1];
13286
- const int nb02 = src0->nb[2];
13287
- //const int nb03 = src0->nb[3];
13288
-
13289
- const int nb10 = src1->nb[0];
13290
- const int nb11 = src1->nb[1];
13291
- //const int nb12 = src1->nb[2];
13292
- //const int nb13 = src1->nb[3];
13293
-
13294
- //const int nb0 = dst->nb[0];
13295
- const int nb1 = dst->nb[1];
13296
- //const int nb2 = dst->nb[2];
13297
- //const int nb3 = dst->nb[3];
12658
+ GGML_TENSOR_BINARY_OP_LOCALS;
13298
12659
 
13299
12660
  const int ith = params->ith;
13300
12661
  const int nth = params->nth;
@@ -13385,36 +12746,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
13385
12746
  int64_t t0 = ggml_perf_time_us();
13386
12747
  UNUSED(t0);
13387
12748
 
13388
- const int64_t ne00 = src0->ne[0];
13389
- const int64_t ne01 = src0->ne[1];
13390
- const int64_t ne02 = src0->ne[2];
13391
- //const int64_t ne03 = src0->ne[3];
13392
-
13393
- const int64_t ne10 = src1->ne[0];
13394
- const int64_t ne11 = src1->ne[1];
13395
- //const int64_t ne12 = src1->ne[2];
13396
- //const int64_t ne13 = src1->ne[3];
13397
-
13398
- //const int64_t ne0 = dst->ne[0];
13399
- //const int64_t ne1 = dst->ne[1];
13400
- //const int64_t ne2 = dst->ne[2];
13401
- //const int64_t ne3 = dst->ne[3];
13402
- //const int64_t ne = ne0*ne1*ne2*ne3;
13403
-
13404
- const int nb00 = src0->nb[0];
13405
- const int nb01 = src0->nb[1];
13406
- const int nb02 = src0->nb[2];
13407
- //const int nb03 = src0->nb[3];
13408
-
13409
- const int nb10 = src1->nb[0];
13410
- const int nb11 = src1->nb[1];
13411
- //const int nb12 = src1->nb[2];
13412
- //const int nb13 = src1->nb[3];
13413
-
13414
- //const int nb0 = dst->nb[0];
13415
- const int nb1 = dst->nb[1];
13416
- //const int nb2 = dst->nb[2];
13417
- //const int nb3 = dst->nb[3];
12749
+ GGML_TENSOR_BINARY_OP_LOCALS;
13418
12750
 
13419
12751
  const int ith = params->ith;
13420
12752
  const int nth = params->nth;
@@ -13514,6 +12846,28 @@ static void ggml_compute_forward_conv_1d_s2_ph(
13514
12846
  }
13515
12847
  }
13516
12848
 
12849
+ // ggml_compute_forward_conv_1d
12850
+
12851
+ static void ggml_compute_forward_conv_1d(
12852
+ const struct ggml_compute_params * params,
12853
+ const struct ggml_tensor * src0,
12854
+ const struct ggml_tensor * src1,
12855
+ const struct ggml_tensor * opt0,
12856
+ struct ggml_tensor * dst) {
12857
+ const int32_t s0 = ((const int32_t*)(opt0->data))[0];
12858
+ const int32_t p0 = ((const int32_t*)(opt0->data))[1];
12859
+ const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12860
+ GGML_ASSERT(d0 == 1); // dilation not supported
12861
+ GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
12862
+ if (s0 == 1) {
12863
+ ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
12864
+ } else if (s0 == 2) {
12865
+ ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
12866
+ } else {
12867
+ GGML_ASSERT(false); // only stride 1 and 2 supported
12868
+ };
12869
+ }
12870
+
13517
12871
  // ggml_compute_forward_conv_2d_sk_p0
13518
12872
 
13519
12873
  static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
@@ -13528,36 +12882,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13528
12882
  int64_t t0 = ggml_perf_time_us();
13529
12883
  UNUSED(t0);
13530
12884
 
13531
- const int ne00 = src0->ne[0];
13532
- const int ne01 = src0->ne[1];
13533
- const int ne02 = src0->ne[2];
13534
- //const int ne03 = src0->ne[3];
13535
-
13536
- const int ne10 = src1->ne[0];
13537
- //const int ne11 = src1->ne[1];
13538
- const int ne12 = src1->ne[2];
13539
- //const int ne13 = src1->ne[3];
13540
-
13541
- const int ne0 = dst->ne[0];
13542
- const int ne1 = dst->ne[1];
13543
- const int ne2 = dst->ne[2];
13544
- //const int ne3 = dst->ne[3];
13545
- //const int ne = ne0*ne1*ne2*ne3;
13546
-
13547
- const int nb00 = src0->nb[0];
13548
- //const int nb01 = src0->nb[1];
13549
- //const int nb02 = src0->nb[2];
13550
- const int nb03 = src0->nb[3];
13551
-
13552
- const int nb10 = src1->nb[0];
13553
- //const int nb11 = src1->nb[1];
13554
- const int nb12 = src1->nb[2];
13555
- //const int nb13 = src1->nb[3];
13556
-
13557
- //const int nb0 = dst->nb[0];
13558
- //const int nb1 = dst->nb[1];
13559
- const int nb2 = dst->nb[2];
13560
- //const int nb3 = dst->nb[3];
12885
+ GGML_TENSOR_BINARY_OP_LOCALS;
13561
12886
 
13562
12887
  const int ith = params->ith;
13563
12888
  const int nth = params->nth;
@@ -13650,6 +12975,34 @@ static void ggml_compute_forward_conv_2d_sk_p0(
13650
12975
  }
13651
12976
  }
13652
12977
 
12978
+ // ggml_compute_forward_conv_2d
12979
+
12980
+ static void ggml_compute_forward_conv_2d(
12981
+ const struct ggml_compute_params* params,
12982
+ const struct ggml_tensor* src0,
12983
+ const struct ggml_tensor* src1,
12984
+ const struct ggml_tensor* opt0,
12985
+ struct ggml_tensor* dst) {
12986
+ const int32_t s0 = ((const int32_t*)(opt0->data))[0];
12987
+ const int32_t s1 = ((const int32_t*)(opt0->data))[1];
12988
+ const int32_t p0 = ((const int32_t*)(opt0->data))[2];
12989
+ const int32_t p1 = ((const int32_t*)(opt0->data))[3];
12990
+ const int32_t d0 = ((const int32_t*)(opt0->data))[4];
12991
+ const int32_t d1 = ((const int32_t*)(opt0->data))[5];
12992
+ GGML_ASSERT(d0 == 1); // dilation not supported
12993
+ GGML_ASSERT(d1 == 1);
12994
+ GGML_ASSERT(p0 == 0); // padding not supported
12995
+ GGML_ASSERT(p1 == 0);
12996
+
12997
+ if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
12998
+ ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
12999
+ }
13000
+ else {
13001
+ GGML_ASSERT(false); // only stride equal to kernel size is supported
13002
+ };
13003
+ }
13004
+
13005
+
13653
13006
  // ggml_compute_forward_flash_attn
13654
13007
 
13655
13008
  static void ggml_compute_forward_flash_attn_f32(
@@ -13662,45 +13015,14 @@ static void ggml_compute_forward_flash_attn_f32(
13662
13015
  int64_t t0 = ggml_perf_time_us();
13663
13016
  UNUSED(t0);
13664
13017
 
13665
- const int64_t neq0 = q->ne[0];
13666
- const int64_t neq1 = q->ne[1];
13667
- const int64_t neq2 = q->ne[2];
13668
- const int64_t neq3 = q->ne[3];
13669
-
13670
- const int64_t nek0 = k->ne[0];
13671
- const int64_t nek1 = k->ne[1];
13672
- //const int64_t nek2 = k->ne[2];
13673
- //const int64_t nek3 = k->ne[3];
13674
-
13675
- //const int64_t nev0 = v->ne[0];
13676
- const int64_t nev1 = v->ne[1];
13677
- //const int64_t nev2 = v->ne[2];
13678
- //const int64_t nev3 = v->ne[3];
13679
-
13680
- const int64_t ne0 = dst->ne[0];
13681
- const int64_t ne1 = dst->ne[1];
13682
- //const int64_t ne2 = dst->ne[2];
13683
- //const int64_t ne3 = dst->ne[3];
13684
-
13685
- const int nbk0 = k->nb[0];
13686
- const int nbk1 = k->nb[1];
13687
- const int nbk2 = k->nb[2];
13688
- const int nbk3 = k->nb[3];
13689
-
13690
- const int nbq0 = q->nb[0];
13691
- const int nbq1 = q->nb[1];
13692
- const int nbq2 = q->nb[2];
13693
- const int nbq3 = q->nb[3];
13694
-
13695
- const int nbv0 = v->nb[0];
13696
- const int nbv1 = v->nb[1];
13697
- const int nbv2 = v->nb[2];
13698
- const int nbv3 = v->nb[3];
13699
-
13700
- const int nb0 = dst->nb[0];
13701
- const int nb1 = dst->nb[1];
13702
- const int nb2 = dst->nb[2];
13703
- const int nb3 = dst->nb[3];
13018
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
13019
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
13020
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
13021
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
13022
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
13023
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
13024
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13025
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
13704
13026
 
13705
13027
  const int ith = params->ith;
13706
13028
  const int nth = params->nth;
@@ -13871,45 +13193,14 @@ static void ggml_compute_forward_flash_attn_f16(
13871
13193
  int64_t t0 = ggml_perf_time_us();
13872
13194
  UNUSED(t0);
13873
13195
 
13874
- const int64_t neq0 = q->ne[0];
13875
- const int64_t neq1 = q->ne[1];
13876
- const int64_t neq2 = q->ne[2];
13877
- const int64_t neq3 = q->ne[3];
13878
-
13879
- const int64_t nek0 = k->ne[0];
13880
- const int64_t nek1 = k->ne[1];
13881
- //const int64_t nek2 = k->ne[2];
13882
- //const int64_t nek3 = k->ne[3];
13883
-
13884
- //const int64_t nev0 = v->ne[0];
13885
- const int64_t nev1 = v->ne[1];
13886
- //const int64_t nev2 = v->ne[2];
13887
- //const int64_t nev3 = v->ne[3];
13888
-
13889
- const int64_t ne0 = dst->ne[0];
13890
- const int64_t ne1 = dst->ne[1];
13891
- //const int64_t ne2 = dst->ne[2];
13892
- //const int64_t ne3 = dst->ne[3];
13893
-
13894
- const int nbk0 = k->nb[0];
13895
- const int nbk1 = k->nb[1];
13896
- const int nbk2 = k->nb[2];
13897
- const int nbk3 = k->nb[3];
13898
-
13899
- const int nbq0 = q->nb[0];
13900
- const int nbq1 = q->nb[1];
13901
- const int nbq2 = q->nb[2];
13902
- const int nbq3 = q->nb[3];
13903
-
13904
- const int nbv0 = v->nb[0];
13905
- const int nbv1 = v->nb[1];
13906
- const int nbv2 = v->nb[2];
13907
- const int nbv3 = v->nb[3];
13908
-
13909
- const int nb0 = dst->nb[0];
13910
- const int nb1 = dst->nb[1];
13911
- const int nb2 = dst->nb[2];
13912
- const int nb3 = dst->nb[3];
13196
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
13197
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
13198
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
13199
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
13200
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
13201
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
13202
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13203
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
13913
13204
 
13914
13205
  const int ith = params->ith;
13915
13206
  const int nth = params->nth;
@@ -14143,65 +13434,18 @@ static void ggml_compute_forward_flash_ff_f16(
14143
13434
  int64_t t0 = ggml_perf_time_us();
14144
13435
  UNUSED(t0);
14145
13436
 
14146
- const int64_t nea0 = a->ne[0];
14147
- const int64_t nea1 = a->ne[1];
14148
- const int64_t nea2 = a->ne[2];
14149
- const int64_t nea3 = a->ne[3];
14150
-
14151
- const int64_t neb00 = b0->ne[0];
14152
- const int64_t neb01 = b0->ne[1];
14153
- //const int64_t neb02 = b0->ne[2];
14154
- //const int64_t neb03 = b0->ne[3];
14155
-
14156
- const int64_t neb10 = b1->ne[0];
14157
- const int64_t neb11 = b1->ne[1];
14158
- //const int64_t neb12 = b1->ne[2];
14159
- //const int64_t neb13 = b1->ne[3];
14160
-
14161
- const int64_t nec00 = c0->ne[0];
14162
- const int64_t nec01 = c0->ne[1];
14163
- //const int64_t nec02 = c0->ne[2];
14164
- //const int64_t nec03 = c0->ne[3];
14165
-
14166
- const int64_t nec10 = c1->ne[0];
14167
- const int64_t nec11 = c1->ne[1];
14168
- //const int64_t nec12 = c1->ne[2];
14169
- //const int64_t nec13 = c1->ne[3];
14170
-
14171
- const int64_t ne0 = dst->ne[0];
14172
- const int64_t ne1 = dst->ne[1];
14173
- const int64_t ne2 = dst->ne[2];
14174
- //const int64_t ne3 = dst->ne[3];
14175
-
14176
- const int nba0 = a->nb[0];
14177
- const int nba1 = a->nb[1];
14178
- const int nba2 = a->nb[2];
14179
- const int nba3 = a->nb[3];
14180
-
14181
- const int nbb00 = b0->nb[0];
14182
- const int nbb01 = b0->nb[1];
14183
- const int nbb02 = b0->nb[2];
14184
- const int nbb03 = b0->nb[3];
14185
-
14186
- const int nbb10 = b1->nb[0];
14187
- //const int nbb11 = b1->nb[1];
14188
- //const int nbb12 = b1->nb[2];
14189
- //const int nbb13 = b1->nb[3];
14190
-
14191
- const int nbc00 = c0->nb[0];
14192
- const int nbc01 = c0->nb[1];
14193
- const int nbc02 = c0->nb[2];
14194
- const int nbc03 = c0->nb[3];
14195
-
14196
- const int nbc10 = c1->nb[0];
14197
- //const int nbc11 = c1->nb[1];
14198
- //const int nbc12 = c1->nb[2];
14199
- //const int nbc13 = c1->nb[3];
14200
-
14201
- const int nb0 = dst->nb[0];
14202
- const int nb1 = dst->nb[1];
14203
- const int nb2 = dst->nb[2];
14204
- const int nb3 = dst->nb[3];
13437
+ GGML_TENSOR_LOCALS(int64_t, nea, a, ne);
13438
+ GGML_TENSOR_LOCALS(size_t, nba, a, nb);
13439
+ GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne);
13440
+ GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb);
13441
+ GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne);
13442
+ GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb);
13443
+ GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne);
13444
+ GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb);
13445
+ GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne);
13446
+ GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb);
13447
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13448
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
14205
13449
 
14206
13450
  const int ith = params->ith;
14207
13451
  const int nth = params->nth;
@@ -14349,55 +13593,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
14349
13593
  int64_t t0 = ggml_perf_time_us();
14350
13594
  UNUSED(t0);
14351
13595
 
14352
- const int64_t neq0 = q->ne[0];
14353
- const int64_t neq1 = q->ne[1];
14354
- const int64_t neq2 = q->ne[2];
14355
- const int64_t neq3 = q->ne[3];
14356
-
14357
- const int64_t nek0 = k->ne[0];
14358
- const int64_t nek1 = k->ne[1];
14359
- //const int64_t nek2 = k->ne[2];
14360
- //const int64_t nek3 = k->ne[3];
14361
-
14362
- const int64_t nev0 = v->ne[0];
14363
- const int64_t nev1 = v->ne[1];
14364
- //const int64_t nev2 = v->ne[2];
14365
- //const int64_t nev3 = v->ne[3];
14366
-
14367
- const int64_t ned0 = d->ne[0];
14368
- const int64_t ned1 = d->ne[1];
14369
- //const int64_t ned2 = d->ne[2];
14370
- //const int64_t ned3 = d->ne[3];
14371
-
14372
- const int64_t ne0 = dst->ne[0];
14373
- const int64_t ne1 = dst->ne[1];
14374
- const int64_t ne2 = dst->ne[2];
14375
- const int64_t ne3 = dst->ne[3];
14376
-
14377
- const int nbk0 = k->nb[0];
14378
- const int nbk1 = k->nb[1];
14379
- const int nbk2 = k->nb[2];
14380
- const int nbk3 = k->nb[3];
14381
-
14382
- const int nbq0 = q->nb[0];
14383
- const int nbq1 = q->nb[1];
14384
- const int nbq2 = q->nb[2];
14385
- const int nbq3 = q->nb[3];
14386
-
14387
- const int nbv0 = v->nb[0];
14388
- const int nbv1 = v->nb[1];
14389
- const int nbv2 = v->nb[2];
14390
- const int nbv3 = v->nb[3];
14391
-
14392
- const int nbd0 = d->nb[0];
14393
- const int nbd1 = d->nb[1];
14394
- const int nbd2 = d->nb[2];
14395
- const int nbd3 = d->nb[3];
14396
-
14397
- const int nb0 = dst->nb[0];
14398
- const int nb1 = dst->nb[1];
14399
- const int nb2 = dst->nb[2];
14400
- const int nb3 = dst->nb[3];
13596
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
13597
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
13598
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
13599
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
13600
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
13601
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
13602
+ GGML_TENSOR_LOCALS(int64_t, ned, d, ne);
13603
+ GGML_TENSOR_LOCALS(size_t, nbd, d, nb);
13604
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
13605
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
14401
13606
 
14402
13607
  const int ith = params->ith;
14403
13608
  const int nth = params->nth;
@@ -14755,15 +13960,8 @@ static void ggml_compute_forward_win_part_f32(
14755
13960
  return;
14756
13961
  }
14757
13962
 
14758
- const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
14759
- const int64_t ne01 = src0->ne[1];
14760
- const int64_t ne02 = src0->ne[2];
14761
- const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
14762
-
14763
- const int64_t ne0 = dst->ne[0];
14764
- const int64_t ne1 = dst->ne[1];
14765
- const int64_t ne2 = dst->ne[2];
14766
- const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
13963
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13964
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14767
13965
 
14768
13966
  const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14769
13967
  const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
@@ -14826,14 +14024,8 @@ static void ggml_compute_forward_win_unpart_f32(
14826
14024
  return;
14827
14025
  }
14828
14026
 
14829
- const int64_t ne00 = src0->ne[0];
14830
- const int64_t ne01 = src0->ne[1];
14831
- const int64_t ne02 = src0->ne[2];
14832
- //const int64_t ne03 = src0->ne[3];
14833
-
14834
- const int64_t ne0 = dst->ne[0];
14835
- const int64_t ne1 = dst->ne[1];
14836
- const int64_t ne2 = dst->ne[2];
14027
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14028
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14837
14029
 
14838
14030
  const int32_t w = ((const int32_t *)(opt0->data))[0];
14839
14031
 
@@ -15431,6 +14623,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15431
14623
  {
15432
14624
  ggml_compute_forward_mean(params, tensor->src0, tensor);
15433
14625
  } break;
14626
+ case GGML_OP_ARGMAX:
14627
+ {
14628
+ ggml_compute_forward_argmax(params, tensor->src0, tensor);
14629
+ } break;
15434
14630
  case GGML_OP_REPEAT:
15435
14631
  {
15436
14632
  ggml_compute_forward_repeat(params, tensor->src0, tensor);
@@ -15455,6 +14651,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15455
14651
  {
15456
14652
  ggml_compute_forward_step(params, tensor->src0, tensor);
15457
14653
  } break;
14654
+ case GGML_OP_TANH:
14655
+ {
14656
+ ggml_compute_forward_tanh(params, tensor->src0, tensor);
14657
+ } break;
14658
+ case GGML_OP_ELU:
14659
+ {
14660
+ ggml_compute_forward_elu(params, tensor->src0, tensor);
14661
+ } break;
15458
14662
  case GGML_OP_RELU:
15459
14663
  {
15460
14664
  ggml_compute_forward_relu(params, tensor->src0, tensor);
@@ -15571,17 +14775,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15571
14775
  {
15572
14776
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
15573
14777
  } break;
15574
- case GGML_OP_CONV_1D_S1_PH:
15575
- {
15576
- ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
15577
- } break;
15578
- case GGML_OP_CONV_1D_S2_PH:
14778
+ case GGML_OP_CONV_1D:
15579
14779
  {
15580
- ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
14780
+ ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
15581
14781
  } break;
15582
- case GGML_OP_CONV_2D_SK_P0:
14782
+ case GGML_OP_CONV_2D:
15583
14783
  {
15584
- ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
14784
+ ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
15585
14785
  } break;
15586
14786
  case GGML_OP_FLASH_ATTN:
15587
14787
  {
@@ -15830,6 +15030,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15830
15030
  }
15831
15031
  } break;
15832
15032
  case GGML_OP_MEAN:
15033
+ case GGML_OP_ARGMAX:
15833
15034
  {
15834
15035
  GGML_ASSERT(false); // TODO: implement
15835
15036
  } break;
@@ -15883,6 +15084,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15883
15084
  // noop
15884
15085
  }
15885
15086
  } break;
15087
+ case GGML_OP_TANH:
15088
+ {
15089
+ GGML_ASSERT(false); // TODO: not implemented
15090
+ } break;
15091
+ case GGML_OP_ELU:
15092
+ {
15093
+ GGML_ASSERT(false); // TODO: not implemented
15094
+ } break;
15886
15095
  case GGML_OP_RELU:
15887
15096
  {
15888
15097
  if (src0->grad) {
@@ -15902,14 +15111,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15902
15111
  {
15903
15112
  GGML_ASSERT(false); // TODO: not implemented
15904
15113
  } break;
15905
- case GGML_OP_ALIBI:
15906
- {
15907
- GGML_ASSERT(false); // TODO: not implemented
15908
- } break;
15909
- case GGML_OP_CLAMP:
15910
- {
15911
- GGML_ASSERT(false); // TODO: not implemented
15912
- } break;
15913
15114
  case GGML_OP_SILU:
15914
15115
  {
15915
15116
  // necessary for llama
@@ -16226,7 +15427,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16226
15427
  // necessary for llama
16227
15428
  if (src0->grad) {
16228
15429
  assert(src1->type == GGML_TYPE_I32);
16229
- assert(ggml_nelements(src1) == 3);
15430
+ assert(ggml_nelements(src1) == 4);
16230
15431
  const int n_past = ((int32_t *) src1->data)[0];
16231
15432
  const int n_dims = ((int32_t *) src1->data)[1];
16232
15433
  const int mode = ((int32_t *) src1->data)[2];
@@ -16266,15 +15467,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16266
15467
  // noop
16267
15468
  }
16268
15469
  } break;
16269
- case GGML_OP_CONV_1D_S1_PH:
15470
+ case GGML_OP_ALIBI:
15471
+ {
15472
+ GGML_ASSERT(false); // TODO: not implemented
15473
+ } break;
15474
+ case GGML_OP_CLAMP:
16270
15475
  {
16271
15476
  GGML_ASSERT(false); // TODO: not implemented
16272
15477
  } break;
16273
- case GGML_OP_CONV_1D_S2_PH:
15478
+ case GGML_OP_CONV_1D:
16274
15479
  {
16275
15480
  GGML_ASSERT(false); // TODO: not implemented
16276
15481
  } break;
16277
- case GGML_OP_CONV_2D_SK_P0:
15482
+ case GGML_OP_CONV_2D:
16278
15483
  {
16279
15484
  GGML_ASSERT(false); // TODO: not implemented
16280
15485
  } break;
@@ -16791,9 +15996,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16791
15996
  if (node_n != -1) {
16792
15997
  /* FINALIZE */
16793
15998
  struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794
- params.nth = node->n_tasks;
16795
- ggml_compute_forward(&params, node);
16796
- ggml_graph_compute_perf_stats_node(node, state->shared);
15999
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16000
+ params.nth = node->n_tasks;
16001
+ ggml_compute_forward(&params, node);
16002
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16003
+ }
16797
16004
  }
16798
16005
 
16799
16006
  // distribute new work or execute it direct if 1T
@@ -16805,10 +16012,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16805
16012
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16806
16013
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
16807
16014
 
16015
+ params.nth = node->n_tasks;
16016
+
16808
16017
  /* INIT */
16809
- params.type = GGML_TASK_INIT;
16810
- params.nth = node->n_tasks;
16811
- ggml_compute_forward(&params, node);
16018
+ if (GGML_OP_HAS_INIT[node->op]) {
16019
+ params.type = GGML_TASK_INIT;
16020
+ ggml_compute_forward(&params, node);
16021
+ }
16812
16022
 
16813
16023
  if (node->n_tasks == 1) {
16814
16024
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
@@ -16816,9 +16026,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16816
16026
  params.type = GGML_TASK_COMPUTE;
16817
16027
  ggml_compute_forward(&params, node);
16818
16028
 
16819
- params.type = GGML_TASK_FINALIZE;
16820
- ggml_compute_forward(&params, node);
16821
- ggml_graph_compute_perf_stats_node(node, state->shared);
16029
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16030
+ params.type = GGML_TASK_FINALIZE;
16031
+ ggml_compute_forward(&params, node);
16032
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16033
+ }
16822
16034
  } else {
16823
16035
  break;
16824
16036
  }
@@ -16924,12 +16136,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16924
16136
  case GGML_OP_SUM:
16925
16137
  case GGML_OP_SUM_ROWS:
16926
16138
  case GGML_OP_MEAN:
16139
+ case GGML_OP_ARGMAX:
16927
16140
  case GGML_OP_REPEAT:
16928
16141
  case GGML_OP_REPEAT_BACK:
16929
16142
  case GGML_OP_ABS:
16930
16143
  case GGML_OP_SGN:
16931
16144
  case GGML_OP_NEG:
16932
16145
  case GGML_OP_STEP:
16146
+ case GGML_OP_TANH:
16147
+ case GGML_OP_ELU:
16933
16148
  case GGML_OP_RELU:
16934
16149
  {
16935
16150
  node->n_tasks = 1;
@@ -16958,6 +16173,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16958
16173
  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
16959
16174
 
16960
16175
  size_t cur = 0;
16176
+ const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
16961
16177
 
16962
16178
  #if defined(GGML_USE_CUBLAS)
16963
16179
  if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
@@ -16973,39 +16189,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16973
16189
  }
16974
16190
  else
16975
16191
  #endif
16976
- if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
16977
16192
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16978
- if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16979
- node->n_tasks = 1; // TODO: this actually is doing nothing
16980
- // the threads are still spinning
16193
+ if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16194
+ node->n_tasks = 1; // TODO: this actually is doing nothing
16195
+ // the threads are still spinning
16196
+ if (node->src0->type != GGML_TYPE_F32) {
16981
16197
  // here we need memory just for single 2D matrix from src0
16982
16198
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
16983
- } else {
16984
- cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
16985
- }
16986
- #else
16987
- cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
16988
- #endif
16989
- } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
16990
- cur = 0;
16991
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16992
- if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16993
- node->n_tasks = 1;
16994
16199
  }
16200
+ } else
16995
16201
  #endif
16996
- } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
16997
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16998
- if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16999
- node->n_tasks = 1;
17000
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
17001
- } else
17002
- #endif
17003
- {
17004
- const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
17005
- cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
17006
- }
16202
+ if (node->src1->type != vec_dot_type) {
16203
+ cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
17007
16204
  } else {
17008
- GGML_ASSERT(false);
16205
+ cur = 0;
17009
16206
  }
17010
16207
 
17011
16208
  work_size = MAX(work_size, cur);
@@ -17043,8 +16240,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
17043
16240
  {
17044
16241
  node->n_tasks = 1; //TODO
17045
16242
  } break;
17046
- case GGML_OP_CONV_1D_S1_PH:
17047
- case GGML_OP_CONV_1D_S2_PH:
16243
+ case GGML_OP_CONV_1D:
17048
16244
  {
17049
16245
  node->n_tasks = n_threads;
17050
16246
 
@@ -17073,7 +16269,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
17073
16269
 
17074
16270
  work_size = MAX(work_size, cur);
17075
16271
  } break;
17076
- case GGML_OP_CONV_2D_SK_P0:
16272
+ case GGML_OP_CONV_2D:
17077
16273
  {
17078
16274
  node->n_tasks = n_threads;
17079
16275
 
@@ -17435,13 +16631,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17435
16631
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17436
16632
  }
17437
16633
 
17438
- // store the pointer address
17439
- {
17440
- const uint64_t ptr = (uint64_t) tensor->data;
17441
-
17442
- fwrite(&ptr, sizeof(uint64_t), 1, fout);
17443
- }
17444
-
17445
16634
  fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
17446
16635
 
17447
16636
  // dump the data
@@ -17475,13 +16664,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17475
16664
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17476
16665
  }
17477
16666
 
17478
- // store the pointer address
17479
- {
17480
- const uint64_t ptr = (uint64_t) tensor->data;
17481
-
17482
- fwrite(&ptr, sizeof(uint64_t), 1, fout);
17483
- }
17484
-
17485
16667
  fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
17486
16668
 
17487
16669
  // output the op arguments
@@ -17666,8 +16848,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17666
16848
 
17667
16849
  tensor->op = (enum ggml_op) op;
17668
16850
 
17669
- uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
17670
-
17671
16851
  memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
17672
16852
 
17673
16853
  tensor->data = (void *) ptr;
@@ -17713,8 +16893,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17713
16893
  nb[j] = nb_cur;
17714
16894
  }
17715
16895
 
17716
- uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
17717
-
17718
16896
  const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
17719
16897
 
17720
16898
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);