llama_cpp 0.3.8 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
123
123
  #define GGML_GELU_FP16
124
124
  #define GGML_GELU_QUICK_FP16
125
125
  #define GGML_SILU_FP16
126
+ // #define GGML_CROSS_ENTROPY_EXP_FP16
127
+ // #define GGML_FLASH_ATTN_EXP_FP16
126
128
 
127
129
  #define GGML_SOFT_MAX_UNROLL 4
128
130
  #define GGML_VEC_DOT_UNROLL 2
@@ -157,12 +159,6 @@ typedef void * thread_ret_t;
157
159
  //#define GGML_SOFT_MAX_ACCELERATE
158
160
  #endif
159
161
 
160
- #if UINTPTR_MAX == 0xFFFFFFFF
161
- #define GGML_MEM_ALIGN 4
162
- #else
163
- #define GGML_MEM_ALIGN 16
164
- #endif
165
-
166
162
  //
167
163
  // logging
168
164
  //
@@ -192,8 +188,8 @@ typedef void * thread_ret_t;
192
188
  //
193
189
 
194
190
  #if defined(_MSC_VER) || defined(__MINGW32__)
195
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
191
+ #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
192
+ #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
193
  #else
198
194
  inline static void * ggml_aligned_malloc(size_t size) {
199
195
  void * aligned_memory = NULL;
@@ -213,14 +209,13 @@ inline static void * ggml_aligned_malloc(size_t size) {
213
209
  error_desc = "insufficient memory";
214
210
  break;
215
211
  }
216
- GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
217
- __func__, error_desc, size/(1024.0*1024.0));
212
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
218
213
  return NULL;
219
214
  }
220
215
  return aligned_memory;
221
216
  }
222
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
223
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
217
+ #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
218
+ #define GGML_ALIGNED_FREE(ptr) free(ptr)
224
219
  #endif
225
220
 
226
221
  #define UNUSED GGML_UNUSED
@@ -306,6 +301,10 @@ typedef double ggml_float;
306
301
  #endif
307
302
  #endif
308
303
 
304
+ #ifdef __riscv_v_intrinsic
305
+ #include <riscv_vector.h>
306
+ #endif
307
+
309
308
  #ifdef __F16C__
310
309
 
311
310
  #ifdef _MSC_VER
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
1643
1642
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1644
1643
 
1645
1644
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1645
+ [GGML_TYPE_I8] = {
1646
+ .type_name = "i8",
1647
+ .blck_size = 1,
1648
+ .type_size = sizeof(int8_t),
1649
+ .is_quantized = false,
1650
+ },
1651
+ [GGML_TYPE_I16] = {
1652
+ .type_name = "i16",
1653
+ .blck_size = 1,
1654
+ .type_size = sizeof(int16_t),
1655
+ .is_quantized = false,
1656
+ },
1657
+ [GGML_TYPE_I32] = {
1658
+ .type_name = "i32",
1659
+ .blck_size = 1,
1660
+ .type_size = sizeof(int32_t),
1661
+ .is_quantized = false,
1662
+ },
1646
1663
  [GGML_TYPE_F32] = {
1664
+ .type_name = "f32",
1665
+ .blck_size = 1,
1666
+ .type_size = sizeof(float),
1667
+ .is_quantized = false,
1647
1668
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
1648
1669
  .vec_dot_type = GGML_TYPE_F32,
1649
1670
  },
1650
1671
  [GGML_TYPE_F16] = {
1672
+ .type_name = "f16",
1673
+ .blck_size = 1,
1674
+ .type_size = sizeof(ggml_fp16_t),
1675
+ .is_quantized = false,
1651
1676
  .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
1652
1677
  .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1653
1678
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1655
1680
  .vec_dot_type = GGML_TYPE_F16,
1656
1681
  },
1657
1682
  [GGML_TYPE_Q4_0] = {
1683
+ .type_name = "q4_0",
1684
+ .blck_size = QK4_0,
1685
+ .type_size = sizeof(block_q4_0),
1686
+ .is_quantized = true,
1658
1687
  .to_float = (ggml_to_float_t) dequantize_row_q4_0,
1659
1688
  .from_float = quantize_row_q4_0,
1660
1689
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1662
1691
  .vec_dot_type = GGML_TYPE_Q8_0,
1663
1692
  },
1664
1693
  [GGML_TYPE_Q4_1] = {
1694
+ .type_name = "q4_1",
1695
+ .blck_size = QK4_1,
1696
+ .type_size = sizeof(block_q4_1),
1697
+ .is_quantized = true,
1665
1698
  .to_float = (ggml_to_float_t) dequantize_row_q4_1,
1666
1699
  .from_float = quantize_row_q4_1,
1667
1700
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1669
1702
  .vec_dot_type = GGML_TYPE_Q8_1,
1670
1703
  },
1671
1704
  [GGML_TYPE_Q5_0] = {
1705
+ .type_name = "q5_0",
1706
+ .blck_size = QK5_0,
1707
+ .type_size = sizeof(block_q5_0),
1708
+ .is_quantized = true,
1672
1709
  .to_float = (ggml_to_float_t) dequantize_row_q5_0,
1673
1710
  .from_float = quantize_row_q5_0,
1674
1711
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1676
1713
  .vec_dot_type = GGML_TYPE_Q8_0,
1677
1714
  },
1678
1715
  [GGML_TYPE_Q5_1] = {
1716
+ .type_name = "q5_1",
1717
+ .blck_size = QK5_1,
1718
+ .type_size = sizeof(block_q5_1),
1719
+ .is_quantized = true,
1679
1720
  .to_float = (ggml_to_float_t) dequantize_row_q5_1,
1680
1721
  .from_float = quantize_row_q5_1,
1681
1722
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1683
1724
  .vec_dot_type = GGML_TYPE_Q8_1,
1684
1725
  },
1685
1726
  [GGML_TYPE_Q8_0] = {
1727
+ .type_name = "q8_0",
1728
+ .blck_size = QK8_0,
1729
+ .type_size = sizeof(block_q8_0),
1730
+ .is_quantized = true,
1686
1731
  .to_float = dequantize_row_q8_0,
1687
1732
  .from_float = quantize_row_q8_0,
1688
1733
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1690
1735
  .vec_dot_type = GGML_TYPE_Q8_0,
1691
1736
  },
1692
1737
  [GGML_TYPE_Q8_1] = {
1738
+ .type_name = "q8_1",
1739
+ .blck_size = QK8_1,
1740
+ .type_size = sizeof(block_q8_1),
1741
+ .is_quantized = true,
1693
1742
  .from_float = quantize_row_q8_1,
1694
1743
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
1695
1744
  .vec_dot_type = GGML_TYPE_Q8_1,
1696
1745
  },
1697
1746
  #ifdef GGML_USE_K_QUANTS
1698
1747
  [GGML_TYPE_Q2_K] = {
1748
+ .type_name = "q2_K",
1749
+ .blck_size = QK_K,
1750
+ .type_size = sizeof(block_q2_K),
1751
+ .is_quantized = true,
1699
1752
  .to_float = (ggml_to_float_t) dequantize_row_q2_K,
1700
1753
  .from_float = quantize_row_q2_K,
1701
1754
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1703
1756
  .vec_dot_type = GGML_TYPE_Q8_K,
1704
1757
  },
1705
1758
  [GGML_TYPE_Q3_K] = {
1759
+ .type_name = "q3_K",
1760
+ .blck_size = QK_K,
1761
+ .type_size = sizeof(block_q3_K),
1762
+ .is_quantized = true,
1706
1763
  .to_float = (ggml_to_float_t) dequantize_row_q3_K,
1707
1764
  .from_float = quantize_row_q3_K,
1708
1765
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1710
1767
  .vec_dot_type = GGML_TYPE_Q8_K,
1711
1768
  },
1712
1769
  [GGML_TYPE_Q4_K] = {
1770
+ .type_name = "q4_K",
1771
+ .blck_size = QK_K,
1772
+ .type_size = sizeof(block_q4_K),
1773
+ .is_quantized = true,
1713
1774
  .to_float = (ggml_to_float_t) dequantize_row_q4_K,
1714
1775
  .from_float = quantize_row_q4_K,
1715
1776
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1717
1778
  .vec_dot_type = GGML_TYPE_Q8_K,
1718
1779
  },
1719
1780
  [GGML_TYPE_Q5_K] = {
1781
+ .type_name = "q5_K",
1782
+ .blck_size = QK_K,
1783
+ .type_size = sizeof(block_q5_K),
1784
+ .is_quantized = true,
1720
1785
  .to_float = (ggml_to_float_t) dequantize_row_q5_K,
1721
1786
  .from_float = quantize_row_q5_K,
1722
1787
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1724
1789
  .vec_dot_type = GGML_TYPE_Q8_K,
1725
1790
  },
1726
1791
  [GGML_TYPE_Q6_K] = {
1792
+ .type_name = "q6_K",
1793
+ .blck_size = QK_K,
1794
+ .type_size = sizeof(block_q6_K),
1795
+ .is_quantized = true,
1727
1796
  .to_float = (ggml_to_float_t) dequantize_row_q6_K,
1728
1797
  .from_float = quantize_row_q6_K,
1729
1798
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1731
1800
  .vec_dot_type = GGML_TYPE_Q8_K,
1732
1801
  },
1733
1802
  [GGML_TYPE_Q8_K] = {
1803
+ .type_name = "q8_K",
1804
+ .blck_size = QK_K,
1805
+ .type_size = sizeof(block_q8_K),
1806
+ .is_quantized = true,
1734
1807
  .from_float = quantize_row_q8_K,
1735
1808
  }
1736
1809
  #endif
1737
1810
  };
1738
1811
 
1739
1812
  // For internal test use
1740
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
1741
- GGML_ASSERT(i < GGML_TYPE_COUNT);
1742
- return type_traits[i];
1813
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
1814
+ GGML_ASSERT(type < GGML_TYPE_COUNT);
1815
+ return type_traits[type];
1743
1816
  }
1744
1817
 
1745
1818
 
@@ -2363,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2363
2436
  const int nb = n / qk;
2364
2437
 
2365
2438
  assert(n % qk == 0);
2366
- assert(nb % 2 == 0);
2367
2439
 
2368
2440
  const block_q4_0 * restrict x = vx;
2369
2441
  const block_q8_0 * restrict y = vy;
@@ -2372,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2372
2444
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2373
2445
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2374
2446
 
2447
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2375
2448
  for (int i = 0; i < nb; i += 2) {
2376
2449
  const block_q4_0 * restrict x0 = &x[i + 0];
2377
2450
  const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2550,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2550
2623
  }
2551
2624
 
2552
2625
  // Main loop
2626
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2553
2627
  for (int i = 2; i < nb; i+=2) {
2554
2628
  _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
2555
2629
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2607,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2607
2681
  }
2608
2682
 
2609
2683
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2684
+ #elif defined(__riscv_v_intrinsic)
2685
+ float sumf = 0.0;
2686
+
2687
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2688
+
2689
+ for (int i = 0; i < nb; i++) {
2690
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2691
+
2692
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2693
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2694
+
2695
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2696
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2697
+
2698
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2699
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2700
+
2701
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2702
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2703
+
2704
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2705
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2706
+
2707
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2708
+
2709
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2710
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2711
+
2712
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2713
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2714
+
2715
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2716
+ }
2717
+
2718
+ *s = sumf;
2610
2719
  #else
2611
2720
  // scalar
2612
2721
  float sumf = 0.0;
@@ -2633,7 +2742,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2633
2742
  const int nb = n / qk;
2634
2743
 
2635
2744
  assert(n % qk == 0);
2636
- assert(nb % 2 == 0);
2637
2745
 
2638
2746
  const block_q4_1 * restrict x = vx;
2639
2747
  const block_q8_1 * restrict y = vy;
@@ -2645,6 +2753,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2645
2753
 
2646
2754
  float summs = 0;
2647
2755
 
2756
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2648
2757
  for (int i = 0; i < nb; i += 2) {
2649
2758
  const block_q4_1 * restrict x0 = &x[i + 0];
2650
2759
  const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2733,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2733
2842
  }
2734
2843
 
2735
2844
  *s = hsum_float_8(acc) + summs;
2845
+ #elif defined(__riscv_v_intrinsic)
2846
+ float sumf = 0.0;
2847
+
2848
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2849
+
2850
+ for (int i = 0; i < nb; i++) {
2851
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2852
+
2853
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2854
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2855
+
2856
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2857
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2858
+
2859
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2860
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2861
+
2862
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2863
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2864
+
2865
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2866
+
2867
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2868
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2869
+
2870
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2871
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2872
+
2873
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2874
+ }
2875
+
2876
+ *s = sumf;
2736
2877
  #else
2737
2878
  // scalar
2738
2879
  float sumf = 0.0;
@@ -2759,7 +2900,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2759
2900
  const int nb = n / qk;
2760
2901
 
2761
2902
  assert(n % qk == 0);
2762
- assert(nb % 2 == 0);
2763
2903
  assert(qk == QK5_0);
2764
2904
 
2765
2905
  const block_q5_0 * restrict x = vx;
@@ -2775,6 +2915,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2775
2915
  uint64_t tmp0[4];
2776
2916
  uint64_t tmp1[4];
2777
2917
 
2918
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2778
2919
  for (int i = 0; i < nb; i += 2) {
2779
2920
  const block_q5_0 * restrict x0 = &x[i];
2780
2921
  const block_q5_0 * restrict x1 = &x[i + 1];
@@ -2967,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2967
3108
  }
2968
3109
 
2969
3110
  *s = hsum_float_8(acc);
3111
+ #elif defined(__riscv_v_intrinsic)
3112
+ float sumf = 0.0;
3113
+
3114
+ uint32_t qh;
3115
+
3116
+ // These temp values are for masking and shift operations
3117
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3118
+ uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3119
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3120
+
3121
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3122
+
3123
+ for (int i = 0; i < nb; i++) {
3124
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3125
+
3126
+ // temporary registers
3127
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3128
+ vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3129
+ vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3130
+ vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3131
+
3132
+ // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3133
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3134
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3135
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3136
+
3137
+ // ((qh & (1u << (j + 16))) >> (j + 12));
3138
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3139
+ vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3140
+
3141
+ // narrowing
3142
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3143
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3144
+
3145
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3146
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3147
+
3148
+ // load
3149
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3150
+
3151
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3152
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3153
+
3154
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3155
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3156
+
3157
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3158
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3159
+
3160
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3161
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3162
+
3163
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3164
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3165
+
3166
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3167
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3168
+
3169
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3170
+
3171
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3172
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3173
+
3174
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3175
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3176
+
3177
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3178
+ }
3179
+
3180
+ *s = sumf;
2970
3181
  #else
2971
3182
  // scalar
2972
3183
  float sumf = 0.0;
@@ -2999,7 +3210,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2999
3210
  const int nb = n / qk;
3000
3211
 
3001
3212
  assert(n % qk == 0);
3002
- assert(nb % 2 == 0);
3003
3213
  assert(qk == QK5_1);
3004
3214
 
3005
3215
  const block_q5_1 * restrict x = vx;
@@ -3018,6 +3228,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3018
3228
  uint64_t tmp0[4];
3019
3229
  uint64_t tmp1[4];
3020
3230
 
3231
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3021
3232
  for (int i = 0; i < nb; i += 2) {
3022
3233
  const block_q5_1 * restrict x0 = &x[i];
3023
3234
  const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3223,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3223
3434
  }
3224
3435
 
3225
3436
  *s = hsum_float_8(acc) + summs;
3437
+ #elif defined(__riscv_v_intrinsic)
3438
+ float sumf = 0.0;
3439
+
3440
+ uint32_t qh;
3441
+
3442
+ // These temp values are for shift operations
3443
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3444
+
3445
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3446
+
3447
+ for (int i = 0; i < nb; i++) {
3448
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3449
+
3450
+ // temporary registers
3451
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3452
+ vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3453
+
3454
+ // load qh
3455
+ vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3456
+
3457
+ // ((qh >> (j + 0)) << 4) & 0x10;
3458
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3459
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3460
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3461
+
3462
+ // ((qh >> (j + 12)) ) & 0x10;
3463
+ vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3464
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3465
+
3466
+ // narrowing
3467
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3468
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3469
+
3470
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3471
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3472
+
3473
+ // load
3474
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3475
+
3476
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3477
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3478
+
3479
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3480
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3481
+
3482
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3483
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3484
+
3485
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3486
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3487
+
3488
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3489
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3490
+
3491
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3492
+
3493
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3494
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3495
+
3496
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3497
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3498
+
3499
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3500
+ }
3501
+
3502
+ *s = sumf;
3226
3503
  #else
3227
3504
  // scalar
3228
3505
  float sumf = 0.0;
@@ -3255,7 +3532,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3255
3532
  const int nb = n / qk;
3256
3533
 
3257
3534
  assert(n % qk == 0);
3258
- assert(nb % 2 == 0);
3259
3535
 
3260
3536
  const block_q8_0 * restrict x = vx;
3261
3537
  const block_q8_0 * restrict y = vy;
@@ -3264,6 +3540,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3264
3540
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3265
3541
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3266
3542
 
3543
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3267
3544
  for (int i = 0; i < nb; i += 2) {
3268
3545
  const block_q8_0 * restrict x0 = &x[i + 0];
3269
3546
  const block_q8_0 * restrict x1 = &x[i + 1];
@@ -3334,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3334
3611
  }
3335
3612
 
3336
3613
  *s = hsum_float_8(acc);
3614
+ #elif defined(__riscv_v_intrinsic)
3615
+ float sumf = 0.0;
3616
+ size_t vl = __riscv_vsetvl_e8m1(qk);
3617
+
3618
+ for (int i = 0; i < nb; i++) {
3619
+ // load elements
3620
+ vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
3621
+ vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
3622
+
3623
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
3624
+
3625
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
3626
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
3627
+
3628
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
3629
+
3630
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3631
+ }
3632
+
3633
+ *s = sumf;
3337
3634
  #else
3338
3635
  // scalar
3339
3636
  float sumf = 0.0;
@@ -3481,9 +3778,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
3481
3778
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
3482
3779
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3483
3780
 
3484
- static const float GELU_COEF_A = 0.044715f;
3485
- static const float GELU_QUICK_COEF = -1.702f;
3486
- static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3781
+ static const float GELU_COEF_A = 0.044715f;
3782
+ static const float GELU_QUICK_COEF = -1.702f;
3783
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3487
3784
 
3488
3785
  inline static float ggml_gelu_f32(float x) {
3489
3786
  return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -3652,95 +3949,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
3652
3949
  // data types
3653
3950
  //
3654
3951
 
3655
- static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3656
- [GGML_TYPE_F32] = 1,
3657
- [GGML_TYPE_F16] = 1,
3658
- [GGML_TYPE_Q4_0] = QK4_0,
3659
- [GGML_TYPE_Q4_1] = QK4_1,
3660
- [GGML_TYPE_Q5_0] = QK5_0,
3661
- [GGML_TYPE_Q5_1] = QK5_1,
3662
- [GGML_TYPE_Q8_0] = QK8_0,
3663
- [GGML_TYPE_Q8_1] = QK8_1,
3664
- #ifdef GGML_USE_K_QUANTS
3665
- [GGML_TYPE_Q2_K] = QK_K,
3666
- [GGML_TYPE_Q3_K] = QK_K,
3667
- [GGML_TYPE_Q4_K] = QK_K,
3668
- [GGML_TYPE_Q5_K] = QK_K,
3669
- [GGML_TYPE_Q6_K] = QK_K,
3670
- [GGML_TYPE_Q8_K] = QK_K,
3671
- #endif
3672
- [GGML_TYPE_I8] = 1,
3673
- [GGML_TYPE_I16] = 1,
3674
- [GGML_TYPE_I32] = 1,
3675
- };
3676
- static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3677
-
3678
- static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3679
- [GGML_TYPE_F32] = sizeof(float),
3680
- [GGML_TYPE_F16] = sizeof(ggml_fp16_t),
3681
- [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3682
- [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
3683
- [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
3684
- [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3685
- [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3686
- [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3687
- #ifdef GGML_USE_K_QUANTS
3688
- [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3689
- [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3690
- [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3691
- [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3692
- [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3693
- [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3694
- #endif
3695
- [GGML_TYPE_I8] = sizeof(int8_t),
3696
- [GGML_TYPE_I16] = sizeof(int16_t),
3697
- [GGML_TYPE_I32] = sizeof(int32_t),
3698
- };
3699
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3700
-
3701
-
3702
- static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3703
- [GGML_TYPE_F32] = "f32",
3704
- [GGML_TYPE_F16] = "f16",
3705
- [GGML_TYPE_Q4_0] = "q4_0",
3706
- [GGML_TYPE_Q4_1] = "q4_1",
3707
- [GGML_TYPE_Q5_0] = "q5_0",
3708
- [GGML_TYPE_Q5_1] = "q5_1",
3709
- [GGML_TYPE_Q8_0] = "q8_0",
3710
- [GGML_TYPE_Q8_1] = "q8_1",
3711
- [GGML_TYPE_Q2_K] = "q2_K",
3712
- [GGML_TYPE_Q3_K] = "q3_K",
3713
- [GGML_TYPE_Q4_K] = "q4_K",
3714
- [GGML_TYPE_Q5_K] = "q5_K",
3715
- [GGML_TYPE_Q6_K] = "q6_K",
3716
- [GGML_TYPE_Q8_K] = "q8_K",
3717
- [GGML_TYPE_I8] = "i8",
3718
- [GGML_TYPE_I16] = "i16",
3719
- [GGML_TYPE_I32] = "i32",
3720
- };
3721
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3722
-
3723
- static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3724
- [GGML_TYPE_F32] = false,
3725
- [GGML_TYPE_F16] = false,
3726
- [GGML_TYPE_Q4_0] = true,
3727
- [GGML_TYPE_Q4_1] = true,
3728
- [GGML_TYPE_Q5_0] = true,
3729
- [GGML_TYPE_Q5_1] = true,
3730
- [GGML_TYPE_Q8_0] = true,
3731
- [GGML_TYPE_Q8_1] = true,
3732
- [GGML_TYPE_Q2_K] = true,
3733
- [GGML_TYPE_Q3_K] = true,
3734
- [GGML_TYPE_Q4_K] = true,
3735
- [GGML_TYPE_Q5_K] = true,
3736
- [GGML_TYPE_Q6_K] = true,
3737
- [GGML_TYPE_Q8_K] = true,
3738
- [GGML_TYPE_I8] = false,
3739
- [GGML_TYPE_I16] = false,
3740
- [GGML_TYPE_I32] = false,
3741
- };
3742
- static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3743
-
3744
3952
  static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3745
3953
  "NONE",
3746
3954
 
@@ -3760,10 +3968,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3760
3968
  "ARGMAX",
3761
3969
  "REPEAT",
3762
3970
  "REPEAT_BACK",
3971
+ "CONCAT",
3763
3972
  "SILU_BACK",
3764
3973
  "NORM",
3765
3974
  "RMS_NORM",
3766
3975
  "RMS_NORM_BACK",
3976
+ "GROUP_NORM",
3767
3977
 
3768
3978
  "MUL_MAT",
3769
3979
  "OUT_PROD",
@@ -3789,20 +3999,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3789
3999
  "CLAMP",
3790
4000
  "CONV_1D",
3791
4001
  "CONV_2D",
4002
+ "CONV_TRANSPOSE_2D",
3792
4003
  "POOL_1D",
3793
4004
  "POOL_2D",
4005
+ "UPSCALE",
3794
4006
 
3795
4007
  "FLASH_ATTN",
3796
4008
  "FLASH_FF",
3797
4009
  "FLASH_ATTN_BACK",
3798
4010
  "WIN_PART",
3799
4011
  "WIN_UNPART",
4012
+ "GET_REL_POS",
4013
+ "ADD_REL_POS",
3800
4014
 
3801
4015
  "UNARY",
3802
4016
 
3803
4017
  "MAP_UNARY",
3804
4018
  "MAP_BINARY",
3805
4019
 
4020
+ "MAP_CUSTOM1_F32",
4021
+ "MAP_CUSTOM2_F32",
4022
+ "MAP_CUSTOM3_F32",
4023
+
3806
4024
  "MAP_CUSTOM1",
3807
4025
  "MAP_CUSTOM2",
3808
4026
  "MAP_CUSTOM3",
@@ -3811,7 +4029,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3811
4029
  "CROSS_ENTROPY_LOSS_BACK",
3812
4030
  };
3813
4031
 
3814
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
4032
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3815
4033
 
3816
4034
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3817
4035
  "none",
@@ -3832,10 +4050,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3832
4050
  "argmax(x)",
3833
4051
  "repeat(x)",
3834
4052
  "repeat_back(x)",
4053
+ "concat(x, y)",
3835
4054
  "silu_back(x)",
3836
4055
  "norm(x)",
3837
4056
  "rms_norm(x)",
3838
4057
  "rms_norm_back(x)",
4058
+ "group_norm(x)",
3839
4059
 
3840
4060
  "X*Y",
3841
4061
  "X*Y",
@@ -3861,20 +4081,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3861
4081
  "clamp(x)",
3862
4082
  "conv_1d(x)",
3863
4083
  "conv_2d(x)",
4084
+ "conv_transpose_2d(x)",
3864
4085
  "pool_1d(x)",
3865
4086
  "pool_2d(x)",
4087
+ "upscale(x)",
3866
4088
 
3867
4089
  "flash_attn(x)",
3868
4090
  "flash_ff(x)",
3869
4091
  "flash_attn_back(x)",
3870
4092
  "win_part(x)",
3871
4093
  "win_unpart(x)",
4094
+ "get_rel_pos(x)",
4095
+ "add_rel_pos(x)",
3872
4096
 
3873
4097
  "unary(x)",
3874
4098
 
3875
4099
  "f(x)",
3876
4100
  "f(x,y)",
3877
4101
 
4102
+ "custom_f32(x)",
4103
+ "custom_f32(x,y)",
4104
+ "custom_f32(x,y,z)",
4105
+
3878
4106
  "custom(x)",
3879
4107
  "custom(x,y)",
3880
4108
  "custom(x,y,z)",
@@ -3883,7 +4111,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3883
4111
  "cross_entropy_loss_back(x,y)",
3884
4112
  };
3885
4113
 
3886
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
4114
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3887
4115
 
3888
4116
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3889
4117
 
@@ -3913,8 +4141,10 @@ static void ggml_setup_op_has_task_pass(void) {
3913
4141
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
3914
4142
  p[GGML_OP_CONV_1D ] = true;
3915
4143
  p[GGML_OP_CONV_2D ] = true;
4144
+ p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
3916
4145
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
3917
4146
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
4147
+ p[GGML_OP_ADD_REL_POS ] = true;
3918
4148
  }
3919
4149
 
3920
4150
  { // FINALIZE
@@ -4101,38 +4331,41 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4101
4331
  }
4102
4332
 
4103
4333
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4104
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4105
-
4106
- // this should handle cases where the tensor is not contiguous in memory
4107
- // probaby just:
4108
- //
4109
- // return tensor->ne[3]*tensor->nb[3]
4110
- //
4111
- // is enough, but just in case, adding the second part
4334
+ size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4335
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4336
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4337
+ }
4338
+ return nbytes;
4339
+ }
4112
4340
 
4113
- return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
4341
+ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
4342
+ return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
4114
4343
  }
4115
4344
 
4116
4345
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
4117
4346
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4118
4347
 
4119
- return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
4348
+ return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
4120
4349
  }
4121
4350
 
4122
4351
  int ggml_blck_size(enum ggml_type type) {
4123
- return GGML_BLCK_SIZE[type];
4352
+ return type_traits[type].blck_size;
4124
4353
  }
4125
4354
 
4126
4355
  size_t ggml_type_size(enum ggml_type type) {
4127
- return GGML_TYPE_SIZE[type];
4356
+ return type_traits[type].type_size;
4128
4357
  }
4129
4358
 
4130
4359
  float ggml_type_sizef(enum ggml_type type) {
4131
- return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
4360
+ return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
4132
4361
  }
4133
4362
 
4134
4363
  const char * ggml_type_name(enum ggml_type type) {
4135
- return GGML_TYPE_NAME[type];
4364
+ return type_traits[type].type_name;
4365
+ }
4366
+
4367
+ bool ggml_is_quantized(enum ggml_type type) {
4368
+ return type_traits[type].is_quantized;
4136
4369
  }
4137
4370
 
4138
4371
  const char * ggml_op_name(enum ggml_op op) {
@@ -4144,7 +4377,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
4144
4377
  }
4145
4378
 
4146
4379
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4147
- return GGML_TYPE_SIZE[tensor->type];
4380
+ return ggml_type_size(tensor->type);
4148
4381
  }
4149
4382
 
4150
4383
  static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@@ -4182,10 +4415,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
4182
4415
  (t0->ne[3] == t1->ne[3]);
4183
4416
  }
4184
4417
 
4185
- bool ggml_is_quantized(enum ggml_type type) {
4186
- return GGML_IS_QUANTIZED[type];
4187
- }
4188
-
4189
4418
  enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4190
4419
  enum ggml_type wtype = GGML_TYPE_COUNT;
4191
4420
 
@@ -4223,8 +4452,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4223
4452
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4224
4453
 
4225
4454
  return
4226
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4227
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
4455
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4456
+ tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
4228
4457
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4229
4458
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4230
4459
  }
@@ -4233,7 +4462,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
4233
4462
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
4463
 
4235
4464
  return
4236
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4465
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4237
4466
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
4467
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
4468
  }
@@ -4248,7 +4477,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
4248
4477
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4249
4478
 
4250
4479
  return
4251
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4480
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4252
4481
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4253
4482
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4254
4483
  }
@@ -4560,36 +4789,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4560
4789
  enum ggml_type type,
4561
4790
  int n_dims,
4562
4791
  const int64_t * ne,
4563
- void * data) {
4792
+ struct ggml_tensor * view_src,
4793
+ size_t view_offs) {
4564
4794
 
4565
4795
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4566
4796
 
4567
- size_t data_size = 0;
4797
+ // find the base tensor and absolute offset
4798
+ if (view_src != NULL && view_src->view_src != NULL) {
4799
+ view_offs += view_src->view_offs;
4800
+ view_src = view_src->view_src;
4801
+ }
4802
+
4803
+ size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4804
+ for (int i = 1; i < n_dims; i++) {
4805
+ data_size *= ne[i];
4806
+ }
4807
+
4808
+ GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
4568
4809
 
4569
- if (data == NULL && !ctx->no_alloc) {
4570
- data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4571
- for (int i = 1; i < n_dims; i++) {
4572
- data_size *= ne[i];
4573
- }
4810
+ void * data = view_src != NULL ? view_src->data : NULL;
4811
+ if (data != NULL) {
4812
+ data = (char *) data + view_offs;
4574
4813
  }
4575
4814
 
4576
- if (ctx->scratch.data != NULL && data == NULL) {
4577
- // allocate tensor data in the scratch buffer
4578
- if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4579
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4580
- __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4581
- assert(false);
4582
- return NULL;
4583
- }
4815
+ size_t obj_alloc_size = 0;
4584
4816
 
4585
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4817
+ if (view_src == NULL && ctx->no_alloc == false) {
4818
+ if (ctx->scratch.data != NULL) {
4819
+ // allocate tensor data in the scratch buffer
4820
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4821
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4822
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4823
+ assert(false);
4824
+ return NULL;
4825
+ }
4586
4826
 
4587
- ctx->scratch.offs += data_size;
4827
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4588
4828
 
4589
- data_size = 0;
4829
+ ctx->scratch.offs += data_size;
4830
+ } else {
4831
+ // allocate tensor data in the context's memory pool
4832
+ obj_alloc_size = data_size;
4833
+ }
4590
4834
  }
4591
4835
 
4592
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4836
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
4593
4837
 
4594
4838
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4595
4839
 
@@ -4609,7 +4853,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4609
4853
  /*.perf_runs =*/ 0,
4610
4854
  /*.perf_cycles =*/ 0,
4611
4855
  /*.perf_time_us =*/ 0,
4612
- /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4856
+ /*.view_src =*/ view_src,
4857
+ /*.view_offs =*/ view_offs,
4858
+ /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
4613
4859
  /*.name =*/ { 0 },
4614
4860
  /*.extra =*/ NULL,
4615
4861
  /*.padding =*/ { 0 },
@@ -4622,8 +4868,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4622
4868
  result->ne[i] = ne[i];
4623
4869
  }
4624
4870
 
4625
- result->nb[0] = GGML_TYPE_SIZE[type];
4626
- result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
4871
+ result->nb[0] = ggml_type_size(type);
4872
+ result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
4627
4873
  for (int i = 2; i < GGML_MAX_DIMS; i++) {
4628
4874
  result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
4629
4875
  }
@@ -4633,28 +4879,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4633
4879
  return result;
4634
4880
  }
4635
4881
 
4636
- static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4637
- GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4638
- assert(params_size <= GGML_MAX_OP_PARAMS);
4639
- memcpy(tensor->op_params, params, params_size);
4640
- }
4641
-
4642
- static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4643
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4644
- return ((const int32_t *)(tensor->op_params))[i];
4645
- }
4646
-
4647
- static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4648
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4649
- ((int32_t *)(tensor->op_params))[i] = value;
4650
- }
4651
-
4652
4882
  struct ggml_tensor * ggml_new_tensor(
4653
4883
  struct ggml_context * ctx,
4654
4884
  enum ggml_type type,
4655
4885
  int n_dims,
4656
4886
  const int64_t * ne) {
4657
- return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4887
+ return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
4658
4888
  }
4659
4889
 
4660
4890
  struct ggml_tensor * ggml_new_tensor_1d(
@@ -4719,7 +4949,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
4719
4949
  }
4720
4950
 
4721
4951
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4722
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
4952
+ return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
4953
+ }
4954
+
4955
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4956
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4957
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4958
+ memcpy(tensor->op_params, params, params_size);
4959
+ }
4960
+
4961
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4962
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4963
+ return ((const int32_t *)(tensor->op_params))[i];
4964
+ }
4965
+
4966
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4967
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4968
+ ((int32_t *)(tensor->op_params))[i] = value;
4723
4969
  }
4724
4970
 
4725
4971
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@@ -5005,14 +5251,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
5005
5251
 
5006
5252
  struct ggml_tensor * ggml_view_tensor(
5007
5253
  struct ggml_context * ctx,
5008
- const struct ggml_tensor * src) {
5009
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
5254
+ struct ggml_tensor * src) {
5255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
5010
5256
  ggml_format_name(result, "%s (view)", src->name);
5011
5257
 
5012
- result->nb[0] = src->nb[0];
5013
- result->nb[1] = src->nb[1];
5014
- result->nb[2] = src->nb[2];
5015
- result->nb[3] = src->nb[3];
5258
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
5259
+ result->nb[i] = src->nb[i];
5260
+ }
5016
5261
 
5017
5262
  return result;
5018
5263
  }
@@ -5545,10 +5790,6 @@ struct ggml_tensor * ggml_repeat(
5545
5790
  is_node = true;
5546
5791
  }
5547
5792
 
5548
- if (ggml_are_same_shape(a, b) && !is_node) {
5549
- return a;
5550
- }
5551
-
5552
5793
  struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
5553
5794
 
5554
5795
  result->op = GGML_OP_REPEAT;
@@ -5587,6 +5828,30 @@ struct ggml_tensor * ggml_repeat_back(
5587
5828
  return result;
5588
5829
  }
5589
5830
 
5831
+ // ggml_concat
5832
+
5833
+ struct ggml_tensor * ggml_concat(
5834
+ struct ggml_context* ctx,
5835
+ struct ggml_tensor* a,
5836
+ struct ggml_tensor* b) {
5837
+ GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
5838
+
5839
+ bool is_node = false;
5840
+
5841
+ if (a->grad || b->grad) {
5842
+ is_node = true;
5843
+ }
5844
+
5845
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
5846
+
5847
+ result->op = GGML_OP_CONCAT;
5848
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5849
+ result->src[0] = a;
5850
+ result->src[1] = b;
5851
+
5852
+ return result;
5853
+ }
5854
+
5590
5855
  // ggml_abs
5591
5856
 
5592
5857
  struct ggml_tensor * ggml_abs(
@@ -5755,6 +6020,7 @@ struct ggml_tensor * ggml_silu_back(
5755
6020
  static struct ggml_tensor * ggml_norm_impl(
5756
6021
  struct ggml_context * ctx,
5757
6022
  struct ggml_tensor * a,
6023
+ float eps,
5758
6024
  bool inplace) {
5759
6025
  bool is_node = false;
5760
6026
 
@@ -5765,7 +6031,7 @@ static struct ggml_tensor * ggml_norm_impl(
5765
6031
 
5766
6032
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5767
6033
 
5768
- // TODO: maybe store epsilon here?
6034
+ ggml_set_op_params(result, &eps, sizeof(eps));
5769
6035
 
5770
6036
  result->op = GGML_OP_NORM;
5771
6037
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5776,16 +6042,20 @@ static struct ggml_tensor * ggml_norm_impl(
5776
6042
 
5777
6043
  struct ggml_tensor * ggml_norm(
5778
6044
  struct ggml_context * ctx,
5779
- struct ggml_tensor * a) {
5780
- return ggml_norm_impl(ctx, a, false);
6045
+ struct ggml_tensor * a,
6046
+ float eps) {
6047
+ return ggml_norm_impl(ctx, a, eps, false);
5781
6048
  }
5782
6049
 
5783
6050
  struct ggml_tensor * ggml_norm_inplace(
5784
6051
  struct ggml_context * ctx,
5785
- struct ggml_tensor * a) {
5786
- return ggml_norm_impl(ctx, a, true);
6052
+ struct ggml_tensor * a,
6053
+ float eps) {
6054
+ return ggml_norm_impl(ctx, a, eps, true);
5787
6055
  }
5788
6056
 
6057
+ // ggml_rms_norm
6058
+
5789
6059
  static struct ggml_tensor * ggml_rms_norm_impl(
5790
6060
  struct ggml_context * ctx,
5791
6061
  struct ggml_tensor * a,
@@ -5822,10 +6092,13 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5822
6092
  return ggml_rms_norm_impl(ctx, a, eps, true);
5823
6093
  }
5824
6094
 
6095
+ // ggml_rms_norm_back
6096
+
5825
6097
  struct ggml_tensor * ggml_rms_norm_back(
5826
6098
  struct ggml_context * ctx,
5827
6099
  struct ggml_tensor * a,
5828
- struct ggml_tensor * b) {
6100
+ struct ggml_tensor * b,
6101
+ float eps) {
5829
6102
  bool is_node = false;
5830
6103
 
5831
6104
  if (a->grad) {
@@ -5835,6 +6108,8 @@ struct ggml_tensor * ggml_rms_norm_back(
5835
6108
 
5836
6109
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5837
6110
 
6111
+ ggml_set_op_params(result, &eps, sizeof(eps));
6112
+
5838
6113
  result->op = GGML_OP_RMS_NORM_BACK;
5839
6114
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5840
6115
  result->src[0] = a;
@@ -5843,6 +6118,44 @@ struct ggml_tensor * ggml_rms_norm_back(
5843
6118
  return result;
5844
6119
  }
5845
6120
 
6121
+ // ggml_group_norm
6122
+
6123
+ static struct ggml_tensor * ggml_group_norm_impl(
6124
+ struct ggml_context * ctx,
6125
+ struct ggml_tensor * a,
6126
+ int n_groups,
6127
+ bool inplace) {
6128
+
6129
+ bool is_node = false;
6130
+ if (!inplace && (a->grad)) {
6131
+ GGML_ASSERT(false); // TODO: implement backward
6132
+ is_node = true;
6133
+ }
6134
+
6135
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6136
+
6137
+ result->op = GGML_OP_GROUP_NORM;
6138
+ result->op_params[0] = n_groups;
6139
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6140
+ result->src[0] = a;
6141
+ result->src[1] = NULL; // TODO: maybe store epsilon here?
6142
+
6143
+ return result;
6144
+ }
6145
+
6146
+ struct ggml_tensor * ggml_group_norm(
6147
+ struct ggml_context * ctx,
6148
+ struct ggml_tensor * a,
6149
+ int n_groups) {
6150
+ return ggml_group_norm_impl(ctx, a, n_groups, false);
6151
+ }
6152
+
6153
+ struct ggml_tensor * ggml_group_norm_inplace(
6154
+ struct ggml_context * ctx,
6155
+ struct ggml_tensor * a,
6156
+ int n_groups) {
6157
+ return ggml_group_norm_impl(ctx, a, n_groups, true);
6158
+ }
5846
6159
 
5847
6160
  // ggml_mul_mat
5848
6161
 
@@ -6126,7 +6439,7 @@ struct ggml_tensor * ggml_reshape(
6126
6439
  //GGML_ASSERT(false);
6127
6440
  }
6128
6441
 
6129
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6442
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
6130
6443
  ggml_format_name(result, "%s (reshaped)", a->name);
6131
6444
 
6132
6445
  result->op = GGML_OP_RESHAPE;
@@ -6150,7 +6463,7 @@ struct ggml_tensor * ggml_reshape_1d(
6150
6463
  }
6151
6464
 
6152
6465
  const int64_t ne[1] = { ne0 };
6153
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6466
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
6154
6467
  ggml_format_name(result, "%s (reshaped)", a->name);
6155
6468
 
6156
6469
  result->op = GGML_OP_RESHAPE;
@@ -6175,7 +6488,7 @@ struct ggml_tensor * ggml_reshape_2d(
6175
6488
  }
6176
6489
 
6177
6490
  const int64_t ne[2] = { ne0, ne1 };
6178
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6491
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
6179
6492
  ggml_format_name(result, "%s (reshaped)", a->name);
6180
6493
 
6181
6494
  result->op = GGML_OP_RESHAPE;
@@ -6201,7 +6514,7 @@ struct ggml_tensor * ggml_reshape_3d(
6201
6514
  }
6202
6515
 
6203
6516
  const int64_t ne[3] = { ne0, ne1, ne2 };
6204
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6517
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
6205
6518
  ggml_format_name(result, "%s (reshaped)", a->name);
6206
6519
 
6207
6520
  result->op = GGML_OP_RESHAPE;
@@ -6211,7 +6524,6 @@ struct ggml_tensor * ggml_reshape_3d(
6211
6524
  return result;
6212
6525
  }
6213
6526
 
6214
-
6215
6527
  struct ggml_tensor * ggml_reshape_4d(
6216
6528
  struct ggml_context * ctx,
6217
6529
  struct ggml_tensor * a,
@@ -6229,7 +6541,7 @@ struct ggml_tensor * ggml_reshape_4d(
6229
6541
  }
6230
6542
 
6231
6543
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6232
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6544
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
6233
6545
  ggml_format_name(result, "%s (reshaped)", a->name);
6234
6546
 
6235
6547
  result->op = GGML_OP_RESHAPE;
@@ -6239,46 +6551,40 @@ struct ggml_tensor * ggml_reshape_4d(
6239
6551
  return result;
6240
6552
  }
6241
6553
 
6242
- // ggml_view_1d
6243
-
6244
- static struct ggml_tensor * ggml_view_tensor_offset(
6554
+ static struct ggml_tensor * ggml_view_impl(
6245
6555
  struct ggml_context * ctx,
6246
6556
  struct ggml_tensor * a,
6247
6557
  int n_dims,
6248
6558
  const int64_t * ne,
6249
6559
  size_t offset) {
6250
- // don't calculate an offset from an unallocated tensor
6251
- void * data = NULL;
6252
- if (a->data != NULL) {
6253
- data = (char *) a->data + offset;
6254
- }
6255
6560
 
6256
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6561
+ bool is_node = false;
6562
+
6563
+ if (a->grad) {
6564
+ is_node = true;
6565
+ }
6257
6566
 
6567
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
6258
6568
  ggml_format_name(result, "%s (view)", a->name);
6259
6569
 
6260
6570
  ggml_set_op_params(result, &offset, sizeof(offset));
6261
6571
 
6572
+ result->op = GGML_OP_VIEW;
6573
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6574
+ result->src[0] = a;
6575
+
6262
6576
  return result;
6263
6577
  }
6264
6578
 
6579
+ // ggml_view_1d
6580
+
6265
6581
  struct ggml_tensor * ggml_view_1d(
6266
6582
  struct ggml_context * ctx,
6267
6583
  struct ggml_tensor * a,
6268
6584
  int64_t ne0,
6269
6585
  size_t offset) {
6270
6586
 
6271
- bool is_node = false;
6272
-
6273
- if (a->grad) {
6274
- is_node = true;
6275
- }
6276
-
6277
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6278
-
6279
- result->op = GGML_OP_VIEW;
6280
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6281
- result->src[0] = a;
6587
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
6282
6588
 
6283
6589
  return result;
6284
6590
  }
@@ -6293,24 +6599,14 @@ struct ggml_tensor * ggml_view_2d(
6293
6599
  size_t nb1,
6294
6600
  size_t offset) {
6295
6601
 
6296
- bool is_node = false;
6602
+ const int64_t ne[2] = { ne0, ne1 };
6297
6603
 
6298
- if (a->grad) {
6299
- is_node = true;
6300
- }
6301
-
6302
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6303
-
6304
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6604
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
6305
6605
 
6306
6606
  result->nb[1] = nb1;
6307
6607
  result->nb[2] = result->nb[1]*ne1;
6308
6608
  result->nb[3] = result->nb[2];
6309
6609
 
6310
- result->op = GGML_OP_VIEW;
6311
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6312
- result->src[0] = a;
6313
-
6314
6610
  return result;
6315
6611
  }
6316
6612
 
@@ -6326,24 +6622,14 @@ struct ggml_tensor * ggml_view_3d(
6326
6622
  size_t nb2,
6327
6623
  size_t offset) {
6328
6624
 
6329
- bool is_node = false;
6330
-
6331
- if (a->grad) {
6332
- is_node = true;
6333
- }
6334
-
6335
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6625
+ const int64_t ne[3] = { ne0, ne1, ne2 };
6336
6626
 
6337
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6627
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
6338
6628
 
6339
6629
  result->nb[1] = nb1;
6340
6630
  result->nb[2] = nb2;
6341
6631
  result->nb[3] = result->nb[2]*ne2;
6342
6632
 
6343
- result->op = GGML_OP_VIEW;
6344
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6345
- result->src[0] = a;
6346
-
6347
6633
  return result;
6348
6634
  }
6349
6635
 
@@ -6361,24 +6647,14 @@ struct ggml_tensor * ggml_view_4d(
6361
6647
  size_t nb3,
6362
6648
  size_t offset) {
6363
6649
 
6364
- bool is_node = false;
6365
-
6366
- if (a->grad) {
6367
- is_node = true;
6368
- }
6369
-
6370
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6650
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6371
6651
 
6372
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6652
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
6373
6653
 
6374
6654
  result->nb[1] = nb1;
6375
6655
  result->nb[2] = nb2;
6376
6656
  result->nb[3] = nb3;
6377
6657
 
6378
- result->op = GGML_OP_VIEW;
6379
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6380
- result->src[0] = a;
6381
-
6382
6658
  return result;
6383
6659
  }
6384
6660
 
@@ -6565,7 +6841,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6565
6841
 
6566
6842
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6567
6843
 
6568
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6844
+ int32_t params[] = { n_past };
6569
6845
  ggml_set_op_params(result, params, sizeof(params));
6570
6846
 
6571
6847
  result->op = GGML_OP_DIAG_MASK_INF;
@@ -6582,7 +6858,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
6582
6858
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6583
6859
  }
6584
6860
 
6585
-
6586
6861
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6587
6862
  struct ggml_context * ctx,
6588
6863
  struct ggml_tensor * a,
@@ -6605,7 +6880,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6605
6880
 
6606
6881
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6607
6882
 
6608
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6883
+ int32_t params[] = { n_past };
6609
6884
  ggml_set_op_params(result, params, sizeof(params));
6610
6885
 
6611
6886
  result->op = GGML_OP_DIAG_MASK_ZERO;
@@ -6711,6 +6986,8 @@ static struct ggml_tensor * ggml_rope_impl(
6711
6986
  int n_ctx,
6712
6987
  float freq_base,
6713
6988
  float freq_scale,
6989
+ float xpos_base,
6990
+ bool xpos_down,
6714
6991
  bool inplace) {
6715
6992
  GGML_ASSERT(n_past >= 0);
6716
6993
  bool is_node = false;
@@ -6721,9 +6998,11 @@ static struct ggml_tensor * ggml_rope_impl(
6721
6998
 
6722
6999
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6723
7000
 
6724
- int32_t params[6] = { n_past, n_dims, mode, n_ctx };
7001
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
6725
7002
  memcpy(params + 4, &freq_base, sizeof(float));
6726
7003
  memcpy(params + 5, &freq_scale, sizeof(float));
7004
+ memcpy(params + 6, &xpos_base, sizeof(float));
7005
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6727
7006
  ggml_set_op_params(result, params, sizeof(params));
6728
7007
 
6729
7008
  result->op = GGML_OP_ROPE;
@@ -6740,7 +7019,7 @@ struct ggml_tensor * ggml_rope(
6740
7019
  int n_dims,
6741
7020
  int mode,
6742
7021
  int n_ctx) {
6743
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
7022
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
6744
7023
  }
6745
7024
 
6746
7025
  struct ggml_tensor * ggml_rope_inplace(
@@ -6750,7 +7029,7 @@ struct ggml_tensor * ggml_rope_inplace(
6750
7029
  int n_dims,
6751
7030
  int mode,
6752
7031
  int n_ctx) {
6753
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
7032
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
6754
7033
  }
6755
7034
 
6756
7035
  struct ggml_tensor * ggml_rope_custom(
@@ -6762,7 +7041,7 @@ struct ggml_tensor * ggml_rope_custom(
6762
7041
  int n_ctx,
6763
7042
  float freq_base,
6764
7043
  float freq_scale) {
6765
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
7044
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
6766
7045
  }
6767
7046
 
6768
7047
  struct ggml_tensor * ggml_rope_custom_inplace(
@@ -6774,7 +7053,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6774
7053
  int n_ctx,
6775
7054
  float freq_base,
6776
7055
  float freq_scale) {
6777
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
7056
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
7057
+ }
7058
+
7059
+ struct ggml_tensor * ggml_rope_xpos_inplace(
7060
+ struct ggml_context * ctx,
7061
+ struct ggml_tensor * a,
7062
+ int n_past,
7063
+ int n_dims,
7064
+ float base,
7065
+ bool down) {
7066
+ return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
6778
7067
  }
6779
7068
 
6780
7069
  // ggml_rope_back
@@ -6785,7 +7074,11 @@ struct ggml_tensor * ggml_rope_back(
6785
7074
  int n_past,
6786
7075
  int n_dims,
6787
7076
  int mode,
6788
- int n_ctx) {
7077
+ int n_ctx,
7078
+ float freq_base,
7079
+ float freq_scale,
7080
+ float xpos_base,
7081
+ bool xpos_down) {
6789
7082
  GGML_ASSERT(n_past >= 0);
6790
7083
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
6791
7084
 
@@ -6797,7 +7090,11 @@ struct ggml_tensor * ggml_rope_back(
6797
7090
 
6798
7091
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6799
7092
 
6800
- int32_t params[] = { n_past, n_dims, mode, n_ctx };
7093
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
7094
+ memcpy(params + 4, &freq_base, sizeof(float));
7095
+ memcpy(params + 5, &freq_scale, sizeof(float));
7096
+ memcpy(params + 6, &xpos_base, sizeof(float));
7097
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6801
7098
  ggml_set_op_params(result, params, sizeof(params));
6802
7099
 
6803
7100
  result->op = GGML_OP_ROPE_BACK;
@@ -6904,6 +7201,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6904
7201
  return result;
6905
7202
  }
6906
7203
 
7204
+ // ggml_conv_1d_ph
7205
+
7206
+ struct ggml_tensor* ggml_conv_1d_ph(
7207
+ struct ggml_context * ctx,
7208
+ struct ggml_tensor * a,
7209
+ struct ggml_tensor * b,
7210
+ int s,
7211
+ int d) {
7212
+ return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7213
+ }
7214
+
6907
7215
  // ggml_conv_2d
6908
7216
 
6909
7217
  struct ggml_tensor * ggml_conv_2d(
@@ -6944,17 +7252,61 @@ struct ggml_tensor * ggml_conv_2d(
6944
7252
 
6945
7253
  }
6946
7254
 
6947
- // ggml_conv_1d_ph
7255
+ // ggml_conv_2d_sk_p0
6948
7256
 
6949
- struct ggml_tensor * ggml_conv_1d_ph(
7257
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6950
7258
  struct ggml_context * ctx,
6951
7259
  struct ggml_tensor * a,
6952
- struct ggml_tensor * b,
6953
- int s,
6954
- int d) {
6955
- return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7260
+ struct ggml_tensor * b) {
7261
+ return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
7262
+ }
7263
+
7264
+ // ggml_conv_2d_s1_ph
7265
+
7266
+ struct ggml_tensor * ggml_conv_2d_s1_ph(
7267
+ struct ggml_context * ctx,
7268
+ struct ggml_tensor * a,
7269
+ struct ggml_tensor * b) {
7270
+ return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
7271
+ }
7272
+
7273
+ // ggml_conv_transpose_2d_p0
7274
+
7275
+ static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
7276
+ return (ins - 1) * s - 2 * p + ks;
6956
7277
  }
6957
7278
 
7279
+ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7280
+ struct ggml_context * ctx,
7281
+ struct ggml_tensor * a,
7282
+ struct ggml_tensor * b,
7283
+ int stride) {
7284
+ GGML_ASSERT(a->ne[3] == b->ne[2]);
7285
+
7286
+ bool is_node = false;
7287
+
7288
+ if (a->grad || b->grad) {
7289
+ GGML_ASSERT(false); // TODO: implement backward
7290
+ is_node = true;
7291
+ }
7292
+
7293
+ const int64_t ne[4] = {
7294
+ ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
7295
+ ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
7296
+ a->ne[2], b->ne[3],
7297
+ };
7298
+
7299
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7300
+
7301
+ ggml_set_op_params_i32(result, 0, stride);
7302
+
7303
+ result->op = GGML_OP_CONV_TRANSPOSE_2D;
7304
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7305
+ result->src[0] = a;
7306
+ result->src[1] = b;
7307
+
7308
+ return result;
7309
+ }
6958
7310
 
6959
7311
  // ggml_pool_*
6960
7312
 
@@ -7032,6 +7384,40 @@ struct ggml_tensor * ggml_pool_2d(
7032
7384
  return result;
7033
7385
  }
7034
7386
 
7387
+ // ggml_upscale
7388
+
7389
+ static struct ggml_tensor * ggml_upscale_impl(
7390
+ struct ggml_context * ctx,
7391
+ struct ggml_tensor * a,
7392
+ int scale_factor) {
7393
+ bool is_node = false;
7394
+
7395
+ if (a->grad) {
7396
+ GGML_ASSERT(false); // TODO: implement backward
7397
+ is_node = true;
7398
+ }
7399
+
7400
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
7401
+ a->ne[0] * scale_factor,
7402
+ a->ne[1] * scale_factor,
7403
+ a->ne[2], a->ne[3]);
7404
+
7405
+ result->op = GGML_OP_UPSCALE;
7406
+ result->op_params[0] = scale_factor;
7407
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7408
+ result->src[0] = a;
7409
+ result->src[1] = NULL;
7410
+
7411
+ return result;
7412
+ }
7413
+
7414
+ struct ggml_tensor * ggml_upscale(
7415
+ struct ggml_context * ctx,
7416
+ struct ggml_tensor * a,
7417
+ int scale_factor) {
7418
+ return ggml_upscale_impl(ctx, a, scale_factor);
7419
+ }
7420
+
7035
7421
  // ggml_flash_attn
7036
7422
 
7037
7423
  struct ggml_tensor * ggml_flash_attn(
@@ -7230,6 +7616,87 @@ struct ggml_tensor * ggml_win_unpart(
7230
7616
  return result;
7231
7617
  }
7232
7618
 
7619
+ // ggml_get_rel_pos
7620
+
7621
+ struct ggml_tensor * ggml_get_rel_pos(
7622
+ struct ggml_context * ctx,
7623
+ struct ggml_tensor * a,
7624
+ int qh,
7625
+ int kh) {
7626
+ GGML_ASSERT(qh == kh);
7627
+ GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
7628
+
7629
+ bool is_node = false;
7630
+
7631
+ if (a->grad) {
7632
+ GGML_ASSERT(false); // TODO: implement backward
7633
+ is_node = true;
7634
+ }
7635
+
7636
+ const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
7637
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
7638
+
7639
+ result->op = GGML_OP_GET_REL_POS;
7640
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7641
+ result->src[0] = a;
7642
+ result->src[1] = NULL;
7643
+
7644
+ return result;
7645
+ }
7646
+
7647
+ // ggml_add_rel_pos
7648
+
7649
+ static struct ggml_tensor * ggml_add_rel_pos_impl(
7650
+ struct ggml_context * ctx,
7651
+ struct ggml_tensor * a,
7652
+ struct ggml_tensor * pw,
7653
+ struct ggml_tensor * ph,
7654
+ bool inplace) {
7655
+ GGML_ASSERT(ggml_are_same_shape(pw, ph));
7656
+ GGML_ASSERT(ggml_is_contiguous(a));
7657
+ GGML_ASSERT(ggml_is_contiguous(pw));
7658
+ GGML_ASSERT(ggml_is_contiguous(ph));
7659
+ GGML_ASSERT(ph->type == GGML_TYPE_F32);
7660
+ GGML_ASSERT(pw->type == GGML_TYPE_F32);
7661
+ GGML_ASSERT(pw->ne[3] == a->ne[2]);
7662
+ GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
7663
+ GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
7664
+
7665
+ bool is_node = false;
7666
+
7667
+ if (!inplace && (a->grad || pw->grad || ph->grad)) {
7668
+ is_node = true;
7669
+ }
7670
+
7671
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7672
+ ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
7673
+
7674
+ result->op = GGML_OP_ADD_REL_POS;
7675
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7676
+ result->src[0] = a;
7677
+ result->src[1] = pw;
7678
+ result->src[2] = ph;
7679
+
7680
+ return result;
7681
+ }
7682
+
7683
+
7684
+ struct ggml_tensor * ggml_add_rel_pos(
7685
+ struct ggml_context * ctx,
7686
+ struct ggml_tensor * a,
7687
+ struct ggml_tensor * pw,
7688
+ struct ggml_tensor * ph) {
7689
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
7690
+ }
7691
+
7692
+ struct ggml_tensor * ggml_add_rel_pos_inplace(
7693
+ struct ggml_context * ctx,
7694
+ struct ggml_tensor * a,
7695
+ struct ggml_tensor * pw,
7696
+ struct ggml_tensor * ph) {
7697
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7698
+ }
7699
+
7233
7700
  // gmml_unary
7234
7701
 
7235
7702
  static struct ggml_tensor * ggml_unary_impl(
@@ -7745,7 +8212,7 @@ static void ggml_compute_forward_dup_same_cont(
7745
8212
  memcpy(
7746
8213
  ((char *) dst->data + ie0*nb0),
7747
8214
  ((char *) src0->data + ie0*nb00),
7748
- (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
8215
+ (ie1 - ie0) * ggml_type_size(src0->type));
7749
8216
  }
7750
8217
 
7751
8218
  }
@@ -7779,7 +8246,7 @@ static void ggml_compute_forward_dup_f16(
7779
8246
 
7780
8247
  if (src0->type == dst->type &&
7781
8248
  ne00 == ne0 &&
7782
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8249
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
7783
8250
  // copy by rows
7784
8251
  const size_t rs = ne00*nb00;
7785
8252
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7837,7 +8304,7 @@ static void ggml_compute_forward_dup_f16(
7837
8304
  float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
7838
8305
 
7839
8306
  size_t id = 0;
7840
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8307
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
7841
8308
  char * dst_ptr = (char *) dst->data;
7842
8309
 
7843
8310
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8050,7 +8517,7 @@ static void ggml_compute_forward_dup_f32(
8050
8517
 
8051
8518
  if (src0->type == dst->type &&
8052
8519
  ne00 == ne0 &&
8053
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8520
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
8054
8521
  // copy by rows
8055
8522
  const size_t rs = ne00*nb00;
8056
8523
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -8089,7 +8556,7 @@ static void ggml_compute_forward_dup_f32(
8089
8556
  ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
8090
8557
 
8091
8558
  size_t id = 0;
8092
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8559
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
8093
8560
  char * dst_ptr = (char *) dst->data;
8094
8561
 
8095
8562
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8501,7 +8968,7 @@ static void ggml_compute_forward_add_q_f32(
8501
8968
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8502
8969
 
8503
8970
  // we don't support permuted src0 or src1
8504
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
8971
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8505
8972
  GGML_ASSERT(nb10 == sizeof(float));
8506
8973
 
8507
8974
  // dst cannot be transposed or permuted
@@ -8775,7 +9242,7 @@ static void ggml_compute_forward_add1_q_f32(
8775
9242
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8776
9243
 
8777
9244
  // we don't support permuted src0
8778
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
9245
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8779
9246
 
8780
9247
  // dst cannot be transposed or permuted
8781
9248
  GGML_ASSERT(nb0 <= nb1);
@@ -9137,6 +9604,8 @@ static void ggml_compute_forward_mul(
9137
9604
  const struct ggml_tensor * src0,
9138
9605
  const struct ggml_tensor * src1,
9139
9606
  struct ggml_tensor * dst) {
9607
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
9608
+
9140
9609
  switch (src0->type) {
9141
9610
  case GGML_TYPE_F32:
9142
9611
  {
@@ -9179,6 +9648,8 @@ static void ggml_compute_forward_div_f32(
9179
9648
 
9180
9649
 
9181
9650
  #ifdef GGML_USE_ACCELERATE
9651
+ UNUSED(ggml_vec_div_f32);
9652
+
9182
9653
  vDSP_vdiv(
9183
9654
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9184
9655
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -9731,6 +10202,72 @@ static void ggml_compute_forward_repeat_back(
9731
10202
  }
9732
10203
  }
9733
10204
 
10205
+ // ggml_compute_forward_concat
10206
+
10207
+ static void ggml_compute_forward_concat_f32(
10208
+ const struct ggml_compute_params * params,
10209
+ const struct ggml_tensor * src0,
10210
+ const struct ggml_tensor * src1,
10211
+ struct ggml_tensor * dst) {
10212
+
10213
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10214
+ return;
10215
+ }
10216
+
10217
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10218
+
10219
+ const int ith = params->ith;
10220
+
10221
+ GGML_TENSOR_BINARY_OP_LOCALS;
10222
+
10223
+ // TODO: support for transposed / permuted tensors
10224
+ GGML_ASSERT(nb0 == sizeof(float));
10225
+ GGML_ASSERT(nb00 == sizeof(float));
10226
+ GGML_ASSERT(nb10 == sizeof(float));
10227
+
10228
+ for (int i3 = 0; i3 < ne3; i3++) {
10229
+ for (int i2 = ith; i2 < ne2; i2++) {
10230
+ if (i2 < ne02) { // src0
10231
+ for (int i1 = 0; i1 < ne1; i1++) {
10232
+ for (int i0 = 0; i0 < ne0; i0++) {
10233
+ const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
10234
+
10235
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10236
+ *y = *x;
10237
+ }
10238
+ }
10239
+ } // src1
10240
+ else {
10241
+ for (int i1 = 0; i1 < ne1; i1++) {
10242
+ for (int i0 = 0; i0 < ne0; i0++) {
10243
+ const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
10244
+
10245
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10246
+ *y = *x;
10247
+ }
10248
+ }
10249
+ }
10250
+ }
10251
+ }
10252
+ }
10253
+
10254
+ static void ggml_compute_forward_concat(
10255
+ const struct ggml_compute_params* params,
10256
+ const struct ggml_tensor* src0,
10257
+ const struct ggml_tensor* src1,
10258
+ struct ggml_tensor* dst) {
10259
+ switch (src0->type) {
10260
+ case GGML_TYPE_F32:
10261
+ {
10262
+ ggml_compute_forward_concat_f32(params, src0, src1, dst);
10263
+ } break;
10264
+ default:
10265
+ {
10266
+ GGML_ASSERT(false);
10267
+ } break;
10268
+ }
10269
+ }
10270
+
9734
10271
  // ggml_compute_forward_abs
9735
10272
 
9736
10273
  static void ggml_compute_forward_abs_f32(
@@ -10285,7 +10822,8 @@ static void ggml_compute_forward_norm_f32(
10285
10822
 
10286
10823
  GGML_TENSOR_UNARY_OP_LOCALS;
10287
10824
 
10288
- const float eps = 1e-5f; // TODO: make this a parameter
10825
+ float eps;
10826
+ memcpy(&eps, dst->op_params, sizeof(float));
10289
10827
 
10290
10828
  // TODO: optimize
10291
10829
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10334,6 +10872,8 @@ static void ggml_compute_forward_norm(
10334
10872
  }
10335
10873
  }
10336
10874
 
10875
+ // ggml_compute_forward_group_rms_norm
10876
+
10337
10877
  static void ggml_compute_forward_rms_norm_f32(
10338
10878
  const struct ggml_compute_params * params,
10339
10879
  const struct ggml_tensor * src0,
@@ -10398,7 +10938,6 @@ static void ggml_compute_forward_rms_norm(
10398
10938
  }
10399
10939
  }
10400
10940
 
10401
-
10402
10941
  static void ggml_compute_forward_rms_norm_back_f32(
10403
10942
  const struct ggml_compute_params * params,
10404
10943
  const struct ggml_tensor * src0,
@@ -10417,7 +10956,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
10417
10956
 
10418
10957
  GGML_TENSOR_BINARY_OP_LOCALS;
10419
10958
 
10420
- const float eps = 1e-6f; // TODO: make this a parameter
10959
+ float eps;
10960
+ memcpy(&eps, dst->op_params, sizeof(float));
10421
10961
 
10422
10962
  // TODO: optimize
10423
10963
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10572,54 +11112,144 @@ static void ggml_compute_forward_rms_norm_back(
10572
11112
  }
10573
11113
  }
10574
11114
 
10575
- // ggml_compute_forward_mul_mat
10576
-
10577
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10578
- // helper function to determine if it is better to use BLAS or not
10579
- // for large matrices, BLAS is faster
10580
- static bool ggml_compute_forward_mul_mat_use_blas(
10581
- const struct ggml_tensor * src0,
10582
- const struct ggml_tensor * src1,
10583
- struct ggml_tensor * dst) {
10584
- //const int64_t ne00 = src0->ne[0];
10585
- //const int64_t ne01 = src0->ne[1];
10586
-
10587
- const int64_t ne10 = src1->ne[0];
10588
-
10589
- const int64_t ne0 = dst->ne[0];
10590
- const int64_t ne1 = dst->ne[1];
11115
+ // ggml_compute_forward_group_norm
10591
11116
 
10592
- // TODO: find the optimal values for these
10593
- if (ggml_is_contiguous(src0) &&
10594
- ggml_is_contiguous(src1) &&
10595
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
11117
+ static void ggml_compute_forward_group_norm_f32(
11118
+ const struct ggml_compute_params * params,
11119
+ const struct ggml_tensor * src0,
11120
+ struct ggml_tensor * dst) {
11121
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
10596
11122
 
10597
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
10598
- return true;
11123
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11124
+ return;
10599
11125
  }
10600
11126
 
10601
- return false;
10602
- }
10603
- #endif
10604
-
10605
- static void ggml_compute_forward_mul_mat(
10606
- const struct ggml_compute_params * params,
10607
- const struct ggml_tensor * src0,
10608
- const struct ggml_tensor * src1,
10609
- struct ggml_tensor * dst) {
10610
- int64_t t0 = ggml_perf_time_us();
10611
- UNUSED(t0);
10612
-
10613
- GGML_TENSOR_BINARY_OP_LOCALS;
11127
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10614
11128
 
10615
11129
  const int ith = params->ith;
10616
11130
  const int nth = params->nth;
10617
11131
 
10618
- const enum ggml_type type = src0->type;
11132
+ GGML_TENSOR_UNARY_OP_LOCALS;
10619
11133
 
10620
- const bool src1_cont = ggml_is_contiguous(src1);
11134
+ const float eps = 1e-6f; // TODO: make this a parameter
10621
11135
 
10622
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
11136
+ // TODO: optimize
11137
+
11138
+ int n_channels = src0->ne[2];
11139
+ int n_groups = dst->op_params[0];
11140
+ int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
11141
+ for (int i = ith; i < n_groups; i+=nth) {
11142
+ int start = i * n_channels_per_group;
11143
+ int end = start + n_channels_per_group;
11144
+ if (end > n_channels) {
11145
+ end = n_channels;
11146
+ }
11147
+ int step = end - start;
11148
+
11149
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
11150
+ ggml_float sum = 0.0;
11151
+ for (int64_t i02 = start; i02 < end; i02++) {
11152
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11153
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
11154
+
11155
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11156
+ sum += (ggml_float)x[i00];
11157
+ }
11158
+ }
11159
+ }
11160
+ float mean = sum / (ne00 * ne01 * step);
11161
+ ggml_float sum2 = 0.0;
11162
+
11163
+ for (int64_t i02 = start; i02 < end; i02++) {
11164
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11165
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
11166
+
11167
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
11168
+
11169
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11170
+ float v = x[i00] - mean;
11171
+ y[i00] = v;
11172
+ sum2 += (ggml_float)(v * v);
11173
+ }
11174
+ }
11175
+ }
11176
+ float variance = sum2 / (ne00 * ne01 * step);
11177
+ const float scale = 1.0f / sqrtf(variance + eps);
11178
+
11179
+ for (int64_t i02 = start; i02 < end; i02++) {
11180
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11181
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
11182
+ ggml_vec_scale_f32(ne00, y, scale);
11183
+ }
11184
+ }
11185
+ }
11186
+ }
11187
+ }
11188
+
11189
+ static void ggml_compute_forward_group_norm(
11190
+ const struct ggml_compute_params * params,
11191
+ const struct ggml_tensor * src0,
11192
+ struct ggml_tensor * dst) {
11193
+ switch (src0->type) {
11194
+ case GGML_TYPE_F32:
11195
+ {
11196
+ ggml_compute_forward_group_norm_f32(params, src0, dst);
11197
+ } break;
11198
+ default:
11199
+ {
11200
+ GGML_ASSERT(false);
11201
+ } break;
11202
+ }
11203
+ }
11204
+
11205
+ // ggml_compute_forward_mul_mat
11206
+
11207
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
11208
+ // helper function to determine if it is better to use BLAS or not
11209
+ // for large matrices, BLAS is faster
11210
+ static bool ggml_compute_forward_mul_mat_use_blas(
11211
+ const struct ggml_tensor * src0,
11212
+ const struct ggml_tensor * src1,
11213
+ struct ggml_tensor * dst) {
11214
+ //const int64_t ne00 = src0->ne[0];
11215
+ //const int64_t ne01 = src0->ne[1];
11216
+
11217
+ const int64_t ne10 = src1->ne[0];
11218
+
11219
+ const int64_t ne0 = dst->ne[0];
11220
+ const int64_t ne1 = dst->ne[1];
11221
+
11222
+ // TODO: find the optimal values for these
11223
+ if (ggml_is_contiguous(src0) &&
11224
+ ggml_is_contiguous(src1) &&
11225
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
11226
+
11227
+ /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
11228
+ return true;
11229
+ }
11230
+
11231
+ return false;
11232
+ }
11233
+ #endif
11234
+
11235
+ static void ggml_compute_forward_mul_mat(
11236
+ const struct ggml_compute_params * params,
11237
+ const struct ggml_tensor * src0,
11238
+ const struct ggml_tensor * src1,
11239
+ struct ggml_tensor * dst) {
11240
+ int64_t t0 = ggml_perf_time_us();
11241
+ UNUSED(t0);
11242
+
11243
+ GGML_TENSOR_BINARY_OP_LOCALS;
11244
+
11245
+ const int ith = params->ith;
11246
+ const int nth = params->nth;
11247
+
11248
+ const enum ggml_type type = src0->type;
11249
+
11250
+ const bool src1_cont = ggml_is_contiguous(src1);
11251
+
11252
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10623
11253
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10624
11254
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10625
11255
 
@@ -10629,7 +11259,7 @@ static void ggml_compute_forward_mul_mat(
10629
11259
  GGML_ASSERT(ne3 == ne13);
10630
11260
 
10631
11261
  // we don't support permuted src0 or src1
10632
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
11262
+ GGML_ASSERT(nb00 == ggml_type_size(type));
10633
11263
  GGML_ASSERT(nb10 == sizeof(float));
10634
11264
 
10635
11265
  // dst cannot be transposed or permuted
@@ -10638,6 +11268,10 @@ static void ggml_compute_forward_mul_mat(
10638
11268
  GGML_ASSERT(nb1 <= nb2);
10639
11269
  GGML_ASSERT(nb2 <= nb3);
10640
11270
 
11271
+ // broadcast factors
11272
+ const int64_t r2 = ne12/ne02;
11273
+ const int64_t r3 = ne13/ne03;
11274
+
10641
11275
  // nb01 >= nb00 - src0 is not transposed
10642
11276
  // compute by src0 rows
10643
11277
 
@@ -10657,11 +11291,6 @@ static void ggml_compute_forward_mul_mat(
10657
11291
 
10658
11292
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10659
11293
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10660
- // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
10661
- // ref: https://github.com/ggerganov/ggml/pull/224
10662
- GGML_ASSERT(ne02 == ne12);
10663
- GGML_ASSERT(ne03 == ne13);
10664
-
10665
11294
  if (params->ith != 0) {
10666
11295
  return;
10667
11296
  }
@@ -10674,12 +11303,16 @@ static void ggml_compute_forward_mul_mat(
10674
11303
  return;
10675
11304
  }
10676
11305
 
10677
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10678
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10679
- const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
10680
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
11306
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
11307
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
11308
+ // broadcast src0 into src1 across 2nd,3rd dimension
11309
+ const int64_t i03 = i13/r3;
11310
+ const int64_t i02 = i12/r2;
10681
11311
 
10682
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
11312
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
11313
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
11314
+
11315
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
10683
11316
 
10684
11317
  if (type != GGML_TYPE_F32) {
10685
11318
  float * const wdata = params->wdata;
@@ -10687,7 +11320,7 @@ static void ggml_compute_forward_mul_mat(
10687
11320
 
10688
11321
  size_t id = 0;
10689
11322
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
10690
- to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
11323
+ to_float((const char *) x + i01*nb01, wdata + id, ne00);
10691
11324
  id += ne00;
10692
11325
  }
10693
11326
 
@@ -10712,7 +11345,7 @@ static void ggml_compute_forward_mul_mat(
10712
11345
  if (params->type == GGML_TASK_INIT) {
10713
11346
  if (src1->type != vec_dot_type) {
10714
11347
  char * wdata = params->wdata;
10715
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11348
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10716
11349
 
10717
11350
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
10718
11351
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -10732,7 +11365,7 @@ static void ggml_compute_forward_mul_mat(
10732
11365
  }
10733
11366
 
10734
11367
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10735
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11368
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10736
11369
 
10737
11370
  const int64_t nr0 = ne01; // src0 rows
10738
11371
  const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@@ -10767,10 +11400,6 @@ static void ggml_compute_forward_mul_mat(
10767
11400
  assert(ne12 % ne02 == 0);
10768
11401
  assert(ne13 % ne03 == 0);
10769
11402
 
10770
- // broadcast factors
10771
- const int64_t r2 = ne12/ne02;
10772
- const int64_t r3 = ne13/ne03;
10773
-
10774
11403
  // block-tiling attempt
10775
11404
  const int64_t blck_0 = 16;
10776
11405
  const int64_t blck_1 = 16;
@@ -11205,7 +11834,7 @@ static void ggml_compute_forward_get_rows_q(
11205
11834
 
11206
11835
  assert( dst->ne[0] == nc);
11207
11836
  assert( dst->ne[1] == nr);
11208
- assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
11837
+ assert(src0->nb[0] == ggml_type_size(type));
11209
11838
 
11210
11839
  for (int i = 0; i < nr; ++i) {
11211
11840
  const int r = ((int32_t *) src1->data)[i];
@@ -11506,8 +12135,8 @@ static void ggml_compute_forward_diag_mask_f32(
11506
12135
  const int ith = params->ith;
11507
12136
  const int nth = params->nth;
11508
12137
 
11509
- const int n_past = ((int32_t *) dst->op_params)[0];
11510
- const bool inplace = (bool)((int32_t *) dst->op_params)[1];
12138
+ const int n_past = ((int32_t *) dst->op_params)[0];
12139
+ const bool inplace = src0->data == dst->data;
11511
12140
 
11512
12141
  GGML_ASSERT(n_past >= 0);
11513
12142
 
@@ -11718,6 +12347,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11718
12347
  // dx = J * dy
11719
12348
  // dxk = sum_i(Jki * dyi)
11720
12349
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
12350
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
11721
12351
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
11722
12352
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
11723
12353
  // dxk = -yk * dot(y, dy) + yk*dyk
@@ -11926,7 +12556,6 @@ static void ggml_compute_forward_alibi(
11926
12556
  }
11927
12557
  }
11928
12558
 
11929
-
11930
12559
  // ggml_compute_forward_clamp
11931
12560
 
11932
12561
  static void ggml_compute_forward_clamp_f32(
@@ -12015,12 +12644,18 @@ static void ggml_compute_forward_rope_f32(
12015
12644
  float freq_base;
12016
12645
  float freq_scale;
12017
12646
 
12647
+ // these two only relevant for xPos RoPE:
12648
+ float xpos_base;
12649
+ bool xpos_down;
12650
+
12018
12651
  const int n_past = ((int32_t *) dst->op_params)[0];
12019
12652
  const int n_dims = ((int32_t *) dst->op_params)[1];
12020
12653
  const int mode = ((int32_t *) dst->op_params)[2];
12021
12654
  const int n_ctx = ((int32_t *) dst->op_params)[3];
12022
12655
  memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12023
12656
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12657
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12658
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12024
12659
 
12025
12660
  assert(n_past >= 0);
12026
12661
 
@@ -12092,6 +12727,9 @@ static void ggml_compute_forward_rope_f32(
12092
12727
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12093
12728
  const float cos_theta = cosf(theta);
12094
12729
  const float sin_theta = sinf(theta);
12730
+ // zeta scaling for xPos only:
12731
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12732
+ if (xpos_down) zeta = 1.0f / zeta;
12095
12733
 
12096
12734
  theta *= theta_scale;
12097
12735
 
@@ -12101,11 +12739,11 @@ static void ggml_compute_forward_rope_f32(
12101
12739
  const float x0 = src[0];
12102
12740
  const float x1 = src[1];
12103
12741
 
12104
- dst_data[0] = x0*cos_theta - x1*sin_theta;
12105
- dst_data[1] = x0*sin_theta + x1*cos_theta;
12742
+ dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
12743
+ dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
12106
12744
  }
12107
12745
  } else {
12108
- // TODO: this is probably wrong, but I can't figure it out ..
12746
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12109
12747
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12110
12748
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12111
12749
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12234,7 +12872,7 @@ static void ggml_compute_forward_rope_f16(
12234
12872
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12235
12873
  }
12236
12874
  } else {
12237
- // TODO: this is probably wrong, but I can't figure it out ..
12875
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12238
12876
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12239
12877
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12240
12878
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12296,9 +12934,21 @@ static void ggml_compute_forward_rope_back_f32(
12296
12934
  // dx = rope_back(dy, src1)
12297
12935
  // src0 is dy, src1 contains options
12298
12936
 
12937
+ float freq_base;
12938
+ float freq_scale;
12939
+
12940
+ // these two only relevant for xPos RoPE:
12941
+ float xpos_base;
12942
+ bool xpos_down;
12943
+
12299
12944
  const int n_past = ((int32_t *) dst->op_params)[0];
12300
12945
  const int n_dims = ((int32_t *) dst->op_params)[1];
12301
12946
  const int mode = ((int32_t *) dst->op_params)[2];
12947
+ const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
12948
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12949
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12950
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12951
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12302
12952
 
12303
12953
  assert(n_past >= 0);
12304
12954
 
@@ -12324,7 +12974,7 @@ static void ggml_compute_forward_rope_back_f32(
12324
12974
  // row index used to determine which thread to use
12325
12975
  int ir = 0;
12326
12976
 
12327
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12977
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12328
12978
 
12329
12979
  const bool is_neox = mode & 2;
12330
12980
 
@@ -12335,12 +12985,15 @@ static void ggml_compute_forward_rope_back_f32(
12335
12985
  if (ir++ < ir0) continue;
12336
12986
  if (ir > ir1) break;
12337
12987
 
12338
- float theta = (float)p;
12988
+ float theta = freq_scale * (float)p;
12339
12989
 
12340
12990
  if (!is_neox) {
12341
12991
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12342
12992
  const float cos_theta = cosf(theta);
12343
12993
  const float sin_theta = sinf(theta);
12994
+ // zeta scaling for xPos only:
12995
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12996
+ if (xpos_down) zeta = 1.0f / zeta;
12344
12997
 
12345
12998
  theta *= theta_scale;
12346
12999
 
@@ -12350,8 +13003,8 @@ static void ggml_compute_forward_rope_back_f32(
12350
13003
  const float dy0 = dy[0];
12351
13004
  const float dy1 = dy[1];
12352
13005
 
12353
- dx[0] = dy0*cos_theta + dy1*sin_theta;
12354
- dx[1] = - dy0*sin_theta + dy1*cos_theta;
13006
+ dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
13007
+ dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
12355
13008
  }
12356
13009
  } else {
12357
13010
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
@@ -13044,6 +13697,106 @@ static void ggml_compute_forward_conv_2d(
13044
13697
  }
13045
13698
  }
13046
13699
 
13700
+ // ggml_compute_forward_conv_transpose_2d
13701
+
13702
+ static void ggml_compute_forward_conv_transpose_2d(
13703
+ const struct ggml_compute_params * params,
13704
+ const struct ggml_tensor * src0,
13705
+ const struct ggml_tensor * src1,
13706
+ struct ggml_tensor * dst) {
13707
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13708
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13709
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13710
+
13711
+ int64_t t0 = ggml_perf_time_us();
13712
+ UNUSED(t0);
13713
+
13714
+ GGML_TENSOR_BINARY_OP_LOCALS;
13715
+
13716
+ const int ith = params->ith;
13717
+ const int nth = params->nth;
13718
+
13719
+ const int nk = ne00*ne01*ne02*ne03;
13720
+
13721
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13722
+ GGML_ASSERT(nb10 == sizeof(float));
13723
+
13724
+ if (params->type == GGML_TASK_INIT) {
13725
+ memset(params->wdata, 0, params->wsize);
13726
+
13727
+ // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
13728
+ {
13729
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13730
+
13731
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
13732
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
13733
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
13734
+ ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
13735
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
13736
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
13737
+ dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
13738
+ }
13739
+ }
13740
+ }
13741
+ }
13742
+ }
13743
+
13744
+ // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
13745
+ {
13746
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
13747
+ for (int i12 = 0; i12 < ne12; i12++) {
13748
+ for (int i11 = 0; i11 < ne11; i11++) {
13749
+ const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
13750
+ ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
13751
+ for (int i10 = 0; i10 < ne10; i10++) {
13752
+ dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
13753
+ }
13754
+ }
13755
+ }
13756
+ }
13757
+
13758
+ return;
13759
+ }
13760
+
13761
+ if (params->type == GGML_TASK_FINALIZE) {
13762
+ return;
13763
+ }
13764
+
13765
+ const int32_t stride = ggml_get_op_params_i32(dst, 0);
13766
+
13767
+ // total patches in dst
13768
+ const int np = ne2;
13769
+
13770
+ // patches per thread
13771
+ const int dp = (np + nth - 1)/nth;
13772
+
13773
+ // patch range for this thread
13774
+ const int ip0 = dp*ith;
13775
+ const int ip1 = MIN(ip0 + dp, np);
13776
+
13777
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13778
+ ggml_fp16_t * const wdata_src = wdata + nk;
13779
+
13780
+ for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13781
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13782
+ ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
13783
+ for (int i11 = 0; i11 < ne11; i11++) {
13784
+ for (int i10 = 0; i10 < ne10; i10++) {
13785
+ const int i1n = i11*ne10*ne12 + i10*ne12;
13786
+ for (int i01 = 0; i01 < ne01; i01++) {
13787
+ for (int i00 = 0; i00 < ne00; i00++) {
13788
+ float v = 0;
13789
+ ggml_vec_dot_f16(ne03, &v,
13790
+ wdata_src + i1n,
13791
+ wdata_kernel + i01*ne00*ne03 + i00*ne03);
13792
+ dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13793
+ }
13794
+ }
13795
+ }
13796
+ }
13797
+ }
13798
+ }
13799
+
13047
13800
  // ggml_compute_forward_pool_1d_sk_p0
13048
13801
 
13049
13802
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -13202,6 +13955,60 @@ static void ggml_compute_forward_pool_2d(
13202
13955
  ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
13203
13956
  }
13204
13957
 
13958
+ // ggml_compute_forward_upscale
13959
+
13960
+ static void ggml_compute_forward_upscale_f32(
13961
+ const struct ggml_compute_params * params,
13962
+ const struct ggml_tensor * src0,
13963
+ struct ggml_tensor * dst) {
13964
+
13965
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13966
+ return;
13967
+ }
13968
+
13969
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13970
+
13971
+ const int ith = params->ith;
13972
+
13973
+ GGML_TENSOR_UNARY_OP_LOCALS;
13974
+
13975
+ const int scale_factor = dst->op_params[0];
13976
+
13977
+ // TODO: optimize
13978
+
13979
+ for (int i03 = 0; i03 < ne03; i03++) {
13980
+ for (int i02 = ith; i02 < ne02; i02++) {
13981
+ for (int m = 0; m < dst->ne[1]; m++) {
13982
+ int i01 = m / scale_factor;
13983
+ for (int n = 0; n < dst->ne[0]; n++) {
13984
+ int i00 = n / scale_factor;
13985
+
13986
+ const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
13987
+
13988
+ float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
13989
+
13990
+ *y = *x;
13991
+ }
13992
+ }
13993
+ }
13994
+ }
13995
+ }
13996
+
13997
+ static void ggml_compute_forward_upscale(
13998
+ const struct ggml_compute_params * params,
13999
+ const struct ggml_tensor * src0,
14000
+ struct ggml_tensor * dst) {
14001
+ switch (src0->type) {
14002
+ case GGML_TYPE_F32:
14003
+ {
14004
+ ggml_compute_forward_upscale_f32(params, src0, dst);
14005
+ } break;
14006
+ default:
14007
+ {
14008
+ GGML_ASSERT(false);
14009
+ } break;
14010
+ }
14011
+ }
13205
14012
 
13206
14013
  // ggml_compute_forward_flash_attn
13207
14014
 
@@ -13331,7 +14138,7 @@ static void ggml_compute_forward_flash_attn_f32(
13331
14138
  vvexpf(S, S, &Mup);
13332
14139
  ggml_vec_sum_f32(Mup, &sum, S);
13333
14140
  #else
13334
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14141
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13335
14142
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13336
14143
 
13337
14144
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13341,9 +14148,13 @@ static void ggml_compute_forward_flash_attn_f32(
13341
14148
  if (SS[j] == -INFINITY) {
13342
14149
  SS[j] = 0.0f;
13343
14150
  } else {
14151
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14152
+ const float val = expf(SS[j] - max);
14153
+ #else
13344
14154
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
13345
14155
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13346
14156
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14157
+ #endif
13347
14158
  sump[j] += (ggml_float)val;
13348
14159
  SS[j] = val;
13349
14160
  }
@@ -13921,7 +14732,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
13921
14732
  vvexpf(SM, SM, &Mup);
13922
14733
  ggml_vec_sum_f32(Mup, &sum, SM);
13923
14734
  #else
13924
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14735
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13925
14736
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13926
14737
 
13927
14738
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13932,9 +14743,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
13932
14743
  if (SR[j] == -INFINITY) {
13933
14744
  SW[j] = 0.0f;
13934
14745
  } else {
14746
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14747
+ const float val = expf(SR[j] - max);
14748
+ #else
13935
14749
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
13936
14750
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13937
14751
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14752
+ #endif
13938
14753
  sump[j] += (ggml_float)val;
13939
14754
  SW[j] = val;
13940
14755
  }
@@ -14327,38 +15142,169 @@ static void ggml_compute_forward_unary(
14327
15142
  }
14328
15143
  }
14329
15144
 
14330
- // ggml_compute_forward_map_unary
15145
+ // ggml_compute_forward_get_rel_pos
14331
15146
 
14332
- static void ggml_compute_forward_map_unary_f32(
15147
+ static void ggml_compute_forward_get_rel_pos_f16(
14333
15148
  const struct ggml_compute_params * params,
14334
15149
  const struct ggml_tensor * src0,
14335
- struct ggml_tensor * dst,
14336
- const ggml_unary_op_f32_t fun) {
14337
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
14338
-
15150
+ struct ggml_tensor * dst) {
14339
15151
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14340
15152
  return;
14341
15153
  }
14342
15154
 
14343
- const int n = ggml_nrows(src0);
14344
- const int nc = src0->ne[0];
15155
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
14345
15156
 
14346
- assert( dst->nb[0] == sizeof(float));
14347
- assert(src0->nb[0] == sizeof(float));
15157
+ GGML_TENSOR_UNARY_OP_LOCALS;
14348
15158
 
14349
- for (int i = 0; i < n; i++) {
14350
- fun(nc,
14351
- (float *) ((char *) dst->data + i*( dst->nb[1])),
14352
- (float *) ((char *) src0->data + i*(src0->nb[1])));
15159
+ const int64_t w = ne1;
15160
+
15161
+ ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
15162
+ ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
15163
+
15164
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
15165
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
15166
+ const int64_t pos = (w - i1 - 1) + i2;
15167
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
15168
+ dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
15169
+ }
15170
+ }
14353
15171
  }
14354
15172
  }
14355
15173
 
14356
-
14357
- static void ggml_compute_forward_map_unary(
15174
+ static void ggml_compute_forward_get_rel_pos(
14358
15175
  const struct ggml_compute_params * params,
14359
15176
  const struct ggml_tensor * src0,
14360
- struct ggml_tensor * dst,
14361
- const ggml_unary_op_f32_t fun) {
15177
+ struct ggml_tensor * dst) {
15178
+ switch (src0->type) {
15179
+ case GGML_TYPE_F16:
15180
+ {
15181
+ ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
15182
+ } break;
15183
+ default:
15184
+ {
15185
+ GGML_ASSERT(false);
15186
+ } break;
15187
+ }
15188
+ }
15189
+
15190
+ // ggml_compute_forward_add_rel_pos
15191
+
15192
+ static void ggml_compute_forward_add_rel_pos_f32(
15193
+ const struct ggml_compute_params * params,
15194
+ const struct ggml_tensor * src0,
15195
+ const struct ggml_tensor * src1,
15196
+ const struct ggml_tensor * src2,
15197
+ struct ggml_tensor * dst) {
15198
+
15199
+ const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
15200
+ if (!inplace && params->type == GGML_TASK_INIT) {
15201
+ memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
15202
+ return;
15203
+ }
15204
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15205
+ return;
15206
+ }
15207
+
15208
+ int64_t t0 = ggml_perf_time_us();
15209
+ UNUSED(t0);
15210
+
15211
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
15212
+
15213
+ float * src1_data = (float *) src1->data;
15214
+ float * src2_data = (float *) src2->data;
15215
+ float * dst_data = (float *) dst->data;
15216
+
15217
+ const int64_t ne10 = src1->ne[0];
15218
+ const int64_t ne11 = src1->ne[1];
15219
+ const int64_t ne12 = src1->ne[2];
15220
+ const int64_t ne13 = src1->ne[3];
15221
+
15222
+ const int ith = params->ith;
15223
+ const int nth = params->nth;
15224
+
15225
+ // total patches in dst
15226
+ const int np = ne13;
15227
+
15228
+ // patches per thread
15229
+ const int dp = (np + nth - 1)/nth;
15230
+
15231
+ // patch range for this thread
15232
+ const int ip0 = dp*ith;
15233
+ const int ip1 = MIN(ip0 + dp, np);
15234
+
15235
+
15236
+ for (int64_t i13 = ip0; i13 < ip1; ++i13) {
15237
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
15238
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
15239
+ const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
15240
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
15241
+ const int64_t jp0 = jp1 + i10;
15242
+ const float src1_e = src1_data[jp0];
15243
+ const float src2_e = src2_data[jp0];
15244
+
15245
+ const int64_t jdh = jp0 * ne10;
15246
+ const int64_t jdw = jdh - (ne10 - 1) * i10;
15247
+
15248
+ for (int64_t j = 0; j < ne10; ++j) {
15249
+ dst_data[jdh + j ] += src2_e;
15250
+ dst_data[jdw + j*ne10] += src1_e;
15251
+ }
15252
+ }
15253
+ }
15254
+ }
15255
+ }
15256
+ }
15257
+
15258
+ static void ggml_compute_forward_add_rel_pos(
15259
+ const struct ggml_compute_params * params,
15260
+ const struct ggml_tensor * src0,
15261
+ const struct ggml_tensor * src1,
15262
+ const struct ggml_tensor * src2,
15263
+ struct ggml_tensor * dst) {
15264
+ switch (src0->type) {
15265
+ case GGML_TYPE_F32:
15266
+ {
15267
+ ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
15268
+ } break;
15269
+ default:
15270
+ {
15271
+ GGML_ASSERT(false);
15272
+ } break;
15273
+ }
15274
+ }
15275
+
15276
+ // ggml_compute_forward_map_unary
15277
+
15278
+ static void ggml_compute_forward_map_unary_f32(
15279
+ const struct ggml_compute_params * params,
15280
+ const struct ggml_tensor * src0,
15281
+ struct ggml_tensor * dst,
15282
+ const ggml_unary_op_f32_t fun) {
15283
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
15284
+
15285
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15286
+ return;
15287
+ }
15288
+
15289
+ const int n = ggml_nrows(src0);
15290
+ const int nc = src0->ne[0];
15291
+
15292
+ assert( dst->nb[0] == sizeof(float));
15293
+ assert(src0->nb[0] == sizeof(float));
15294
+
15295
+ for (int i = 0; i < n; i++) {
15296
+ fun(nc,
15297
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
15298
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
15299
+ }
15300
+ }
15301
+
15302
+
15303
+ static void ggml_compute_forward_map_unary(
15304
+ const struct ggml_compute_params * params,
15305
+ const struct ggml_tensor * src0,
15306
+ struct ggml_tensor * dst,
15307
+ const ggml_unary_op_f32_t fun) {
14362
15308
  switch (src0->type) {
14363
15309
  case GGML_TYPE_F32:
14364
15310
  {
@@ -14541,6 +15487,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14541
15487
  const int nc = src0->ne[0];
14542
15488
  const int nr = ggml_nrows(src0);
14543
15489
 
15490
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15491
+
14544
15492
  if (params->type == GGML_TASK_INIT) {
14545
15493
  if (ith == 0) {
14546
15494
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -14552,7 +15500,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14552
15500
  if (ith == 0) {
14553
15501
  float * dp = (float *) dst->data;
14554
15502
  ggml_vec_sum_f32(nth, dp, sums);
14555
- dp[0] *= -1.0f;
15503
+ dp[0] *= -1.0f / (float) nr;
14556
15504
  }
14557
15505
  return;
14558
15506
  }
@@ -14569,7 +15517,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14569
15517
  for (int i1 = ir0; i1 < ir1; i1++) {
14570
15518
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
14571
15519
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
14572
- float * st = (float *) params->wdata + nth + ith*nc;
15520
+ float * st = ((float *) params->wdata) + nth + ith*nc;
14573
15521
 
14574
15522
  #ifndef NDEBUG
14575
15523
  for (int i = 0; i < nc; ++i) {
@@ -14584,15 +15532,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14584
15532
  float max = -INFINITY;
14585
15533
  ggml_vec_max_f32(nc, &max, s0);
14586
15534
 
14587
- uint16_t scvt;
15535
+ uint16_t scvt; UNUSED(scvt);
14588
15536
  for (int i = 0; i < nc; i++) {
14589
15537
  if (s0[i] == -INFINITY) {
14590
15538
  st[i] = 0.0f;
14591
15539
  } else {
14592
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15540
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15541
+ const float s = s0[i] - max;
15542
+ const float val = expf(s);
15543
+ #else
14593
15544
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
14594
15545
  memcpy(&scvt, &s, sizeof(scvt));
14595
15546
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15547
+ #endif
14596
15548
  sum += (ggml_float)val;
14597
15549
  st[i] = val;
14598
15550
  }
@@ -14608,7 +15560,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14608
15560
  ggml_vec_log_f32(nc, st, st);
14609
15561
  ggml_vec_mul_f32(nc, st, st, s1);
14610
15562
 
14611
- ggml_vec_sum_f32(nc, sums + ith, st);
15563
+ float st_sum = 0;
15564
+ ggml_vec_sum_f32(nc, &st_sum, st);
15565
+ sums[ith] += st_sum;
14612
15566
 
14613
15567
  #ifndef NDEBUG
14614
15568
  for (int i = 0; i < nc; ++i) {
@@ -14658,7 +15612,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14658
15612
  return;
14659
15613
  }
14660
15614
 
14661
- const float eps = 1e-9f;
15615
+ const double eps = 1e-9;
14662
15616
 
14663
15617
  // TODO: handle transposed/permuted matrices
14664
15618
  const int64_t nc = src0->ne[0];
@@ -14677,7 +15631,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14677
15631
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
14678
15632
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
14679
15633
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
14680
- float * sm = (float *) params->wdata + ith*nc;
14681
15634
 
14682
15635
  #ifndef NDEBUG
14683
15636
  for (int i = 0; i < nc; ++i) {
@@ -14686,54 +15639,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14686
15639
  assert(!isnan(s1[i]));
14687
15640
  }
14688
15641
  #endif
14689
- // step by step explanation:
14690
- {
14691
- //float * sums = (float *) params->wdata;
14692
-
14693
- // forward pass with annotated gradients from backward pass
14694
- // (built by going in reverse operation order, adding to gradients of current operation args)
14695
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
14696
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
14697
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
14698
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
14699
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
14700
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
14701
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
14702
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
14703
-
14704
- // substitute into grad[st1], because we can reuse softmax_back from this point on
14705
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
14706
- // postorder:
14707
- // grad[st1] := softmax(s0)
14708
- // grad[st1] := grad[st1]*(1.0 - eps)
14709
- // grad[st1] := grad[st1] + eps
14710
- // grad[st1] := s1 / grad[st1]
14711
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
14712
-
14713
- // src0 gradients by going through softmax_back
14714
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
14715
- // from softmax_back:
14716
- // dxk = yk * (dyk - dot(y, dy))
14717
- // dot_y_dy := dot(y, dy)
14718
- // dx := dy
14719
- // dx := dx - dot_y_dy
14720
- // dx := dx * y
14721
- // postorder:
14722
- // dot_st1_dst1 := dot(st1, grad[st1])
14723
- // grad[s0] := grad[st1]
14724
- // grad[s0] := grad[s0] - dot_st1_dst1
14725
- // grad[s0] := grad[s0] * st1
14726
-
14727
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
14728
- // sm := softmax(s0)
14729
- // grad[s0] := sm*(1.0 - eps)
14730
- // grad[s0] := grad[s0] + eps
14731
- // grad[s0] := s1 / grad[s0]
14732
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
14733
- // dot_st1_dst1 := dot(sm, grad[s0])
14734
- // grad[s0] := grad[s0] - dot_st1_dst1
14735
- // grad[s0] := grad[s0] * sm
14736
- }
14737
15642
 
14738
15643
  // soft_max
14739
15644
  ggml_float sum = 0.0;
@@ -14741,39 +15646,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14741
15646
  float max = -INFINITY;
14742
15647
  ggml_vec_max_f32(nc, &max, s0);
14743
15648
 
14744
- uint16_t scvt;
15649
+ uint16_t scvt; UNUSED(scvt);
14745
15650
  for (int i = 0; i < nc; i++) {
14746
15651
  if (s0[i] == -INFINITY) {
14747
- sm[i] = 0.0f;
15652
+ ds0[i] = 0.0f;
14748
15653
  } else {
14749
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15654
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15655
+ const float s = s0[i] - max;
15656
+ const float val = expf(s);
15657
+ #else
14750
15658
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
14751
15659
  memcpy(&scvt, &s, sizeof(scvt));
14752
15660
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15661
+ #endif
14753
15662
  sum += (ggml_float)val;
14754
- sm[i] = val;
15663
+ ds0[i] = val;
14755
15664
  }
14756
15665
  }
14757
15666
 
14758
15667
  assert(sum > 0.0);
14759
- sum = 1.0/sum;
15668
+ sum = (1.0 - eps)/sum;
14760
15669
  }
14761
15670
 
14762
- float dot_st1_dst1 = 0;
14763
- ggml_vec_scale_f32(nc, sm, sum);
14764
- ggml_vec_cpy_f32 (nc, ds0, sm);
14765
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
14766
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
14767
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
14768
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
14769
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
14770
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
14771
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
15671
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
15672
+ ggml_vec_scale_f32(nc, ds0, sum);
15673
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
15674
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
15675
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
15676
+
14772
15677
 
14773
15678
  #ifndef NDEBUG
14774
15679
  for (int i = 0; i < nc; ++i) {
14775
- assert(!isnan(sm[i]));
14776
- assert(!isinf(sm[i]));
14777
15680
  assert(!isnan(ds0[i]));
14778
15681
  assert(!isinf(ds0[i]));
14779
15682
  }
@@ -14879,6 +15782,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14879
15782
  {
14880
15783
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14881
15784
  } break;
15785
+ case GGML_OP_CONCAT:
15786
+ {
15787
+ ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15788
+ } break;
14882
15789
  case GGML_OP_SILU_BACK:
14883
15790
  {
14884
15791
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -14895,6 +15802,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14895
15802
  {
14896
15803
  ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
14897
15804
  } break;
15805
+ case GGML_OP_GROUP_NORM:
15806
+ {
15807
+ ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15808
+ } break;
14898
15809
  case GGML_OP_MUL_MAT:
14899
15810
  {
14900
15811
  ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
@@ -14987,6 +15898,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14987
15898
  {
14988
15899
  ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14989
15900
  } break;
15901
+ case GGML_OP_CONV_TRANSPOSE_2D:
15902
+ {
15903
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15904
+ } break;
14990
15905
  case GGML_OP_POOL_1D:
14991
15906
  {
14992
15907
  ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
@@ -14995,6 +15910,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14995
15910
  {
14996
15911
  ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
14997
15912
  } break;
15913
+ case GGML_OP_UPSCALE:
15914
+ {
15915
+ ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15916
+ } break;
14998
15917
  case GGML_OP_FLASH_ATTN:
14999
15918
  {
15000
15919
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -15025,6 +15944,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15025
15944
  {
15026
15945
  ggml_compute_forward_unary(params, tensor->src[0], tensor);
15027
15946
  } break;
15947
+ case GGML_OP_GET_REL_POS:
15948
+ {
15949
+ ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15950
+ } break;
15951
+ case GGML_OP_ADD_REL_POS:
15952
+ {
15953
+ ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15954
+ } break;
15028
15955
  case GGML_OP_MAP_UNARY:
15029
15956
  {
15030
15957
  ggml_unary_op_f32_t fun;
@@ -15288,6 +16215,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15288
16215
  inplace);
15289
16216
  }
15290
16217
  } break;
16218
+ case GGML_OP_CONCAT:
16219
+ {
16220
+ GGML_ASSERT(false); // TODO: implement
16221
+ } break;
15291
16222
  case GGML_OP_SILU_BACK:
15292
16223
  {
15293
16224
  GGML_ASSERT(false); // TODO: not implemented
@@ -15300,9 +16231,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15300
16231
  {
15301
16232
  // necessary for llama
15302
16233
  if (src0->grad) {
16234
+ float eps;
16235
+ memcpy(&eps, tensor->op_params, sizeof(float));
16236
+
15303
16237
  src0->grad = ggml_add_impl(ctx,
15304
16238
  src0->grad,
15305
- ggml_rms_norm_back(ctx, src0, tensor->grad),
16239
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
15306
16240
  inplace);
15307
16241
  }
15308
16242
  } break;
@@ -15310,6 +16244,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15310
16244
  {
15311
16245
  GGML_ASSERT(false); // TODO: not implemented
15312
16246
  } break;
16247
+ case GGML_OP_GROUP_NORM:
16248
+ {
16249
+ GGML_ASSERT(false); // TODO: not implemented
16250
+ } break;
15313
16251
  case GGML_OP_MUL_MAT:
15314
16252
  {
15315
16253
  // https://cs231n.github.io/optimization-2/#staged
@@ -15584,6 +16522,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15584
16522
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15585
16523
  const int mode = ((int32_t *) tensor->op_params)[2];
15586
16524
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16525
+ float freq_base;
16526
+ float freq_scale;
16527
+ float xpos_base;
16528
+ bool xpos_down;
16529
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16530
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16531
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16532
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16533
+
15587
16534
  src0->grad = ggml_add_impl(ctx,
15588
16535
  src0->grad,
15589
16536
  ggml_rope_back(ctx,
@@ -15591,7 +16538,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15591
16538
  n_past,
15592
16539
  n_dims,
15593
16540
  mode,
15594
- n_ctx),
16541
+ n_ctx,
16542
+ freq_base,
16543
+ freq_scale,
16544
+ xpos_base,
16545
+ xpos_down),
15595
16546
  inplace);
15596
16547
  }
15597
16548
  } break;
@@ -15602,14 +16553,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15602
16553
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15603
16554
  const int mode = ((int32_t *) tensor->op_params)[2];
15604
16555
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16556
+ float freq_base;
16557
+ float freq_scale;
16558
+ float xpos_base;
16559
+ bool xpos_down;
16560
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16561
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16562
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16563
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16564
+
15605
16565
  src0->grad = ggml_add_impl(ctx,
15606
16566
  src0->grad,
15607
- ggml_rope(ctx,
16567
+ ggml_rope_impl(ctx,
15608
16568
  tensor->grad,
15609
16569
  n_past,
15610
16570
  n_dims,
15611
16571
  mode,
15612
- n_ctx),
16572
+ n_ctx,
16573
+ freq_base,
16574
+ freq_scale,
16575
+ xpos_base,
16576
+ xpos_down,
16577
+ false),
15613
16578
  inplace);
15614
16579
  }
15615
16580
  } break;
@@ -15629,6 +16594,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15629
16594
  {
15630
16595
  GGML_ASSERT(false); // TODO: not implemented
15631
16596
  } break;
16597
+ case GGML_OP_CONV_TRANSPOSE_2D:
16598
+ {
16599
+ GGML_ASSERT(false); // TODO: not implemented
16600
+ } break;
15632
16601
  case GGML_OP_POOL_1D:
15633
16602
  {
15634
16603
  GGML_ASSERT(false); // TODO: not implemented
@@ -15637,6 +16606,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15637
16606
  {
15638
16607
  GGML_ASSERT(false); // TODO: not implemented
15639
16608
  } break;
16609
+ case GGML_OP_UPSCALE:
16610
+ {
16611
+ GGML_ASSERT(false); // TODO: not implemented
16612
+ } break;
15640
16613
  case GGML_OP_FLASH_ATTN:
15641
16614
  {
15642
16615
  struct ggml_tensor * flash_grad = NULL;
@@ -15878,6 +16851,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15878
16851
  GGML_ASSERT(false);
15879
16852
  }
15880
16853
  } break;
16854
+ case GGML_OP_GET_REL_POS:
16855
+ case GGML_OP_ADD_REL_POS:
15881
16856
  case GGML_OP_MAP_UNARY:
15882
16857
  case GGML_OP_MAP_BINARY:
15883
16858
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16029,9 +17004,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16029
17004
  return result;
16030
17005
  }
16031
17006
 
16032
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
16033
- struct ggml_cgraph result = *gf;
16034
-
17007
+ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
16035
17008
  GGML_ASSERT(gf->n_nodes > 0);
16036
17009
 
16037
17010
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16055,15 +17028,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16055
17028
  }
16056
17029
  }
16057
17030
 
16058
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
17031
+ for (int i = 0; i < gf->n_nodes; i++) {
16059
17032
  struct ggml_tensor * node = gf->nodes[i];
16060
17033
 
16061
17034
  if (node->is_param) {
16062
17035
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16063
- ggml_build_forward_expand(&result, node->grad);
17036
+ ggml_build_forward_expand(gb, node->grad);
16064
17037
  }
16065
17038
  }
17039
+ }
16066
17040
 
17041
+ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
17042
+ struct ggml_cgraph result = *gf;
17043
+ ggml_build_backward_expand(ctx, gf, &result, keep);
16067
17044
  return result;
16068
17045
  }
16069
17046
 
@@ -16382,7 +17359,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16382
17359
 
16383
17360
  size_t cur = 0;
16384
17361
  if (ggml_is_quantized(node->type)) {
16385
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
17362
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16386
17363
  }
16387
17364
 
16388
17365
  work_size = MAX(work_size, cur);
@@ -16395,7 +17372,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16395
17372
  size_t cur = 0;
16396
17373
 
16397
17374
  if (ggml_is_quantized(node->src[0]->type)) {
16398
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
17375
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16399
17376
  }
16400
17377
 
16401
17378
  work_size = MAX(work_size, cur);
@@ -16407,7 +17384,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16407
17384
  size_t cur = 0;
16408
17385
 
16409
17386
  if (ggml_is_quantized(node->src[0]->type)) {
16410
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
17387
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16411
17388
  }
16412
17389
 
16413
17390
  work_size = MAX(work_size, cur);
@@ -16454,9 +17431,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16454
17431
  case GGML_OP_NORM:
16455
17432
  case GGML_OP_RMS_NORM:
16456
17433
  case GGML_OP_RMS_NORM_BACK:
17434
+ case GGML_OP_GROUP_NORM:
16457
17435
  {
16458
17436
  n_tasks = n_threads;
16459
17437
  } break;
17438
+ case GGML_OP_CONCAT:
16460
17439
  case GGML_OP_MUL_MAT:
16461
17440
  case GGML_OP_OUT_PROD:
16462
17441
  {
@@ -16490,12 +17469,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16490
17469
  // the threads are still spinning
16491
17470
  if (node->src[0]->type != GGML_TYPE_F32) {
16492
17471
  // here we need memory just for single 2D matrix from src0
16493
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
17472
+ cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
16494
17473
  }
16495
17474
  } else
16496
17475
  #endif
16497
17476
  if (node->src[1]->type != vec_dot_type) {
16498
- cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
17477
+ cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16499
17478
  } else {
16500
17479
  cur = 0;
16501
17480
  }
@@ -16524,6 +17503,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16524
17503
  case GGML_OP_SOFT_MAX_BACK:
16525
17504
  case GGML_OP_ROPE:
16526
17505
  case GGML_OP_ROPE_BACK:
17506
+ case GGML_OP_ADD_REL_POS:
16527
17507
  {
16528
17508
  n_tasks = n_threads;
16529
17509
  } break;
@@ -16598,6 +17578,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16598
17578
  GGML_ASSERT(false);
16599
17579
  }
16600
17580
 
17581
+ work_size = MAX(work_size, cur);
17582
+ } break;
17583
+ case GGML_OP_CONV_TRANSPOSE_2D:
17584
+ {
17585
+ n_tasks = n_threads;
17586
+
17587
+ const int64_t ne00 = node->src[0]->ne[0]; // W
17588
+ const int64_t ne01 = node->src[0]->ne[1]; // H
17589
+ const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
17590
+ const int64_t ne03 = node->src[0]->ne[3]; // Channels In
17591
+
17592
+ const int64_t ne10 = node->src[1]->ne[0]; // W
17593
+ const int64_t ne11 = node->src[1]->ne[1]; // H
17594
+ const int64_t ne12 = node->src[1]->ne[2]; // Channels In
17595
+
17596
+ size_t cur = 0;
17597
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
17598
+ cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
17599
+
16601
17600
  work_size = MAX(work_size, cur);
16602
17601
  } break;
16603
17602
  case GGML_OP_POOL_1D:
@@ -16605,6 +17604,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16605
17604
  {
16606
17605
  n_tasks = 1;
16607
17606
  } break;
17607
+ case GGML_OP_UPSCALE:
17608
+ {
17609
+ n_tasks = n_threads;
17610
+ } break;
16608
17611
  case GGML_OP_FLASH_ATTN:
16609
17612
  {
16610
17613
  n_tasks = n_threads;
@@ -16666,6 +17669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16666
17669
  } break;
16667
17670
  case GGML_OP_WIN_PART:
16668
17671
  case GGML_OP_WIN_UNPART:
17672
+ case GGML_OP_GET_REL_POS:
16669
17673
  case GGML_OP_MAP_UNARY:
16670
17674
  case GGML_OP_MAP_BINARY:
16671
17675
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16712,10 +17716,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16712
17716
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16713
17717
  {
16714
17718
  n_tasks = n_threads;
16715
-
16716
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
16717
-
16718
- work_size = MAX(work_size, cur);
16719
17719
  } break;
16720
17720
  case GGML_OP_NONE:
16721
17721
  {
@@ -16783,8 +17783,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16783
17783
 
16784
17784
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
16785
17785
  GGML_ASSERT(rc == 0);
17786
+ UNUSED(rc);
16786
17787
  }
16787
17788
  }
17789
+
16788
17790
  workers[0].ith = 0;
16789
17791
  workers[0].shared = &state_shared;
16790
17792
 
@@ -16900,7 +17902,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16900
17902
  // compute size of intermediate results
16901
17903
  // TODO: does not take into account scratch buffers !!!!
16902
17904
  for (int i = 0; i < cgraph->n_nodes; ++i) {
16903
- size_eval += ggml_nbytes(cgraph->nodes[i]);
17905
+ size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
16904
17906
  }
16905
17907
 
16906
17908
  // print
@@ -17591,14 +18593,16 @@ static enum ggml_opt_result ggml_opt_adam(
17591
18593
  struct ggml_opt_params params,
17592
18594
  struct ggml_tensor * f,
17593
18595
  struct ggml_cgraph * gf,
17594
- struct ggml_cgraph * gb) {
18596
+ struct ggml_cgraph * gb,
18597
+ ggml_opt_callback callback,
18598
+ void * callback_data) {
17595
18599
  GGML_ASSERT(ggml_is_scalar(f));
17596
18600
 
17597
18601
  // these will store the parameters we want to optimize
17598
18602
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
17599
18603
 
17600
18604
  int np = 0;
17601
- int nx = 0;
18605
+ int64_t nx = 0;
17602
18606
  for (int i = 0; i < gf->n_nodes; ++i) {
17603
18607
  if (gf->nodes[i]->is_param) {
17604
18608
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -17617,31 +18621,32 @@ static enum ggml_opt_result ggml_opt_adam(
17617
18621
  }
17618
18622
 
17619
18623
  // constants
17620
- const float sched = params.adam.sched;
17621
- const float decay = params.adam.decay * sched;
17622
- const float alpha = params.adam.alpha * sched;
18624
+ float sched = params.adam.sched;
18625
+ const float alpha = params.adam.alpha;
18626
+ const float decay = params.adam.decay * alpha;
17623
18627
  const float beta1 = params.adam.beta1;
17624
18628
  const float beta2 = params.adam.beta2;
17625
18629
  const float eps = params.adam.eps;
18630
+ const float gclip = params.adam.gclip;
18631
+ const int decay_min_ndim = params.adam.decay_min_ndim;
17626
18632
 
17627
- float * x = opt->adam.x->data; // view of the parameters
17628
- float * g1 = opt->adam.g1->data; // gradient
17629
- float * g2 = opt->adam.g2->data; // gradient squared
17630
18633
  float * m = opt->adam.m->data; // first moment
17631
18634
  float * v = opt->adam.v->data; // second moment
17632
- float * mh = opt->adam.mh->data; // first moment hat
17633
- float * vh = opt->adam.vh->data; // second moment hat
17634
18635
 
17635
18636
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
17636
18637
 
17637
- // update view
17638
- ggml_opt_get_params(np, ps, x);
18638
+ if (callback) {
18639
+ callback(callback_data, &sched);
18640
+ }
17639
18641
 
17640
18642
  // compute the function value
17641
18643
  ggml_graph_reset (gf);
17642
18644
  ggml_set_f32 (f->grad, 1.0f);
17643
18645
 
17644
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18646
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18647
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18648
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18649
+ ggml_graph_compute(gb, &cplan);
17645
18650
 
17646
18651
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
17647
18652
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -17649,6 +18654,9 @@ static enum ggml_opt_result ggml_opt_adam(
17649
18654
  pf[opt->iter % params.past] = opt->adam.fx_prev;
17650
18655
  }
17651
18656
 
18657
+ opt->loss_before = opt->adam.fx_prev;
18658
+ opt->loss_after = opt->adam.fx_prev;
18659
+
17652
18660
  // initialize
17653
18661
  if (opt->just_initialized) {
17654
18662
  opt->adam.n_no_improvement = 0;
@@ -17681,50 +18689,55 @@ static enum ggml_opt_result ggml_opt_adam(
17681
18689
  UNUSED(t_start_cpu);
17682
18690
 
17683
18691
  {
17684
- // update the gradient
17685
- ggml_opt_get_grad(np, ps, g1);
17686
-
17687
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
17688
- ggml_vec_scale_f32(nx, m, beta1);
17689
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
17690
-
17691
- // g2 = g1^2
17692
- ggml_vec_sqr_f32 (nx, g2, g1);
17693
-
17694
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
17695
- ggml_vec_scale_f32(nx, v, beta2);
17696
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
17697
-
17698
- // m^hat = m_t / (1 - beta1^t)
17699
- // v^hat = v_t / (1 - beta2^t)
17700
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
17701
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
17702
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
17703
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
17704
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
17705
- ggml_vec_cpy_f32 (nx, mh, m);
17706
- ggml_vec_cpy_f32 (nx, vh, v);
17707
-
17708
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
17709
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
17710
-
17711
- ggml_vec_sqrt_f32 (nx, vh, vh);
17712
- ggml_vec_acc1_f32 (nx, vh, eps);
17713
-
17714
- ggml_vec_div_f32 (nx, mh, mh, vh);
17715
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
17716
- ggml_vec_sub_f32 (nx, x, x, mh);
18692
+ float gnorm = 1.0f;
18693
+ if (gclip > 0.0f) {
18694
+ // gradient clipping
18695
+ ggml_float sum = 0.0;
18696
+ for (int p = 0; p < np; ++p) {
18697
+ const int64_t ne = ggml_nelements(ps[p]);
18698
+ for (int64_t j = 0; j < ne; ++j) {
18699
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
18700
+ sum += (ggml_float)(g*g);
18701
+ }
18702
+ }
18703
+ ggml_float norm = sqrt(sum);
18704
+ if (norm > (ggml_float) gclip) {
18705
+ gnorm = (float) ((ggml_float) gclip / norm);
18706
+ }
18707
+ }
18708
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
18709
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
18710
+ int64_t i = 0;
18711
+ for (int p = 0; p < np; ++p) {
18712
+ const int64_t ne = ggml_nelements(ps[p]);
18713
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
18714
+ for (int64_t j = 0; j < ne; ++j) {
18715
+ float x = ggml_get_f32_1d(ps[p], j);
18716
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
18717
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
18718
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
18719
+ float mh = m[i]*beta1h;
18720
+ float vh = v[i]*beta2h;
18721
+ vh = sqrtf(vh) + eps;
18722
+ x = x*(1.0f - p_decay) - mh/vh;
18723
+ ggml_set_f32_1d(ps[p], j, x);
18724
+ ++i;
18725
+ }
18726
+ }
18727
+ }
17717
18728
 
17718
- // update the parameters
17719
- ggml_opt_set_params(np, ps, x);
18729
+ if (callback) {
18730
+ callback(callback_data, &sched);
17720
18731
  }
17721
18732
 
17722
18733
  ggml_graph_reset (gf);
17723
18734
  ggml_set_f32 (f->grad, 1.0f);
17724
18735
 
17725
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18736
+ ggml_graph_compute(gb, &cplan);
17726
18737
 
17727
18738
  const float fx = ggml_get_f32_1d(f, 0);
18739
+ opt->loss_after = fx;
18740
+
17728
18741
 
17729
18742
  // check convergence
17730
18743
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -17793,7 +18806,6 @@ struct ggml_lbfgs_iteration_data {
17793
18806
  };
17794
18807
 
17795
18808
  static enum ggml_opt_result linesearch_backtracking(
17796
- struct ggml_context * ctx,
17797
18809
  const struct ggml_opt_params * params,
17798
18810
  int nx,
17799
18811
  float * x,
@@ -17805,8 +18817,11 @@ static enum ggml_opt_result linesearch_backtracking(
17805
18817
  struct ggml_tensor * f,
17806
18818
  struct ggml_cgraph * gf,
17807
18819
  struct ggml_cgraph * gb,
18820
+ struct ggml_cplan * cplan,
17808
18821
  const int np,
17809
- struct ggml_tensor * ps[]) {
18822
+ struct ggml_tensor * ps[],
18823
+ ggml_opt_callback callback,
18824
+ void * callback_data) {
17810
18825
  int count = 0;
17811
18826
 
17812
18827
  float width = 0.0f;
@@ -17835,6 +18850,12 @@ static enum ggml_opt_result linesearch_backtracking(
17835
18850
  dgtest = params->lbfgs.ftol*dginit;
17836
18851
 
17837
18852
  while (true) {
18853
+ if (callback) {
18854
+ // LBFG-S does not support learning rate -> ignore learning schedule
18855
+ float sched = 0;
18856
+ callback(callback_data, &sched);
18857
+ }
18858
+
17838
18859
  ggml_vec_cpy_f32(nx, x, xp);
17839
18860
  ggml_vec_mad_f32(nx, x, d, *step);
17840
18861
 
@@ -17845,7 +18866,7 @@ static enum ggml_opt_result linesearch_backtracking(
17845
18866
  ggml_graph_reset (gf);
17846
18867
  ggml_set_f32 (f->grad, 1.0f);
17847
18868
 
17848
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
18869
+ ggml_graph_compute(gb, cplan);
17849
18870
 
17850
18871
  ggml_opt_get_grad(np, ps, g);
17851
18872
 
@@ -17905,7 +18926,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17905
18926
  struct ggml_opt_params params,
17906
18927
  struct ggml_tensor * f,
17907
18928
  struct ggml_cgraph * gf,
17908
- struct ggml_cgraph * gb) {
18929
+ struct ggml_cgraph * gb,
18930
+ ggml_opt_callback callback,
18931
+ void * callback_data) {
17909
18932
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
17910
18933
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
17911
18934
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -17937,6 +18960,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17937
18960
  opt->iter = iter;
17938
18961
  }
17939
18962
 
18963
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18964
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18965
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18966
+
17940
18967
  float * x = opt->lbfgs.x->data; // current parameters
17941
18968
  float * xp = opt->lbfgs.xp->data; // previous parameters
17942
18969
  float * g = opt->lbfgs.g->data; // current gradient
@@ -17958,6 +18985,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17958
18985
  float * lm_s = opt->lbfgs.lms->data;
17959
18986
  float * lm_y = opt->lbfgs.lmy->data;
17960
18987
 
18988
+ if (callback) {
18989
+ // LBFG-S does not support learning rate -> ignore learning schedule
18990
+ float sched = 0;
18991
+ callback(callback_data, &sched);
18992
+ }
18993
+
17961
18994
  // evaluate the function value and its gradient
17962
18995
  {
17963
18996
  ggml_opt_set_params(np, ps, x);
@@ -17965,11 +18998,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17965
18998
  ggml_graph_reset (gf);
17966
18999
  ggml_set_f32 (f->grad, 1.0f);
17967
19000
 
17968
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
19001
+ ggml_graph_compute(gb, &cplan);
17969
19002
 
17970
19003
  ggml_opt_get_grad(np, ps, g);
17971
19004
 
17972
19005
  fx = ggml_get_f32_1d(f, 0);
19006
+
19007
+ opt->loss_before = fx;
19008
+ opt->loss_after = fx;
17973
19009
  }
17974
19010
 
17975
19011
  // search direction = -gradient
@@ -18024,7 +19060,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18024
19060
  ggml_vec_cpy_f32(nx, xp, x);
18025
19061
  ggml_vec_cpy_f32(nx, gp, g);
18026
19062
 
18027
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
19063
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
18028
19064
 
18029
19065
  if (ls < 0) {
18030
19066
  // linesearch failed - go back to the previous point and return
@@ -18034,6 +19070,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18034
19070
  return ls;
18035
19071
  }
18036
19072
 
19073
+ opt->loss_after = fx;
19074
+
18037
19075
  ggml_vec_norm_f32(nx, &xnorm, x);
18038
19076
  ggml_vec_norm_f32(nx, &gnorm, g);
18039
19077
 
@@ -18091,7 +19129,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18091
19129
  // ys = y^t \cdot s -> 1 / \rho.
18092
19130
  // yy = y^t \cdot y.
18093
19131
  //
18094
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
19132
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18095
19133
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18096
19134
 
18097
19135
  lm_ys[end[0]] = ys;
@@ -18154,13 +19192,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18154
19192
  .adam = {
18155
19193
  .n_iter = 10000,
18156
19194
  .sched = 1.000f,
18157
- .decay = 0.001f,
19195
+ .decay = 0.0f,
19196
+ .decay_min_ndim = 2,
18158
19197
  .alpha = 0.001f,
18159
19198
  .beta1 = 0.9f,
18160
19199
  .beta2 = 0.999f,
18161
19200
  .eps = 1e-8f,
18162
19201
  .eps_f = 1e-5f,
18163
19202
  .eps_g = 1e-3f,
19203
+ .gclip = 0.0f,
18164
19204
  },
18165
19205
  };
18166
19206
  } break;
@@ -18210,23 +19250,13 @@ GGML_API void ggml_opt_init(
18210
19250
  switch (opt->params.type) {
18211
19251
  case GGML_OPT_ADAM:
18212
19252
  {
18213
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18214
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18215
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18216
19253
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18217
19254
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18218
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18219
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18220
19255
  opt->adam.pf = params.past > 0
18221
19256
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
18222
19257
  : NULL;
18223
- ggml_set_zero(opt->adam.x);
18224
- ggml_set_zero(opt->adam.g1);
18225
- ggml_set_zero(opt->adam.g2);
18226
19258
  ggml_set_zero(opt->adam.m);
18227
19259
  ggml_set_zero(opt->adam.v);
18228
- ggml_set_zero(opt->adam.mh);
18229
- ggml_set_zero(opt->adam.vh);
18230
19260
  if (opt->adam.pf) {
18231
19261
  ggml_set_zero(opt->adam.pf);
18232
19262
  }
@@ -18301,8 +19331,8 @@ enum ggml_opt_result ggml_opt_resume(
18301
19331
  struct ggml_tensor * f) {
18302
19332
 
18303
19333
  // build forward + backward compute graphs
18304
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
18305
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
19334
+ struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
19335
+ struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18306
19336
 
18307
19337
  struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18308
19338
  struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
@@ -18310,7 +19340,7 @@ enum ggml_opt_result ggml_opt_resume(
18310
19340
  *gf = ggml_build_forward (f);
18311
19341
  *gb = ggml_build_backward(ctx, gf, true);
18312
19342
 
18313
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
19343
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18314
19344
  }
18315
19345
 
18316
19346
  enum ggml_opt_result ggml_opt_resume_g(
@@ -18318,7 +19348,9 @@ enum ggml_opt_result ggml_opt_resume_g(
18318
19348
  struct ggml_opt_context * opt,
18319
19349
  struct ggml_tensor * f,
18320
19350
  struct ggml_cgraph * gf,
18321
- struct ggml_cgraph * gb) {
19351
+ struct ggml_cgraph * gb,
19352
+ ggml_opt_callback callback,
19353
+ void * callback_data) {
18322
19354
 
18323
19355
  // build forward + backward compute graphs
18324
19356
  enum ggml_opt_result result = GGML_OPT_OK;
@@ -18326,11 +19358,11 @@ enum ggml_opt_result ggml_opt_resume_g(
18326
19358
  switch (opt->params.type) {
18327
19359
  case GGML_OPT_ADAM:
18328
19360
  {
18329
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
19361
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
18330
19362
  } break;
18331
19363
  case GGML_OPT_LBFGS:
18332
19364
  {
18333
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
19365
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
18334
19366
  } break;
18335
19367
  }
18336
19368
 
@@ -18561,64 +19593,1164 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18561
19593
 
18562
19594
  ////////////////////////////////////////////////////////////////////////////////
18563
19595
 
18564
- int ggml_cpu_has_avx(void) {
18565
- #if defined(__AVX__)
18566
- return 1;
18567
- #else
18568
- return 0;
18569
- #endif
18570
- }
19596
+ struct gguf_str {
19597
+ uint64_t n; // GGUFv2
19598
+ char * data;
19599
+ };
18571
19600
 
18572
- int ggml_cpu_has_avx2(void) {
18573
- #if defined(__AVX2__)
18574
- return 1;
18575
- #else
18576
- return 0;
18577
- #endif
18578
- }
19601
+ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19602
+ [GGUF_TYPE_UINT8] = sizeof(uint8_t),
19603
+ [GGUF_TYPE_INT8] = sizeof(int8_t),
19604
+ [GGUF_TYPE_UINT16] = sizeof(uint16_t),
19605
+ [GGUF_TYPE_INT16] = sizeof(int16_t),
19606
+ [GGUF_TYPE_UINT32] = sizeof(uint32_t),
19607
+ [GGUF_TYPE_INT32] = sizeof(int32_t),
19608
+ [GGUF_TYPE_FLOAT32] = sizeof(float),
19609
+ [GGUF_TYPE_BOOL] = sizeof(bool),
19610
+ [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19611
+ [GGUF_TYPE_UINT64] = sizeof(uint64_t),
19612
+ [GGUF_TYPE_INT64] = sizeof(int64_t),
19613
+ [GGUF_TYPE_FLOAT64] = sizeof(double),
19614
+ [GGUF_TYPE_ARRAY] = 0, // undefined
19615
+ };
19616
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19617
+
19618
+ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19619
+ [GGUF_TYPE_UINT8] = "u8",
19620
+ [GGUF_TYPE_INT8] = "i8",
19621
+ [GGUF_TYPE_UINT16] = "u16",
19622
+ [GGUF_TYPE_INT16] = "i16",
19623
+ [GGUF_TYPE_UINT32] = "u32",
19624
+ [GGUF_TYPE_INT32] = "i32",
19625
+ [GGUF_TYPE_FLOAT32] = "f32",
19626
+ [GGUF_TYPE_BOOL] = "bool",
19627
+ [GGUF_TYPE_STRING] = "str",
19628
+ [GGUF_TYPE_ARRAY] = "arr",
19629
+ [GGUF_TYPE_UINT64] = "u64",
19630
+ [GGUF_TYPE_INT64] = "i64",
19631
+ [GGUF_TYPE_FLOAT64] = "f64",
19632
+ };
19633
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19634
+
19635
+ union gguf_value {
19636
+ uint8_t uint8;
19637
+ int8_t int8;
19638
+ uint16_t uint16;
19639
+ int16_t int16;
19640
+ uint32_t uint32;
19641
+ int32_t int32;
19642
+ float float32;
19643
+ uint64_t uint64;
19644
+ int64_t int64;
19645
+ double float64;
19646
+ bool bool_;
19647
+
19648
+ struct gguf_str str;
19649
+
19650
+ struct {
19651
+ enum gguf_type type;
19652
+
19653
+ uint64_t n; // GGUFv2
19654
+ void * data;
19655
+ } arr;
19656
+ };
18579
19657
 
18580
- int ggml_cpu_has_avx512(void) {
18581
- #if defined(__AVX512F__)
18582
- return 1;
18583
- #else
18584
- return 0;
18585
- #endif
18586
- }
19658
+ struct gguf_kv {
19659
+ struct gguf_str key;
18587
19660
 
18588
- int ggml_cpu_has_avx512_vbmi(void) {
18589
- #if defined(__AVX512VBMI__)
18590
- return 1;
18591
- #else
18592
- return 0;
18593
- #endif
18594
- }
19661
+ enum gguf_type type;
19662
+ union gguf_value value;
19663
+ };
18595
19664
 
18596
- int ggml_cpu_has_avx512_vnni(void) {
18597
- #if defined(__AVX512VNNI__)
18598
- return 1;
18599
- #else
18600
- return 0;
18601
- #endif
18602
- }
19665
+ struct gguf_header {
19666
+ uint32_t magic;
19667
+ uint32_t version;
19668
+ uint64_t n_tensors; // GGUFv2
19669
+ uint64_t n_kv; // GGUFv2
19670
+ };
18603
19671
 
18604
- int ggml_cpu_has_fma(void) {
18605
- #if defined(__FMA__)
18606
- return 1;
18607
- #else
18608
- return 0;
18609
- #endif
18610
- }
19672
+ struct gguf_tensor_info {
19673
+ struct gguf_str name;
18611
19674
 
18612
- int ggml_cpu_has_neon(void) {
18613
- #if defined(__ARM_NEON)
18614
- return 1;
18615
- #else
18616
- return 0;
18617
- #endif
18618
- }
19675
+ uint32_t n_dims;
19676
+ uint64_t ne[GGML_MAX_DIMS];
18619
19677
 
18620
- int ggml_cpu_has_arm_fma(void) {
18621
- #if defined(__ARM_FEATURE_FMA)
19678
+ enum ggml_type type;
19679
+
19680
+ uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
19681
+
19682
+ // for writing API
19683
+ const void * data;
19684
+ size_t size;
19685
+ };
19686
+
19687
+ struct gguf_context {
19688
+ struct gguf_header header;
19689
+
19690
+ struct gguf_kv * kv;
19691
+ struct gguf_tensor_info * infos;
19692
+
19693
+ size_t alignment;
19694
+ size_t offset; // offset of `data` from beginning of file
19695
+ size_t size; // size of `data` in bytes
19696
+
19697
+ //uint8_t * padding;
19698
+ void * data;
19699
+ };
19700
+
19701
+ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
19702
+ const size_t n = fread(dst, 1, size, file);
19703
+ *offset += n;
19704
+ return n == size;
19705
+ }
19706
+
19707
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19708
+ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
19709
+ p->n = 0;
19710
+ p->data = NULL;
19711
+
19712
+ bool ok = true;
19713
+
19714
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19715
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19716
+
19717
+ return ok;
19718
+ }
19719
+
19720
+ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
19721
+ p->n = 0;
19722
+ p->data = NULL;
19723
+
19724
+ bool ok = true;
19725
+
19726
+ uint32_t n = 0;
19727
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
19728
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19729
+
19730
+ return ok;
19731
+ }
19732
+
19733
+ struct gguf_context * gguf_init_empty(void) {
19734
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19735
+
19736
+ ctx->header.magic = GGUF_MAGIC;
19737
+ ctx->header.version = GGUF_VERSION;
19738
+ ctx->header.n_tensors = 0;
19739
+ ctx->header.n_kv = 0;
19740
+
19741
+ ctx->kv = NULL;
19742
+ ctx->infos = NULL;
19743
+
19744
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19745
+ ctx->offset = 0;
19746
+ ctx->size = 0;
19747
+
19748
+ ctx->data = NULL;
19749
+
19750
+ return ctx;
19751
+ }
19752
+
19753
+ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
19754
+ FILE * file = fopen(fname, "rb");
19755
+ if (!file) {
19756
+ return NULL;
19757
+ }
19758
+
19759
+ // offset from start of file
19760
+ size_t offset = 0;
19761
+
19762
+ uint32_t magic = 0;
19763
+
19764
+ // check the magic before making allocations
19765
+ {
19766
+ gguf_fread_el(file, &magic, sizeof(magic), &offset);
19767
+
19768
+ if (magic != GGUF_MAGIC) {
19769
+ fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
19770
+ fclose(file);
19771
+ return NULL;
19772
+ }
19773
+ }
19774
+
19775
+ bool ok = true;
19776
+
19777
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19778
+
19779
+ // read the header
19780
+ {
19781
+ ctx->header.magic = magic;
19782
+
19783
+ ctx->kv = NULL;
19784
+ ctx->infos = NULL;
19785
+ ctx->data = NULL;
19786
+
19787
+ ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19788
+
19789
+ if (ctx->header.version == 1) {
19790
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19791
+ uint32_t n_tensors = 0;
19792
+ uint32_t n_kv = 0;
19793
+
19794
+ ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
19795
+ ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
19796
+
19797
+ ctx->header.n_tensors = n_tensors;
19798
+ ctx->header.n_kv = n_kv;
19799
+ } else {
19800
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19801
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19802
+ }
19803
+
19804
+ if (!ok) {
19805
+ fprintf(stderr, "%s: failed to read header\n", __func__);
19806
+ fclose(file);
19807
+ gguf_free(ctx);
19808
+ return NULL;
19809
+ }
19810
+ }
19811
+
19812
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19813
+ bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
19814
+ if (ctx->header.version == 1) {
19815
+ gguf_fread_str = gguf_fread_str_v1;
19816
+ }
19817
+
19818
+ // read the kv pairs
19819
+ {
19820
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19821
+
19822
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19823
+ struct gguf_kv * kv = &ctx->kv[i];
19824
+
19825
+ //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19826
+
19827
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19828
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19829
+
19830
+ //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19831
+
19832
+ switch (kv->type) {
19833
+ case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
19834
+ case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
19835
+ case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
19836
+ case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
19837
+ case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19838
+ case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19839
+ case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19840
+ case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
19841
+ case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
19842
+ case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
19843
+ case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19844
+ case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19845
+ case GGUF_TYPE_ARRAY:
19846
+ {
19847
+ ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19848
+
19849
+ if (ctx->header.version == 1) {
19850
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19851
+ uint32_t n = 0;
19852
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
19853
+ kv->value.arr.n = n;
19854
+ } else {
19855
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19856
+ }
19857
+
19858
+ switch (kv->value.arr.type) {
19859
+ case GGUF_TYPE_UINT8:
19860
+ case GGUF_TYPE_INT8:
19861
+ case GGUF_TYPE_UINT16:
19862
+ case GGUF_TYPE_INT16:
19863
+ case GGUF_TYPE_UINT32:
19864
+ case GGUF_TYPE_INT32:
19865
+ case GGUF_TYPE_FLOAT32:
19866
+ case GGUF_TYPE_UINT64:
19867
+ case GGUF_TYPE_INT64:
19868
+ case GGUF_TYPE_FLOAT64:
19869
+ case GGUF_TYPE_BOOL:
19870
+ {
19871
+ kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
19872
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
19873
+ } break;
19874
+ case GGUF_TYPE_STRING:
19875
+ {
19876
+ kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
19877
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19878
+ ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
19879
+ }
19880
+ } break;
19881
+ case GGUF_TYPE_ARRAY:
19882
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
19883
+ };
19884
+ } break;
19885
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
19886
+ };
19887
+
19888
+ if (!ok) {
19889
+ break;
19890
+ }
19891
+ }
19892
+
19893
+ if (!ok) {
19894
+ fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
19895
+ fclose(file);
19896
+ gguf_free(ctx);
19897
+ return NULL;
19898
+ }
19899
+ }
19900
+
19901
+ // read the tensor infos
19902
+ {
19903
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19904
+
19905
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19906
+ struct gguf_tensor_info * info = &ctx->infos[i];
19907
+
19908
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
19909
+ info->ne[j] = 1;
19910
+ }
19911
+
19912
+ ok = ok && gguf_fread_str(file, &info->name, &offset);
19913
+ ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19914
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
19915
+ if (ctx->header.version == 1) {
19916
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19917
+ uint32_t t = 0;
19918
+ ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
19919
+ info->ne[j] = t;
19920
+ } else {
19921
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19922
+ }
19923
+ }
19924
+ ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19925
+ ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
19926
+
19927
+ if (!ok) {
19928
+ fprintf(stderr, "%s: failed to read tensor info\n", __func__);
19929
+ fclose(file);
19930
+ gguf_free(ctx);
19931
+ return NULL;
19932
+ }
19933
+ }
19934
+ }
19935
+
19936
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19937
+
19938
+ int alignment_idx = gguf_find_key(ctx, "general.alignment");
19939
+ if (alignment_idx != -1) {
19940
+ ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
19941
+ }
19942
+
19943
+ // we require the data section to be aligned, so take into account any padding
19944
+ {
19945
+ const size_t offset_pad = offset % ctx->alignment;
19946
+
19947
+ if (offset_pad != 0) {
19948
+ offset += ctx->alignment - offset_pad;
19949
+ fseek(file, offset, SEEK_SET);
19950
+ }
19951
+ }
19952
+
19953
+ // store the current file offset - this is where the data section starts
19954
+ ctx->offset = offset;
19955
+
19956
+ // compute the total size of the data section, taking into account the alignment
19957
+ {
19958
+ ctx->size = 0;
19959
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19960
+ struct gguf_tensor_info * info = &ctx->infos[i];
19961
+
19962
+ const int64_t ne =
19963
+ (int64_t) info->ne[0] *
19964
+ (int64_t) info->ne[1] *
19965
+ (int64_t) info->ne[2] *
19966
+ (int64_t) info->ne[3];
19967
+
19968
+ if (ne % ggml_blck_size(info->type) != 0) {
19969
+ fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19970
+ __func__, info->name.data, ne, ggml_blck_size(info->type));
19971
+ fclose(file);
19972
+ gguf_free(ctx);
19973
+ return NULL;
19974
+ }
19975
+
19976
+ const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
19977
+
19978
+ ctx->size += GGML_PAD(size_cur, ctx->alignment);
19979
+ }
19980
+ }
19981
+
19982
+ // load the tensor data only if requested
19983
+ if (params.ctx != NULL) {
19984
+ // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
19985
+ // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
19986
+ // the ggml_tensor structs to the appropriate locations in the binary blob
19987
+
19988
+ // compute the exact size needed for the new ggml_context
19989
+ const size_t mem_size =
19990
+ params.no_alloc ?
19991
+ (ctx->header.n_tensors )*ggml_tensor_overhead() :
19992
+ (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
19993
+
19994
+ struct ggml_init_params pdata = {
19995
+ .mem_size = mem_size,
19996
+ .mem_buffer = NULL,
19997
+ .no_alloc = params.no_alloc,
19998
+ };
19999
+
20000
+ *params.ctx = ggml_init(pdata);
20001
+
20002
+ struct ggml_context * ctx_data = *params.ctx;
20003
+
20004
+ struct ggml_tensor * data = NULL;
20005
+
20006
+ if (params.no_alloc == false) {
20007
+ data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
20008
+
20009
+ ok = ok && data != NULL;
20010
+
20011
+ // read the binary blob with the tensor data
20012
+ ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
20013
+
20014
+ if (!ok) {
20015
+ fprintf(stderr, "%s: failed to read tensor data\n", __func__);
20016
+ fclose(file);
20017
+ ggml_free(ctx_data);
20018
+ gguf_free(ctx);
20019
+ return NULL;
20020
+ }
20021
+
20022
+ ctx->data = data->data;
20023
+ }
20024
+
20025
+ ggml_set_no_alloc(ctx_data, true);
20026
+
20027
+ // create the tensors
20028
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20029
+ const int64_t ne[GGML_MAX_DIMS] = {
20030
+ ctx->infos[i].ne[0],
20031
+ ctx->infos[i].ne[1],
20032
+ ctx->infos[i].ne[2],
20033
+ ctx->infos[i].ne[3],
20034
+ };
20035
+
20036
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
20037
+
20038
+ ok = ok && cur != NULL;
20039
+
20040
+ ggml_set_name(cur, ctx->infos[i].name.data);
20041
+
20042
+ if (!ok) {
20043
+ break;
20044
+ }
20045
+
20046
+ // point the data member to the appropriate location in the binary blob using the tensor infos
20047
+ if (params.no_alloc == false) {
20048
+ //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
20049
+ cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
20050
+ }
20051
+ }
20052
+
20053
+ if (!ok) {
20054
+ fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
20055
+ fclose(file);
20056
+ ggml_free(ctx_data);
20057
+ gguf_free(ctx);
20058
+ return NULL;
20059
+ }
20060
+
20061
+ ggml_set_no_alloc(ctx_data, params.no_alloc);
20062
+ }
20063
+
20064
+ fclose(file);
20065
+
20066
+ return ctx;
20067
+ }
20068
+
20069
+ void gguf_free(struct gguf_context * ctx) {
20070
+ if (ctx == NULL) {
20071
+ return;
20072
+ }
20073
+
20074
+ if (ctx->kv) {
20075
+ // free string memory - not great..
20076
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
20077
+ struct gguf_kv * kv = &ctx->kv[i];
20078
+
20079
+ if (kv->key.data) {
20080
+ free(kv->key.data);
20081
+ }
20082
+
20083
+ if (kv->type == GGUF_TYPE_STRING) {
20084
+ if (kv->value.str.data) {
20085
+ free(kv->value.str.data);
20086
+ }
20087
+ }
20088
+
20089
+ if (kv->type == GGUF_TYPE_ARRAY) {
20090
+ if (kv->value.arr.data) {
20091
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
20092
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
20093
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20094
+ if (str->data) {
20095
+ free(str->data);
20096
+ }
20097
+ }
20098
+ }
20099
+ free(kv->value.arr.data);
20100
+ }
20101
+ }
20102
+ }
20103
+
20104
+ free(ctx->kv);
20105
+ }
20106
+
20107
+ if (ctx->infos) {
20108
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20109
+ struct gguf_tensor_info * info = &ctx->infos[i];
20110
+
20111
+ if (info->name.data) {
20112
+ free(info->name.data);
20113
+ }
20114
+ }
20115
+
20116
+ free(ctx->infos);
20117
+ }
20118
+
20119
+ GGML_ALIGNED_FREE(ctx);
20120
+ }
20121
+
20122
+ const char * gguf_type_name(enum gguf_type type) {
20123
+ return GGUF_TYPE_NAME[type];
20124
+ }
20125
+
20126
+ int gguf_get_version(struct gguf_context * ctx) {
20127
+ return ctx->header.version;
20128
+ }
20129
+
20130
+ size_t gguf_get_alignment(struct gguf_context * ctx) {
20131
+ return ctx->alignment;
20132
+ }
20133
+
20134
+ size_t gguf_get_data_offset(struct gguf_context * ctx) {
20135
+ return ctx->offset;
20136
+ }
20137
+
20138
+ void * gguf_get_data(struct gguf_context * ctx) {
20139
+ return ctx->data;
20140
+ }
20141
+
20142
+ int gguf_get_n_kv(struct gguf_context * ctx) {
20143
+ return ctx->header.n_kv;
20144
+ }
20145
+
20146
+ int gguf_find_key(struct gguf_context * ctx, const char * key) {
20147
+ // return -1 if key not found
20148
+ int keyfound = -1;
20149
+
20150
+ const int n_kv = gguf_get_n_kv(ctx);
20151
+
20152
+ for (int i = 0; i < n_kv; ++i) {
20153
+ if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
20154
+ keyfound = i;
20155
+ break;
20156
+ }
20157
+ }
20158
+
20159
+ return keyfound;
20160
+ }
20161
+
20162
+ const char * gguf_get_key(struct gguf_context * ctx, int i) {
20163
+ return ctx->kv[i].key.data;
20164
+ }
20165
+
20166
+ enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
20167
+ return ctx->kv[i].type;
20168
+ }
20169
+
20170
+ enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
20171
+ return ctx->kv[i].value.arr.type;
20172
+ }
20173
+
20174
+ const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
20175
+ return ctx->kv[i].value.arr.data;
20176
+ }
20177
+
20178
+ const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
20179
+ struct gguf_kv * kv = &ctx->kv[key_id];
20180
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
20181
+ return str->data;
20182
+ }
20183
+
20184
+ int gguf_get_arr_n(struct gguf_context * ctx, int i) {
20185
+ return ctx->kv[i].value.arr.n;
20186
+ }
20187
+
20188
+ uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
20189
+ return ctx->kv[i].value.uint8;
20190
+ }
20191
+
20192
+ int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
20193
+ return ctx->kv[i].value.int8;
20194
+ }
20195
+
20196
+ uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
20197
+ return ctx->kv[i].value.uint16;
20198
+ }
20199
+
20200
+ int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
20201
+ return ctx->kv[i].value.int16;
20202
+ }
20203
+
20204
+ uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
20205
+ return ctx->kv[i].value.uint32;
20206
+ }
20207
+
20208
+ int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
20209
+ return ctx->kv[i].value.int32;
20210
+ }
20211
+
20212
+ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
20213
+ return ctx->kv[i].value.float32;
20214
+ }
20215
+
20216
+ uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20217
+ return ctx->kv[i].value.uint64;
20218
+ }
20219
+
20220
+ int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20221
+ return ctx->kv[i].value.int64;
20222
+ }
20223
+
20224
+ double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20225
+ return ctx->kv[i].value.float64;
20226
+ }
20227
+
20228
+ bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
20229
+ return ctx->kv[i].value.bool_;
20230
+ }
20231
+
20232
+ const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
20233
+ return ctx->kv[i].value.str.data;
20234
+ }
20235
+
20236
+ int gguf_get_n_tensors(struct gguf_context * ctx) {
20237
+ return ctx->header.n_tensors;
20238
+ }
20239
+
20240
+ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
20241
+ // return -1 if tensor not found
20242
+ int tensorfound = -1;
20243
+
20244
+ const int n_tensors = gguf_get_n_tensors(ctx);
20245
+
20246
+ for (int i = 0; i < n_tensors; ++i) {
20247
+ if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
20248
+ tensorfound = i;
20249
+ break;
20250
+ }
20251
+ }
20252
+
20253
+ return tensorfound;
20254
+ }
20255
+
20256
+ size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
20257
+ return ctx->infos[i].offset;
20258
+ }
20259
+
20260
+ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
20261
+ return ctx->infos[i].name.data;
20262
+ }
20263
+
20264
+ // returns the index
20265
+ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
20266
+ const int idx = gguf_find_key(ctx, key);
20267
+ if (idx >= 0) {
20268
+ return idx;
20269
+ }
20270
+
20271
+ const int n_kv = gguf_get_n_kv(ctx);
20272
+
20273
+ ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20274
+ ctx->kv[n_kv].key.n = strlen(key);
20275
+ ctx->kv[n_kv].key.data = strdup(key);
20276
+ ctx->header.n_kv++;
20277
+
20278
+ return n_kv;
20279
+ }
20280
+
20281
+ void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
20282
+ const int idx = gguf_get_or_add_key(ctx, key);
20283
+
20284
+ ctx->kv[idx].type = GGUF_TYPE_UINT8;
20285
+ ctx->kv[idx].value.uint8 = val;
20286
+ }
20287
+
20288
+ void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
20289
+ const int idx = gguf_get_or_add_key(ctx, key);
20290
+
20291
+ ctx->kv[idx].type = GGUF_TYPE_INT8;
20292
+ ctx->kv[idx].value.int8 = val;
20293
+ }
20294
+
20295
+ void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
20296
+ const int idx = gguf_get_or_add_key(ctx, key);
20297
+
20298
+ ctx->kv[idx].type = GGUF_TYPE_UINT16;
20299
+ ctx->kv[idx].value.uint16 = val;
20300
+ }
20301
+
20302
+ void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
20303
+ const int idx = gguf_get_or_add_key(ctx, key);
20304
+
20305
+ ctx->kv[idx].type = GGUF_TYPE_INT16;
20306
+ ctx->kv[idx].value.int16 = val;
20307
+ }
20308
+
20309
+ void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
20310
+ const int idx = gguf_get_or_add_key(ctx, key);
20311
+
20312
+ ctx->kv[idx].type = GGUF_TYPE_UINT32;
20313
+ ctx->kv[idx].value.uint32 = val;
20314
+ }
20315
+
20316
+ void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
20317
+ const int idx = gguf_get_or_add_key(ctx, key);
20318
+
20319
+ ctx->kv[idx].type = GGUF_TYPE_INT32;
20320
+ ctx->kv[idx].value.int32 = val;
20321
+ }
20322
+
20323
+ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20324
+ const int idx = gguf_get_or_add_key(ctx, key);
20325
+
20326
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
20327
+ ctx->kv[idx].value.float32 = val;
20328
+ }
20329
+
20330
+ void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
20331
+ const int idx = gguf_get_or_add_key(ctx, key);
20332
+
20333
+ ctx->kv[idx].type = GGUF_TYPE_UINT64;
20334
+ ctx->kv[idx].value.uint64 = val;
20335
+ }
20336
+
20337
+ void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
20338
+ const int idx = gguf_get_or_add_key(ctx, key);
20339
+
20340
+ ctx->kv[idx].type = GGUF_TYPE_INT64;
20341
+ ctx->kv[idx].value.int64 = val;
20342
+ }
20343
+
20344
+ void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
20345
+ const int idx = gguf_get_or_add_key(ctx, key);
20346
+
20347
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
20348
+ ctx->kv[idx].value.float64 = val;
20349
+ }
20350
+
20351
+ void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20352
+ const int idx = gguf_get_or_add_key(ctx, key);
20353
+
20354
+ ctx->kv[idx].type = GGUF_TYPE_BOOL;
20355
+ ctx->kv[idx].value.bool_ = val;
20356
+ }
20357
+
20358
+ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
20359
+ const int idx = gguf_get_or_add_key(ctx, key);
20360
+
20361
+ ctx->kv[idx].type = GGUF_TYPE_STRING;
20362
+ ctx->kv[idx].value.str.n = strlen(val);
20363
+ ctx->kv[idx].value.str.data = strdup(val);
20364
+ }
20365
+
20366
+ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
20367
+ const int idx = gguf_get_or_add_key(ctx, key);
20368
+
20369
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20370
+ ctx->kv[idx].value.arr.type = type;
20371
+ ctx->kv[idx].value.arr.n = n;
20372
+ ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
20373
+ memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
20374
+ }
20375
+
20376
+ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
20377
+ const int idx = gguf_get_or_add_key(ctx, key);
20378
+
20379
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20380
+ ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
20381
+ ctx->kv[idx].value.arr.n = n;
20382
+ ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20383
+ for (int i = 0; i < n; i++) {
20384
+ struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20385
+ str->n = strlen(data[i]);
20386
+ str->data = strdup(data[i]);
20387
+ }
20388
+ }
20389
+
20390
+ // set or add KV pairs from another context
20391
+ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20392
+ for (uint32_t i = 0; i < src->header.n_kv; i++) {
20393
+ switch (src->kv[i].type) {
20394
+ case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
20395
+ case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
20396
+ case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
20397
+ case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
20398
+ case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20399
+ case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20400
+ case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20401
+ case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
20402
+ case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
20403
+ case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
20404
+ case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20405
+ case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20406
+ case GGUF_TYPE_ARRAY:
20407
+ {
20408
+ if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
20409
+ const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
20410
+ for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
20411
+ data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
20412
+ }
20413
+ gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
20414
+ free(data);
20415
+ } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
20416
+ GGML_ASSERT(false && "nested arrays not supported");
20417
+ } else {
20418
+ gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
20419
+ }
20420
+ } break;
20421
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20422
+ }
20423
+ }
20424
+ }
20425
+
20426
+ void gguf_add_tensor(
20427
+ struct gguf_context * ctx,
20428
+ const struct ggml_tensor * tensor) {
20429
+ const int idx = ctx->header.n_tensors;
20430
+ ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20431
+
20432
+ ctx->infos[idx].name.n = strlen(tensor->name);
20433
+ ctx->infos[idx].name.data = strdup(tensor->name);
20434
+
20435
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
20436
+ ctx->infos[idx].ne[i] = 1;
20437
+ }
20438
+
20439
+ ctx->infos[idx].n_dims = tensor->n_dims;
20440
+ for (int i = 0; i < tensor->n_dims; i++) {
20441
+ ctx->infos[idx].ne[i] = tensor->ne[i];
20442
+ }
20443
+
20444
+ ctx->infos[idx].type = tensor->type;
20445
+ ctx->infos[idx].offset = 0;
20446
+ ctx->infos[idx].data = tensor->data;
20447
+ ctx->infos[idx].size = ggml_nbytes(tensor);
20448
+
20449
+ if (ctx->header.n_tensors > 0) {
20450
+ ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
20451
+ }
20452
+
20453
+ ctx->header.n_tensors++;
20454
+ }
20455
+
20456
+ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
20457
+ const int idx = gguf_find_tensor(ctx, name);
20458
+ if (idx < 0) {
20459
+ GGML_ASSERT(false && "tensor not found");
20460
+ }
20461
+
20462
+ ctx->infos[idx].type = type;
20463
+ }
20464
+
20465
+ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
20466
+ const int idx = gguf_find_tensor(ctx, name);
20467
+ if (idx < 0) {
20468
+ GGML_ASSERT(false && "tensor not found");
20469
+ }
20470
+
20471
+ ctx->infos[idx].data = data;
20472
+ ctx->infos[idx].size = size;
20473
+
20474
+ // update offsets
20475
+ for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
20476
+ ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
20477
+ }
20478
+ }
20479
+
20480
+ //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
20481
+ // fwrite(&val->n, sizeof(val->n), 1, file);
20482
+ // fwrite(val->data, sizeof(char), val->n, file);
20483
+ //}
20484
+ //
20485
+ //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
20486
+ // fwrite(val, sizeof(char), size, file);
20487
+ //}
20488
+
20489
+ struct gguf_buf {
20490
+ void * data;
20491
+ size_t size;
20492
+ size_t offset;
20493
+ };
20494
+
20495
+ static struct gguf_buf gguf_buf_init(size_t size) {
20496
+ struct gguf_buf buf = {
20497
+ /*buf.data =*/ size == 0 ? NULL : malloc(size),
20498
+ /*buf.size =*/ size,
20499
+ /*buf.offset =*/ 0,
20500
+ };
20501
+
20502
+ return buf;
20503
+ }
20504
+
20505
+ static void gguf_buf_free(struct gguf_buf buf) {
20506
+ if (buf.data) {
20507
+ free(buf.data);
20508
+ }
20509
+ }
20510
+
20511
+ static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
20512
+ if (buf->offset + size > buf->size) {
20513
+ buf->size = 1.5*(buf->offset + size);
20514
+ if (buf->data) {
20515
+ buf->data = realloc(buf->data, buf->size);
20516
+ }
20517
+ }
20518
+ }
20519
+
20520
+ static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
20521
+ gguf_buf_grow(buf, sizeof(val->n) + val->n);
20522
+
20523
+ if (buf->data) {
20524
+ memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
20525
+ }
20526
+ buf->offset += sizeof(val->n);
20527
+
20528
+ if (buf->data) {
20529
+ memcpy((char *) buf->data + buf->offset, val->data, val->n);
20530
+ }
20531
+ buf->offset += val->n;
20532
+ }
20533
+
20534
+ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
20535
+ gguf_buf_grow(buf, el_size);
20536
+
20537
+ if (buf->data) {
20538
+ memcpy((char *) buf->data + buf->offset, val, el_size);
20539
+ }
20540
+ buf->offset += el_size;
20541
+ }
20542
+
20543
+ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20544
+ // write header
20545
+ gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
20546
+ gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
20547
+ gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
20548
+ gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
20549
+
20550
+ // write key-value pairs
20551
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
20552
+ struct gguf_kv * kv = &ctx->kv[i];
20553
+
20554
+ gguf_bwrite_str(buf, &kv->key);
20555
+ gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
20556
+
20557
+ switch (kv->type) {
20558
+ case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
20559
+ case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
20560
+ case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
20561
+ case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
20562
+ case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20563
+ case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20564
+ case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20565
+ case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
20566
+ case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
20567
+ case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
20568
+ case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20569
+ case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20570
+ case GGUF_TYPE_ARRAY:
20571
+ {
20572
+ gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
20573
+ gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
20574
+
20575
+ switch (kv->value.arr.type) {
20576
+ case GGUF_TYPE_UINT8:
20577
+ case GGUF_TYPE_INT8:
20578
+ case GGUF_TYPE_UINT16:
20579
+ case GGUF_TYPE_INT16:
20580
+ case GGUF_TYPE_UINT32:
20581
+ case GGUF_TYPE_INT32:
20582
+ case GGUF_TYPE_FLOAT32:
20583
+ case GGUF_TYPE_UINT64:
20584
+ case GGUF_TYPE_INT64:
20585
+ case GGUF_TYPE_FLOAT64:
20586
+ case GGUF_TYPE_BOOL:
20587
+ {
20588
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
20589
+ } break;
20590
+ case GGUF_TYPE_STRING:
20591
+ {
20592
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
20593
+ gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
20594
+ }
20595
+ } break;
20596
+ case GGUF_TYPE_ARRAY:
20597
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20598
+ };
20599
+ } break;
20600
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
20601
+ };
20602
+ }
20603
+
20604
+ // write tensor infos
20605
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20606
+ struct gguf_tensor_info * info = &ctx->infos[i];
20607
+
20608
+ gguf_bwrite_str(buf, &info->name);
20609
+ gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
20610
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
20611
+ gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
20612
+ }
20613
+ gguf_bwrite_el(buf, &info->type, sizeof(info->type));
20614
+ gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
20615
+ }
20616
+
20617
+ // we require the data section to be aligned, so take into account any padding
20618
+ {
20619
+ const size_t offset = buf->offset;
20620
+ const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
20621
+
20622
+ if (offset_pad != offset) {
20623
+ uint8_t pad = 0;
20624
+ for (size_t i = 0; i < offset_pad - offset; ++i) {
20625
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20626
+ }
20627
+ }
20628
+ }
20629
+
20630
+ if (only_meta) {
20631
+ return;
20632
+ }
20633
+
20634
+ size_t offset = 0;
20635
+
20636
+ // write tensor data
20637
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20638
+ struct gguf_tensor_info * info = &ctx->infos[i];
20639
+
20640
+ const size_t size = info->size;
20641
+ const size_t size_pad = GGML_PAD(size, ctx->alignment);
20642
+
20643
+ gguf_bwrite_el(buf, info->data, size);
20644
+
20645
+ if (size_pad != size) {
20646
+ uint8_t pad = 0;
20647
+ for (size_t j = 0; j < size_pad - size; ++j) {
20648
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20649
+ }
20650
+ }
20651
+
20652
+ GGML_ASSERT(offset == info->offset);
20653
+
20654
+ offset += size_pad;
20655
+ }
20656
+ }
20657
+
20658
+ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
20659
+ FILE * file = fopen(fname, "wb");
20660
+ if (!file) {
20661
+ GGML_ASSERT(false && "failed to open file for writing");
20662
+ }
20663
+
20664
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20665
+
20666
+ gguf_write_to_buf(ctx, &buf, only_meta);
20667
+
20668
+ fwrite(buf.data, 1, buf.offset, file);
20669
+
20670
+ gguf_buf_free(buf);
20671
+
20672
+ fclose(file);
20673
+ }
20674
+
20675
+ size_t gguf_get_meta_size(struct gguf_context * ctx) {
20676
+ // no allocs - only compute size
20677
+ struct gguf_buf buf = gguf_buf_init(0);
20678
+
20679
+ gguf_write_to_buf(ctx, &buf, true);
20680
+
20681
+ return buf.offset;
20682
+ }
20683
+
20684
+ void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
20685
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20686
+
20687
+ gguf_write_to_buf(ctx, &buf, true);
20688
+
20689
+ memcpy(data, buf.data, buf.offset);
20690
+
20691
+ gguf_buf_free(buf);
20692
+ }
20693
+
20694
+ ////////////////////////////////////////////////////////////////////////////////
20695
+
20696
+ int ggml_cpu_has_avx(void) {
20697
+ #if defined(__AVX__)
20698
+ return 1;
20699
+ #else
20700
+ return 0;
20701
+ #endif
20702
+ }
20703
+
20704
+ int ggml_cpu_has_avx2(void) {
20705
+ #if defined(__AVX2__)
20706
+ return 1;
20707
+ #else
20708
+ return 0;
20709
+ #endif
20710
+ }
20711
+
20712
+ int ggml_cpu_has_avx512(void) {
20713
+ #if defined(__AVX512F__)
20714
+ return 1;
20715
+ #else
20716
+ return 0;
20717
+ #endif
20718
+ }
20719
+
20720
+ int ggml_cpu_has_avx512_vbmi(void) {
20721
+ #if defined(__AVX512VBMI__)
20722
+ return 1;
20723
+ #else
20724
+ return 0;
20725
+ #endif
20726
+ }
20727
+
20728
+ int ggml_cpu_has_avx512_vnni(void) {
20729
+ #if defined(__AVX512VNNI__)
20730
+ return 1;
20731
+ #else
20732
+ return 0;
20733
+ #endif
20734
+ }
20735
+
20736
+ int ggml_cpu_has_fma(void) {
20737
+ #if defined(__FMA__)
20738
+ return 1;
20739
+ #else
20740
+ return 0;
20741
+ #endif
20742
+ }
20743
+
20744
+ int ggml_cpu_has_neon(void) {
20745
+ #if defined(__ARM_NEON)
20746
+ return 1;
20747
+ #else
20748
+ return 0;
20749
+ #endif
20750
+ }
20751
+
20752
+ int ggml_cpu_has_arm_fma(void) {
20753
+ #if defined(__ARM_FEATURE_FMA)
18622
20754
  return 1;
18623
20755
  #else
18624
20756
  return 0;
@@ -18685,6 +20817,14 @@ int ggml_cpu_has_sse3(void) {
18685
20817
  #endif
18686
20818
  }
18687
20819
 
20820
+ int ggml_cpu_has_ssse3(void) {
20821
+ #if defined(__SSSE3__)
20822
+ return 1;
20823
+ #else
20824
+ return 0;
20825
+ #endif
20826
+ }
20827
+
18688
20828
  int ggml_cpu_has_vsx(void) {
18689
20829
  #if defined(__POWER9_VECTOR__)
18690
20830
  return 1;