llama_cpp 0.3.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
123
123
  #define GGML_GELU_FP16
124
124
  #define GGML_GELU_QUICK_FP16
125
125
  #define GGML_SILU_FP16
126
+ // #define GGML_CROSS_ENTROPY_EXP_FP16
127
+ // #define GGML_FLASH_ATTN_EXP_FP16
126
128
 
127
129
  #define GGML_SOFT_MAX_UNROLL 4
128
130
  #define GGML_VEC_DOT_UNROLL 2
@@ -157,12 +159,6 @@ typedef void * thread_ret_t;
157
159
  //#define GGML_SOFT_MAX_ACCELERATE
158
160
  #endif
159
161
 
160
- #if UINTPTR_MAX == 0xFFFFFFFF
161
- #define GGML_MEM_ALIGN 4
162
- #else
163
- #define GGML_MEM_ALIGN 16
164
- #endif
165
-
166
162
  //
167
163
  // logging
168
164
  //
@@ -192,8 +188,8 @@ typedef void * thread_ret_t;
192
188
  //
193
189
 
194
190
  #if defined(_MSC_VER) || defined(__MINGW32__)
195
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
191
+ #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
192
+ #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
193
  #else
198
194
  inline static void * ggml_aligned_malloc(size_t size) {
199
195
  void * aligned_memory = NULL;
@@ -213,14 +209,13 @@ inline static void * ggml_aligned_malloc(size_t size) {
213
209
  error_desc = "insufficient memory";
214
210
  break;
215
211
  }
216
- GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
217
- __func__, error_desc, size/(1024.0*1024.0));
212
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
218
213
  return NULL;
219
214
  }
220
215
  return aligned_memory;
221
216
  }
222
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
223
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
217
+ #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
218
+ #define GGML_ALIGNED_FREE(ptr) free(ptr)
224
219
  #endif
225
220
 
226
221
  #define UNUSED GGML_UNUSED
@@ -306,6 +301,10 @@ typedef double ggml_float;
306
301
  #endif
307
302
  #endif
308
303
 
304
+ #ifdef __riscv_v_intrinsic
305
+ #include <riscv_vector.h>
306
+ #endif
307
+
309
308
  #ifdef __F16C__
310
309
 
311
310
  #ifdef _MSC_VER
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
1643
1642
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1644
1643
 
1645
1644
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1645
+ [GGML_TYPE_I8] = {
1646
+ .type_name = "i8",
1647
+ .blck_size = 1,
1648
+ .type_size = sizeof(int8_t),
1649
+ .is_quantized = false,
1650
+ },
1651
+ [GGML_TYPE_I16] = {
1652
+ .type_name = "i16",
1653
+ .blck_size = 1,
1654
+ .type_size = sizeof(int16_t),
1655
+ .is_quantized = false,
1656
+ },
1657
+ [GGML_TYPE_I32] = {
1658
+ .type_name = "i32",
1659
+ .blck_size = 1,
1660
+ .type_size = sizeof(int32_t),
1661
+ .is_quantized = false,
1662
+ },
1646
1663
  [GGML_TYPE_F32] = {
1664
+ .type_name = "f32",
1665
+ .blck_size = 1,
1666
+ .type_size = sizeof(float),
1667
+ .is_quantized = false,
1647
1668
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
1648
1669
  .vec_dot_type = GGML_TYPE_F32,
1649
1670
  },
1650
1671
  [GGML_TYPE_F16] = {
1672
+ .type_name = "f16",
1673
+ .blck_size = 1,
1674
+ .type_size = sizeof(ggml_fp16_t),
1675
+ .is_quantized = false,
1651
1676
  .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
1652
1677
  .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1653
1678
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1655
1680
  .vec_dot_type = GGML_TYPE_F16,
1656
1681
  },
1657
1682
  [GGML_TYPE_Q4_0] = {
1683
+ .type_name = "q4_0",
1684
+ .blck_size = QK4_0,
1685
+ .type_size = sizeof(block_q4_0),
1686
+ .is_quantized = true,
1658
1687
  .to_float = (ggml_to_float_t) dequantize_row_q4_0,
1659
1688
  .from_float = quantize_row_q4_0,
1660
1689
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1662
1691
  .vec_dot_type = GGML_TYPE_Q8_0,
1663
1692
  },
1664
1693
  [GGML_TYPE_Q4_1] = {
1694
+ .type_name = "q4_1",
1695
+ .blck_size = QK4_1,
1696
+ .type_size = sizeof(block_q4_1),
1697
+ .is_quantized = true,
1665
1698
  .to_float = (ggml_to_float_t) dequantize_row_q4_1,
1666
1699
  .from_float = quantize_row_q4_1,
1667
1700
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1669
1702
  .vec_dot_type = GGML_TYPE_Q8_1,
1670
1703
  },
1671
1704
  [GGML_TYPE_Q5_0] = {
1705
+ .type_name = "q5_0",
1706
+ .blck_size = QK5_0,
1707
+ .type_size = sizeof(block_q5_0),
1708
+ .is_quantized = true,
1672
1709
  .to_float = (ggml_to_float_t) dequantize_row_q5_0,
1673
1710
  .from_float = quantize_row_q5_0,
1674
1711
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1676
1713
  .vec_dot_type = GGML_TYPE_Q8_0,
1677
1714
  },
1678
1715
  [GGML_TYPE_Q5_1] = {
1716
+ .type_name = "q5_1",
1717
+ .blck_size = QK5_1,
1718
+ .type_size = sizeof(block_q5_1),
1719
+ .is_quantized = true,
1679
1720
  .to_float = (ggml_to_float_t) dequantize_row_q5_1,
1680
1721
  .from_float = quantize_row_q5_1,
1681
1722
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1683
1724
  .vec_dot_type = GGML_TYPE_Q8_1,
1684
1725
  },
1685
1726
  [GGML_TYPE_Q8_0] = {
1727
+ .type_name = "q8_0",
1728
+ .blck_size = QK8_0,
1729
+ .type_size = sizeof(block_q8_0),
1730
+ .is_quantized = true,
1686
1731
  .to_float = dequantize_row_q8_0,
1687
1732
  .from_float = quantize_row_q8_0,
1688
1733
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1690
1735
  .vec_dot_type = GGML_TYPE_Q8_0,
1691
1736
  },
1692
1737
  [GGML_TYPE_Q8_1] = {
1738
+ .type_name = "q8_1",
1739
+ .blck_size = QK8_1,
1740
+ .type_size = sizeof(block_q8_1),
1741
+ .is_quantized = true,
1693
1742
  .from_float = quantize_row_q8_1,
1694
1743
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
1695
1744
  .vec_dot_type = GGML_TYPE_Q8_1,
1696
1745
  },
1697
1746
  #ifdef GGML_USE_K_QUANTS
1698
1747
  [GGML_TYPE_Q2_K] = {
1748
+ .type_name = "q2_K",
1749
+ .blck_size = QK_K,
1750
+ .type_size = sizeof(block_q2_K),
1751
+ .is_quantized = true,
1699
1752
  .to_float = (ggml_to_float_t) dequantize_row_q2_K,
1700
1753
  .from_float = quantize_row_q2_K,
1701
1754
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1703
1756
  .vec_dot_type = GGML_TYPE_Q8_K,
1704
1757
  },
1705
1758
  [GGML_TYPE_Q3_K] = {
1759
+ .type_name = "q3_K",
1760
+ .blck_size = QK_K,
1761
+ .type_size = sizeof(block_q3_K),
1762
+ .is_quantized = true,
1706
1763
  .to_float = (ggml_to_float_t) dequantize_row_q3_K,
1707
1764
  .from_float = quantize_row_q3_K,
1708
1765
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1710
1767
  .vec_dot_type = GGML_TYPE_Q8_K,
1711
1768
  },
1712
1769
  [GGML_TYPE_Q4_K] = {
1770
+ .type_name = "q4_K",
1771
+ .blck_size = QK_K,
1772
+ .type_size = sizeof(block_q4_K),
1773
+ .is_quantized = true,
1713
1774
  .to_float = (ggml_to_float_t) dequantize_row_q4_K,
1714
1775
  .from_float = quantize_row_q4_K,
1715
1776
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1717
1778
  .vec_dot_type = GGML_TYPE_Q8_K,
1718
1779
  },
1719
1780
  [GGML_TYPE_Q5_K] = {
1781
+ .type_name = "q5_K",
1782
+ .blck_size = QK_K,
1783
+ .type_size = sizeof(block_q5_K),
1784
+ .is_quantized = true,
1720
1785
  .to_float = (ggml_to_float_t) dequantize_row_q5_K,
1721
1786
  .from_float = quantize_row_q5_K,
1722
1787
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1724
1789
  .vec_dot_type = GGML_TYPE_Q8_K,
1725
1790
  },
1726
1791
  [GGML_TYPE_Q6_K] = {
1792
+ .type_name = "q6_K",
1793
+ .blck_size = QK_K,
1794
+ .type_size = sizeof(block_q6_K),
1795
+ .is_quantized = true,
1727
1796
  .to_float = (ggml_to_float_t) dequantize_row_q6_K,
1728
1797
  .from_float = quantize_row_q6_K,
1729
1798
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1731
1800
  .vec_dot_type = GGML_TYPE_Q8_K,
1732
1801
  },
1733
1802
  [GGML_TYPE_Q8_K] = {
1803
+ .type_name = "q8_K",
1804
+ .blck_size = QK_K,
1805
+ .type_size = sizeof(block_q8_K),
1806
+ .is_quantized = true,
1734
1807
  .from_float = quantize_row_q8_K,
1735
1808
  }
1736
1809
  #endif
1737
1810
  };
1738
1811
 
1739
1812
  // For internal test use
1740
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
1741
- GGML_ASSERT(i < GGML_TYPE_COUNT);
1742
- return type_traits[i];
1813
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
1814
+ GGML_ASSERT(type < GGML_TYPE_COUNT);
1815
+ return type_traits[type];
1743
1816
  }
1744
1817
 
1745
1818
 
@@ -2363,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2363
2436
  const int nb = n / qk;
2364
2437
 
2365
2438
  assert(n % qk == 0);
2366
- assert(nb % 2 == 0);
2367
2439
 
2368
2440
  const block_q4_0 * restrict x = vx;
2369
2441
  const block_q8_0 * restrict y = vy;
@@ -2372,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2372
2444
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2373
2445
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2374
2446
 
2447
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2375
2448
  for (int i = 0; i < nb; i += 2) {
2376
2449
  const block_q4_0 * restrict x0 = &x[i + 0];
2377
2450
  const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2550,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2550
2623
  }
2551
2624
 
2552
2625
  // Main loop
2626
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2553
2627
  for (int i = 2; i < nb; i+=2) {
2554
2628
  _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
2555
2629
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2607,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2607
2681
  }
2608
2682
 
2609
2683
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2684
+ #elif defined(__riscv_v_intrinsic)
2685
+ float sumf = 0.0;
2686
+
2687
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2688
+
2689
+ for (int i = 0; i < nb; i++) {
2690
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2691
+
2692
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2693
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2694
+
2695
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2696
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2697
+
2698
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2699
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2700
+
2701
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2702
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2703
+
2704
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2705
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2706
+
2707
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2708
+
2709
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2710
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2711
+
2712
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2713
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2714
+
2715
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2716
+ }
2717
+
2718
+ *s = sumf;
2610
2719
  #else
2611
2720
  // scalar
2612
2721
  float sumf = 0.0;
@@ -2633,7 +2742,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2633
2742
  const int nb = n / qk;
2634
2743
 
2635
2744
  assert(n % qk == 0);
2636
- assert(nb % 2 == 0);
2637
2745
 
2638
2746
  const block_q4_1 * restrict x = vx;
2639
2747
  const block_q8_1 * restrict y = vy;
@@ -2645,6 +2753,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2645
2753
 
2646
2754
  float summs = 0;
2647
2755
 
2756
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2648
2757
  for (int i = 0; i < nb; i += 2) {
2649
2758
  const block_q4_1 * restrict x0 = &x[i + 0];
2650
2759
  const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2733,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2733
2842
  }
2734
2843
 
2735
2844
  *s = hsum_float_8(acc) + summs;
2845
+ #elif defined(__riscv_v_intrinsic)
2846
+ float sumf = 0.0;
2847
+
2848
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2849
+
2850
+ for (int i = 0; i < nb; i++) {
2851
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2852
+
2853
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2854
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2855
+
2856
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2857
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2858
+
2859
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2860
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2861
+
2862
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2863
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2864
+
2865
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2866
+
2867
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2868
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2869
+
2870
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2871
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2872
+
2873
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2874
+ }
2875
+
2876
+ *s = sumf;
2736
2877
  #else
2737
2878
  // scalar
2738
2879
  float sumf = 0.0;
@@ -2759,7 +2900,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2759
2900
  const int nb = n / qk;
2760
2901
 
2761
2902
  assert(n % qk == 0);
2762
- assert(nb % 2 == 0);
2763
2903
  assert(qk == QK5_0);
2764
2904
 
2765
2905
  const block_q5_0 * restrict x = vx;
@@ -2775,6 +2915,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2775
2915
  uint64_t tmp0[4];
2776
2916
  uint64_t tmp1[4];
2777
2917
 
2918
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2778
2919
  for (int i = 0; i < nb; i += 2) {
2779
2920
  const block_q5_0 * restrict x0 = &x[i];
2780
2921
  const block_q5_0 * restrict x1 = &x[i + 1];
@@ -2967,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2967
3108
  }
2968
3109
 
2969
3110
  *s = hsum_float_8(acc);
3111
+ #elif defined(__riscv_v_intrinsic)
3112
+ float sumf = 0.0;
3113
+
3114
+ uint32_t qh;
3115
+
3116
+ // These temp values are for masking and shift operations
3117
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3118
+ uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3119
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3120
+
3121
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3122
+
3123
+ for (int i = 0; i < nb; i++) {
3124
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3125
+
3126
+ // temporary registers
3127
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3128
+ vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3129
+ vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3130
+ vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3131
+
3132
+ // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3133
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3134
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3135
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3136
+
3137
+ // ((qh & (1u << (j + 16))) >> (j + 12));
3138
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3139
+ vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3140
+
3141
+ // narrowing
3142
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3143
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3144
+
3145
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3146
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3147
+
3148
+ // load
3149
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3150
+
3151
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3152
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3153
+
3154
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3155
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3156
+
3157
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3158
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3159
+
3160
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3161
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3162
+
3163
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3164
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3165
+
3166
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3167
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3168
+
3169
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3170
+
3171
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3172
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3173
+
3174
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3175
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3176
+
3177
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3178
+ }
3179
+
3180
+ *s = sumf;
2970
3181
  #else
2971
3182
  // scalar
2972
3183
  float sumf = 0.0;
@@ -2999,7 +3210,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2999
3210
  const int nb = n / qk;
3000
3211
 
3001
3212
  assert(n % qk == 0);
3002
- assert(nb % 2 == 0);
3003
3213
  assert(qk == QK5_1);
3004
3214
 
3005
3215
  const block_q5_1 * restrict x = vx;
@@ -3018,6 +3228,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3018
3228
  uint64_t tmp0[4];
3019
3229
  uint64_t tmp1[4];
3020
3230
 
3231
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3021
3232
  for (int i = 0; i < nb; i += 2) {
3022
3233
  const block_q5_1 * restrict x0 = &x[i];
3023
3234
  const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3223,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3223
3434
  }
3224
3435
 
3225
3436
  *s = hsum_float_8(acc) + summs;
3437
+ #elif defined(__riscv_v_intrinsic)
3438
+ float sumf = 0.0;
3439
+
3440
+ uint32_t qh;
3441
+
3442
+ // These temp values are for shift operations
3443
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3444
+
3445
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3446
+
3447
+ for (int i = 0; i < nb; i++) {
3448
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3449
+
3450
+ // temporary registers
3451
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3452
+ vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3453
+
3454
+ // load qh
3455
+ vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3456
+
3457
+ // ((qh >> (j + 0)) << 4) & 0x10;
3458
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3459
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3460
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3461
+
3462
+ // ((qh >> (j + 12)) ) & 0x10;
3463
+ vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3464
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3465
+
3466
+ // narrowing
3467
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3468
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3469
+
3470
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3471
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3472
+
3473
+ // load
3474
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3475
+
3476
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3477
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3478
+
3479
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3480
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3481
+
3482
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3483
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3484
+
3485
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3486
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3487
+
3488
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3489
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3490
+
3491
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3492
+
3493
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3494
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3495
+
3496
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3497
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3498
+
3499
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3500
+ }
3501
+
3502
+ *s = sumf;
3226
3503
  #else
3227
3504
  // scalar
3228
3505
  float sumf = 0.0;
@@ -3255,7 +3532,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3255
3532
  const int nb = n / qk;
3256
3533
 
3257
3534
  assert(n % qk == 0);
3258
- assert(nb % 2 == 0);
3259
3535
 
3260
3536
  const block_q8_0 * restrict x = vx;
3261
3537
  const block_q8_0 * restrict y = vy;
@@ -3264,6 +3540,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3264
3540
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3265
3541
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3266
3542
 
3543
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3267
3544
  for (int i = 0; i < nb; i += 2) {
3268
3545
  const block_q8_0 * restrict x0 = &x[i + 0];
3269
3546
  const block_q8_0 * restrict x1 = &x[i + 1];
@@ -3334,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3334
3611
  }
3335
3612
 
3336
3613
  *s = hsum_float_8(acc);
3614
+ #elif defined(__riscv_v_intrinsic)
3615
+ float sumf = 0.0;
3616
+ size_t vl = __riscv_vsetvl_e8m1(qk);
3617
+
3618
+ for (int i = 0; i < nb; i++) {
3619
+ // load elements
3620
+ vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
3621
+ vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
3622
+
3623
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
3624
+
3625
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
3626
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
3627
+
3628
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
3629
+
3630
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3631
+ }
3632
+
3633
+ *s = sumf;
3337
3634
  #else
3338
3635
  // scalar
3339
3636
  float sumf = 0.0;
@@ -3481,9 +3778,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
3481
3778
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
3482
3779
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3483
3780
 
3484
- static const float GELU_COEF_A = 0.044715f;
3485
- static const float GELU_QUICK_COEF = -1.702f;
3486
- static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3781
+ static const float GELU_COEF_A = 0.044715f;
3782
+ static const float GELU_QUICK_COEF = -1.702f;
3783
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3487
3784
 
3488
3785
  inline static float ggml_gelu_f32(float x) {
3489
3786
  return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -3652,95 +3949,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
3652
3949
  // data types
3653
3950
  //
3654
3951
 
3655
- static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3656
- [GGML_TYPE_F32] = 1,
3657
- [GGML_TYPE_F16] = 1,
3658
- [GGML_TYPE_Q4_0] = QK4_0,
3659
- [GGML_TYPE_Q4_1] = QK4_1,
3660
- [GGML_TYPE_Q5_0] = QK5_0,
3661
- [GGML_TYPE_Q5_1] = QK5_1,
3662
- [GGML_TYPE_Q8_0] = QK8_0,
3663
- [GGML_TYPE_Q8_1] = QK8_1,
3664
- #ifdef GGML_USE_K_QUANTS
3665
- [GGML_TYPE_Q2_K] = QK_K,
3666
- [GGML_TYPE_Q3_K] = QK_K,
3667
- [GGML_TYPE_Q4_K] = QK_K,
3668
- [GGML_TYPE_Q5_K] = QK_K,
3669
- [GGML_TYPE_Q6_K] = QK_K,
3670
- [GGML_TYPE_Q8_K] = QK_K,
3671
- #endif
3672
- [GGML_TYPE_I8] = 1,
3673
- [GGML_TYPE_I16] = 1,
3674
- [GGML_TYPE_I32] = 1,
3675
- };
3676
- static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3677
-
3678
- static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3679
- [GGML_TYPE_F32] = sizeof(float),
3680
- [GGML_TYPE_F16] = sizeof(ggml_fp16_t),
3681
- [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3682
- [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
3683
- [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
3684
- [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3685
- [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3686
- [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3687
- #ifdef GGML_USE_K_QUANTS
3688
- [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3689
- [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3690
- [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3691
- [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3692
- [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3693
- [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3694
- #endif
3695
- [GGML_TYPE_I8] = sizeof(int8_t),
3696
- [GGML_TYPE_I16] = sizeof(int16_t),
3697
- [GGML_TYPE_I32] = sizeof(int32_t),
3698
- };
3699
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3700
-
3701
-
3702
- static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3703
- [GGML_TYPE_F32] = "f32",
3704
- [GGML_TYPE_F16] = "f16",
3705
- [GGML_TYPE_Q4_0] = "q4_0",
3706
- [GGML_TYPE_Q4_1] = "q4_1",
3707
- [GGML_TYPE_Q5_0] = "q5_0",
3708
- [GGML_TYPE_Q5_1] = "q5_1",
3709
- [GGML_TYPE_Q8_0] = "q8_0",
3710
- [GGML_TYPE_Q8_1] = "q8_1",
3711
- [GGML_TYPE_Q2_K] = "q2_K",
3712
- [GGML_TYPE_Q3_K] = "q3_K",
3713
- [GGML_TYPE_Q4_K] = "q4_K",
3714
- [GGML_TYPE_Q5_K] = "q5_K",
3715
- [GGML_TYPE_Q6_K] = "q6_K",
3716
- [GGML_TYPE_Q8_K] = "q8_K",
3717
- [GGML_TYPE_I8] = "i8",
3718
- [GGML_TYPE_I16] = "i16",
3719
- [GGML_TYPE_I32] = "i32",
3720
- };
3721
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3722
-
3723
- static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3724
- [GGML_TYPE_F32] = false,
3725
- [GGML_TYPE_F16] = false,
3726
- [GGML_TYPE_Q4_0] = true,
3727
- [GGML_TYPE_Q4_1] = true,
3728
- [GGML_TYPE_Q5_0] = true,
3729
- [GGML_TYPE_Q5_1] = true,
3730
- [GGML_TYPE_Q8_0] = true,
3731
- [GGML_TYPE_Q8_1] = true,
3732
- [GGML_TYPE_Q2_K] = true,
3733
- [GGML_TYPE_Q3_K] = true,
3734
- [GGML_TYPE_Q4_K] = true,
3735
- [GGML_TYPE_Q5_K] = true,
3736
- [GGML_TYPE_Q6_K] = true,
3737
- [GGML_TYPE_Q8_K] = true,
3738
- [GGML_TYPE_I8] = false,
3739
- [GGML_TYPE_I16] = false,
3740
- [GGML_TYPE_I32] = false,
3741
- };
3742
- static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3743
-
3744
3952
  static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3745
3953
  "NONE",
3746
3954
 
@@ -3760,10 +3968,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3760
3968
  "ARGMAX",
3761
3969
  "REPEAT",
3762
3970
  "REPEAT_BACK",
3971
+ "CONCAT",
3763
3972
  "SILU_BACK",
3764
3973
  "NORM",
3765
3974
  "RMS_NORM",
3766
3975
  "RMS_NORM_BACK",
3976
+ "GROUP_NORM",
3767
3977
 
3768
3978
  "MUL_MAT",
3769
3979
  "OUT_PROD",
@@ -3789,20 +3999,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3789
3999
  "CLAMP",
3790
4000
  "CONV_1D",
3791
4001
  "CONV_2D",
4002
+ "CONV_TRANSPOSE_2D",
3792
4003
  "POOL_1D",
3793
4004
  "POOL_2D",
4005
+ "UPSCALE",
3794
4006
 
3795
4007
  "FLASH_ATTN",
3796
4008
  "FLASH_FF",
3797
4009
  "FLASH_ATTN_BACK",
3798
4010
  "WIN_PART",
3799
4011
  "WIN_UNPART",
4012
+ "GET_REL_POS",
4013
+ "ADD_REL_POS",
3800
4014
 
3801
4015
  "UNARY",
3802
4016
 
3803
4017
  "MAP_UNARY",
3804
4018
  "MAP_BINARY",
3805
4019
 
4020
+ "MAP_CUSTOM1_F32",
4021
+ "MAP_CUSTOM2_F32",
4022
+ "MAP_CUSTOM3_F32",
4023
+
3806
4024
  "MAP_CUSTOM1",
3807
4025
  "MAP_CUSTOM2",
3808
4026
  "MAP_CUSTOM3",
@@ -3811,7 +4029,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3811
4029
  "CROSS_ENTROPY_LOSS_BACK",
3812
4030
  };
3813
4031
 
3814
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
4032
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3815
4033
 
3816
4034
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3817
4035
  "none",
@@ -3832,10 +4050,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3832
4050
  "argmax(x)",
3833
4051
  "repeat(x)",
3834
4052
  "repeat_back(x)",
4053
+ "concat(x, y)",
3835
4054
  "silu_back(x)",
3836
4055
  "norm(x)",
3837
4056
  "rms_norm(x)",
3838
4057
  "rms_norm_back(x)",
4058
+ "group_norm(x)",
3839
4059
 
3840
4060
  "X*Y",
3841
4061
  "X*Y",
@@ -3861,20 +4081,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3861
4081
  "clamp(x)",
3862
4082
  "conv_1d(x)",
3863
4083
  "conv_2d(x)",
4084
+ "conv_transpose_2d(x)",
3864
4085
  "pool_1d(x)",
3865
4086
  "pool_2d(x)",
4087
+ "upscale(x)",
3866
4088
 
3867
4089
  "flash_attn(x)",
3868
4090
  "flash_ff(x)",
3869
4091
  "flash_attn_back(x)",
3870
4092
  "win_part(x)",
3871
4093
  "win_unpart(x)",
4094
+ "get_rel_pos(x)",
4095
+ "add_rel_pos(x)",
3872
4096
 
3873
4097
  "unary(x)",
3874
4098
 
3875
4099
  "f(x)",
3876
4100
  "f(x,y)",
3877
4101
 
4102
+ "custom_f32(x)",
4103
+ "custom_f32(x,y)",
4104
+ "custom_f32(x,y,z)",
4105
+
3878
4106
  "custom(x)",
3879
4107
  "custom(x,y)",
3880
4108
  "custom(x,y,z)",
@@ -3883,7 +4111,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3883
4111
  "cross_entropy_loss_back(x,y)",
3884
4112
  };
3885
4113
 
3886
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
4114
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3887
4115
 
3888
4116
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3889
4117
 
@@ -3913,8 +4141,10 @@ static void ggml_setup_op_has_task_pass(void) {
3913
4141
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
3914
4142
  p[GGML_OP_CONV_1D ] = true;
3915
4143
  p[GGML_OP_CONV_2D ] = true;
4144
+ p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
3916
4145
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
3917
4146
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
4147
+ p[GGML_OP_ADD_REL_POS ] = true;
3918
4148
  }
3919
4149
 
3920
4150
  { // FINALIZE
@@ -4101,38 +4331,41 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4101
4331
  }
4102
4332
 
4103
4333
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4104
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4105
-
4106
- // this should handle cases where the tensor is not contiguous in memory
4107
- // probaby just:
4108
- //
4109
- // return tensor->ne[3]*tensor->nb[3]
4110
- //
4111
- // is enough, but just in case, adding the second part
4334
+ size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4335
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4336
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4337
+ }
4338
+ return nbytes;
4339
+ }
4112
4340
 
4113
- return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
4341
+ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
4342
+ return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
4114
4343
  }
4115
4344
 
4116
4345
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
4117
4346
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4118
4347
 
4119
- return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
4348
+ return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
4120
4349
  }
4121
4350
 
4122
4351
  int ggml_blck_size(enum ggml_type type) {
4123
- return GGML_BLCK_SIZE[type];
4352
+ return type_traits[type].blck_size;
4124
4353
  }
4125
4354
 
4126
4355
  size_t ggml_type_size(enum ggml_type type) {
4127
- return GGML_TYPE_SIZE[type];
4356
+ return type_traits[type].type_size;
4128
4357
  }
4129
4358
 
4130
4359
  float ggml_type_sizef(enum ggml_type type) {
4131
- return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
4360
+ return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
4132
4361
  }
4133
4362
 
4134
4363
  const char * ggml_type_name(enum ggml_type type) {
4135
- return GGML_TYPE_NAME[type];
4364
+ return type_traits[type].type_name;
4365
+ }
4366
+
4367
+ bool ggml_is_quantized(enum ggml_type type) {
4368
+ return type_traits[type].is_quantized;
4136
4369
  }
4137
4370
 
4138
4371
  const char * ggml_op_name(enum ggml_op op) {
@@ -4144,7 +4377,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
4144
4377
  }
4145
4378
 
4146
4379
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4147
- return GGML_TYPE_SIZE[tensor->type];
4380
+ return ggml_type_size(tensor->type);
4148
4381
  }
4149
4382
 
4150
4383
  static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@@ -4182,10 +4415,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
4182
4415
  (t0->ne[3] == t1->ne[3]);
4183
4416
  }
4184
4417
 
4185
- bool ggml_is_quantized(enum ggml_type type) {
4186
- return GGML_IS_QUANTIZED[type];
4187
- }
4188
-
4189
4418
  enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4190
4419
  enum ggml_type wtype = GGML_TYPE_COUNT;
4191
4420
 
@@ -4223,8 +4452,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4223
4452
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4224
4453
 
4225
4454
  return
4226
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4227
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
4455
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4456
+ tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
4228
4457
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4229
4458
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4230
4459
  }
@@ -4233,7 +4462,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
4233
4462
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
4463
 
4235
4464
  return
4236
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4465
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4237
4466
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
4467
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
4468
  }
@@ -4248,7 +4477,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
4248
4477
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4249
4478
 
4250
4479
  return
4251
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4480
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4252
4481
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4253
4482
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4254
4483
  }
@@ -4560,36 +4789,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4560
4789
  enum ggml_type type,
4561
4790
  int n_dims,
4562
4791
  const int64_t * ne,
4563
- void * data) {
4792
+ struct ggml_tensor * view_src,
4793
+ size_t view_offs) {
4564
4794
 
4565
4795
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4566
4796
 
4567
- size_t data_size = 0;
4797
+ // find the base tensor and absolute offset
4798
+ if (view_src != NULL && view_src->view_src != NULL) {
4799
+ view_offs += view_src->view_offs;
4800
+ view_src = view_src->view_src;
4801
+ }
4802
+
4803
+ size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4804
+ for (int i = 1; i < n_dims; i++) {
4805
+ data_size *= ne[i];
4806
+ }
4807
+
4808
+ GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
4568
4809
 
4569
- if (data == NULL && !ctx->no_alloc) {
4570
- data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4571
- for (int i = 1; i < n_dims; i++) {
4572
- data_size *= ne[i];
4573
- }
4810
+ void * data = view_src != NULL ? view_src->data : NULL;
4811
+ if (data != NULL) {
4812
+ data = (char *) data + view_offs;
4574
4813
  }
4575
4814
 
4576
- if (ctx->scratch.data != NULL && data == NULL) {
4577
- // allocate tensor data in the scratch buffer
4578
- if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4579
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4580
- __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4581
- assert(false);
4582
- return NULL;
4583
- }
4815
+ size_t obj_alloc_size = 0;
4584
4816
 
4585
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4817
+ if (view_src == NULL && ctx->no_alloc == false) {
4818
+ if (ctx->scratch.data != NULL) {
4819
+ // allocate tensor data in the scratch buffer
4820
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4821
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4822
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4823
+ assert(false);
4824
+ return NULL;
4825
+ }
4586
4826
 
4587
- ctx->scratch.offs += data_size;
4827
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4588
4828
 
4589
- data_size = 0;
4829
+ ctx->scratch.offs += data_size;
4830
+ } else {
4831
+ // allocate tensor data in the context's memory pool
4832
+ obj_alloc_size = data_size;
4833
+ }
4590
4834
  }
4591
4835
 
4592
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4836
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
4593
4837
 
4594
4838
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4595
4839
 
@@ -4609,7 +4853,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4609
4853
  /*.perf_runs =*/ 0,
4610
4854
  /*.perf_cycles =*/ 0,
4611
4855
  /*.perf_time_us =*/ 0,
4612
- /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4856
+ /*.view_src =*/ view_src,
4857
+ /*.view_offs =*/ view_offs,
4858
+ /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
4613
4859
  /*.name =*/ { 0 },
4614
4860
  /*.extra =*/ NULL,
4615
4861
  /*.padding =*/ { 0 },
@@ -4622,8 +4868,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4622
4868
  result->ne[i] = ne[i];
4623
4869
  }
4624
4870
 
4625
- result->nb[0] = GGML_TYPE_SIZE[type];
4626
- result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
4871
+ result->nb[0] = ggml_type_size(type);
4872
+ result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
4627
4873
  for (int i = 2; i < GGML_MAX_DIMS; i++) {
4628
4874
  result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
4629
4875
  }
@@ -4633,28 +4879,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4633
4879
  return result;
4634
4880
  }
4635
4881
 
4636
- static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4637
- GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4638
- assert(params_size <= GGML_MAX_OP_PARAMS);
4639
- memcpy(tensor->op_params, params, params_size);
4640
- }
4641
-
4642
- static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4643
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4644
- return ((const int32_t *)(tensor->op_params))[i];
4645
- }
4646
-
4647
- static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4648
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4649
- ((int32_t *)(tensor->op_params))[i] = value;
4650
- }
4651
-
4652
4882
  struct ggml_tensor * ggml_new_tensor(
4653
4883
  struct ggml_context * ctx,
4654
4884
  enum ggml_type type,
4655
4885
  int n_dims,
4656
4886
  const int64_t * ne) {
4657
- return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4887
+ return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
4658
4888
  }
4659
4889
 
4660
4890
  struct ggml_tensor * ggml_new_tensor_1d(
@@ -4719,7 +4949,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
4719
4949
  }
4720
4950
 
4721
4951
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4722
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
4952
+ return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
4953
+ }
4954
+
4955
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4956
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4957
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4958
+ memcpy(tensor->op_params, params, params_size);
4959
+ }
4960
+
4961
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4962
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4963
+ return ((const int32_t *)(tensor->op_params))[i];
4964
+ }
4965
+
4966
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4967
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4968
+ ((int32_t *)(tensor->op_params))[i] = value;
4723
4969
  }
4724
4970
 
4725
4971
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@@ -5005,14 +5251,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
5005
5251
 
5006
5252
  struct ggml_tensor * ggml_view_tensor(
5007
5253
  struct ggml_context * ctx,
5008
- const struct ggml_tensor * src) {
5009
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
5254
+ struct ggml_tensor * src) {
5255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
5010
5256
  ggml_format_name(result, "%s (view)", src->name);
5011
5257
 
5012
- result->nb[0] = src->nb[0];
5013
- result->nb[1] = src->nb[1];
5014
- result->nb[2] = src->nb[2];
5015
- result->nb[3] = src->nb[3];
5258
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
5259
+ result->nb[i] = src->nb[i];
5260
+ }
5016
5261
 
5017
5262
  return result;
5018
5263
  }
@@ -5545,10 +5790,6 @@ struct ggml_tensor * ggml_repeat(
5545
5790
  is_node = true;
5546
5791
  }
5547
5792
 
5548
- if (ggml_are_same_shape(a, b) && !is_node) {
5549
- return a;
5550
- }
5551
-
5552
5793
  struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
5553
5794
 
5554
5795
  result->op = GGML_OP_REPEAT;
@@ -5587,6 +5828,30 @@ struct ggml_tensor * ggml_repeat_back(
5587
5828
  return result;
5588
5829
  }
5589
5830
 
5831
+ // ggml_concat
5832
+
5833
+ struct ggml_tensor * ggml_concat(
5834
+ struct ggml_context* ctx,
5835
+ struct ggml_tensor* a,
5836
+ struct ggml_tensor* b) {
5837
+ GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
5838
+
5839
+ bool is_node = false;
5840
+
5841
+ if (a->grad || b->grad) {
5842
+ is_node = true;
5843
+ }
5844
+
5845
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
5846
+
5847
+ result->op = GGML_OP_CONCAT;
5848
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5849
+ result->src[0] = a;
5850
+ result->src[1] = b;
5851
+
5852
+ return result;
5853
+ }
5854
+
5590
5855
  // ggml_abs
5591
5856
 
5592
5857
  struct ggml_tensor * ggml_abs(
@@ -5755,6 +6020,7 @@ struct ggml_tensor * ggml_silu_back(
5755
6020
  static struct ggml_tensor * ggml_norm_impl(
5756
6021
  struct ggml_context * ctx,
5757
6022
  struct ggml_tensor * a,
6023
+ float eps,
5758
6024
  bool inplace) {
5759
6025
  bool is_node = false;
5760
6026
 
@@ -5765,7 +6031,7 @@ static struct ggml_tensor * ggml_norm_impl(
5765
6031
 
5766
6032
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5767
6033
 
5768
- // TODO: maybe store epsilon here?
6034
+ ggml_set_op_params(result, &eps, sizeof(eps));
5769
6035
 
5770
6036
  result->op = GGML_OP_NORM;
5771
6037
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5776,16 +6042,20 @@ static struct ggml_tensor * ggml_norm_impl(
5776
6042
 
5777
6043
  struct ggml_tensor * ggml_norm(
5778
6044
  struct ggml_context * ctx,
5779
- struct ggml_tensor * a) {
5780
- return ggml_norm_impl(ctx, a, false);
6045
+ struct ggml_tensor * a,
6046
+ float eps) {
6047
+ return ggml_norm_impl(ctx, a, eps, false);
5781
6048
  }
5782
6049
 
5783
6050
  struct ggml_tensor * ggml_norm_inplace(
5784
6051
  struct ggml_context * ctx,
5785
- struct ggml_tensor * a) {
5786
- return ggml_norm_impl(ctx, a, true);
6052
+ struct ggml_tensor * a,
6053
+ float eps) {
6054
+ return ggml_norm_impl(ctx, a, eps, true);
5787
6055
  }
5788
6056
 
6057
+ // ggml_rms_norm
6058
+
5789
6059
  static struct ggml_tensor * ggml_rms_norm_impl(
5790
6060
  struct ggml_context * ctx,
5791
6061
  struct ggml_tensor * a,
@@ -5822,10 +6092,13 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5822
6092
  return ggml_rms_norm_impl(ctx, a, eps, true);
5823
6093
  }
5824
6094
 
6095
+ // ggml_rms_norm_back
6096
+
5825
6097
  struct ggml_tensor * ggml_rms_norm_back(
5826
6098
  struct ggml_context * ctx,
5827
6099
  struct ggml_tensor * a,
5828
- struct ggml_tensor * b) {
6100
+ struct ggml_tensor * b,
6101
+ float eps) {
5829
6102
  bool is_node = false;
5830
6103
 
5831
6104
  if (a->grad) {
@@ -5835,6 +6108,8 @@ struct ggml_tensor * ggml_rms_norm_back(
5835
6108
 
5836
6109
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5837
6110
 
6111
+ ggml_set_op_params(result, &eps, sizeof(eps));
6112
+
5838
6113
  result->op = GGML_OP_RMS_NORM_BACK;
5839
6114
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5840
6115
  result->src[0] = a;
@@ -5843,6 +6118,44 @@ struct ggml_tensor * ggml_rms_norm_back(
5843
6118
  return result;
5844
6119
  }
5845
6120
 
6121
+ // ggml_group_norm
6122
+
6123
+ static struct ggml_tensor * ggml_group_norm_impl(
6124
+ struct ggml_context * ctx,
6125
+ struct ggml_tensor * a,
6126
+ int n_groups,
6127
+ bool inplace) {
6128
+
6129
+ bool is_node = false;
6130
+ if (!inplace && (a->grad)) {
6131
+ GGML_ASSERT(false); // TODO: implement backward
6132
+ is_node = true;
6133
+ }
6134
+
6135
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6136
+
6137
+ result->op = GGML_OP_GROUP_NORM;
6138
+ result->op_params[0] = n_groups;
6139
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6140
+ result->src[0] = a;
6141
+ result->src[1] = NULL; // TODO: maybe store epsilon here?
6142
+
6143
+ return result;
6144
+ }
6145
+
6146
+ struct ggml_tensor * ggml_group_norm(
6147
+ struct ggml_context * ctx,
6148
+ struct ggml_tensor * a,
6149
+ int n_groups) {
6150
+ return ggml_group_norm_impl(ctx, a, n_groups, false);
6151
+ }
6152
+
6153
+ struct ggml_tensor * ggml_group_norm_inplace(
6154
+ struct ggml_context * ctx,
6155
+ struct ggml_tensor * a,
6156
+ int n_groups) {
6157
+ return ggml_group_norm_impl(ctx, a, n_groups, true);
6158
+ }
5846
6159
 
5847
6160
  // ggml_mul_mat
5848
6161
 
@@ -6126,7 +6439,7 @@ struct ggml_tensor * ggml_reshape(
6126
6439
  //GGML_ASSERT(false);
6127
6440
  }
6128
6441
 
6129
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6442
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
6130
6443
  ggml_format_name(result, "%s (reshaped)", a->name);
6131
6444
 
6132
6445
  result->op = GGML_OP_RESHAPE;
@@ -6150,7 +6463,7 @@ struct ggml_tensor * ggml_reshape_1d(
6150
6463
  }
6151
6464
 
6152
6465
  const int64_t ne[1] = { ne0 };
6153
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6466
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
6154
6467
  ggml_format_name(result, "%s (reshaped)", a->name);
6155
6468
 
6156
6469
  result->op = GGML_OP_RESHAPE;
@@ -6175,7 +6488,7 @@ struct ggml_tensor * ggml_reshape_2d(
6175
6488
  }
6176
6489
 
6177
6490
  const int64_t ne[2] = { ne0, ne1 };
6178
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6491
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
6179
6492
  ggml_format_name(result, "%s (reshaped)", a->name);
6180
6493
 
6181
6494
  result->op = GGML_OP_RESHAPE;
@@ -6201,7 +6514,7 @@ struct ggml_tensor * ggml_reshape_3d(
6201
6514
  }
6202
6515
 
6203
6516
  const int64_t ne[3] = { ne0, ne1, ne2 };
6204
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6517
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
6205
6518
  ggml_format_name(result, "%s (reshaped)", a->name);
6206
6519
 
6207
6520
  result->op = GGML_OP_RESHAPE;
@@ -6211,7 +6524,6 @@ struct ggml_tensor * ggml_reshape_3d(
6211
6524
  return result;
6212
6525
  }
6213
6526
 
6214
-
6215
6527
  struct ggml_tensor * ggml_reshape_4d(
6216
6528
  struct ggml_context * ctx,
6217
6529
  struct ggml_tensor * a,
@@ -6229,7 +6541,7 @@ struct ggml_tensor * ggml_reshape_4d(
6229
6541
  }
6230
6542
 
6231
6543
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6232
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6544
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
6233
6545
  ggml_format_name(result, "%s (reshaped)", a->name);
6234
6546
 
6235
6547
  result->op = GGML_OP_RESHAPE;
@@ -6239,46 +6551,40 @@ struct ggml_tensor * ggml_reshape_4d(
6239
6551
  return result;
6240
6552
  }
6241
6553
 
6242
- // ggml_view_1d
6243
-
6244
- static struct ggml_tensor * ggml_view_tensor_offset(
6554
+ static struct ggml_tensor * ggml_view_impl(
6245
6555
  struct ggml_context * ctx,
6246
6556
  struct ggml_tensor * a,
6247
6557
  int n_dims,
6248
6558
  const int64_t * ne,
6249
6559
  size_t offset) {
6250
- // don't calculate an offset from an unallocated tensor
6251
- void * data = NULL;
6252
- if (a->data != NULL) {
6253
- data = (char *) a->data + offset;
6254
- }
6255
6560
 
6256
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6561
+ bool is_node = false;
6562
+
6563
+ if (a->grad) {
6564
+ is_node = true;
6565
+ }
6257
6566
 
6567
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
6258
6568
  ggml_format_name(result, "%s (view)", a->name);
6259
6569
 
6260
6570
  ggml_set_op_params(result, &offset, sizeof(offset));
6261
6571
 
6572
+ result->op = GGML_OP_VIEW;
6573
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6574
+ result->src[0] = a;
6575
+
6262
6576
  return result;
6263
6577
  }
6264
6578
 
6579
+ // ggml_view_1d
6580
+
6265
6581
  struct ggml_tensor * ggml_view_1d(
6266
6582
  struct ggml_context * ctx,
6267
6583
  struct ggml_tensor * a,
6268
6584
  int64_t ne0,
6269
6585
  size_t offset) {
6270
6586
 
6271
- bool is_node = false;
6272
-
6273
- if (a->grad) {
6274
- is_node = true;
6275
- }
6276
-
6277
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6278
-
6279
- result->op = GGML_OP_VIEW;
6280
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6281
- result->src[0] = a;
6587
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
6282
6588
 
6283
6589
  return result;
6284
6590
  }
@@ -6293,24 +6599,14 @@ struct ggml_tensor * ggml_view_2d(
6293
6599
  size_t nb1,
6294
6600
  size_t offset) {
6295
6601
 
6296
- bool is_node = false;
6602
+ const int64_t ne[2] = { ne0, ne1 };
6297
6603
 
6298
- if (a->grad) {
6299
- is_node = true;
6300
- }
6301
-
6302
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6303
-
6304
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6604
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
6305
6605
 
6306
6606
  result->nb[1] = nb1;
6307
6607
  result->nb[2] = result->nb[1]*ne1;
6308
6608
  result->nb[3] = result->nb[2];
6309
6609
 
6310
- result->op = GGML_OP_VIEW;
6311
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6312
- result->src[0] = a;
6313
-
6314
6610
  return result;
6315
6611
  }
6316
6612
 
@@ -6326,24 +6622,14 @@ struct ggml_tensor * ggml_view_3d(
6326
6622
  size_t nb2,
6327
6623
  size_t offset) {
6328
6624
 
6329
- bool is_node = false;
6330
-
6331
- if (a->grad) {
6332
- is_node = true;
6333
- }
6334
-
6335
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6625
+ const int64_t ne[3] = { ne0, ne1, ne2 };
6336
6626
 
6337
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6627
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
6338
6628
 
6339
6629
  result->nb[1] = nb1;
6340
6630
  result->nb[2] = nb2;
6341
6631
  result->nb[3] = result->nb[2]*ne2;
6342
6632
 
6343
- result->op = GGML_OP_VIEW;
6344
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6345
- result->src[0] = a;
6346
-
6347
6633
  return result;
6348
6634
  }
6349
6635
 
@@ -6361,24 +6647,14 @@ struct ggml_tensor * ggml_view_4d(
6361
6647
  size_t nb3,
6362
6648
  size_t offset) {
6363
6649
 
6364
- bool is_node = false;
6365
-
6366
- if (a->grad) {
6367
- is_node = true;
6368
- }
6369
-
6370
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6650
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6371
6651
 
6372
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6652
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
6373
6653
 
6374
6654
  result->nb[1] = nb1;
6375
6655
  result->nb[2] = nb2;
6376
6656
  result->nb[3] = nb3;
6377
6657
 
6378
- result->op = GGML_OP_VIEW;
6379
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6380
- result->src[0] = a;
6381
-
6382
6658
  return result;
6383
6659
  }
6384
6660
 
@@ -6565,7 +6841,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6565
6841
 
6566
6842
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6567
6843
 
6568
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6844
+ int32_t params[] = { n_past };
6569
6845
  ggml_set_op_params(result, params, sizeof(params));
6570
6846
 
6571
6847
  result->op = GGML_OP_DIAG_MASK_INF;
@@ -6582,7 +6858,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
6582
6858
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6583
6859
  }
6584
6860
 
6585
-
6586
6861
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6587
6862
  struct ggml_context * ctx,
6588
6863
  struct ggml_tensor * a,
@@ -6605,7 +6880,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6605
6880
 
6606
6881
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6607
6882
 
6608
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6883
+ int32_t params[] = { n_past };
6609
6884
  ggml_set_op_params(result, params, sizeof(params));
6610
6885
 
6611
6886
  result->op = GGML_OP_DIAG_MASK_ZERO;
@@ -6711,6 +6986,8 @@ static struct ggml_tensor * ggml_rope_impl(
6711
6986
  int n_ctx,
6712
6987
  float freq_base,
6713
6988
  float freq_scale,
6989
+ float xpos_base,
6990
+ bool xpos_down,
6714
6991
  bool inplace) {
6715
6992
  GGML_ASSERT(n_past >= 0);
6716
6993
  bool is_node = false;
@@ -6721,9 +6998,11 @@ static struct ggml_tensor * ggml_rope_impl(
6721
6998
 
6722
6999
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6723
7000
 
6724
- int32_t params[6] = { n_past, n_dims, mode, n_ctx };
7001
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
6725
7002
  memcpy(params + 4, &freq_base, sizeof(float));
6726
7003
  memcpy(params + 5, &freq_scale, sizeof(float));
7004
+ memcpy(params + 6, &xpos_base, sizeof(float));
7005
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6727
7006
  ggml_set_op_params(result, params, sizeof(params));
6728
7007
 
6729
7008
  result->op = GGML_OP_ROPE;
@@ -6740,7 +7019,7 @@ struct ggml_tensor * ggml_rope(
6740
7019
  int n_dims,
6741
7020
  int mode,
6742
7021
  int n_ctx) {
6743
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
7022
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
6744
7023
  }
6745
7024
 
6746
7025
  struct ggml_tensor * ggml_rope_inplace(
@@ -6750,7 +7029,7 @@ struct ggml_tensor * ggml_rope_inplace(
6750
7029
  int n_dims,
6751
7030
  int mode,
6752
7031
  int n_ctx) {
6753
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
7032
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
6754
7033
  }
6755
7034
 
6756
7035
  struct ggml_tensor * ggml_rope_custom(
@@ -6762,7 +7041,7 @@ struct ggml_tensor * ggml_rope_custom(
6762
7041
  int n_ctx,
6763
7042
  float freq_base,
6764
7043
  float freq_scale) {
6765
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
7044
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
6766
7045
  }
6767
7046
 
6768
7047
  struct ggml_tensor * ggml_rope_custom_inplace(
@@ -6774,7 +7053,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6774
7053
  int n_ctx,
6775
7054
  float freq_base,
6776
7055
  float freq_scale) {
6777
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
7056
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
7057
+ }
7058
+
7059
+ struct ggml_tensor * ggml_rope_xpos_inplace(
7060
+ struct ggml_context * ctx,
7061
+ struct ggml_tensor * a,
7062
+ int n_past,
7063
+ int n_dims,
7064
+ float base,
7065
+ bool down) {
7066
+ return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
6778
7067
  }
6779
7068
 
6780
7069
  // ggml_rope_back
@@ -6785,7 +7074,11 @@ struct ggml_tensor * ggml_rope_back(
6785
7074
  int n_past,
6786
7075
  int n_dims,
6787
7076
  int mode,
6788
- int n_ctx) {
7077
+ int n_ctx,
7078
+ float freq_base,
7079
+ float freq_scale,
7080
+ float xpos_base,
7081
+ bool xpos_down) {
6789
7082
  GGML_ASSERT(n_past >= 0);
6790
7083
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
6791
7084
 
@@ -6797,7 +7090,11 @@ struct ggml_tensor * ggml_rope_back(
6797
7090
 
6798
7091
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6799
7092
 
6800
- int32_t params[] = { n_past, n_dims, mode, n_ctx };
7093
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
7094
+ memcpy(params + 4, &freq_base, sizeof(float));
7095
+ memcpy(params + 5, &freq_scale, sizeof(float));
7096
+ memcpy(params + 6, &xpos_base, sizeof(float));
7097
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6801
7098
  ggml_set_op_params(result, params, sizeof(params));
6802
7099
 
6803
7100
  result->op = GGML_OP_ROPE_BACK;
@@ -6904,6 +7201,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6904
7201
  return result;
6905
7202
  }
6906
7203
 
7204
+ // ggml_conv_1d_ph
7205
+
7206
+ struct ggml_tensor* ggml_conv_1d_ph(
7207
+ struct ggml_context * ctx,
7208
+ struct ggml_tensor * a,
7209
+ struct ggml_tensor * b,
7210
+ int s,
7211
+ int d) {
7212
+ return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7213
+ }
7214
+
6907
7215
  // ggml_conv_2d
6908
7216
 
6909
7217
  struct ggml_tensor * ggml_conv_2d(
@@ -6944,17 +7252,61 @@ struct ggml_tensor * ggml_conv_2d(
6944
7252
 
6945
7253
  }
6946
7254
 
6947
- // ggml_conv_1d_ph
7255
+ // ggml_conv_2d_sk_p0
6948
7256
 
6949
- struct ggml_tensor * ggml_conv_1d_ph(
7257
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6950
7258
  struct ggml_context * ctx,
6951
7259
  struct ggml_tensor * a,
6952
- struct ggml_tensor * b,
6953
- int s,
6954
- int d) {
6955
- return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7260
+ struct ggml_tensor * b) {
7261
+ return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
7262
+ }
7263
+
7264
+ // ggml_conv_2d_s1_ph
7265
+
7266
+ struct ggml_tensor * ggml_conv_2d_s1_ph(
7267
+ struct ggml_context * ctx,
7268
+ struct ggml_tensor * a,
7269
+ struct ggml_tensor * b) {
7270
+ return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
7271
+ }
7272
+
7273
+ // ggml_conv_transpose_2d_p0
7274
+
7275
+ static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
7276
+ return (ins - 1) * s - 2 * p + ks;
6956
7277
  }
6957
7278
 
7279
+ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7280
+ struct ggml_context * ctx,
7281
+ struct ggml_tensor * a,
7282
+ struct ggml_tensor * b,
7283
+ int stride) {
7284
+ GGML_ASSERT(a->ne[3] == b->ne[2]);
7285
+
7286
+ bool is_node = false;
7287
+
7288
+ if (a->grad || b->grad) {
7289
+ GGML_ASSERT(false); // TODO: implement backward
7290
+ is_node = true;
7291
+ }
7292
+
7293
+ const int64_t ne[4] = {
7294
+ ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
7295
+ ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
7296
+ a->ne[2], b->ne[3],
7297
+ };
7298
+
7299
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7300
+
7301
+ ggml_set_op_params_i32(result, 0, stride);
7302
+
7303
+ result->op = GGML_OP_CONV_TRANSPOSE_2D;
7304
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7305
+ result->src[0] = a;
7306
+ result->src[1] = b;
7307
+
7308
+ return result;
7309
+ }
6958
7310
 
6959
7311
  // ggml_pool_*
6960
7312
 
@@ -7032,6 +7384,40 @@ struct ggml_tensor * ggml_pool_2d(
7032
7384
  return result;
7033
7385
  }
7034
7386
 
7387
+ // ggml_upscale
7388
+
7389
+ static struct ggml_tensor * ggml_upscale_impl(
7390
+ struct ggml_context * ctx,
7391
+ struct ggml_tensor * a,
7392
+ int scale_factor) {
7393
+ bool is_node = false;
7394
+
7395
+ if (a->grad) {
7396
+ GGML_ASSERT(false); // TODO: implement backward
7397
+ is_node = true;
7398
+ }
7399
+
7400
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
7401
+ a->ne[0] * scale_factor,
7402
+ a->ne[1] * scale_factor,
7403
+ a->ne[2], a->ne[3]);
7404
+
7405
+ result->op = GGML_OP_UPSCALE;
7406
+ result->op_params[0] = scale_factor;
7407
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7408
+ result->src[0] = a;
7409
+ result->src[1] = NULL;
7410
+
7411
+ return result;
7412
+ }
7413
+
7414
+ struct ggml_tensor * ggml_upscale(
7415
+ struct ggml_context * ctx,
7416
+ struct ggml_tensor * a,
7417
+ int scale_factor) {
7418
+ return ggml_upscale_impl(ctx, a, scale_factor);
7419
+ }
7420
+
7035
7421
  // ggml_flash_attn
7036
7422
 
7037
7423
  struct ggml_tensor * ggml_flash_attn(
@@ -7230,6 +7616,87 @@ struct ggml_tensor * ggml_win_unpart(
7230
7616
  return result;
7231
7617
  }
7232
7618
 
7619
+ // ggml_get_rel_pos
7620
+
7621
+ struct ggml_tensor * ggml_get_rel_pos(
7622
+ struct ggml_context * ctx,
7623
+ struct ggml_tensor * a,
7624
+ int qh,
7625
+ int kh) {
7626
+ GGML_ASSERT(qh == kh);
7627
+ GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
7628
+
7629
+ bool is_node = false;
7630
+
7631
+ if (a->grad) {
7632
+ GGML_ASSERT(false); // TODO: implement backward
7633
+ is_node = true;
7634
+ }
7635
+
7636
+ const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
7637
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
7638
+
7639
+ result->op = GGML_OP_GET_REL_POS;
7640
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7641
+ result->src[0] = a;
7642
+ result->src[1] = NULL;
7643
+
7644
+ return result;
7645
+ }
7646
+
7647
+ // ggml_add_rel_pos
7648
+
7649
+ static struct ggml_tensor * ggml_add_rel_pos_impl(
7650
+ struct ggml_context * ctx,
7651
+ struct ggml_tensor * a,
7652
+ struct ggml_tensor * pw,
7653
+ struct ggml_tensor * ph,
7654
+ bool inplace) {
7655
+ GGML_ASSERT(ggml_are_same_shape(pw, ph));
7656
+ GGML_ASSERT(ggml_is_contiguous(a));
7657
+ GGML_ASSERT(ggml_is_contiguous(pw));
7658
+ GGML_ASSERT(ggml_is_contiguous(ph));
7659
+ GGML_ASSERT(ph->type == GGML_TYPE_F32);
7660
+ GGML_ASSERT(pw->type == GGML_TYPE_F32);
7661
+ GGML_ASSERT(pw->ne[3] == a->ne[2]);
7662
+ GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
7663
+ GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
7664
+
7665
+ bool is_node = false;
7666
+
7667
+ if (!inplace && (a->grad || pw->grad || ph->grad)) {
7668
+ is_node = true;
7669
+ }
7670
+
7671
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7672
+ ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
7673
+
7674
+ result->op = GGML_OP_ADD_REL_POS;
7675
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7676
+ result->src[0] = a;
7677
+ result->src[1] = pw;
7678
+ result->src[2] = ph;
7679
+
7680
+ return result;
7681
+ }
7682
+
7683
+
7684
+ struct ggml_tensor * ggml_add_rel_pos(
7685
+ struct ggml_context * ctx,
7686
+ struct ggml_tensor * a,
7687
+ struct ggml_tensor * pw,
7688
+ struct ggml_tensor * ph) {
7689
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
7690
+ }
7691
+
7692
+ struct ggml_tensor * ggml_add_rel_pos_inplace(
7693
+ struct ggml_context * ctx,
7694
+ struct ggml_tensor * a,
7695
+ struct ggml_tensor * pw,
7696
+ struct ggml_tensor * ph) {
7697
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7698
+ }
7699
+
7233
7700
  // gmml_unary
7234
7701
 
7235
7702
  static struct ggml_tensor * ggml_unary_impl(
@@ -7745,7 +8212,7 @@ static void ggml_compute_forward_dup_same_cont(
7745
8212
  memcpy(
7746
8213
  ((char *) dst->data + ie0*nb0),
7747
8214
  ((char *) src0->data + ie0*nb00),
7748
- (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
8215
+ (ie1 - ie0) * ggml_type_size(src0->type));
7749
8216
  }
7750
8217
 
7751
8218
  }
@@ -7779,7 +8246,7 @@ static void ggml_compute_forward_dup_f16(
7779
8246
 
7780
8247
  if (src0->type == dst->type &&
7781
8248
  ne00 == ne0 &&
7782
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8249
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
7783
8250
  // copy by rows
7784
8251
  const size_t rs = ne00*nb00;
7785
8252
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7837,7 +8304,7 @@ static void ggml_compute_forward_dup_f16(
7837
8304
  float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
7838
8305
 
7839
8306
  size_t id = 0;
7840
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8307
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
7841
8308
  char * dst_ptr = (char *) dst->data;
7842
8309
 
7843
8310
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8050,7 +8517,7 @@ static void ggml_compute_forward_dup_f32(
8050
8517
 
8051
8518
  if (src0->type == dst->type &&
8052
8519
  ne00 == ne0 &&
8053
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8520
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
8054
8521
  // copy by rows
8055
8522
  const size_t rs = ne00*nb00;
8056
8523
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -8089,7 +8556,7 @@ static void ggml_compute_forward_dup_f32(
8089
8556
  ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
8090
8557
 
8091
8558
  size_t id = 0;
8092
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8559
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
8093
8560
  char * dst_ptr = (char *) dst->data;
8094
8561
 
8095
8562
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8501,7 +8968,7 @@ static void ggml_compute_forward_add_q_f32(
8501
8968
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8502
8969
 
8503
8970
  // we don't support permuted src0 or src1
8504
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
8971
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8505
8972
  GGML_ASSERT(nb10 == sizeof(float));
8506
8973
 
8507
8974
  // dst cannot be transposed or permuted
@@ -8775,7 +9242,7 @@ static void ggml_compute_forward_add1_q_f32(
8775
9242
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8776
9243
 
8777
9244
  // we don't support permuted src0
8778
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
9245
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8779
9246
 
8780
9247
  // dst cannot be transposed or permuted
8781
9248
  GGML_ASSERT(nb0 <= nb1);
@@ -9137,6 +9604,8 @@ static void ggml_compute_forward_mul(
9137
9604
  const struct ggml_tensor * src0,
9138
9605
  const struct ggml_tensor * src1,
9139
9606
  struct ggml_tensor * dst) {
9607
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
9608
+
9140
9609
  switch (src0->type) {
9141
9610
  case GGML_TYPE_F32:
9142
9611
  {
@@ -9179,6 +9648,8 @@ static void ggml_compute_forward_div_f32(
9179
9648
 
9180
9649
 
9181
9650
  #ifdef GGML_USE_ACCELERATE
9651
+ UNUSED(ggml_vec_div_f32);
9652
+
9182
9653
  vDSP_vdiv(
9183
9654
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9184
9655
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -9731,6 +10202,72 @@ static void ggml_compute_forward_repeat_back(
9731
10202
  }
9732
10203
  }
9733
10204
 
10205
+ // ggml_compute_forward_concat
10206
+
10207
+ static void ggml_compute_forward_concat_f32(
10208
+ const struct ggml_compute_params * params,
10209
+ const struct ggml_tensor * src0,
10210
+ const struct ggml_tensor * src1,
10211
+ struct ggml_tensor * dst) {
10212
+
10213
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10214
+ return;
10215
+ }
10216
+
10217
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10218
+
10219
+ const int ith = params->ith;
10220
+
10221
+ GGML_TENSOR_BINARY_OP_LOCALS;
10222
+
10223
+ // TODO: support for transposed / permuted tensors
10224
+ GGML_ASSERT(nb0 == sizeof(float));
10225
+ GGML_ASSERT(nb00 == sizeof(float));
10226
+ GGML_ASSERT(nb10 == sizeof(float));
10227
+
10228
+ for (int i3 = 0; i3 < ne3; i3++) {
10229
+ for (int i2 = ith; i2 < ne2; i2++) {
10230
+ if (i2 < ne02) { // src0
10231
+ for (int i1 = 0; i1 < ne1; i1++) {
10232
+ for (int i0 = 0; i0 < ne0; i0++) {
10233
+ const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
10234
+
10235
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10236
+ *y = *x;
10237
+ }
10238
+ }
10239
+ } // src1
10240
+ else {
10241
+ for (int i1 = 0; i1 < ne1; i1++) {
10242
+ for (int i0 = 0; i0 < ne0; i0++) {
10243
+ const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
10244
+
10245
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10246
+ *y = *x;
10247
+ }
10248
+ }
10249
+ }
10250
+ }
10251
+ }
10252
+ }
10253
+
10254
+ static void ggml_compute_forward_concat(
10255
+ const struct ggml_compute_params* params,
10256
+ const struct ggml_tensor* src0,
10257
+ const struct ggml_tensor* src1,
10258
+ struct ggml_tensor* dst) {
10259
+ switch (src0->type) {
10260
+ case GGML_TYPE_F32:
10261
+ {
10262
+ ggml_compute_forward_concat_f32(params, src0, src1, dst);
10263
+ } break;
10264
+ default:
10265
+ {
10266
+ GGML_ASSERT(false);
10267
+ } break;
10268
+ }
10269
+ }
10270
+
9734
10271
  // ggml_compute_forward_abs
9735
10272
 
9736
10273
  static void ggml_compute_forward_abs_f32(
@@ -10285,7 +10822,8 @@ static void ggml_compute_forward_norm_f32(
10285
10822
 
10286
10823
  GGML_TENSOR_UNARY_OP_LOCALS;
10287
10824
 
10288
- const float eps = 1e-5f; // TODO: make this a parameter
10825
+ float eps;
10826
+ memcpy(&eps, dst->op_params, sizeof(float));
10289
10827
 
10290
10828
  // TODO: optimize
10291
10829
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10334,6 +10872,8 @@ static void ggml_compute_forward_norm(
10334
10872
  }
10335
10873
  }
10336
10874
 
10875
+ // ggml_compute_forward_group_rms_norm
10876
+
10337
10877
  static void ggml_compute_forward_rms_norm_f32(
10338
10878
  const struct ggml_compute_params * params,
10339
10879
  const struct ggml_tensor * src0,
@@ -10398,7 +10938,6 @@ static void ggml_compute_forward_rms_norm(
10398
10938
  }
10399
10939
  }
10400
10940
 
10401
-
10402
10941
  static void ggml_compute_forward_rms_norm_back_f32(
10403
10942
  const struct ggml_compute_params * params,
10404
10943
  const struct ggml_tensor * src0,
@@ -10417,7 +10956,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
10417
10956
 
10418
10957
  GGML_TENSOR_BINARY_OP_LOCALS;
10419
10958
 
10420
- const float eps = 1e-6f; // TODO: make this a parameter
10959
+ float eps;
10960
+ memcpy(&eps, dst->op_params, sizeof(float));
10421
10961
 
10422
10962
  // TODO: optimize
10423
10963
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10572,54 +11112,144 @@ static void ggml_compute_forward_rms_norm_back(
10572
11112
  }
10573
11113
  }
10574
11114
 
10575
- // ggml_compute_forward_mul_mat
10576
-
10577
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10578
- // helper function to determine if it is better to use BLAS or not
10579
- // for large matrices, BLAS is faster
10580
- static bool ggml_compute_forward_mul_mat_use_blas(
10581
- const struct ggml_tensor * src0,
10582
- const struct ggml_tensor * src1,
10583
- struct ggml_tensor * dst) {
10584
- //const int64_t ne00 = src0->ne[0];
10585
- //const int64_t ne01 = src0->ne[1];
10586
-
10587
- const int64_t ne10 = src1->ne[0];
10588
-
10589
- const int64_t ne0 = dst->ne[0];
10590
- const int64_t ne1 = dst->ne[1];
11115
+ // ggml_compute_forward_group_norm
10591
11116
 
10592
- // TODO: find the optimal values for these
10593
- if (ggml_is_contiguous(src0) &&
10594
- ggml_is_contiguous(src1) &&
10595
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
11117
+ static void ggml_compute_forward_group_norm_f32(
11118
+ const struct ggml_compute_params * params,
11119
+ const struct ggml_tensor * src0,
11120
+ struct ggml_tensor * dst) {
11121
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
10596
11122
 
10597
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
10598
- return true;
11123
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11124
+ return;
10599
11125
  }
10600
11126
 
10601
- return false;
10602
- }
10603
- #endif
10604
-
10605
- static void ggml_compute_forward_mul_mat(
10606
- const struct ggml_compute_params * params,
10607
- const struct ggml_tensor * src0,
10608
- const struct ggml_tensor * src1,
10609
- struct ggml_tensor * dst) {
10610
- int64_t t0 = ggml_perf_time_us();
10611
- UNUSED(t0);
10612
-
10613
- GGML_TENSOR_BINARY_OP_LOCALS;
11127
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10614
11128
 
10615
11129
  const int ith = params->ith;
10616
11130
  const int nth = params->nth;
10617
11131
 
10618
- const enum ggml_type type = src0->type;
11132
+ GGML_TENSOR_UNARY_OP_LOCALS;
10619
11133
 
10620
- const bool src1_cont = ggml_is_contiguous(src1);
11134
+ const float eps = 1e-6f; // TODO: make this a parameter
10621
11135
 
10622
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
11136
+ // TODO: optimize
11137
+
11138
+ int n_channels = src0->ne[2];
11139
+ int n_groups = dst->op_params[0];
11140
+ int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
11141
+ for (int i = ith; i < n_groups; i+=nth) {
11142
+ int start = i * n_channels_per_group;
11143
+ int end = start + n_channels_per_group;
11144
+ if (end > n_channels) {
11145
+ end = n_channels;
11146
+ }
11147
+ int step = end - start;
11148
+
11149
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
11150
+ ggml_float sum = 0.0;
11151
+ for (int64_t i02 = start; i02 < end; i02++) {
11152
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11153
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
11154
+
11155
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11156
+ sum += (ggml_float)x[i00];
11157
+ }
11158
+ }
11159
+ }
11160
+ float mean = sum / (ne00 * ne01 * step);
11161
+ ggml_float sum2 = 0.0;
11162
+
11163
+ for (int64_t i02 = start; i02 < end; i02++) {
11164
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11165
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
11166
+
11167
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
11168
+
11169
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11170
+ float v = x[i00] - mean;
11171
+ y[i00] = v;
11172
+ sum2 += (ggml_float)(v * v);
11173
+ }
11174
+ }
11175
+ }
11176
+ float variance = sum2 / (ne00 * ne01 * step);
11177
+ const float scale = 1.0f / sqrtf(variance + eps);
11178
+
11179
+ for (int64_t i02 = start; i02 < end; i02++) {
11180
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11181
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
11182
+ ggml_vec_scale_f32(ne00, y, scale);
11183
+ }
11184
+ }
11185
+ }
11186
+ }
11187
+ }
11188
+
11189
+ static void ggml_compute_forward_group_norm(
11190
+ const struct ggml_compute_params * params,
11191
+ const struct ggml_tensor * src0,
11192
+ struct ggml_tensor * dst) {
11193
+ switch (src0->type) {
11194
+ case GGML_TYPE_F32:
11195
+ {
11196
+ ggml_compute_forward_group_norm_f32(params, src0, dst);
11197
+ } break;
11198
+ default:
11199
+ {
11200
+ GGML_ASSERT(false);
11201
+ } break;
11202
+ }
11203
+ }
11204
+
11205
+ // ggml_compute_forward_mul_mat
11206
+
11207
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
11208
+ // helper function to determine if it is better to use BLAS or not
11209
+ // for large matrices, BLAS is faster
11210
+ static bool ggml_compute_forward_mul_mat_use_blas(
11211
+ const struct ggml_tensor * src0,
11212
+ const struct ggml_tensor * src1,
11213
+ struct ggml_tensor * dst) {
11214
+ //const int64_t ne00 = src0->ne[0];
11215
+ //const int64_t ne01 = src0->ne[1];
11216
+
11217
+ const int64_t ne10 = src1->ne[0];
11218
+
11219
+ const int64_t ne0 = dst->ne[0];
11220
+ const int64_t ne1 = dst->ne[1];
11221
+
11222
+ // TODO: find the optimal values for these
11223
+ if (ggml_is_contiguous(src0) &&
11224
+ ggml_is_contiguous(src1) &&
11225
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
11226
+
11227
+ /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
11228
+ return true;
11229
+ }
11230
+
11231
+ return false;
11232
+ }
11233
+ #endif
11234
+
11235
+ static void ggml_compute_forward_mul_mat(
11236
+ const struct ggml_compute_params * params,
11237
+ const struct ggml_tensor * src0,
11238
+ const struct ggml_tensor * src1,
11239
+ struct ggml_tensor * dst) {
11240
+ int64_t t0 = ggml_perf_time_us();
11241
+ UNUSED(t0);
11242
+
11243
+ GGML_TENSOR_BINARY_OP_LOCALS;
11244
+
11245
+ const int ith = params->ith;
11246
+ const int nth = params->nth;
11247
+
11248
+ const enum ggml_type type = src0->type;
11249
+
11250
+ const bool src1_cont = ggml_is_contiguous(src1);
11251
+
11252
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10623
11253
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10624
11254
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10625
11255
 
@@ -10629,7 +11259,7 @@ static void ggml_compute_forward_mul_mat(
10629
11259
  GGML_ASSERT(ne3 == ne13);
10630
11260
 
10631
11261
  // we don't support permuted src0 or src1
10632
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
11262
+ GGML_ASSERT(nb00 == ggml_type_size(type));
10633
11263
  GGML_ASSERT(nb10 == sizeof(float));
10634
11264
 
10635
11265
  // dst cannot be transposed or permuted
@@ -10638,6 +11268,10 @@ static void ggml_compute_forward_mul_mat(
10638
11268
  GGML_ASSERT(nb1 <= nb2);
10639
11269
  GGML_ASSERT(nb2 <= nb3);
10640
11270
 
11271
+ // broadcast factors
11272
+ const int64_t r2 = ne12/ne02;
11273
+ const int64_t r3 = ne13/ne03;
11274
+
10641
11275
  // nb01 >= nb00 - src0 is not transposed
10642
11276
  // compute by src0 rows
10643
11277
 
@@ -10657,11 +11291,6 @@ static void ggml_compute_forward_mul_mat(
10657
11291
 
10658
11292
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10659
11293
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10660
- // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
10661
- // ref: https://github.com/ggerganov/ggml/pull/224
10662
- GGML_ASSERT(ne02 == ne12);
10663
- GGML_ASSERT(ne03 == ne13);
10664
-
10665
11294
  if (params->ith != 0) {
10666
11295
  return;
10667
11296
  }
@@ -10674,12 +11303,16 @@ static void ggml_compute_forward_mul_mat(
10674
11303
  return;
10675
11304
  }
10676
11305
 
10677
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10678
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10679
- const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
10680
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
11306
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
11307
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
11308
+ // broadcast src0 into src1 across 2nd,3rd dimension
11309
+ const int64_t i03 = i13/r3;
11310
+ const int64_t i02 = i12/r2;
10681
11311
 
10682
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
11312
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
11313
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
11314
+
11315
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
10683
11316
 
10684
11317
  if (type != GGML_TYPE_F32) {
10685
11318
  float * const wdata = params->wdata;
@@ -10687,7 +11320,7 @@ static void ggml_compute_forward_mul_mat(
10687
11320
 
10688
11321
  size_t id = 0;
10689
11322
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
10690
- to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
11323
+ to_float((const char *) x + i01*nb01, wdata + id, ne00);
10691
11324
  id += ne00;
10692
11325
  }
10693
11326
 
@@ -10712,7 +11345,7 @@ static void ggml_compute_forward_mul_mat(
10712
11345
  if (params->type == GGML_TASK_INIT) {
10713
11346
  if (src1->type != vec_dot_type) {
10714
11347
  char * wdata = params->wdata;
10715
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11348
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10716
11349
 
10717
11350
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
10718
11351
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -10732,7 +11365,7 @@ static void ggml_compute_forward_mul_mat(
10732
11365
  }
10733
11366
 
10734
11367
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10735
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11368
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10736
11369
 
10737
11370
  const int64_t nr0 = ne01; // src0 rows
10738
11371
  const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@@ -10767,10 +11400,6 @@ static void ggml_compute_forward_mul_mat(
10767
11400
  assert(ne12 % ne02 == 0);
10768
11401
  assert(ne13 % ne03 == 0);
10769
11402
 
10770
- // broadcast factors
10771
- const int64_t r2 = ne12/ne02;
10772
- const int64_t r3 = ne13/ne03;
10773
-
10774
11403
  // block-tiling attempt
10775
11404
  const int64_t blck_0 = 16;
10776
11405
  const int64_t blck_1 = 16;
@@ -11205,7 +11834,7 @@ static void ggml_compute_forward_get_rows_q(
11205
11834
 
11206
11835
  assert( dst->ne[0] == nc);
11207
11836
  assert( dst->ne[1] == nr);
11208
- assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
11837
+ assert(src0->nb[0] == ggml_type_size(type));
11209
11838
 
11210
11839
  for (int i = 0; i < nr; ++i) {
11211
11840
  const int r = ((int32_t *) src1->data)[i];
@@ -11506,8 +12135,8 @@ static void ggml_compute_forward_diag_mask_f32(
11506
12135
  const int ith = params->ith;
11507
12136
  const int nth = params->nth;
11508
12137
 
11509
- const int n_past = ((int32_t *) dst->op_params)[0];
11510
- const bool inplace = (bool)((int32_t *) dst->op_params)[1];
12138
+ const int n_past = ((int32_t *) dst->op_params)[0];
12139
+ const bool inplace = src0->data == dst->data;
11511
12140
 
11512
12141
  GGML_ASSERT(n_past >= 0);
11513
12142
 
@@ -11718,6 +12347,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11718
12347
  // dx = J * dy
11719
12348
  // dxk = sum_i(Jki * dyi)
11720
12349
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
12350
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
11721
12351
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
11722
12352
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
11723
12353
  // dxk = -yk * dot(y, dy) + yk*dyk
@@ -11926,7 +12556,6 @@ static void ggml_compute_forward_alibi(
11926
12556
  }
11927
12557
  }
11928
12558
 
11929
-
11930
12559
  // ggml_compute_forward_clamp
11931
12560
 
11932
12561
  static void ggml_compute_forward_clamp_f32(
@@ -12015,12 +12644,18 @@ static void ggml_compute_forward_rope_f32(
12015
12644
  float freq_base;
12016
12645
  float freq_scale;
12017
12646
 
12647
+ // these two only relevant for xPos RoPE:
12648
+ float xpos_base;
12649
+ bool xpos_down;
12650
+
12018
12651
  const int n_past = ((int32_t *) dst->op_params)[0];
12019
12652
  const int n_dims = ((int32_t *) dst->op_params)[1];
12020
12653
  const int mode = ((int32_t *) dst->op_params)[2];
12021
12654
  const int n_ctx = ((int32_t *) dst->op_params)[3];
12022
12655
  memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12023
12656
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12657
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12658
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12024
12659
 
12025
12660
  assert(n_past >= 0);
12026
12661
 
@@ -12092,6 +12727,9 @@ static void ggml_compute_forward_rope_f32(
12092
12727
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12093
12728
  const float cos_theta = cosf(theta);
12094
12729
  const float sin_theta = sinf(theta);
12730
+ // zeta scaling for xPos only:
12731
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12732
+ if (xpos_down) zeta = 1.0f / zeta;
12095
12733
 
12096
12734
  theta *= theta_scale;
12097
12735
 
@@ -12101,11 +12739,11 @@ static void ggml_compute_forward_rope_f32(
12101
12739
  const float x0 = src[0];
12102
12740
  const float x1 = src[1];
12103
12741
 
12104
- dst_data[0] = x0*cos_theta - x1*sin_theta;
12105
- dst_data[1] = x0*sin_theta + x1*cos_theta;
12742
+ dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
12743
+ dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
12106
12744
  }
12107
12745
  } else {
12108
- // TODO: this is probably wrong, but I can't figure it out ..
12746
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12109
12747
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12110
12748
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12111
12749
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12234,7 +12872,7 @@ static void ggml_compute_forward_rope_f16(
12234
12872
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12235
12873
  }
12236
12874
  } else {
12237
- // TODO: this is probably wrong, but I can't figure it out ..
12875
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12238
12876
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12239
12877
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12240
12878
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12296,9 +12934,21 @@ static void ggml_compute_forward_rope_back_f32(
12296
12934
  // dx = rope_back(dy, src1)
12297
12935
  // src0 is dy, src1 contains options
12298
12936
 
12937
+ float freq_base;
12938
+ float freq_scale;
12939
+
12940
+ // these two only relevant for xPos RoPE:
12941
+ float xpos_base;
12942
+ bool xpos_down;
12943
+
12299
12944
  const int n_past = ((int32_t *) dst->op_params)[0];
12300
12945
  const int n_dims = ((int32_t *) dst->op_params)[1];
12301
12946
  const int mode = ((int32_t *) dst->op_params)[2];
12947
+ const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
12948
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12949
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12950
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12951
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12302
12952
 
12303
12953
  assert(n_past >= 0);
12304
12954
 
@@ -12324,7 +12974,7 @@ static void ggml_compute_forward_rope_back_f32(
12324
12974
  // row index used to determine which thread to use
12325
12975
  int ir = 0;
12326
12976
 
12327
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12977
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12328
12978
 
12329
12979
  const bool is_neox = mode & 2;
12330
12980
 
@@ -12335,12 +12985,15 @@ static void ggml_compute_forward_rope_back_f32(
12335
12985
  if (ir++ < ir0) continue;
12336
12986
  if (ir > ir1) break;
12337
12987
 
12338
- float theta = (float)p;
12988
+ float theta = freq_scale * (float)p;
12339
12989
 
12340
12990
  if (!is_neox) {
12341
12991
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12342
12992
  const float cos_theta = cosf(theta);
12343
12993
  const float sin_theta = sinf(theta);
12994
+ // zeta scaling for xPos only:
12995
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12996
+ if (xpos_down) zeta = 1.0f / zeta;
12344
12997
 
12345
12998
  theta *= theta_scale;
12346
12999
 
@@ -12350,8 +13003,8 @@ static void ggml_compute_forward_rope_back_f32(
12350
13003
  const float dy0 = dy[0];
12351
13004
  const float dy1 = dy[1];
12352
13005
 
12353
- dx[0] = dy0*cos_theta + dy1*sin_theta;
12354
- dx[1] = - dy0*sin_theta + dy1*cos_theta;
13006
+ dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
13007
+ dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
12355
13008
  }
12356
13009
  } else {
12357
13010
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
@@ -13044,6 +13697,106 @@ static void ggml_compute_forward_conv_2d(
13044
13697
  }
13045
13698
  }
13046
13699
 
13700
+ // ggml_compute_forward_conv_transpose_2d
13701
+
13702
+ static void ggml_compute_forward_conv_transpose_2d(
13703
+ const struct ggml_compute_params * params,
13704
+ const struct ggml_tensor * src0,
13705
+ const struct ggml_tensor * src1,
13706
+ struct ggml_tensor * dst) {
13707
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13708
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13709
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13710
+
13711
+ int64_t t0 = ggml_perf_time_us();
13712
+ UNUSED(t0);
13713
+
13714
+ GGML_TENSOR_BINARY_OP_LOCALS;
13715
+
13716
+ const int ith = params->ith;
13717
+ const int nth = params->nth;
13718
+
13719
+ const int nk = ne00*ne01*ne02*ne03;
13720
+
13721
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13722
+ GGML_ASSERT(nb10 == sizeof(float));
13723
+
13724
+ if (params->type == GGML_TASK_INIT) {
13725
+ memset(params->wdata, 0, params->wsize);
13726
+
13727
+ // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
13728
+ {
13729
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13730
+
13731
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
13732
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
13733
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
13734
+ ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
13735
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
13736
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
13737
+ dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
13738
+ }
13739
+ }
13740
+ }
13741
+ }
13742
+ }
13743
+
13744
+ // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
13745
+ {
13746
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
13747
+ for (int i12 = 0; i12 < ne12; i12++) {
13748
+ for (int i11 = 0; i11 < ne11; i11++) {
13749
+ const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
13750
+ ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
13751
+ for (int i10 = 0; i10 < ne10; i10++) {
13752
+ dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
13753
+ }
13754
+ }
13755
+ }
13756
+ }
13757
+
13758
+ return;
13759
+ }
13760
+
13761
+ if (params->type == GGML_TASK_FINALIZE) {
13762
+ return;
13763
+ }
13764
+
13765
+ const int32_t stride = ggml_get_op_params_i32(dst, 0);
13766
+
13767
+ // total patches in dst
13768
+ const int np = ne2;
13769
+
13770
+ // patches per thread
13771
+ const int dp = (np + nth - 1)/nth;
13772
+
13773
+ // patch range for this thread
13774
+ const int ip0 = dp*ith;
13775
+ const int ip1 = MIN(ip0 + dp, np);
13776
+
13777
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13778
+ ggml_fp16_t * const wdata_src = wdata + nk;
13779
+
13780
+ for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13781
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13782
+ ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
13783
+ for (int i11 = 0; i11 < ne11; i11++) {
13784
+ for (int i10 = 0; i10 < ne10; i10++) {
13785
+ const int i1n = i11*ne10*ne12 + i10*ne12;
13786
+ for (int i01 = 0; i01 < ne01; i01++) {
13787
+ for (int i00 = 0; i00 < ne00; i00++) {
13788
+ float v = 0;
13789
+ ggml_vec_dot_f16(ne03, &v,
13790
+ wdata_src + i1n,
13791
+ wdata_kernel + i01*ne00*ne03 + i00*ne03);
13792
+ dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13793
+ }
13794
+ }
13795
+ }
13796
+ }
13797
+ }
13798
+ }
13799
+
13047
13800
  // ggml_compute_forward_pool_1d_sk_p0
13048
13801
 
13049
13802
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -13202,6 +13955,60 @@ static void ggml_compute_forward_pool_2d(
13202
13955
  ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
13203
13956
  }
13204
13957
 
13958
+ // ggml_compute_forward_upscale
13959
+
13960
+ static void ggml_compute_forward_upscale_f32(
13961
+ const struct ggml_compute_params * params,
13962
+ const struct ggml_tensor * src0,
13963
+ struct ggml_tensor * dst) {
13964
+
13965
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13966
+ return;
13967
+ }
13968
+
13969
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13970
+
13971
+ const int ith = params->ith;
13972
+
13973
+ GGML_TENSOR_UNARY_OP_LOCALS;
13974
+
13975
+ const int scale_factor = dst->op_params[0];
13976
+
13977
+ // TODO: optimize
13978
+
13979
+ for (int i03 = 0; i03 < ne03; i03++) {
13980
+ for (int i02 = ith; i02 < ne02; i02++) {
13981
+ for (int m = 0; m < dst->ne[1]; m++) {
13982
+ int i01 = m / scale_factor;
13983
+ for (int n = 0; n < dst->ne[0]; n++) {
13984
+ int i00 = n / scale_factor;
13985
+
13986
+ const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
13987
+
13988
+ float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
13989
+
13990
+ *y = *x;
13991
+ }
13992
+ }
13993
+ }
13994
+ }
13995
+ }
13996
+
13997
+ static void ggml_compute_forward_upscale(
13998
+ const struct ggml_compute_params * params,
13999
+ const struct ggml_tensor * src0,
14000
+ struct ggml_tensor * dst) {
14001
+ switch (src0->type) {
14002
+ case GGML_TYPE_F32:
14003
+ {
14004
+ ggml_compute_forward_upscale_f32(params, src0, dst);
14005
+ } break;
14006
+ default:
14007
+ {
14008
+ GGML_ASSERT(false);
14009
+ } break;
14010
+ }
14011
+ }
13205
14012
 
13206
14013
  // ggml_compute_forward_flash_attn
13207
14014
 
@@ -13331,7 +14138,7 @@ static void ggml_compute_forward_flash_attn_f32(
13331
14138
  vvexpf(S, S, &Mup);
13332
14139
  ggml_vec_sum_f32(Mup, &sum, S);
13333
14140
  #else
13334
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14141
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13335
14142
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13336
14143
 
13337
14144
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13341,9 +14148,13 @@ static void ggml_compute_forward_flash_attn_f32(
13341
14148
  if (SS[j] == -INFINITY) {
13342
14149
  SS[j] = 0.0f;
13343
14150
  } else {
14151
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14152
+ const float val = expf(SS[j] - max);
14153
+ #else
13344
14154
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
13345
14155
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13346
14156
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14157
+ #endif
13347
14158
  sump[j] += (ggml_float)val;
13348
14159
  SS[j] = val;
13349
14160
  }
@@ -13921,7 +14732,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
13921
14732
  vvexpf(SM, SM, &Mup);
13922
14733
  ggml_vec_sum_f32(Mup, &sum, SM);
13923
14734
  #else
13924
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14735
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13925
14736
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13926
14737
 
13927
14738
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13932,9 +14743,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
13932
14743
  if (SR[j] == -INFINITY) {
13933
14744
  SW[j] = 0.0f;
13934
14745
  } else {
14746
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14747
+ const float val = expf(SR[j] - max);
14748
+ #else
13935
14749
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
13936
14750
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13937
14751
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14752
+ #endif
13938
14753
  sump[j] += (ggml_float)val;
13939
14754
  SW[j] = val;
13940
14755
  }
@@ -14327,38 +15142,169 @@ static void ggml_compute_forward_unary(
14327
15142
  }
14328
15143
  }
14329
15144
 
14330
- // ggml_compute_forward_map_unary
15145
+ // ggml_compute_forward_get_rel_pos
14331
15146
 
14332
- static void ggml_compute_forward_map_unary_f32(
15147
+ static void ggml_compute_forward_get_rel_pos_f16(
14333
15148
  const struct ggml_compute_params * params,
14334
15149
  const struct ggml_tensor * src0,
14335
- struct ggml_tensor * dst,
14336
- const ggml_unary_op_f32_t fun) {
14337
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
14338
-
15150
+ struct ggml_tensor * dst) {
14339
15151
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14340
15152
  return;
14341
15153
  }
14342
15154
 
14343
- const int n = ggml_nrows(src0);
14344
- const int nc = src0->ne[0];
15155
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
14345
15156
 
14346
- assert( dst->nb[0] == sizeof(float));
14347
- assert(src0->nb[0] == sizeof(float));
15157
+ GGML_TENSOR_UNARY_OP_LOCALS;
14348
15158
 
14349
- for (int i = 0; i < n; i++) {
14350
- fun(nc,
14351
- (float *) ((char *) dst->data + i*( dst->nb[1])),
14352
- (float *) ((char *) src0->data + i*(src0->nb[1])));
15159
+ const int64_t w = ne1;
15160
+
15161
+ ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
15162
+ ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
15163
+
15164
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
15165
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
15166
+ const int64_t pos = (w - i1 - 1) + i2;
15167
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
15168
+ dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
15169
+ }
15170
+ }
14353
15171
  }
14354
15172
  }
14355
15173
 
14356
-
14357
- static void ggml_compute_forward_map_unary(
15174
+ static void ggml_compute_forward_get_rel_pos(
14358
15175
  const struct ggml_compute_params * params,
14359
15176
  const struct ggml_tensor * src0,
14360
- struct ggml_tensor * dst,
14361
- const ggml_unary_op_f32_t fun) {
15177
+ struct ggml_tensor * dst) {
15178
+ switch (src0->type) {
15179
+ case GGML_TYPE_F16:
15180
+ {
15181
+ ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
15182
+ } break;
15183
+ default:
15184
+ {
15185
+ GGML_ASSERT(false);
15186
+ } break;
15187
+ }
15188
+ }
15189
+
15190
+ // ggml_compute_forward_add_rel_pos
15191
+
15192
+ static void ggml_compute_forward_add_rel_pos_f32(
15193
+ const struct ggml_compute_params * params,
15194
+ const struct ggml_tensor * src0,
15195
+ const struct ggml_tensor * src1,
15196
+ const struct ggml_tensor * src2,
15197
+ struct ggml_tensor * dst) {
15198
+
15199
+ const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
15200
+ if (!inplace && params->type == GGML_TASK_INIT) {
15201
+ memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
15202
+ return;
15203
+ }
15204
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15205
+ return;
15206
+ }
15207
+
15208
+ int64_t t0 = ggml_perf_time_us();
15209
+ UNUSED(t0);
15210
+
15211
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
15212
+
15213
+ float * src1_data = (float *) src1->data;
15214
+ float * src2_data = (float *) src2->data;
15215
+ float * dst_data = (float *) dst->data;
15216
+
15217
+ const int64_t ne10 = src1->ne[0];
15218
+ const int64_t ne11 = src1->ne[1];
15219
+ const int64_t ne12 = src1->ne[2];
15220
+ const int64_t ne13 = src1->ne[3];
15221
+
15222
+ const int ith = params->ith;
15223
+ const int nth = params->nth;
15224
+
15225
+ // total patches in dst
15226
+ const int np = ne13;
15227
+
15228
+ // patches per thread
15229
+ const int dp = (np + nth - 1)/nth;
15230
+
15231
+ // patch range for this thread
15232
+ const int ip0 = dp*ith;
15233
+ const int ip1 = MIN(ip0 + dp, np);
15234
+
15235
+
15236
+ for (int64_t i13 = ip0; i13 < ip1; ++i13) {
15237
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
15238
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
15239
+ const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
15240
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
15241
+ const int64_t jp0 = jp1 + i10;
15242
+ const float src1_e = src1_data[jp0];
15243
+ const float src2_e = src2_data[jp0];
15244
+
15245
+ const int64_t jdh = jp0 * ne10;
15246
+ const int64_t jdw = jdh - (ne10 - 1) * i10;
15247
+
15248
+ for (int64_t j = 0; j < ne10; ++j) {
15249
+ dst_data[jdh + j ] += src2_e;
15250
+ dst_data[jdw + j*ne10] += src1_e;
15251
+ }
15252
+ }
15253
+ }
15254
+ }
15255
+ }
15256
+ }
15257
+
15258
+ static void ggml_compute_forward_add_rel_pos(
15259
+ const struct ggml_compute_params * params,
15260
+ const struct ggml_tensor * src0,
15261
+ const struct ggml_tensor * src1,
15262
+ const struct ggml_tensor * src2,
15263
+ struct ggml_tensor * dst) {
15264
+ switch (src0->type) {
15265
+ case GGML_TYPE_F32:
15266
+ {
15267
+ ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
15268
+ } break;
15269
+ default:
15270
+ {
15271
+ GGML_ASSERT(false);
15272
+ } break;
15273
+ }
15274
+ }
15275
+
15276
+ // ggml_compute_forward_map_unary
15277
+
15278
+ static void ggml_compute_forward_map_unary_f32(
15279
+ const struct ggml_compute_params * params,
15280
+ const struct ggml_tensor * src0,
15281
+ struct ggml_tensor * dst,
15282
+ const ggml_unary_op_f32_t fun) {
15283
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
15284
+
15285
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15286
+ return;
15287
+ }
15288
+
15289
+ const int n = ggml_nrows(src0);
15290
+ const int nc = src0->ne[0];
15291
+
15292
+ assert( dst->nb[0] == sizeof(float));
15293
+ assert(src0->nb[0] == sizeof(float));
15294
+
15295
+ for (int i = 0; i < n; i++) {
15296
+ fun(nc,
15297
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
15298
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
15299
+ }
15300
+ }
15301
+
15302
+
15303
+ static void ggml_compute_forward_map_unary(
15304
+ const struct ggml_compute_params * params,
15305
+ const struct ggml_tensor * src0,
15306
+ struct ggml_tensor * dst,
15307
+ const ggml_unary_op_f32_t fun) {
14362
15308
  switch (src0->type) {
14363
15309
  case GGML_TYPE_F32:
14364
15310
  {
@@ -14541,6 +15487,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14541
15487
  const int nc = src0->ne[0];
14542
15488
  const int nr = ggml_nrows(src0);
14543
15489
 
15490
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15491
+
14544
15492
  if (params->type == GGML_TASK_INIT) {
14545
15493
  if (ith == 0) {
14546
15494
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -14552,7 +15500,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14552
15500
  if (ith == 0) {
14553
15501
  float * dp = (float *) dst->data;
14554
15502
  ggml_vec_sum_f32(nth, dp, sums);
14555
- dp[0] *= -1.0f;
15503
+ dp[0] *= -1.0f / (float) nr;
14556
15504
  }
14557
15505
  return;
14558
15506
  }
@@ -14569,7 +15517,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14569
15517
  for (int i1 = ir0; i1 < ir1; i1++) {
14570
15518
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
14571
15519
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
14572
- float * st = (float *) params->wdata + nth + ith*nc;
15520
+ float * st = ((float *) params->wdata) + nth + ith*nc;
14573
15521
 
14574
15522
  #ifndef NDEBUG
14575
15523
  for (int i = 0; i < nc; ++i) {
@@ -14584,15 +15532,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14584
15532
  float max = -INFINITY;
14585
15533
  ggml_vec_max_f32(nc, &max, s0);
14586
15534
 
14587
- uint16_t scvt;
15535
+ uint16_t scvt; UNUSED(scvt);
14588
15536
  for (int i = 0; i < nc; i++) {
14589
15537
  if (s0[i] == -INFINITY) {
14590
15538
  st[i] = 0.0f;
14591
15539
  } else {
14592
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15540
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15541
+ const float s = s0[i] - max;
15542
+ const float val = expf(s);
15543
+ #else
14593
15544
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
14594
15545
  memcpy(&scvt, &s, sizeof(scvt));
14595
15546
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15547
+ #endif
14596
15548
  sum += (ggml_float)val;
14597
15549
  st[i] = val;
14598
15550
  }
@@ -14608,7 +15560,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14608
15560
  ggml_vec_log_f32(nc, st, st);
14609
15561
  ggml_vec_mul_f32(nc, st, st, s1);
14610
15562
 
14611
- ggml_vec_sum_f32(nc, sums + ith, st);
15563
+ float st_sum = 0;
15564
+ ggml_vec_sum_f32(nc, &st_sum, st);
15565
+ sums[ith] += st_sum;
14612
15566
 
14613
15567
  #ifndef NDEBUG
14614
15568
  for (int i = 0; i < nc; ++i) {
@@ -14658,7 +15612,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14658
15612
  return;
14659
15613
  }
14660
15614
 
14661
- const float eps = 1e-9f;
15615
+ const double eps = 1e-9;
14662
15616
 
14663
15617
  // TODO: handle transposed/permuted matrices
14664
15618
  const int64_t nc = src0->ne[0];
@@ -14677,7 +15631,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14677
15631
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
14678
15632
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
14679
15633
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
14680
- float * sm = (float *) params->wdata + ith*nc;
14681
15634
 
14682
15635
  #ifndef NDEBUG
14683
15636
  for (int i = 0; i < nc; ++i) {
@@ -14686,54 +15639,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14686
15639
  assert(!isnan(s1[i]));
14687
15640
  }
14688
15641
  #endif
14689
- // step by step explanation:
14690
- {
14691
- //float * sums = (float *) params->wdata;
14692
-
14693
- // forward pass with annotated gradients from backward pass
14694
- // (built by going in reverse operation order, adding to gradients of current operation args)
14695
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
14696
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
14697
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
14698
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
14699
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
14700
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
14701
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
14702
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
14703
-
14704
- // substitute into grad[st1], because we can reuse softmax_back from this point on
14705
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
14706
- // postorder:
14707
- // grad[st1] := softmax(s0)
14708
- // grad[st1] := grad[st1]*(1.0 - eps)
14709
- // grad[st1] := grad[st1] + eps
14710
- // grad[st1] := s1 / grad[st1]
14711
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
14712
-
14713
- // src0 gradients by going through softmax_back
14714
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
14715
- // from softmax_back:
14716
- // dxk = yk * (dyk - dot(y, dy))
14717
- // dot_y_dy := dot(y, dy)
14718
- // dx := dy
14719
- // dx := dx - dot_y_dy
14720
- // dx := dx * y
14721
- // postorder:
14722
- // dot_st1_dst1 := dot(st1, grad[st1])
14723
- // grad[s0] := grad[st1]
14724
- // grad[s0] := grad[s0] - dot_st1_dst1
14725
- // grad[s0] := grad[s0] * st1
14726
-
14727
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
14728
- // sm := softmax(s0)
14729
- // grad[s0] := sm*(1.0 - eps)
14730
- // grad[s0] := grad[s0] + eps
14731
- // grad[s0] := s1 / grad[s0]
14732
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
14733
- // dot_st1_dst1 := dot(sm, grad[s0])
14734
- // grad[s0] := grad[s0] - dot_st1_dst1
14735
- // grad[s0] := grad[s0] * sm
14736
- }
14737
15642
 
14738
15643
  // soft_max
14739
15644
  ggml_float sum = 0.0;
@@ -14741,39 +15646,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14741
15646
  float max = -INFINITY;
14742
15647
  ggml_vec_max_f32(nc, &max, s0);
14743
15648
 
14744
- uint16_t scvt;
15649
+ uint16_t scvt; UNUSED(scvt);
14745
15650
  for (int i = 0; i < nc; i++) {
14746
15651
  if (s0[i] == -INFINITY) {
14747
- sm[i] = 0.0f;
15652
+ ds0[i] = 0.0f;
14748
15653
  } else {
14749
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15654
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15655
+ const float s = s0[i] - max;
15656
+ const float val = expf(s);
15657
+ #else
14750
15658
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
14751
15659
  memcpy(&scvt, &s, sizeof(scvt));
14752
15660
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15661
+ #endif
14753
15662
  sum += (ggml_float)val;
14754
- sm[i] = val;
15663
+ ds0[i] = val;
14755
15664
  }
14756
15665
  }
14757
15666
 
14758
15667
  assert(sum > 0.0);
14759
- sum = 1.0/sum;
15668
+ sum = (1.0 - eps)/sum;
14760
15669
  }
14761
15670
 
14762
- float dot_st1_dst1 = 0;
14763
- ggml_vec_scale_f32(nc, sm, sum);
14764
- ggml_vec_cpy_f32 (nc, ds0, sm);
14765
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
14766
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
14767
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
14768
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
14769
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
14770
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
14771
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
15671
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
15672
+ ggml_vec_scale_f32(nc, ds0, sum);
15673
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
15674
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
15675
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
15676
+
14772
15677
 
14773
15678
  #ifndef NDEBUG
14774
15679
  for (int i = 0; i < nc; ++i) {
14775
- assert(!isnan(sm[i]));
14776
- assert(!isinf(sm[i]));
14777
15680
  assert(!isnan(ds0[i]));
14778
15681
  assert(!isinf(ds0[i]));
14779
15682
  }
@@ -14879,6 +15782,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14879
15782
  {
14880
15783
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14881
15784
  } break;
15785
+ case GGML_OP_CONCAT:
15786
+ {
15787
+ ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15788
+ } break;
14882
15789
  case GGML_OP_SILU_BACK:
14883
15790
  {
14884
15791
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -14895,6 +15802,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14895
15802
  {
14896
15803
  ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
14897
15804
  } break;
15805
+ case GGML_OP_GROUP_NORM:
15806
+ {
15807
+ ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15808
+ } break;
14898
15809
  case GGML_OP_MUL_MAT:
14899
15810
  {
14900
15811
  ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
@@ -14987,6 +15898,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14987
15898
  {
14988
15899
  ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14989
15900
  } break;
15901
+ case GGML_OP_CONV_TRANSPOSE_2D:
15902
+ {
15903
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15904
+ } break;
14990
15905
  case GGML_OP_POOL_1D:
14991
15906
  {
14992
15907
  ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
@@ -14995,6 +15910,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14995
15910
  {
14996
15911
  ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
14997
15912
  } break;
15913
+ case GGML_OP_UPSCALE:
15914
+ {
15915
+ ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15916
+ } break;
14998
15917
  case GGML_OP_FLASH_ATTN:
14999
15918
  {
15000
15919
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -15025,6 +15944,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15025
15944
  {
15026
15945
  ggml_compute_forward_unary(params, tensor->src[0], tensor);
15027
15946
  } break;
15947
+ case GGML_OP_GET_REL_POS:
15948
+ {
15949
+ ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15950
+ } break;
15951
+ case GGML_OP_ADD_REL_POS:
15952
+ {
15953
+ ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15954
+ } break;
15028
15955
  case GGML_OP_MAP_UNARY:
15029
15956
  {
15030
15957
  ggml_unary_op_f32_t fun;
@@ -15288,6 +16215,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15288
16215
  inplace);
15289
16216
  }
15290
16217
  } break;
16218
+ case GGML_OP_CONCAT:
16219
+ {
16220
+ GGML_ASSERT(false); // TODO: implement
16221
+ } break;
15291
16222
  case GGML_OP_SILU_BACK:
15292
16223
  {
15293
16224
  GGML_ASSERT(false); // TODO: not implemented
@@ -15300,9 +16231,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15300
16231
  {
15301
16232
  // necessary for llama
15302
16233
  if (src0->grad) {
16234
+ float eps;
16235
+ memcpy(&eps, tensor->op_params, sizeof(float));
16236
+
15303
16237
  src0->grad = ggml_add_impl(ctx,
15304
16238
  src0->grad,
15305
- ggml_rms_norm_back(ctx, src0, tensor->grad),
16239
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
15306
16240
  inplace);
15307
16241
  }
15308
16242
  } break;
@@ -15310,6 +16244,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15310
16244
  {
15311
16245
  GGML_ASSERT(false); // TODO: not implemented
15312
16246
  } break;
16247
+ case GGML_OP_GROUP_NORM:
16248
+ {
16249
+ GGML_ASSERT(false); // TODO: not implemented
16250
+ } break;
15313
16251
  case GGML_OP_MUL_MAT:
15314
16252
  {
15315
16253
  // https://cs231n.github.io/optimization-2/#staged
@@ -15584,6 +16522,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15584
16522
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15585
16523
  const int mode = ((int32_t *) tensor->op_params)[2];
15586
16524
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16525
+ float freq_base;
16526
+ float freq_scale;
16527
+ float xpos_base;
16528
+ bool xpos_down;
16529
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16530
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16531
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16532
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16533
+
15587
16534
  src0->grad = ggml_add_impl(ctx,
15588
16535
  src0->grad,
15589
16536
  ggml_rope_back(ctx,
@@ -15591,7 +16538,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15591
16538
  n_past,
15592
16539
  n_dims,
15593
16540
  mode,
15594
- n_ctx),
16541
+ n_ctx,
16542
+ freq_base,
16543
+ freq_scale,
16544
+ xpos_base,
16545
+ xpos_down),
15595
16546
  inplace);
15596
16547
  }
15597
16548
  } break;
@@ -15602,14 +16553,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15602
16553
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15603
16554
  const int mode = ((int32_t *) tensor->op_params)[2];
15604
16555
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16556
+ float freq_base;
16557
+ float freq_scale;
16558
+ float xpos_base;
16559
+ bool xpos_down;
16560
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16561
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16562
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16563
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16564
+
15605
16565
  src0->grad = ggml_add_impl(ctx,
15606
16566
  src0->grad,
15607
- ggml_rope(ctx,
16567
+ ggml_rope_impl(ctx,
15608
16568
  tensor->grad,
15609
16569
  n_past,
15610
16570
  n_dims,
15611
16571
  mode,
15612
- n_ctx),
16572
+ n_ctx,
16573
+ freq_base,
16574
+ freq_scale,
16575
+ xpos_base,
16576
+ xpos_down,
16577
+ false),
15613
16578
  inplace);
15614
16579
  }
15615
16580
  } break;
@@ -15629,6 +16594,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15629
16594
  {
15630
16595
  GGML_ASSERT(false); // TODO: not implemented
15631
16596
  } break;
16597
+ case GGML_OP_CONV_TRANSPOSE_2D:
16598
+ {
16599
+ GGML_ASSERT(false); // TODO: not implemented
16600
+ } break;
15632
16601
  case GGML_OP_POOL_1D:
15633
16602
  {
15634
16603
  GGML_ASSERT(false); // TODO: not implemented
@@ -15637,6 +16606,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15637
16606
  {
15638
16607
  GGML_ASSERT(false); // TODO: not implemented
15639
16608
  } break;
16609
+ case GGML_OP_UPSCALE:
16610
+ {
16611
+ GGML_ASSERT(false); // TODO: not implemented
16612
+ } break;
15640
16613
  case GGML_OP_FLASH_ATTN:
15641
16614
  {
15642
16615
  struct ggml_tensor * flash_grad = NULL;
@@ -15878,6 +16851,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15878
16851
  GGML_ASSERT(false);
15879
16852
  }
15880
16853
  } break;
16854
+ case GGML_OP_GET_REL_POS:
16855
+ case GGML_OP_ADD_REL_POS:
15881
16856
  case GGML_OP_MAP_UNARY:
15882
16857
  case GGML_OP_MAP_BINARY:
15883
16858
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16029,9 +17004,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16029
17004
  return result;
16030
17005
  }
16031
17006
 
16032
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
16033
- struct ggml_cgraph result = *gf;
16034
-
17007
+ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
16035
17008
  GGML_ASSERT(gf->n_nodes > 0);
16036
17009
 
16037
17010
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16055,15 +17028,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16055
17028
  }
16056
17029
  }
16057
17030
 
16058
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
17031
+ for (int i = 0; i < gf->n_nodes; i++) {
16059
17032
  struct ggml_tensor * node = gf->nodes[i];
16060
17033
 
16061
17034
  if (node->is_param) {
16062
17035
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16063
- ggml_build_forward_expand(&result, node->grad);
17036
+ ggml_build_forward_expand(gb, node->grad);
16064
17037
  }
16065
17038
  }
17039
+ }
16066
17040
 
17041
+ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
17042
+ struct ggml_cgraph result = *gf;
17043
+ ggml_build_backward_expand(ctx, gf, &result, keep);
16067
17044
  return result;
16068
17045
  }
16069
17046
 
@@ -16382,7 +17359,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16382
17359
 
16383
17360
  size_t cur = 0;
16384
17361
  if (ggml_is_quantized(node->type)) {
16385
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
17362
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16386
17363
  }
16387
17364
 
16388
17365
  work_size = MAX(work_size, cur);
@@ -16395,7 +17372,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16395
17372
  size_t cur = 0;
16396
17373
 
16397
17374
  if (ggml_is_quantized(node->src[0]->type)) {
16398
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
17375
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16399
17376
  }
16400
17377
 
16401
17378
  work_size = MAX(work_size, cur);
@@ -16407,7 +17384,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16407
17384
  size_t cur = 0;
16408
17385
 
16409
17386
  if (ggml_is_quantized(node->src[0]->type)) {
16410
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
17387
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16411
17388
  }
16412
17389
 
16413
17390
  work_size = MAX(work_size, cur);
@@ -16454,9 +17431,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16454
17431
  case GGML_OP_NORM:
16455
17432
  case GGML_OP_RMS_NORM:
16456
17433
  case GGML_OP_RMS_NORM_BACK:
17434
+ case GGML_OP_GROUP_NORM:
16457
17435
  {
16458
17436
  n_tasks = n_threads;
16459
17437
  } break;
17438
+ case GGML_OP_CONCAT:
16460
17439
  case GGML_OP_MUL_MAT:
16461
17440
  case GGML_OP_OUT_PROD:
16462
17441
  {
@@ -16490,12 +17469,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16490
17469
  // the threads are still spinning
16491
17470
  if (node->src[0]->type != GGML_TYPE_F32) {
16492
17471
  // here we need memory just for single 2D matrix from src0
16493
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
17472
+ cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
16494
17473
  }
16495
17474
  } else
16496
17475
  #endif
16497
17476
  if (node->src[1]->type != vec_dot_type) {
16498
- cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
17477
+ cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16499
17478
  } else {
16500
17479
  cur = 0;
16501
17480
  }
@@ -16524,6 +17503,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16524
17503
  case GGML_OP_SOFT_MAX_BACK:
16525
17504
  case GGML_OP_ROPE:
16526
17505
  case GGML_OP_ROPE_BACK:
17506
+ case GGML_OP_ADD_REL_POS:
16527
17507
  {
16528
17508
  n_tasks = n_threads;
16529
17509
  } break;
@@ -16598,6 +17578,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16598
17578
  GGML_ASSERT(false);
16599
17579
  }
16600
17580
 
17581
+ work_size = MAX(work_size, cur);
17582
+ } break;
17583
+ case GGML_OP_CONV_TRANSPOSE_2D:
17584
+ {
17585
+ n_tasks = n_threads;
17586
+
17587
+ const int64_t ne00 = node->src[0]->ne[0]; // W
17588
+ const int64_t ne01 = node->src[0]->ne[1]; // H
17589
+ const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
17590
+ const int64_t ne03 = node->src[0]->ne[3]; // Channels In
17591
+
17592
+ const int64_t ne10 = node->src[1]->ne[0]; // W
17593
+ const int64_t ne11 = node->src[1]->ne[1]; // H
17594
+ const int64_t ne12 = node->src[1]->ne[2]; // Channels In
17595
+
17596
+ size_t cur = 0;
17597
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
17598
+ cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
17599
+
16601
17600
  work_size = MAX(work_size, cur);
16602
17601
  } break;
16603
17602
  case GGML_OP_POOL_1D:
@@ -16605,6 +17604,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16605
17604
  {
16606
17605
  n_tasks = 1;
16607
17606
  } break;
17607
+ case GGML_OP_UPSCALE:
17608
+ {
17609
+ n_tasks = n_threads;
17610
+ } break;
16608
17611
  case GGML_OP_FLASH_ATTN:
16609
17612
  {
16610
17613
  n_tasks = n_threads;
@@ -16666,6 +17669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16666
17669
  } break;
16667
17670
  case GGML_OP_WIN_PART:
16668
17671
  case GGML_OP_WIN_UNPART:
17672
+ case GGML_OP_GET_REL_POS:
16669
17673
  case GGML_OP_MAP_UNARY:
16670
17674
  case GGML_OP_MAP_BINARY:
16671
17675
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16712,10 +17716,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16712
17716
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16713
17717
  {
16714
17718
  n_tasks = n_threads;
16715
-
16716
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
16717
-
16718
- work_size = MAX(work_size, cur);
16719
17719
  } break;
16720
17720
  case GGML_OP_NONE:
16721
17721
  {
@@ -16783,8 +17783,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16783
17783
 
16784
17784
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
16785
17785
  GGML_ASSERT(rc == 0);
17786
+ UNUSED(rc);
16786
17787
  }
16787
17788
  }
17789
+
16788
17790
  workers[0].ith = 0;
16789
17791
  workers[0].shared = &state_shared;
16790
17792
 
@@ -16900,7 +17902,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16900
17902
  // compute size of intermediate results
16901
17903
  // TODO: does not take into account scratch buffers !!!!
16902
17904
  for (int i = 0; i < cgraph->n_nodes; ++i) {
16903
- size_eval += ggml_nbytes(cgraph->nodes[i]);
17905
+ size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
16904
17906
  }
16905
17907
 
16906
17908
  // print
@@ -17591,14 +18593,16 @@ static enum ggml_opt_result ggml_opt_adam(
17591
18593
  struct ggml_opt_params params,
17592
18594
  struct ggml_tensor * f,
17593
18595
  struct ggml_cgraph * gf,
17594
- struct ggml_cgraph * gb) {
18596
+ struct ggml_cgraph * gb,
18597
+ ggml_opt_callback callback,
18598
+ void * callback_data) {
17595
18599
  GGML_ASSERT(ggml_is_scalar(f));
17596
18600
 
17597
18601
  // these will store the parameters we want to optimize
17598
18602
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
17599
18603
 
17600
18604
  int np = 0;
17601
- int nx = 0;
18605
+ int64_t nx = 0;
17602
18606
  for (int i = 0; i < gf->n_nodes; ++i) {
17603
18607
  if (gf->nodes[i]->is_param) {
17604
18608
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -17617,31 +18621,32 @@ static enum ggml_opt_result ggml_opt_adam(
17617
18621
  }
17618
18622
 
17619
18623
  // constants
17620
- const float sched = params.adam.sched;
17621
- const float decay = params.adam.decay * sched;
17622
- const float alpha = params.adam.alpha * sched;
18624
+ float sched = params.adam.sched;
18625
+ const float alpha = params.adam.alpha;
18626
+ const float decay = params.adam.decay * alpha;
17623
18627
  const float beta1 = params.adam.beta1;
17624
18628
  const float beta2 = params.adam.beta2;
17625
18629
  const float eps = params.adam.eps;
18630
+ const float gclip = params.adam.gclip;
18631
+ const int decay_min_ndim = params.adam.decay_min_ndim;
17626
18632
 
17627
- float * x = opt->adam.x->data; // view of the parameters
17628
- float * g1 = opt->adam.g1->data; // gradient
17629
- float * g2 = opt->adam.g2->data; // gradient squared
17630
18633
  float * m = opt->adam.m->data; // first moment
17631
18634
  float * v = opt->adam.v->data; // second moment
17632
- float * mh = opt->adam.mh->data; // first moment hat
17633
- float * vh = opt->adam.vh->data; // second moment hat
17634
18635
 
17635
18636
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
17636
18637
 
17637
- // update view
17638
- ggml_opt_get_params(np, ps, x);
18638
+ if (callback) {
18639
+ callback(callback_data, &sched);
18640
+ }
17639
18641
 
17640
18642
  // compute the function value
17641
18643
  ggml_graph_reset (gf);
17642
18644
  ggml_set_f32 (f->grad, 1.0f);
17643
18645
 
17644
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18646
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18647
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18648
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18649
+ ggml_graph_compute(gb, &cplan);
17645
18650
 
17646
18651
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
17647
18652
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -17649,6 +18654,9 @@ static enum ggml_opt_result ggml_opt_adam(
17649
18654
  pf[opt->iter % params.past] = opt->adam.fx_prev;
17650
18655
  }
17651
18656
 
18657
+ opt->loss_before = opt->adam.fx_prev;
18658
+ opt->loss_after = opt->adam.fx_prev;
18659
+
17652
18660
  // initialize
17653
18661
  if (opt->just_initialized) {
17654
18662
  opt->adam.n_no_improvement = 0;
@@ -17681,50 +18689,55 @@ static enum ggml_opt_result ggml_opt_adam(
17681
18689
  UNUSED(t_start_cpu);
17682
18690
 
17683
18691
  {
17684
- // update the gradient
17685
- ggml_opt_get_grad(np, ps, g1);
17686
-
17687
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
17688
- ggml_vec_scale_f32(nx, m, beta1);
17689
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
17690
-
17691
- // g2 = g1^2
17692
- ggml_vec_sqr_f32 (nx, g2, g1);
17693
-
17694
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
17695
- ggml_vec_scale_f32(nx, v, beta2);
17696
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
17697
-
17698
- // m^hat = m_t / (1 - beta1^t)
17699
- // v^hat = v_t / (1 - beta2^t)
17700
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
17701
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
17702
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
17703
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
17704
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
17705
- ggml_vec_cpy_f32 (nx, mh, m);
17706
- ggml_vec_cpy_f32 (nx, vh, v);
17707
-
17708
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
17709
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
17710
-
17711
- ggml_vec_sqrt_f32 (nx, vh, vh);
17712
- ggml_vec_acc1_f32 (nx, vh, eps);
17713
-
17714
- ggml_vec_div_f32 (nx, mh, mh, vh);
17715
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
17716
- ggml_vec_sub_f32 (nx, x, x, mh);
18692
+ float gnorm = 1.0f;
18693
+ if (gclip > 0.0f) {
18694
+ // gradient clipping
18695
+ ggml_float sum = 0.0;
18696
+ for (int p = 0; p < np; ++p) {
18697
+ const int64_t ne = ggml_nelements(ps[p]);
18698
+ for (int64_t j = 0; j < ne; ++j) {
18699
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
18700
+ sum += (ggml_float)(g*g);
18701
+ }
18702
+ }
18703
+ ggml_float norm = sqrt(sum);
18704
+ if (norm > (ggml_float) gclip) {
18705
+ gnorm = (float) ((ggml_float) gclip / norm);
18706
+ }
18707
+ }
18708
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
18709
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
18710
+ int64_t i = 0;
18711
+ for (int p = 0; p < np; ++p) {
18712
+ const int64_t ne = ggml_nelements(ps[p]);
18713
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
18714
+ for (int64_t j = 0; j < ne; ++j) {
18715
+ float x = ggml_get_f32_1d(ps[p], j);
18716
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
18717
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
18718
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
18719
+ float mh = m[i]*beta1h;
18720
+ float vh = v[i]*beta2h;
18721
+ vh = sqrtf(vh) + eps;
18722
+ x = x*(1.0f - p_decay) - mh/vh;
18723
+ ggml_set_f32_1d(ps[p], j, x);
18724
+ ++i;
18725
+ }
18726
+ }
18727
+ }
17717
18728
 
17718
- // update the parameters
17719
- ggml_opt_set_params(np, ps, x);
18729
+ if (callback) {
18730
+ callback(callback_data, &sched);
17720
18731
  }
17721
18732
 
17722
18733
  ggml_graph_reset (gf);
17723
18734
  ggml_set_f32 (f->grad, 1.0f);
17724
18735
 
17725
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18736
+ ggml_graph_compute(gb, &cplan);
17726
18737
 
17727
18738
  const float fx = ggml_get_f32_1d(f, 0);
18739
+ opt->loss_after = fx;
18740
+
17728
18741
 
17729
18742
  // check convergence
17730
18743
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -17793,7 +18806,6 @@ struct ggml_lbfgs_iteration_data {
17793
18806
  };
17794
18807
 
17795
18808
  static enum ggml_opt_result linesearch_backtracking(
17796
- struct ggml_context * ctx,
17797
18809
  const struct ggml_opt_params * params,
17798
18810
  int nx,
17799
18811
  float * x,
@@ -17805,8 +18817,11 @@ static enum ggml_opt_result linesearch_backtracking(
17805
18817
  struct ggml_tensor * f,
17806
18818
  struct ggml_cgraph * gf,
17807
18819
  struct ggml_cgraph * gb,
18820
+ struct ggml_cplan * cplan,
17808
18821
  const int np,
17809
- struct ggml_tensor * ps[]) {
18822
+ struct ggml_tensor * ps[],
18823
+ ggml_opt_callback callback,
18824
+ void * callback_data) {
17810
18825
  int count = 0;
17811
18826
 
17812
18827
  float width = 0.0f;
@@ -17835,6 +18850,12 @@ static enum ggml_opt_result linesearch_backtracking(
17835
18850
  dgtest = params->lbfgs.ftol*dginit;
17836
18851
 
17837
18852
  while (true) {
18853
+ if (callback) {
18854
+ // LBFG-S does not support learning rate -> ignore learning schedule
18855
+ float sched = 0;
18856
+ callback(callback_data, &sched);
18857
+ }
18858
+
17838
18859
  ggml_vec_cpy_f32(nx, x, xp);
17839
18860
  ggml_vec_mad_f32(nx, x, d, *step);
17840
18861
 
@@ -17845,7 +18866,7 @@ static enum ggml_opt_result linesearch_backtracking(
17845
18866
  ggml_graph_reset (gf);
17846
18867
  ggml_set_f32 (f->grad, 1.0f);
17847
18868
 
17848
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
18869
+ ggml_graph_compute(gb, cplan);
17849
18870
 
17850
18871
  ggml_opt_get_grad(np, ps, g);
17851
18872
 
@@ -17905,7 +18926,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17905
18926
  struct ggml_opt_params params,
17906
18927
  struct ggml_tensor * f,
17907
18928
  struct ggml_cgraph * gf,
17908
- struct ggml_cgraph * gb) {
18929
+ struct ggml_cgraph * gb,
18930
+ ggml_opt_callback callback,
18931
+ void * callback_data) {
17909
18932
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
17910
18933
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
17911
18934
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -17937,6 +18960,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17937
18960
  opt->iter = iter;
17938
18961
  }
17939
18962
 
18963
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18964
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18965
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18966
+
17940
18967
  float * x = opt->lbfgs.x->data; // current parameters
17941
18968
  float * xp = opt->lbfgs.xp->data; // previous parameters
17942
18969
  float * g = opt->lbfgs.g->data; // current gradient
@@ -17958,6 +18985,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17958
18985
  float * lm_s = opt->lbfgs.lms->data;
17959
18986
  float * lm_y = opt->lbfgs.lmy->data;
17960
18987
 
18988
+ if (callback) {
18989
+ // LBFG-S does not support learning rate -> ignore learning schedule
18990
+ float sched = 0;
18991
+ callback(callback_data, &sched);
18992
+ }
18993
+
17961
18994
  // evaluate the function value and its gradient
17962
18995
  {
17963
18996
  ggml_opt_set_params(np, ps, x);
@@ -17965,11 +18998,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17965
18998
  ggml_graph_reset (gf);
17966
18999
  ggml_set_f32 (f->grad, 1.0f);
17967
19000
 
17968
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
19001
+ ggml_graph_compute(gb, &cplan);
17969
19002
 
17970
19003
  ggml_opt_get_grad(np, ps, g);
17971
19004
 
17972
19005
  fx = ggml_get_f32_1d(f, 0);
19006
+
19007
+ opt->loss_before = fx;
19008
+ opt->loss_after = fx;
17973
19009
  }
17974
19010
 
17975
19011
  // search direction = -gradient
@@ -18024,7 +19060,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18024
19060
  ggml_vec_cpy_f32(nx, xp, x);
18025
19061
  ggml_vec_cpy_f32(nx, gp, g);
18026
19062
 
18027
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
19063
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
18028
19064
 
18029
19065
  if (ls < 0) {
18030
19066
  // linesearch failed - go back to the previous point and return
@@ -18034,6 +19070,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18034
19070
  return ls;
18035
19071
  }
18036
19072
 
19073
+ opt->loss_after = fx;
19074
+
18037
19075
  ggml_vec_norm_f32(nx, &xnorm, x);
18038
19076
  ggml_vec_norm_f32(nx, &gnorm, g);
18039
19077
 
@@ -18091,7 +19129,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18091
19129
  // ys = y^t \cdot s -> 1 / \rho.
18092
19130
  // yy = y^t \cdot y.
18093
19131
  //
18094
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
19132
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18095
19133
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18096
19134
 
18097
19135
  lm_ys[end[0]] = ys;
@@ -18154,13 +19192,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18154
19192
  .adam = {
18155
19193
  .n_iter = 10000,
18156
19194
  .sched = 1.000f,
18157
- .decay = 0.001f,
19195
+ .decay = 0.0f,
19196
+ .decay_min_ndim = 2,
18158
19197
  .alpha = 0.001f,
18159
19198
  .beta1 = 0.9f,
18160
19199
  .beta2 = 0.999f,
18161
19200
  .eps = 1e-8f,
18162
19201
  .eps_f = 1e-5f,
18163
19202
  .eps_g = 1e-3f,
19203
+ .gclip = 0.0f,
18164
19204
  },
18165
19205
  };
18166
19206
  } break;
@@ -18210,23 +19250,13 @@ GGML_API void ggml_opt_init(
18210
19250
  switch (opt->params.type) {
18211
19251
  case GGML_OPT_ADAM:
18212
19252
  {
18213
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18214
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18215
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18216
19253
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18217
19254
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18218
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18219
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
18220
19255
  opt->adam.pf = params.past > 0
18221
19256
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
18222
19257
  : NULL;
18223
- ggml_set_zero(opt->adam.x);
18224
- ggml_set_zero(opt->adam.g1);
18225
- ggml_set_zero(opt->adam.g2);
18226
19258
  ggml_set_zero(opt->adam.m);
18227
19259
  ggml_set_zero(opt->adam.v);
18228
- ggml_set_zero(opt->adam.mh);
18229
- ggml_set_zero(opt->adam.vh);
18230
19260
  if (opt->adam.pf) {
18231
19261
  ggml_set_zero(opt->adam.pf);
18232
19262
  }
@@ -18301,8 +19331,8 @@ enum ggml_opt_result ggml_opt_resume(
18301
19331
  struct ggml_tensor * f) {
18302
19332
 
18303
19333
  // build forward + backward compute graphs
18304
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
18305
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
19334
+ struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
19335
+ struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18306
19336
 
18307
19337
  struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18308
19338
  struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
@@ -18310,7 +19340,7 @@ enum ggml_opt_result ggml_opt_resume(
18310
19340
  *gf = ggml_build_forward (f);
18311
19341
  *gb = ggml_build_backward(ctx, gf, true);
18312
19342
 
18313
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
19343
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18314
19344
  }
18315
19345
 
18316
19346
  enum ggml_opt_result ggml_opt_resume_g(
@@ -18318,7 +19348,9 @@ enum ggml_opt_result ggml_opt_resume_g(
18318
19348
  struct ggml_opt_context * opt,
18319
19349
  struct ggml_tensor * f,
18320
19350
  struct ggml_cgraph * gf,
18321
- struct ggml_cgraph * gb) {
19351
+ struct ggml_cgraph * gb,
19352
+ ggml_opt_callback callback,
19353
+ void * callback_data) {
18322
19354
 
18323
19355
  // build forward + backward compute graphs
18324
19356
  enum ggml_opt_result result = GGML_OPT_OK;
@@ -18326,11 +19358,11 @@ enum ggml_opt_result ggml_opt_resume_g(
18326
19358
  switch (opt->params.type) {
18327
19359
  case GGML_OPT_ADAM:
18328
19360
  {
18329
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
19361
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
18330
19362
  } break;
18331
19363
  case GGML_OPT_LBFGS:
18332
19364
  {
18333
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
19365
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
18334
19366
  } break;
18335
19367
  }
18336
19368
 
@@ -18561,64 +19593,1164 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18561
19593
 
18562
19594
  ////////////////////////////////////////////////////////////////////////////////
18563
19595
 
18564
- int ggml_cpu_has_avx(void) {
18565
- #if defined(__AVX__)
18566
- return 1;
18567
- #else
18568
- return 0;
18569
- #endif
18570
- }
19596
+ struct gguf_str {
19597
+ uint64_t n; // GGUFv2
19598
+ char * data;
19599
+ };
18571
19600
 
18572
- int ggml_cpu_has_avx2(void) {
18573
- #if defined(__AVX2__)
18574
- return 1;
18575
- #else
18576
- return 0;
18577
- #endif
18578
- }
19601
+ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19602
+ [GGUF_TYPE_UINT8] = sizeof(uint8_t),
19603
+ [GGUF_TYPE_INT8] = sizeof(int8_t),
19604
+ [GGUF_TYPE_UINT16] = sizeof(uint16_t),
19605
+ [GGUF_TYPE_INT16] = sizeof(int16_t),
19606
+ [GGUF_TYPE_UINT32] = sizeof(uint32_t),
19607
+ [GGUF_TYPE_INT32] = sizeof(int32_t),
19608
+ [GGUF_TYPE_FLOAT32] = sizeof(float),
19609
+ [GGUF_TYPE_BOOL] = sizeof(bool),
19610
+ [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19611
+ [GGUF_TYPE_UINT64] = sizeof(uint64_t),
19612
+ [GGUF_TYPE_INT64] = sizeof(int64_t),
19613
+ [GGUF_TYPE_FLOAT64] = sizeof(double),
19614
+ [GGUF_TYPE_ARRAY] = 0, // undefined
19615
+ };
19616
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19617
+
19618
+ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19619
+ [GGUF_TYPE_UINT8] = "u8",
19620
+ [GGUF_TYPE_INT8] = "i8",
19621
+ [GGUF_TYPE_UINT16] = "u16",
19622
+ [GGUF_TYPE_INT16] = "i16",
19623
+ [GGUF_TYPE_UINT32] = "u32",
19624
+ [GGUF_TYPE_INT32] = "i32",
19625
+ [GGUF_TYPE_FLOAT32] = "f32",
19626
+ [GGUF_TYPE_BOOL] = "bool",
19627
+ [GGUF_TYPE_STRING] = "str",
19628
+ [GGUF_TYPE_ARRAY] = "arr",
19629
+ [GGUF_TYPE_UINT64] = "u64",
19630
+ [GGUF_TYPE_INT64] = "i64",
19631
+ [GGUF_TYPE_FLOAT64] = "f64",
19632
+ };
19633
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19634
+
19635
+ union gguf_value {
19636
+ uint8_t uint8;
19637
+ int8_t int8;
19638
+ uint16_t uint16;
19639
+ int16_t int16;
19640
+ uint32_t uint32;
19641
+ int32_t int32;
19642
+ float float32;
19643
+ uint64_t uint64;
19644
+ int64_t int64;
19645
+ double float64;
19646
+ bool bool_;
19647
+
19648
+ struct gguf_str str;
19649
+
19650
+ struct {
19651
+ enum gguf_type type;
19652
+
19653
+ uint64_t n; // GGUFv2
19654
+ void * data;
19655
+ } arr;
19656
+ };
18579
19657
 
18580
- int ggml_cpu_has_avx512(void) {
18581
- #if defined(__AVX512F__)
18582
- return 1;
18583
- #else
18584
- return 0;
18585
- #endif
18586
- }
19658
+ struct gguf_kv {
19659
+ struct gguf_str key;
18587
19660
 
18588
- int ggml_cpu_has_avx512_vbmi(void) {
18589
- #if defined(__AVX512VBMI__)
18590
- return 1;
18591
- #else
18592
- return 0;
18593
- #endif
18594
- }
19661
+ enum gguf_type type;
19662
+ union gguf_value value;
19663
+ };
18595
19664
 
18596
- int ggml_cpu_has_avx512_vnni(void) {
18597
- #if defined(__AVX512VNNI__)
18598
- return 1;
18599
- #else
18600
- return 0;
18601
- #endif
18602
- }
19665
+ struct gguf_header {
19666
+ uint32_t magic;
19667
+ uint32_t version;
19668
+ uint64_t n_tensors; // GGUFv2
19669
+ uint64_t n_kv; // GGUFv2
19670
+ };
18603
19671
 
18604
- int ggml_cpu_has_fma(void) {
18605
- #if defined(__FMA__)
18606
- return 1;
18607
- #else
18608
- return 0;
18609
- #endif
18610
- }
19672
+ struct gguf_tensor_info {
19673
+ struct gguf_str name;
18611
19674
 
18612
- int ggml_cpu_has_neon(void) {
18613
- #if defined(__ARM_NEON)
18614
- return 1;
18615
- #else
18616
- return 0;
18617
- #endif
18618
- }
19675
+ uint32_t n_dims;
19676
+ uint64_t ne[GGML_MAX_DIMS];
18619
19677
 
18620
- int ggml_cpu_has_arm_fma(void) {
18621
- #if defined(__ARM_FEATURE_FMA)
19678
+ enum ggml_type type;
19679
+
19680
+ uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
19681
+
19682
+ // for writing API
19683
+ const void * data;
19684
+ size_t size;
19685
+ };
19686
+
19687
+ struct gguf_context {
19688
+ struct gguf_header header;
19689
+
19690
+ struct gguf_kv * kv;
19691
+ struct gguf_tensor_info * infos;
19692
+
19693
+ size_t alignment;
19694
+ size_t offset; // offset of `data` from beginning of file
19695
+ size_t size; // size of `data` in bytes
19696
+
19697
+ //uint8_t * padding;
19698
+ void * data;
19699
+ };
19700
+
19701
+ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
19702
+ const size_t n = fread(dst, 1, size, file);
19703
+ *offset += n;
19704
+ return n == size;
19705
+ }
19706
+
19707
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19708
+ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
19709
+ p->n = 0;
19710
+ p->data = NULL;
19711
+
19712
+ bool ok = true;
19713
+
19714
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19715
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19716
+
19717
+ return ok;
19718
+ }
19719
+
19720
+ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
19721
+ p->n = 0;
19722
+ p->data = NULL;
19723
+
19724
+ bool ok = true;
19725
+
19726
+ uint32_t n = 0;
19727
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
19728
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19729
+
19730
+ return ok;
19731
+ }
19732
+
19733
+ struct gguf_context * gguf_init_empty(void) {
19734
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19735
+
19736
+ ctx->header.magic = GGUF_MAGIC;
19737
+ ctx->header.version = GGUF_VERSION;
19738
+ ctx->header.n_tensors = 0;
19739
+ ctx->header.n_kv = 0;
19740
+
19741
+ ctx->kv = NULL;
19742
+ ctx->infos = NULL;
19743
+
19744
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19745
+ ctx->offset = 0;
19746
+ ctx->size = 0;
19747
+
19748
+ ctx->data = NULL;
19749
+
19750
+ return ctx;
19751
+ }
19752
+
19753
+ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
19754
+ FILE * file = fopen(fname, "rb");
19755
+ if (!file) {
19756
+ return NULL;
19757
+ }
19758
+
19759
+ // offset from start of file
19760
+ size_t offset = 0;
19761
+
19762
+ uint32_t magic = 0;
19763
+
19764
+ // check the magic before making allocations
19765
+ {
19766
+ gguf_fread_el(file, &magic, sizeof(magic), &offset);
19767
+
19768
+ if (magic != GGUF_MAGIC) {
19769
+ fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
19770
+ fclose(file);
19771
+ return NULL;
19772
+ }
19773
+ }
19774
+
19775
+ bool ok = true;
19776
+
19777
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19778
+
19779
+ // read the header
19780
+ {
19781
+ ctx->header.magic = magic;
19782
+
19783
+ ctx->kv = NULL;
19784
+ ctx->infos = NULL;
19785
+ ctx->data = NULL;
19786
+
19787
+ ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19788
+
19789
+ if (ctx->header.version == 1) {
19790
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19791
+ uint32_t n_tensors = 0;
19792
+ uint32_t n_kv = 0;
19793
+
19794
+ ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
19795
+ ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
19796
+
19797
+ ctx->header.n_tensors = n_tensors;
19798
+ ctx->header.n_kv = n_kv;
19799
+ } else {
19800
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19801
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19802
+ }
19803
+
19804
+ if (!ok) {
19805
+ fprintf(stderr, "%s: failed to read header\n", __func__);
19806
+ fclose(file);
19807
+ gguf_free(ctx);
19808
+ return NULL;
19809
+ }
19810
+ }
19811
+
19812
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19813
+ bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
19814
+ if (ctx->header.version == 1) {
19815
+ gguf_fread_str = gguf_fread_str_v1;
19816
+ }
19817
+
19818
+ // read the kv pairs
19819
+ {
19820
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19821
+
19822
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19823
+ struct gguf_kv * kv = &ctx->kv[i];
19824
+
19825
+ //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19826
+
19827
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19828
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19829
+
19830
+ //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19831
+
19832
+ switch (kv->type) {
19833
+ case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
19834
+ case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
19835
+ case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
19836
+ case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
19837
+ case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19838
+ case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19839
+ case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19840
+ case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
19841
+ case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
19842
+ case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
19843
+ case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19844
+ case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19845
+ case GGUF_TYPE_ARRAY:
19846
+ {
19847
+ ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19848
+
19849
+ if (ctx->header.version == 1) {
19850
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19851
+ uint32_t n = 0;
19852
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
19853
+ kv->value.arr.n = n;
19854
+ } else {
19855
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19856
+ }
19857
+
19858
+ switch (kv->value.arr.type) {
19859
+ case GGUF_TYPE_UINT8:
19860
+ case GGUF_TYPE_INT8:
19861
+ case GGUF_TYPE_UINT16:
19862
+ case GGUF_TYPE_INT16:
19863
+ case GGUF_TYPE_UINT32:
19864
+ case GGUF_TYPE_INT32:
19865
+ case GGUF_TYPE_FLOAT32:
19866
+ case GGUF_TYPE_UINT64:
19867
+ case GGUF_TYPE_INT64:
19868
+ case GGUF_TYPE_FLOAT64:
19869
+ case GGUF_TYPE_BOOL:
19870
+ {
19871
+ kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
19872
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
19873
+ } break;
19874
+ case GGUF_TYPE_STRING:
19875
+ {
19876
+ kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
19877
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19878
+ ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
19879
+ }
19880
+ } break;
19881
+ case GGUF_TYPE_ARRAY:
19882
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
19883
+ };
19884
+ } break;
19885
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
19886
+ };
19887
+
19888
+ if (!ok) {
19889
+ break;
19890
+ }
19891
+ }
19892
+
19893
+ if (!ok) {
19894
+ fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
19895
+ fclose(file);
19896
+ gguf_free(ctx);
19897
+ return NULL;
19898
+ }
19899
+ }
19900
+
19901
+ // read the tensor infos
19902
+ {
19903
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19904
+
19905
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19906
+ struct gguf_tensor_info * info = &ctx->infos[i];
19907
+
19908
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
19909
+ info->ne[j] = 1;
19910
+ }
19911
+
19912
+ ok = ok && gguf_fread_str(file, &info->name, &offset);
19913
+ ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19914
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
19915
+ if (ctx->header.version == 1) {
19916
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19917
+ uint32_t t = 0;
19918
+ ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
19919
+ info->ne[j] = t;
19920
+ } else {
19921
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19922
+ }
19923
+ }
19924
+ ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19925
+ ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
19926
+
19927
+ if (!ok) {
19928
+ fprintf(stderr, "%s: failed to read tensor info\n", __func__);
19929
+ fclose(file);
19930
+ gguf_free(ctx);
19931
+ return NULL;
19932
+ }
19933
+ }
19934
+ }
19935
+
19936
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19937
+
19938
+ int alignment_idx = gguf_find_key(ctx, "general.alignment");
19939
+ if (alignment_idx != -1) {
19940
+ ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
19941
+ }
19942
+
19943
+ // we require the data section to be aligned, so take into account any padding
19944
+ {
19945
+ const size_t offset_pad = offset % ctx->alignment;
19946
+
19947
+ if (offset_pad != 0) {
19948
+ offset += ctx->alignment - offset_pad;
19949
+ fseek(file, offset, SEEK_SET);
19950
+ }
19951
+ }
19952
+
19953
+ // store the current file offset - this is where the data section starts
19954
+ ctx->offset = offset;
19955
+
19956
+ // compute the total size of the data section, taking into account the alignment
19957
+ {
19958
+ ctx->size = 0;
19959
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19960
+ struct gguf_tensor_info * info = &ctx->infos[i];
19961
+
19962
+ const int64_t ne =
19963
+ (int64_t) info->ne[0] *
19964
+ (int64_t) info->ne[1] *
19965
+ (int64_t) info->ne[2] *
19966
+ (int64_t) info->ne[3];
19967
+
19968
+ if (ne % ggml_blck_size(info->type) != 0) {
19969
+ fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19970
+ __func__, info->name.data, ne, ggml_blck_size(info->type));
19971
+ fclose(file);
19972
+ gguf_free(ctx);
19973
+ return NULL;
19974
+ }
19975
+
19976
+ const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
19977
+
19978
+ ctx->size += GGML_PAD(size_cur, ctx->alignment);
19979
+ }
19980
+ }
19981
+
19982
+ // load the tensor data only if requested
19983
+ if (params.ctx != NULL) {
19984
+ // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
19985
+ // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
19986
+ // the ggml_tensor structs to the appropriate locations in the binary blob
19987
+
19988
+ // compute the exact size needed for the new ggml_context
19989
+ const size_t mem_size =
19990
+ params.no_alloc ?
19991
+ (ctx->header.n_tensors )*ggml_tensor_overhead() :
19992
+ (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
19993
+
19994
+ struct ggml_init_params pdata = {
19995
+ .mem_size = mem_size,
19996
+ .mem_buffer = NULL,
19997
+ .no_alloc = params.no_alloc,
19998
+ };
19999
+
20000
+ *params.ctx = ggml_init(pdata);
20001
+
20002
+ struct ggml_context * ctx_data = *params.ctx;
20003
+
20004
+ struct ggml_tensor * data = NULL;
20005
+
20006
+ if (params.no_alloc == false) {
20007
+ data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
20008
+
20009
+ ok = ok && data != NULL;
20010
+
20011
+ // read the binary blob with the tensor data
20012
+ ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
20013
+
20014
+ if (!ok) {
20015
+ fprintf(stderr, "%s: failed to read tensor data\n", __func__);
20016
+ fclose(file);
20017
+ ggml_free(ctx_data);
20018
+ gguf_free(ctx);
20019
+ return NULL;
20020
+ }
20021
+
20022
+ ctx->data = data->data;
20023
+ }
20024
+
20025
+ ggml_set_no_alloc(ctx_data, true);
20026
+
20027
+ // create the tensors
20028
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20029
+ const int64_t ne[GGML_MAX_DIMS] = {
20030
+ ctx->infos[i].ne[0],
20031
+ ctx->infos[i].ne[1],
20032
+ ctx->infos[i].ne[2],
20033
+ ctx->infos[i].ne[3],
20034
+ };
20035
+
20036
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
20037
+
20038
+ ok = ok && cur != NULL;
20039
+
20040
+ ggml_set_name(cur, ctx->infos[i].name.data);
20041
+
20042
+ if (!ok) {
20043
+ break;
20044
+ }
20045
+
20046
+ // point the data member to the appropriate location in the binary blob using the tensor infos
20047
+ if (params.no_alloc == false) {
20048
+ //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
20049
+ cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
20050
+ }
20051
+ }
20052
+
20053
+ if (!ok) {
20054
+ fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
20055
+ fclose(file);
20056
+ ggml_free(ctx_data);
20057
+ gguf_free(ctx);
20058
+ return NULL;
20059
+ }
20060
+
20061
+ ggml_set_no_alloc(ctx_data, params.no_alloc);
20062
+ }
20063
+
20064
+ fclose(file);
20065
+
20066
+ return ctx;
20067
+ }
20068
+
20069
+ void gguf_free(struct gguf_context * ctx) {
20070
+ if (ctx == NULL) {
20071
+ return;
20072
+ }
20073
+
20074
+ if (ctx->kv) {
20075
+ // free string memory - not great..
20076
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
20077
+ struct gguf_kv * kv = &ctx->kv[i];
20078
+
20079
+ if (kv->key.data) {
20080
+ free(kv->key.data);
20081
+ }
20082
+
20083
+ if (kv->type == GGUF_TYPE_STRING) {
20084
+ if (kv->value.str.data) {
20085
+ free(kv->value.str.data);
20086
+ }
20087
+ }
20088
+
20089
+ if (kv->type == GGUF_TYPE_ARRAY) {
20090
+ if (kv->value.arr.data) {
20091
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
20092
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
20093
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20094
+ if (str->data) {
20095
+ free(str->data);
20096
+ }
20097
+ }
20098
+ }
20099
+ free(kv->value.arr.data);
20100
+ }
20101
+ }
20102
+ }
20103
+
20104
+ free(ctx->kv);
20105
+ }
20106
+
20107
+ if (ctx->infos) {
20108
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20109
+ struct gguf_tensor_info * info = &ctx->infos[i];
20110
+
20111
+ if (info->name.data) {
20112
+ free(info->name.data);
20113
+ }
20114
+ }
20115
+
20116
+ free(ctx->infos);
20117
+ }
20118
+
20119
+ GGML_ALIGNED_FREE(ctx);
20120
+ }
20121
+
20122
+ const char * gguf_type_name(enum gguf_type type) {
20123
+ return GGUF_TYPE_NAME[type];
20124
+ }
20125
+
20126
+ int gguf_get_version(struct gguf_context * ctx) {
20127
+ return ctx->header.version;
20128
+ }
20129
+
20130
+ size_t gguf_get_alignment(struct gguf_context * ctx) {
20131
+ return ctx->alignment;
20132
+ }
20133
+
20134
+ size_t gguf_get_data_offset(struct gguf_context * ctx) {
20135
+ return ctx->offset;
20136
+ }
20137
+
20138
+ void * gguf_get_data(struct gguf_context * ctx) {
20139
+ return ctx->data;
20140
+ }
20141
+
20142
+ int gguf_get_n_kv(struct gguf_context * ctx) {
20143
+ return ctx->header.n_kv;
20144
+ }
20145
+
20146
+ int gguf_find_key(struct gguf_context * ctx, const char * key) {
20147
+ // return -1 if key not found
20148
+ int keyfound = -1;
20149
+
20150
+ const int n_kv = gguf_get_n_kv(ctx);
20151
+
20152
+ for (int i = 0; i < n_kv; ++i) {
20153
+ if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
20154
+ keyfound = i;
20155
+ break;
20156
+ }
20157
+ }
20158
+
20159
+ return keyfound;
20160
+ }
20161
+
20162
+ const char * gguf_get_key(struct gguf_context * ctx, int i) {
20163
+ return ctx->kv[i].key.data;
20164
+ }
20165
+
20166
+ enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
20167
+ return ctx->kv[i].type;
20168
+ }
20169
+
20170
+ enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
20171
+ return ctx->kv[i].value.arr.type;
20172
+ }
20173
+
20174
+ const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
20175
+ return ctx->kv[i].value.arr.data;
20176
+ }
20177
+
20178
+ const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
20179
+ struct gguf_kv * kv = &ctx->kv[key_id];
20180
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
20181
+ return str->data;
20182
+ }
20183
+
20184
+ int gguf_get_arr_n(struct gguf_context * ctx, int i) {
20185
+ return ctx->kv[i].value.arr.n;
20186
+ }
20187
+
20188
+ uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
20189
+ return ctx->kv[i].value.uint8;
20190
+ }
20191
+
20192
+ int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
20193
+ return ctx->kv[i].value.int8;
20194
+ }
20195
+
20196
+ uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
20197
+ return ctx->kv[i].value.uint16;
20198
+ }
20199
+
20200
+ int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
20201
+ return ctx->kv[i].value.int16;
20202
+ }
20203
+
20204
+ uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
20205
+ return ctx->kv[i].value.uint32;
20206
+ }
20207
+
20208
+ int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
20209
+ return ctx->kv[i].value.int32;
20210
+ }
20211
+
20212
+ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
20213
+ return ctx->kv[i].value.float32;
20214
+ }
20215
+
20216
+ uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20217
+ return ctx->kv[i].value.uint64;
20218
+ }
20219
+
20220
+ int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20221
+ return ctx->kv[i].value.int64;
20222
+ }
20223
+
20224
+ double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20225
+ return ctx->kv[i].value.float64;
20226
+ }
20227
+
20228
+ bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
20229
+ return ctx->kv[i].value.bool_;
20230
+ }
20231
+
20232
+ const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
20233
+ return ctx->kv[i].value.str.data;
20234
+ }
20235
+
20236
+ int gguf_get_n_tensors(struct gguf_context * ctx) {
20237
+ return ctx->header.n_tensors;
20238
+ }
20239
+
20240
+ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
20241
+ // return -1 if tensor not found
20242
+ int tensorfound = -1;
20243
+
20244
+ const int n_tensors = gguf_get_n_tensors(ctx);
20245
+
20246
+ for (int i = 0; i < n_tensors; ++i) {
20247
+ if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
20248
+ tensorfound = i;
20249
+ break;
20250
+ }
20251
+ }
20252
+
20253
+ return tensorfound;
20254
+ }
20255
+
20256
+ size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
20257
+ return ctx->infos[i].offset;
20258
+ }
20259
+
20260
+ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
20261
+ return ctx->infos[i].name.data;
20262
+ }
20263
+
20264
+ // returns the index
20265
+ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
20266
+ const int idx = gguf_find_key(ctx, key);
20267
+ if (idx >= 0) {
20268
+ return idx;
20269
+ }
20270
+
20271
+ const int n_kv = gguf_get_n_kv(ctx);
20272
+
20273
+ ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20274
+ ctx->kv[n_kv].key.n = strlen(key);
20275
+ ctx->kv[n_kv].key.data = strdup(key);
20276
+ ctx->header.n_kv++;
20277
+
20278
+ return n_kv;
20279
+ }
20280
+
20281
+ void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
20282
+ const int idx = gguf_get_or_add_key(ctx, key);
20283
+
20284
+ ctx->kv[idx].type = GGUF_TYPE_UINT8;
20285
+ ctx->kv[idx].value.uint8 = val;
20286
+ }
20287
+
20288
+ void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
20289
+ const int idx = gguf_get_or_add_key(ctx, key);
20290
+
20291
+ ctx->kv[idx].type = GGUF_TYPE_INT8;
20292
+ ctx->kv[idx].value.int8 = val;
20293
+ }
20294
+
20295
+ void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
20296
+ const int idx = gguf_get_or_add_key(ctx, key);
20297
+
20298
+ ctx->kv[idx].type = GGUF_TYPE_UINT16;
20299
+ ctx->kv[idx].value.uint16 = val;
20300
+ }
20301
+
20302
+ void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
20303
+ const int idx = gguf_get_or_add_key(ctx, key);
20304
+
20305
+ ctx->kv[idx].type = GGUF_TYPE_INT16;
20306
+ ctx->kv[idx].value.int16 = val;
20307
+ }
20308
+
20309
+ void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
20310
+ const int idx = gguf_get_or_add_key(ctx, key);
20311
+
20312
+ ctx->kv[idx].type = GGUF_TYPE_UINT32;
20313
+ ctx->kv[idx].value.uint32 = val;
20314
+ }
20315
+
20316
+ void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
20317
+ const int idx = gguf_get_or_add_key(ctx, key);
20318
+
20319
+ ctx->kv[idx].type = GGUF_TYPE_INT32;
20320
+ ctx->kv[idx].value.int32 = val;
20321
+ }
20322
+
20323
+ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20324
+ const int idx = gguf_get_or_add_key(ctx, key);
20325
+
20326
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
20327
+ ctx->kv[idx].value.float32 = val;
20328
+ }
20329
+
20330
+ void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
20331
+ const int idx = gguf_get_or_add_key(ctx, key);
20332
+
20333
+ ctx->kv[idx].type = GGUF_TYPE_UINT64;
20334
+ ctx->kv[idx].value.uint64 = val;
20335
+ }
20336
+
20337
+ void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
20338
+ const int idx = gguf_get_or_add_key(ctx, key);
20339
+
20340
+ ctx->kv[idx].type = GGUF_TYPE_INT64;
20341
+ ctx->kv[idx].value.int64 = val;
20342
+ }
20343
+
20344
+ void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
20345
+ const int idx = gguf_get_or_add_key(ctx, key);
20346
+
20347
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
20348
+ ctx->kv[idx].value.float64 = val;
20349
+ }
20350
+
20351
+ void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20352
+ const int idx = gguf_get_or_add_key(ctx, key);
20353
+
20354
+ ctx->kv[idx].type = GGUF_TYPE_BOOL;
20355
+ ctx->kv[idx].value.bool_ = val;
20356
+ }
20357
+
20358
+ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
20359
+ const int idx = gguf_get_or_add_key(ctx, key);
20360
+
20361
+ ctx->kv[idx].type = GGUF_TYPE_STRING;
20362
+ ctx->kv[idx].value.str.n = strlen(val);
20363
+ ctx->kv[idx].value.str.data = strdup(val);
20364
+ }
20365
+
20366
+ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
20367
+ const int idx = gguf_get_or_add_key(ctx, key);
20368
+
20369
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20370
+ ctx->kv[idx].value.arr.type = type;
20371
+ ctx->kv[idx].value.arr.n = n;
20372
+ ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
20373
+ memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
20374
+ }
20375
+
20376
+ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
20377
+ const int idx = gguf_get_or_add_key(ctx, key);
20378
+
20379
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20380
+ ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
20381
+ ctx->kv[idx].value.arr.n = n;
20382
+ ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20383
+ for (int i = 0; i < n; i++) {
20384
+ struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20385
+ str->n = strlen(data[i]);
20386
+ str->data = strdup(data[i]);
20387
+ }
20388
+ }
20389
+
20390
+ // set or add KV pairs from another context
20391
+ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20392
+ for (uint32_t i = 0; i < src->header.n_kv; i++) {
20393
+ switch (src->kv[i].type) {
20394
+ case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
20395
+ case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
20396
+ case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
20397
+ case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
20398
+ case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20399
+ case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20400
+ case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20401
+ case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
20402
+ case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
20403
+ case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
20404
+ case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20405
+ case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20406
+ case GGUF_TYPE_ARRAY:
20407
+ {
20408
+ if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
20409
+ const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
20410
+ for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
20411
+ data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
20412
+ }
20413
+ gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
20414
+ free(data);
20415
+ } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
20416
+ GGML_ASSERT(false && "nested arrays not supported");
20417
+ } else {
20418
+ gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
20419
+ }
20420
+ } break;
20421
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20422
+ }
20423
+ }
20424
+ }
20425
+
20426
+ void gguf_add_tensor(
20427
+ struct gguf_context * ctx,
20428
+ const struct ggml_tensor * tensor) {
20429
+ const int idx = ctx->header.n_tensors;
20430
+ ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20431
+
20432
+ ctx->infos[idx].name.n = strlen(tensor->name);
20433
+ ctx->infos[idx].name.data = strdup(tensor->name);
20434
+
20435
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
20436
+ ctx->infos[idx].ne[i] = 1;
20437
+ }
20438
+
20439
+ ctx->infos[idx].n_dims = tensor->n_dims;
20440
+ for (int i = 0; i < tensor->n_dims; i++) {
20441
+ ctx->infos[idx].ne[i] = tensor->ne[i];
20442
+ }
20443
+
20444
+ ctx->infos[idx].type = tensor->type;
20445
+ ctx->infos[idx].offset = 0;
20446
+ ctx->infos[idx].data = tensor->data;
20447
+ ctx->infos[idx].size = ggml_nbytes(tensor);
20448
+
20449
+ if (ctx->header.n_tensors > 0) {
20450
+ ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
20451
+ }
20452
+
20453
+ ctx->header.n_tensors++;
20454
+ }
20455
+
20456
+ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
20457
+ const int idx = gguf_find_tensor(ctx, name);
20458
+ if (idx < 0) {
20459
+ GGML_ASSERT(false && "tensor not found");
20460
+ }
20461
+
20462
+ ctx->infos[idx].type = type;
20463
+ }
20464
+
20465
+ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
20466
+ const int idx = gguf_find_tensor(ctx, name);
20467
+ if (idx < 0) {
20468
+ GGML_ASSERT(false && "tensor not found");
20469
+ }
20470
+
20471
+ ctx->infos[idx].data = data;
20472
+ ctx->infos[idx].size = size;
20473
+
20474
+ // update offsets
20475
+ for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
20476
+ ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
20477
+ }
20478
+ }
20479
+
20480
+ //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
20481
+ // fwrite(&val->n, sizeof(val->n), 1, file);
20482
+ // fwrite(val->data, sizeof(char), val->n, file);
20483
+ //}
20484
+ //
20485
+ //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
20486
+ // fwrite(val, sizeof(char), size, file);
20487
+ //}
20488
+
20489
+ struct gguf_buf {
20490
+ void * data;
20491
+ size_t size;
20492
+ size_t offset;
20493
+ };
20494
+
20495
+ static struct gguf_buf gguf_buf_init(size_t size) {
20496
+ struct gguf_buf buf = {
20497
+ /*buf.data =*/ size == 0 ? NULL : malloc(size),
20498
+ /*buf.size =*/ size,
20499
+ /*buf.offset =*/ 0,
20500
+ };
20501
+
20502
+ return buf;
20503
+ }
20504
+
20505
+ static void gguf_buf_free(struct gguf_buf buf) {
20506
+ if (buf.data) {
20507
+ free(buf.data);
20508
+ }
20509
+ }
20510
+
20511
+ static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
20512
+ if (buf->offset + size > buf->size) {
20513
+ buf->size = 1.5*(buf->offset + size);
20514
+ if (buf->data) {
20515
+ buf->data = realloc(buf->data, buf->size);
20516
+ }
20517
+ }
20518
+ }
20519
+
20520
+ static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
20521
+ gguf_buf_grow(buf, sizeof(val->n) + val->n);
20522
+
20523
+ if (buf->data) {
20524
+ memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
20525
+ }
20526
+ buf->offset += sizeof(val->n);
20527
+
20528
+ if (buf->data) {
20529
+ memcpy((char *) buf->data + buf->offset, val->data, val->n);
20530
+ }
20531
+ buf->offset += val->n;
20532
+ }
20533
+
20534
+ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
20535
+ gguf_buf_grow(buf, el_size);
20536
+
20537
+ if (buf->data) {
20538
+ memcpy((char *) buf->data + buf->offset, val, el_size);
20539
+ }
20540
+ buf->offset += el_size;
20541
+ }
20542
+
20543
+ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20544
+ // write header
20545
+ gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
20546
+ gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
20547
+ gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
20548
+ gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
20549
+
20550
+ // write key-value pairs
20551
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
20552
+ struct gguf_kv * kv = &ctx->kv[i];
20553
+
20554
+ gguf_bwrite_str(buf, &kv->key);
20555
+ gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
20556
+
20557
+ switch (kv->type) {
20558
+ case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
20559
+ case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
20560
+ case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
20561
+ case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
20562
+ case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20563
+ case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20564
+ case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20565
+ case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
20566
+ case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
20567
+ case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
20568
+ case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20569
+ case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20570
+ case GGUF_TYPE_ARRAY:
20571
+ {
20572
+ gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
20573
+ gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
20574
+
20575
+ switch (kv->value.arr.type) {
20576
+ case GGUF_TYPE_UINT8:
20577
+ case GGUF_TYPE_INT8:
20578
+ case GGUF_TYPE_UINT16:
20579
+ case GGUF_TYPE_INT16:
20580
+ case GGUF_TYPE_UINT32:
20581
+ case GGUF_TYPE_INT32:
20582
+ case GGUF_TYPE_FLOAT32:
20583
+ case GGUF_TYPE_UINT64:
20584
+ case GGUF_TYPE_INT64:
20585
+ case GGUF_TYPE_FLOAT64:
20586
+ case GGUF_TYPE_BOOL:
20587
+ {
20588
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
20589
+ } break;
20590
+ case GGUF_TYPE_STRING:
20591
+ {
20592
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
20593
+ gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
20594
+ }
20595
+ } break;
20596
+ case GGUF_TYPE_ARRAY:
20597
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20598
+ };
20599
+ } break;
20600
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
20601
+ };
20602
+ }
20603
+
20604
+ // write tensor infos
20605
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20606
+ struct gguf_tensor_info * info = &ctx->infos[i];
20607
+
20608
+ gguf_bwrite_str(buf, &info->name);
20609
+ gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
20610
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
20611
+ gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
20612
+ }
20613
+ gguf_bwrite_el(buf, &info->type, sizeof(info->type));
20614
+ gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
20615
+ }
20616
+
20617
+ // we require the data section to be aligned, so take into account any padding
20618
+ {
20619
+ const size_t offset = buf->offset;
20620
+ const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
20621
+
20622
+ if (offset_pad != offset) {
20623
+ uint8_t pad = 0;
20624
+ for (size_t i = 0; i < offset_pad - offset; ++i) {
20625
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20626
+ }
20627
+ }
20628
+ }
20629
+
20630
+ if (only_meta) {
20631
+ return;
20632
+ }
20633
+
20634
+ size_t offset = 0;
20635
+
20636
+ // write tensor data
20637
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20638
+ struct gguf_tensor_info * info = &ctx->infos[i];
20639
+
20640
+ const size_t size = info->size;
20641
+ const size_t size_pad = GGML_PAD(size, ctx->alignment);
20642
+
20643
+ gguf_bwrite_el(buf, info->data, size);
20644
+
20645
+ if (size_pad != size) {
20646
+ uint8_t pad = 0;
20647
+ for (size_t j = 0; j < size_pad - size; ++j) {
20648
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20649
+ }
20650
+ }
20651
+
20652
+ GGML_ASSERT(offset == info->offset);
20653
+
20654
+ offset += size_pad;
20655
+ }
20656
+ }
20657
+
20658
+ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
20659
+ FILE * file = fopen(fname, "wb");
20660
+ if (!file) {
20661
+ GGML_ASSERT(false && "failed to open file for writing");
20662
+ }
20663
+
20664
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20665
+
20666
+ gguf_write_to_buf(ctx, &buf, only_meta);
20667
+
20668
+ fwrite(buf.data, 1, buf.offset, file);
20669
+
20670
+ gguf_buf_free(buf);
20671
+
20672
+ fclose(file);
20673
+ }
20674
+
20675
+ size_t gguf_get_meta_size(struct gguf_context * ctx) {
20676
+ // no allocs - only compute size
20677
+ struct gguf_buf buf = gguf_buf_init(0);
20678
+
20679
+ gguf_write_to_buf(ctx, &buf, true);
20680
+
20681
+ return buf.offset;
20682
+ }
20683
+
20684
+ void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
20685
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20686
+
20687
+ gguf_write_to_buf(ctx, &buf, true);
20688
+
20689
+ memcpy(data, buf.data, buf.offset);
20690
+
20691
+ gguf_buf_free(buf);
20692
+ }
20693
+
20694
+ ////////////////////////////////////////////////////////////////////////////////
20695
+
20696
+ int ggml_cpu_has_avx(void) {
20697
+ #if defined(__AVX__)
20698
+ return 1;
20699
+ #else
20700
+ return 0;
20701
+ #endif
20702
+ }
20703
+
20704
+ int ggml_cpu_has_avx2(void) {
20705
+ #if defined(__AVX2__)
20706
+ return 1;
20707
+ #else
20708
+ return 0;
20709
+ #endif
20710
+ }
20711
+
20712
+ int ggml_cpu_has_avx512(void) {
20713
+ #if defined(__AVX512F__)
20714
+ return 1;
20715
+ #else
20716
+ return 0;
20717
+ #endif
20718
+ }
20719
+
20720
+ int ggml_cpu_has_avx512_vbmi(void) {
20721
+ #if defined(__AVX512VBMI__)
20722
+ return 1;
20723
+ #else
20724
+ return 0;
20725
+ #endif
20726
+ }
20727
+
20728
+ int ggml_cpu_has_avx512_vnni(void) {
20729
+ #if defined(__AVX512VNNI__)
20730
+ return 1;
20731
+ #else
20732
+ return 0;
20733
+ #endif
20734
+ }
20735
+
20736
+ int ggml_cpu_has_fma(void) {
20737
+ #if defined(__FMA__)
20738
+ return 1;
20739
+ #else
20740
+ return 0;
20741
+ #endif
20742
+ }
20743
+
20744
+ int ggml_cpu_has_neon(void) {
20745
+ #if defined(__ARM_NEON)
20746
+ return 1;
20747
+ #else
20748
+ return 0;
20749
+ #endif
20750
+ }
20751
+
20752
+ int ggml_cpu_has_arm_fma(void) {
20753
+ #if defined(__ARM_FEATURE_FMA)
18622
20754
  return 1;
18623
20755
  #else
18624
20756
  return 0;
@@ -18685,6 +20817,14 @@ int ggml_cpu_has_sse3(void) {
18685
20817
  #endif
18686
20818
  }
18687
20819
 
20820
+ int ggml_cpu_has_ssse3(void) {
20821
+ #if defined(__SSSE3__)
20822
+ return 1;
20823
+ #else
20824
+ return 0;
20825
+ #endif
20826
+ }
20827
+
18688
20828
  int ggml_cpu_has_vsx(void) {
18689
20829
  #if defined(__POWER9_VECTOR__)
18690
20830
  return 1;