llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
103
103
  #include <sys/stat.h>
104
104
  #include <unistd.h>
105
105
 
106
+ #endif
107
+ #ifdef GGML_USE_CPU_HBM
108
+ #include <hbwmalloc.h>
106
109
  #endif
107
110
 
108
111
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -123,6 +126,8 @@ typedef void * thread_ret_t;
123
126
  #define GGML_GELU_FP16
124
127
  #define GGML_GELU_QUICK_FP16
125
128
  #define GGML_SILU_FP16
129
+ // #define GGML_CROSS_ENTROPY_EXP_FP16
130
+ // #define GGML_FLASH_ATTN_EXP_FP16
126
131
 
127
132
  #define GGML_SOFT_MAX_UNROLL 4
128
133
  #define GGML_VEC_DOT_UNROLL 2
@@ -157,12 +162,6 @@ typedef void * thread_ret_t;
157
162
  //#define GGML_SOFT_MAX_ACCELERATE
158
163
  #endif
159
164
 
160
- #if UINTPTR_MAX == 0xFFFFFFFF
161
- #define GGML_MEM_ALIGN 4
162
- #else
163
- #define GGML_MEM_ALIGN 16
164
- #endif
165
-
166
165
  //
167
166
  // logging
168
167
  //
@@ -192,13 +191,19 @@ typedef void * thread_ret_t;
192
191
  //
193
192
 
194
193
  #if defined(_MSC_VER) || defined(__MINGW32__)
195
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
194
+ #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
195
+ #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
196
  #else
198
197
  inline static void * ggml_aligned_malloc(size_t size) {
198
+ if (size == 0) {
199
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
200
+ return NULL;
201
+ }
199
202
  void * aligned_memory = NULL;
200
- #ifdef GGML_USE_METAL
201
- int result = posix_memalign(&aligned_memory, getpagesize(), size);
203
+ #ifdef GGML_USE_CPU_HBM
204
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
205
+ #elif GGML_USE_METAL
206
+ int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
202
207
  #else
203
208
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
204
209
  #endif
@@ -218,8 +223,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
218
223
  }
219
224
  return aligned_memory;
220
225
  }
221
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
222
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
226
+ #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
227
+ #ifdef GGML_USE_CPU_HBM
228
+ #define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
229
+ #else
230
+ #define GGML_ALIGNED_FREE(ptr) free(ptr)
231
+ #endif
223
232
  #endif
224
233
 
225
234
  #define UNUSED GGML_UNUSED
@@ -305,6 +314,10 @@ typedef double ggml_float;
305
314
  #endif
306
315
  #endif
307
316
 
317
+ #ifdef __riscv_v_intrinsic
318
+ #include <riscv_vector.h>
319
+ #endif
320
+
308
321
  #ifdef __F16C__
309
322
 
310
323
  #ifdef _MSC_VER
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
817
830
 
818
831
  #if !defined(__aarch64__)
819
832
 
820
- inline static uint16_t vaddvq_u8(uint8x16_t v) {
821
- return
822
- (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
823
- (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
824
- (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
825
- (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
826
- (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
827
- (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
828
- (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
829
- (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
830
- }
831
-
832
- inline static int16_t vaddvq_s8(int8x16_t v) {
833
- return
834
- (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
835
- (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
836
- (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
837
- (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
838
- (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
839
- (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
840
- (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
841
- (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
842
- }
843
-
844
- inline static int32_t vaddvq_s16(int16x8_t v) {
845
- return
846
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
847
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
848
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
849
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
850
- }
851
-
852
- inline static uint32_t vaddvq_u16(uint16x8_t v) {
853
- return
854
- (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
855
- (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
856
- (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
857
- (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
858
- }
859
-
860
833
  inline static int32_t vaddvq_s32(int32x4_t v) {
861
834
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
862
835
  }
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
865
838
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
866
839
  }
867
840
 
868
- inline static float vminvq_f32(float32x4_t v) {
869
- return
870
- MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
871
- MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
872
- }
873
-
874
841
  inline static float vmaxvq_f32(float32x4_t v) {
875
842
  return
876
843
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -2436,7 +2403,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2436
2403
  const int nb = n / qk;
2437
2404
 
2438
2405
  assert(n % qk == 0);
2439
- assert(nb % 2 == 0);
2440
2406
 
2441
2407
  const block_q4_0 * restrict x = vx;
2442
2408
  const block_q8_0 * restrict y = vy;
@@ -2445,6 +2411,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2445
2411
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2446
2412
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2447
2413
 
2414
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2448
2415
  for (int i = 0; i < nb; i += 2) {
2449
2416
  const block_q4_0 * restrict x0 = &x[i + 0];
2450
2417
  const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2623,6 +2590,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2623
2590
  }
2624
2591
 
2625
2592
  // Main loop
2593
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2626
2594
  for (int i = 2; i < nb; i+=2) {
2627
2595
  _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
2628
2596
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2680,6 +2648,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2680
2648
  }
2681
2649
 
2682
2650
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2651
+ #elif defined(__riscv_v_intrinsic)
2652
+ float sumf = 0.0;
2653
+
2654
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2655
+
2656
+ for (int i = 0; i < nb; i++) {
2657
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2658
+
2659
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2660
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2661
+
2662
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2663
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2664
+
2665
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2666
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2667
+
2668
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2669
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2670
+
2671
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2672
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2673
+
2674
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2675
+
2676
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2677
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2678
+
2679
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2680
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2681
+
2682
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2683
+ }
2684
+
2685
+ *s = sumf;
2683
2686
  #else
2684
2687
  // scalar
2685
2688
  float sumf = 0.0;
@@ -2706,7 +2709,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2706
2709
  const int nb = n / qk;
2707
2710
 
2708
2711
  assert(n % qk == 0);
2709
- assert(nb % 2 == 0);
2710
2712
 
2711
2713
  const block_q4_1 * restrict x = vx;
2712
2714
  const block_q8_1 * restrict y = vy;
@@ -2718,6 +2720,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2718
2720
 
2719
2721
  float summs = 0;
2720
2722
 
2723
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2721
2724
  for (int i = 0; i < nb; i += 2) {
2722
2725
  const block_q4_1 * restrict x0 = &x[i + 0];
2723
2726
  const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2806,6 +2809,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2806
2809
  }
2807
2810
 
2808
2811
  *s = hsum_float_8(acc) + summs;
2812
+ #elif defined(__riscv_v_intrinsic)
2813
+ float sumf = 0.0;
2814
+
2815
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2816
+
2817
+ for (int i = 0; i < nb; i++) {
2818
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2819
+
2820
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2821
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2822
+
2823
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2824
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2825
+
2826
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2827
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2828
+
2829
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2830
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2831
+
2832
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2833
+
2834
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2835
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2836
+
2837
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2838
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2839
+
2840
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2841
+ }
2842
+
2843
+ *s = sumf;
2809
2844
  #else
2810
2845
  // scalar
2811
2846
  float sumf = 0.0;
@@ -2832,7 +2867,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2832
2867
  const int nb = n / qk;
2833
2868
 
2834
2869
  assert(n % qk == 0);
2835
- assert(nb % 2 == 0);
2836
2870
  assert(qk == QK5_0);
2837
2871
 
2838
2872
  const block_q5_0 * restrict x = vx;
@@ -2848,6 +2882,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2848
2882
  uint64_t tmp0[4];
2849
2883
  uint64_t tmp1[4];
2850
2884
 
2885
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2851
2886
  for (int i = 0; i < nb; i += 2) {
2852
2887
  const block_q5_0 * restrict x0 = &x[i];
2853
2888
  const block_q5_0 * restrict x1 = &x[i + 1];
@@ -3040,6 +3075,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3040
3075
  }
3041
3076
 
3042
3077
  *s = hsum_float_8(acc);
3078
+ #elif defined(__riscv_v_intrinsic)
3079
+ float sumf = 0.0;
3080
+
3081
+ uint32_t qh;
3082
+
3083
+ // These temp values are for masking and shift operations
3084
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3085
+ uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3086
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3087
+
3088
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3089
+
3090
+ for (int i = 0; i < nb; i++) {
3091
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3092
+
3093
+ // temporary registers
3094
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3095
+ vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3096
+ vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3097
+ vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3098
+
3099
+ // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3100
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3101
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3102
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3103
+
3104
+ // ((qh & (1u << (j + 16))) >> (j + 12));
3105
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3106
+ vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3107
+
3108
+ // narrowing
3109
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3110
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3111
+
3112
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3113
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3114
+
3115
+ // load
3116
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3117
+
3118
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3119
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3120
+
3121
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3122
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3123
+
3124
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3125
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3126
+
3127
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3128
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3129
+
3130
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3131
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3132
+
3133
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3134
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3135
+
3136
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3137
+
3138
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3139
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3140
+
3141
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3142
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3143
+
3144
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3145
+ }
3146
+
3147
+ *s = sumf;
3043
3148
  #else
3044
3149
  // scalar
3045
3150
  float sumf = 0.0;
@@ -3072,7 +3177,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3072
3177
  const int nb = n / qk;
3073
3178
 
3074
3179
  assert(n % qk == 0);
3075
- assert(nb % 2 == 0);
3076
3180
  assert(qk == QK5_1);
3077
3181
 
3078
3182
  const block_q5_1 * restrict x = vx;
@@ -3091,6 +3195,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3091
3195
  uint64_t tmp0[4];
3092
3196
  uint64_t tmp1[4];
3093
3197
 
3198
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3094
3199
  for (int i = 0; i < nb; i += 2) {
3095
3200
  const block_q5_1 * restrict x0 = &x[i];
3096
3201
  const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3296,6 +3401,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3296
3401
  }
3297
3402
 
3298
3403
  *s = hsum_float_8(acc) + summs;
3404
+ #elif defined(__riscv_v_intrinsic)
3405
+ float sumf = 0.0;
3406
+
3407
+ uint32_t qh;
3408
+
3409
+ // These temp values are for shift operations
3410
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3411
+
3412
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3413
+
3414
+ for (int i = 0; i < nb; i++) {
3415
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3416
+
3417
+ // temporary registers
3418
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3419
+ vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3420
+
3421
+ // load qh
3422
+ vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3423
+
3424
+ // ((qh >> (j + 0)) << 4) & 0x10;
3425
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3426
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3427
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3428
+
3429
+ // ((qh >> (j + 12)) ) & 0x10;
3430
+ vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3431
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3432
+
3433
+ // narrowing
3434
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3435
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3436
+
3437
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3438
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3439
+
3440
+ // load
3441
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3442
+
3443
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3444
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3445
+
3446
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3447
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3448
+
3449
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3450
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3451
+
3452
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3453
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3454
+
3455
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3456
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3457
+
3458
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3459
+
3460
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3461
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3462
+
3463
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3464
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3465
+
3466
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3467
+ }
3468
+
3469
+ *s = sumf;
3299
3470
  #else
3300
3471
  // scalar
3301
3472
  float sumf = 0.0;
@@ -3328,7 +3499,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3328
3499
  const int nb = n / qk;
3329
3500
 
3330
3501
  assert(n % qk == 0);
3331
- assert(nb % 2 == 0);
3332
3502
 
3333
3503
  const block_q8_0 * restrict x = vx;
3334
3504
  const block_q8_0 * restrict y = vy;
@@ -3337,6 +3507,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3337
3507
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3338
3508
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3339
3509
 
3510
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3340
3511
  for (int i = 0; i < nb; i += 2) {
3341
3512
  const block_q8_0 * restrict x0 = &x[i + 0];
3342
3513
  const block_q8_0 * restrict x1 = &x[i + 1];
@@ -3407,6 +3578,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3407
3578
  }
3408
3579
 
3409
3580
  *s = hsum_float_8(acc);
3581
+ #elif defined(__riscv_v_intrinsic)
3582
+ float sumf = 0.0;
3583
+ size_t vl = __riscv_vsetvl_e8m1(qk);
3584
+
3585
+ for (int i = 0; i < nb; i++) {
3586
+ // load elements
3587
+ vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
3588
+ vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
3589
+
3590
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
3591
+
3592
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
3593
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
3594
+
3595
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
3596
+
3597
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3598
+ }
3599
+
3600
+ *s = sumf;
3410
3601
  #else
3411
3602
  // scalar
3412
3603
  float sumf = 0.0;
@@ -4107,16 +4298,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4107
4298
  }
4108
4299
 
4109
4300
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4111
-
4112
- // this should handle cases where the tensor is not contiguous in memory
4113
- // probaby just:
4114
- //
4115
- // return tensor->ne[3]*tensor->nb[3]
4116
- //
4117
- // is enough, but just in case, adding the second part
4118
-
4119
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
4301
+ size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4302
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4303
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4304
+ }
4305
+ return nbytes;
4120
4306
  }
4121
4307
 
4122
4308
  size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@@ -4393,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4393
4579
  return NULL;
4394
4580
  }
4395
4581
 
4582
+ // allow to call ggml_init with 0 size
4583
+ if (params.mem_size == 0) {
4584
+ params.mem_size = GGML_MEM_ALIGN;
4585
+ }
4586
+
4396
4587
  const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4397
4588
 
4398
4589
  *ctx = (struct ggml_context) {
@@ -4570,36 +4761,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4570
4761
  enum ggml_type type,
4571
4762
  int n_dims,
4572
4763
  const int64_t * ne,
4573
- void * data) {
4764
+ struct ggml_tensor * view_src,
4765
+ size_t view_offs) {
4574
4766
 
4575
4767
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4576
4768
 
4577
- size_t data_size = 0;
4769
+ // find the base tensor and absolute offset
4770
+ if (view_src != NULL && view_src->view_src != NULL) {
4771
+ view_offs += view_src->view_offs;
4772
+ view_src = view_src->view_src;
4773
+ }
4578
4774
 
4579
- if (data == NULL && !ctx->no_alloc) {
4580
- data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4581
- for (int i = 1; i < n_dims; i++) {
4582
- data_size *= ne[i];
4583
- }
4775
+ size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4776
+ for (int i = 1; i < n_dims; i++) {
4777
+ data_size *= ne[i];
4584
4778
  }
4585
4779
 
4586
- if (ctx->scratch.data != NULL && data == NULL) {
4587
- // allocate tensor data in the scratch buffer
4588
- if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4589
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4590
- __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4591
- assert(false);
4592
- return NULL;
4593
- }
4780
+ GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
4781
+
4782
+ void * data = view_src != NULL ? view_src->data : NULL;
4783
+ if (data != NULL) {
4784
+ data = (char *) data + view_offs;
4785
+ }
4594
4786
 
4595
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4787
+ size_t obj_alloc_size = 0;
4788
+
4789
+ if (view_src == NULL && !ctx->no_alloc) {
4790
+ if (ctx->scratch.data != NULL) {
4791
+ // allocate tensor data in the scratch buffer
4792
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4793
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4794
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4795
+ assert(false);
4796
+ return NULL;
4797
+ }
4596
4798
 
4597
- ctx->scratch.offs += data_size;
4799
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4598
4800
 
4599
- data_size = 0;
4801
+ ctx->scratch.offs += data_size;
4802
+ } else {
4803
+ // allocate tensor data in the context's memory pool
4804
+ obj_alloc_size = data_size;
4805
+ }
4600
4806
  }
4601
4807
 
4602
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4808
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
4603
4809
 
4604
4810
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4605
4811
 
@@ -4619,7 +4825,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4619
4825
  /*.perf_runs =*/ 0,
4620
4826
  /*.perf_cycles =*/ 0,
4621
4827
  /*.perf_time_us =*/ 0,
4622
- /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4828
+ /*.view_src =*/ view_src,
4829
+ /*.view_offs =*/ view_offs,
4830
+ /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
4623
4831
  /*.name =*/ { 0 },
4624
4832
  /*.extra =*/ NULL,
4625
4833
  /*.padding =*/ { 0 },
@@ -4643,28 +4851,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4643
4851
  return result;
4644
4852
  }
4645
4853
 
4646
- static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4647
- GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4648
- assert(params_size <= GGML_MAX_OP_PARAMS);
4649
- memcpy(tensor->op_params, params, params_size);
4650
- }
4651
-
4652
- static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4653
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4654
- return ((const int32_t *)(tensor->op_params))[i];
4655
- }
4656
-
4657
- static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4658
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4659
- ((int32_t *)(tensor->op_params))[i] = value;
4660
- }
4661
-
4662
4854
  struct ggml_tensor * ggml_new_tensor(
4663
4855
  struct ggml_context * ctx,
4664
4856
  enum ggml_type type,
4665
4857
  int n_dims,
4666
4858
  const int64_t * ne) {
4667
- return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4859
+ return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
4668
4860
  }
4669
4861
 
4670
4862
  struct ggml_tensor * ggml_new_tensor_1d(
@@ -4729,7 +4921,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
4729
4921
  }
4730
4922
 
4731
4923
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4732
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
4924
+ return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
4925
+ }
4926
+
4927
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4928
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4929
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4930
+ memcpy(tensor->op_params, params, params_size);
4931
+ }
4932
+
4933
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4934
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4935
+ return ((const int32_t *)(tensor->op_params))[i];
4936
+ }
4937
+
4938
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4939
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4940
+ ((int32_t *)(tensor->op_params))[i] = value;
4733
4941
  }
4734
4942
 
4735
4943
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@@ -5015,14 +5223,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
5015
5223
 
5016
5224
  struct ggml_tensor * ggml_view_tensor(
5017
5225
  struct ggml_context * ctx,
5018
- const struct ggml_tensor * src) {
5019
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
5226
+ struct ggml_tensor * src) {
5227
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
5020
5228
  ggml_format_name(result, "%s (view)", src->name);
5021
5229
 
5022
- result->nb[0] = src->nb[0];
5023
- result->nb[1] = src->nb[1];
5024
- result->nb[2] = src->nb[2];
5025
- result->nb[3] = src->nb[3];
5230
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
5231
+ result->nb[i] = src->nb[i];
5232
+ }
5026
5233
 
5027
5234
  return result;
5028
5235
  }
@@ -5280,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
5280
5487
  }
5281
5488
 
5282
5489
  if (inplace) {
5283
- GGML_ASSERT(is_node == false);
5490
+ GGML_ASSERT(!is_node);
5284
5491
  }
5285
5492
 
5286
5493
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5323,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
5323
5530
  }
5324
5531
 
5325
5532
  if (inplace) {
5326
- GGML_ASSERT(is_node == false);
5533
+ GGML_ASSERT(!is_node);
5327
5534
  }
5328
5535
 
5329
5536
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5595,7 +5802,7 @@ struct ggml_tensor * ggml_repeat_back(
5595
5802
 
5596
5803
  // ggml_concat
5597
5804
 
5598
- struct ggml_tensor* ggml_concat(
5805
+ struct ggml_tensor * ggml_concat(
5599
5806
  struct ggml_context* ctx,
5600
5807
  struct ggml_tensor* a,
5601
5808
  struct ggml_tensor* b) {
@@ -5862,7 +6069,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5862
6069
  struct ggml_tensor * ggml_rms_norm_back(
5863
6070
  struct ggml_context * ctx,
5864
6071
  struct ggml_tensor * a,
5865
- struct ggml_tensor * b) {
6072
+ struct ggml_tensor * b,
6073
+ float eps) {
5866
6074
  bool is_node = false;
5867
6075
 
5868
6076
  if (a->grad) {
@@ -5872,6 +6080,8 @@ struct ggml_tensor * ggml_rms_norm_back(
5872
6080
 
5873
6081
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5874
6082
 
6083
+ ggml_set_op_params(result, &eps, sizeof(eps));
6084
+
5875
6085
  result->op = GGML_OP_RMS_NORM_BACK;
5876
6086
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5877
6087
  result->src[0] = a;
@@ -6201,7 +6411,7 @@ struct ggml_tensor * ggml_reshape(
6201
6411
  //GGML_ASSERT(false);
6202
6412
  }
6203
6413
 
6204
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6414
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
6205
6415
  ggml_format_name(result, "%s (reshaped)", a->name);
6206
6416
 
6207
6417
  result->op = GGML_OP_RESHAPE;
@@ -6225,7 +6435,7 @@ struct ggml_tensor * ggml_reshape_1d(
6225
6435
  }
6226
6436
 
6227
6437
  const int64_t ne[1] = { ne0 };
6228
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6438
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
6229
6439
  ggml_format_name(result, "%s (reshaped)", a->name);
6230
6440
 
6231
6441
  result->op = GGML_OP_RESHAPE;
@@ -6250,7 +6460,7 @@ struct ggml_tensor * ggml_reshape_2d(
6250
6460
  }
6251
6461
 
6252
6462
  const int64_t ne[2] = { ne0, ne1 };
6253
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6463
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
6254
6464
  ggml_format_name(result, "%s (reshaped)", a->name);
6255
6465
 
6256
6466
  result->op = GGML_OP_RESHAPE;
@@ -6276,7 +6486,7 @@ struct ggml_tensor * ggml_reshape_3d(
6276
6486
  }
6277
6487
 
6278
6488
  const int64_t ne[3] = { ne0, ne1, ne2 };
6279
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6489
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
6280
6490
  ggml_format_name(result, "%s (reshaped)", a->name);
6281
6491
 
6282
6492
  result->op = GGML_OP_RESHAPE;
@@ -6286,7 +6496,6 @@ struct ggml_tensor * ggml_reshape_3d(
6286
6496
  return result;
6287
6497
  }
6288
6498
 
6289
-
6290
6499
  struct ggml_tensor * ggml_reshape_4d(
6291
6500
  struct ggml_context * ctx,
6292
6501
  struct ggml_tensor * a,
@@ -6304,7 +6513,7 @@ struct ggml_tensor * ggml_reshape_4d(
6304
6513
  }
6305
6514
 
6306
6515
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6307
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6516
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
6308
6517
  ggml_format_name(result, "%s (reshaped)", a->name);
6309
6518
 
6310
6519
  result->op = GGML_OP_RESHAPE;
@@ -6314,46 +6523,40 @@ struct ggml_tensor * ggml_reshape_4d(
6314
6523
  return result;
6315
6524
  }
6316
6525
 
6317
- // ggml_view_1d
6318
-
6319
- static struct ggml_tensor * ggml_view_tensor_offset(
6526
+ static struct ggml_tensor * ggml_view_impl(
6320
6527
  struct ggml_context * ctx,
6321
6528
  struct ggml_tensor * a,
6322
6529
  int n_dims,
6323
6530
  const int64_t * ne,
6324
6531
  size_t offset) {
6325
- // don't calculate an offset from an unallocated tensor
6326
- void * data = NULL;
6327
- if (a->data != NULL) {
6328
- data = (char *) a->data + offset;
6329
- }
6330
6532
 
6331
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6533
+ bool is_node = false;
6534
+
6535
+ if (a->grad) {
6536
+ is_node = true;
6537
+ }
6332
6538
 
6539
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
6333
6540
  ggml_format_name(result, "%s (view)", a->name);
6334
6541
 
6335
6542
  ggml_set_op_params(result, &offset, sizeof(offset));
6336
6543
 
6544
+ result->op = GGML_OP_VIEW;
6545
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6546
+ result->src[0] = a;
6547
+
6337
6548
  return result;
6338
6549
  }
6339
6550
 
6551
+ // ggml_view_1d
6552
+
6340
6553
  struct ggml_tensor * ggml_view_1d(
6341
6554
  struct ggml_context * ctx,
6342
6555
  struct ggml_tensor * a,
6343
6556
  int64_t ne0,
6344
6557
  size_t offset) {
6345
6558
 
6346
- bool is_node = false;
6347
-
6348
- if (a->grad) {
6349
- is_node = true;
6350
- }
6351
-
6352
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6353
-
6354
- result->op = GGML_OP_VIEW;
6355
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6356
- result->src[0] = a;
6559
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
6357
6560
 
6358
6561
  return result;
6359
6562
  }
@@ -6368,24 +6571,14 @@ struct ggml_tensor * ggml_view_2d(
6368
6571
  size_t nb1,
6369
6572
  size_t offset) {
6370
6573
 
6371
- bool is_node = false;
6372
-
6373
- if (a->grad) {
6374
- is_node = true;
6375
- }
6376
-
6377
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6574
+ const int64_t ne[2] = { ne0, ne1 };
6378
6575
 
6379
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6576
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
6380
6577
 
6381
6578
  result->nb[1] = nb1;
6382
6579
  result->nb[2] = result->nb[1]*ne1;
6383
6580
  result->nb[3] = result->nb[2];
6384
6581
 
6385
- result->op = GGML_OP_VIEW;
6386
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6387
- result->src[0] = a;
6388
-
6389
6582
  return result;
6390
6583
  }
6391
6584
 
@@ -6401,24 +6594,14 @@ struct ggml_tensor * ggml_view_3d(
6401
6594
  size_t nb2,
6402
6595
  size_t offset) {
6403
6596
 
6404
- bool is_node = false;
6405
-
6406
- if (a->grad) {
6407
- is_node = true;
6408
- }
6409
-
6410
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6597
+ const int64_t ne[3] = { ne0, ne1, ne2 };
6411
6598
 
6412
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6599
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
6413
6600
 
6414
6601
  result->nb[1] = nb1;
6415
6602
  result->nb[2] = nb2;
6416
6603
  result->nb[3] = result->nb[2]*ne2;
6417
6604
 
6418
- result->op = GGML_OP_VIEW;
6419
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6420
- result->src[0] = a;
6421
-
6422
6605
  return result;
6423
6606
  }
6424
6607
 
@@ -6436,24 +6619,14 @@ struct ggml_tensor * ggml_view_4d(
6436
6619
  size_t nb3,
6437
6620
  size_t offset) {
6438
6621
 
6439
- bool is_node = false;
6440
-
6441
- if (a->grad) {
6442
- is_node = true;
6443
- }
6444
-
6445
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6622
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6446
6623
 
6447
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6624
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
6448
6625
 
6449
6626
  result->nb[1] = nb1;
6450
6627
  result->nb[2] = nb2;
6451
6628
  result->nb[3] = nb3;
6452
6629
 
6453
- result->op = GGML_OP_VIEW;
6454
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6455
- result->src[0] = a;
6456
-
6457
6630
  return result;
6458
6631
  }
6459
6632
 
@@ -6640,7 +6813,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6640
6813
 
6641
6814
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6642
6815
 
6643
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6816
+ int32_t params[] = { n_past };
6644
6817
  ggml_set_op_params(result, params, sizeof(params));
6645
6818
 
6646
6819
  result->op = GGML_OP_DIAG_MASK_INF;
@@ -6657,7 +6830,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
6657
6830
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6658
6831
  }
6659
6832
 
6660
-
6661
6833
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6662
6834
  struct ggml_context * ctx,
6663
6835
  struct ggml_tensor * a,
@@ -6680,7 +6852,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6680
6852
 
6681
6853
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6682
6854
 
6683
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6855
+ int32_t params[] = { n_past };
6684
6856
  ggml_set_op_params(result, params, sizeof(params));
6685
6857
 
6686
6858
  result->op = GGML_OP_DIAG_MASK_ZERO;
@@ -7097,11 +7269,13 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7097
7269
  };
7098
7270
 
7099
7271
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7272
+
7273
+ ggml_set_op_params_i32(result, 0, stride);
7274
+
7100
7275
  result->op = GGML_OP_CONV_TRANSPOSE_2D;
7101
7276
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7102
7277
  result->src[0] = a;
7103
7278
  result->src[1] = b;
7104
- result->src[2] = ggml_new_i32(ctx, stride);
7105
7279
 
7106
7280
  return result;
7107
7281
  }
@@ -9446,6 +9620,8 @@ static void ggml_compute_forward_div_f32(
9446
9620
 
9447
9621
 
9448
9622
  #ifdef GGML_USE_ACCELERATE
9623
+ UNUSED(ggml_vec_div_f32);
9624
+
9449
9625
  vDSP_vdiv(
9450
9626
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9451
9627
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -10752,7 +10928,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
10752
10928
 
10753
10929
  GGML_TENSOR_BINARY_OP_LOCALS;
10754
10930
 
10755
- const float eps = 1e-6f; // TODO: make this a parameter
10931
+ float eps;
10932
+ memcpy(&eps, dst->op_params, sizeof(float));
10756
10933
 
10757
10934
  // TODO: optimize
10758
10935
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11930,8 +12107,8 @@ static void ggml_compute_forward_diag_mask_f32(
11930
12107
  const int ith = params->ith;
11931
12108
  const int nth = params->nth;
11932
12109
 
11933
- const int n_past = ((int32_t *) dst->op_params)[0];
11934
- const bool inplace = (bool)((int32_t *) dst->op_params)[1];
12110
+ const int n_past = ((int32_t *) dst->op_params)[0];
12111
+ const bool inplace = src0->data == dst->data;
11935
12112
 
11936
12113
  GGML_ASSERT(n_past >= 0);
11937
12114
 
@@ -12142,6 +12319,7 @@ static void ggml_compute_forward_soft_max_back_f32(
12142
12319
  // dx = J * dy
12143
12320
  // dxk = sum_i(Jki * dyi)
12144
12321
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
12322
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
12145
12323
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
12146
12324
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
12147
12325
  // dxk = -yk * dot(y, dy) + yk*dyk
@@ -13497,7 +13675,6 @@ static void ggml_compute_forward_conv_transpose_2d(
13497
13675
  const struct ggml_compute_params * params,
13498
13676
  const struct ggml_tensor * src0,
13499
13677
  const struct ggml_tensor * src1,
13500
- const struct ggml_tensor * opt0,
13501
13678
  struct ggml_tensor * dst) {
13502
13679
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13503
13680
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13557,7 +13734,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13557
13734
  return;
13558
13735
  }
13559
13736
 
13560
- const int32_t stride = ((const int32_t*)(opt0->data))[0];
13737
+ const int32_t stride = ggml_get_op_params_i32(dst, 0);
13561
13738
 
13562
13739
  // total patches in dst
13563
13740
  const int np = ne2;
@@ -13570,7 +13747,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13570
13747
  const int ip1 = MIN(ip0 + dp, np);
13571
13748
 
13572
13749
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13573
- ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
13750
+ ggml_fp16_t * const wdata_src = wdata + nk;
13574
13751
 
13575
13752
  for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13576
13753
  float * dst_data = (float *)((char *) dst->data + i2*nb2);
@@ -13582,9 +13759,8 @@ static void ggml_compute_forward_conv_transpose_2d(
13582
13759
  for (int i00 = 0; i00 < ne00; i00++) {
13583
13760
  float v = 0;
13584
13761
  ggml_vec_dot_f16(ne03, &v,
13585
- (ggml_fp16_t *) wdata_src + i1n,
13586
- (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
13587
-
13762
+ wdata_src + i1n,
13763
+ wdata_kernel + i01*ne00*ne03 + i00*ne03);
13588
13764
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13589
13765
  }
13590
13766
  }
@@ -13934,7 +14110,7 @@ static void ggml_compute_forward_flash_attn_f32(
13934
14110
  vvexpf(S, S, &Mup);
13935
14111
  ggml_vec_sum_f32(Mup, &sum, S);
13936
14112
  #else
13937
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14113
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13938
14114
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13939
14115
 
13940
14116
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13944,9 +14120,13 @@ static void ggml_compute_forward_flash_attn_f32(
13944
14120
  if (SS[j] == -INFINITY) {
13945
14121
  SS[j] = 0.0f;
13946
14122
  } else {
14123
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14124
+ const float val = expf(SS[j] - max);
14125
+ #else
13947
14126
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
13948
14127
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13949
14128
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14129
+ #endif
13950
14130
  sump[j] += (ggml_float)val;
13951
14131
  SS[j] = val;
13952
14132
  }
@@ -14524,7 +14704,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14524
14704
  vvexpf(SM, SM, &Mup);
14525
14705
  ggml_vec_sum_f32(Mup, &sum, SM);
14526
14706
  #else
14527
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14707
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
14528
14708
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
14529
14709
 
14530
14710
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -14535,9 +14715,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
14535
14715
  if (SR[j] == -INFINITY) {
14536
14716
  SW[j] = 0.0f;
14537
14717
  } else {
14718
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14719
+ const float val = expf(SR[j] - max);
14720
+ #else
14538
14721
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
14539
14722
  memcpy(&scvt[j], &s, sizeof(uint16_t));
14540
14723
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14724
+ #endif
14541
14725
  sump[j] += (ggml_float)val;
14542
14726
  SW[j] = val;
14543
14727
  }
@@ -15275,6 +15459,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15275
15459
  const int nc = src0->ne[0];
15276
15460
  const int nr = ggml_nrows(src0);
15277
15461
 
15462
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15463
+
15278
15464
  if (params->type == GGML_TASK_INIT) {
15279
15465
  if (ith == 0) {
15280
15466
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -15286,7 +15472,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15286
15472
  if (ith == 0) {
15287
15473
  float * dp = (float *) dst->data;
15288
15474
  ggml_vec_sum_f32(nth, dp, sums);
15289
- dp[0] *= -1.0f;
15475
+ dp[0] *= -1.0f / (float) nr;
15290
15476
  }
15291
15477
  return;
15292
15478
  }
@@ -15303,7 +15489,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15303
15489
  for (int i1 = ir0; i1 < ir1; i1++) {
15304
15490
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15305
15491
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15306
- float * st = (float *) params->wdata + nth + ith*nc;
15492
+ float * st = ((float *) params->wdata) + nth + ith*nc;
15307
15493
 
15308
15494
  #ifndef NDEBUG
15309
15495
  for (int i = 0; i < nc; ++i) {
@@ -15318,15 +15504,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15318
15504
  float max = -INFINITY;
15319
15505
  ggml_vec_max_f32(nc, &max, s0);
15320
15506
 
15321
- uint16_t scvt;
15507
+ uint16_t scvt; UNUSED(scvt);
15322
15508
  for (int i = 0; i < nc; i++) {
15323
15509
  if (s0[i] == -INFINITY) {
15324
15510
  st[i] = 0.0f;
15325
15511
  } else {
15326
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15512
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15513
+ const float s = s0[i] - max;
15514
+ const float val = expf(s);
15515
+ #else
15327
15516
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15328
15517
  memcpy(&scvt, &s, sizeof(scvt));
15329
15518
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15519
+ #endif
15330
15520
  sum += (ggml_float)val;
15331
15521
  st[i] = val;
15332
15522
  }
@@ -15342,7 +15532,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15342
15532
  ggml_vec_log_f32(nc, st, st);
15343
15533
  ggml_vec_mul_f32(nc, st, st, s1);
15344
15534
 
15345
- ggml_vec_sum_f32(nc, sums + ith, st);
15535
+ float st_sum = 0;
15536
+ ggml_vec_sum_f32(nc, &st_sum, st);
15537
+ sums[ith] += st_sum;
15346
15538
 
15347
15539
  #ifndef NDEBUG
15348
15540
  for (int i = 0; i < nc; ++i) {
@@ -15392,7 +15584,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15392
15584
  return;
15393
15585
  }
15394
15586
 
15395
- const float eps = 1e-9f;
15587
+ const double eps = 1e-9;
15396
15588
 
15397
15589
  // TODO: handle transposed/permuted matrices
15398
15590
  const int64_t nc = src0->ne[0];
@@ -15411,7 +15603,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15411
15603
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
15412
15604
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15413
15605
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15414
- float * sm = (float *) params->wdata + ith*nc;
15415
15606
 
15416
15607
  #ifndef NDEBUG
15417
15608
  for (int i = 0; i < nc; ++i) {
@@ -15420,54 +15611,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15420
15611
  assert(!isnan(s1[i]));
15421
15612
  }
15422
15613
  #endif
15423
- // step by step explanation:
15424
- {
15425
- //float * sums = (float *) params->wdata;
15426
-
15427
- // forward pass with annotated gradients from backward pass
15428
- // (built by going in reverse operation order, adding to gradients of current operation args)
15429
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
15430
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15431
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
15432
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
15433
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
15434
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
15435
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
15436
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
15437
-
15438
- // substitute into grad[st1], because we can reuse softmax_back from this point on
15439
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
15440
- // postorder:
15441
- // grad[st1] := softmax(s0)
15442
- // grad[st1] := grad[st1]*(1.0 - eps)
15443
- // grad[st1] := grad[st1] + eps
15444
- // grad[st1] := s1 / grad[st1]
15445
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
15446
-
15447
- // src0 gradients by going through softmax_back
15448
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15449
- // from softmax_back:
15450
- // dxk = yk * (dyk - dot(y, dy))
15451
- // dot_y_dy := dot(y, dy)
15452
- // dx := dy
15453
- // dx := dx - dot_y_dy
15454
- // dx := dx * y
15455
- // postorder:
15456
- // dot_st1_dst1 := dot(st1, grad[st1])
15457
- // grad[s0] := grad[st1]
15458
- // grad[s0] := grad[s0] - dot_st1_dst1
15459
- // grad[s0] := grad[s0] * st1
15460
-
15461
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
15462
- // sm := softmax(s0)
15463
- // grad[s0] := sm*(1.0 - eps)
15464
- // grad[s0] := grad[s0] + eps
15465
- // grad[s0] := s1 / grad[s0]
15466
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
15467
- // dot_st1_dst1 := dot(sm, grad[s0])
15468
- // grad[s0] := grad[s0] - dot_st1_dst1
15469
- // grad[s0] := grad[s0] * sm
15470
- }
15471
15614
 
15472
15615
  // soft_max
15473
15616
  ggml_float sum = 0.0;
@@ -15475,39 +15618,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15475
15618
  float max = -INFINITY;
15476
15619
  ggml_vec_max_f32(nc, &max, s0);
15477
15620
 
15478
- uint16_t scvt;
15621
+ uint16_t scvt; UNUSED(scvt);
15479
15622
  for (int i = 0; i < nc; i++) {
15480
15623
  if (s0[i] == -INFINITY) {
15481
- sm[i] = 0.0f;
15624
+ ds0[i] = 0.0f;
15482
15625
  } else {
15483
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15626
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15627
+ const float s = s0[i] - max;
15628
+ const float val = expf(s);
15629
+ #else
15484
15630
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15485
15631
  memcpy(&scvt, &s, sizeof(scvt));
15486
15632
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15633
+ #endif
15487
15634
  sum += (ggml_float)val;
15488
- sm[i] = val;
15635
+ ds0[i] = val;
15489
15636
  }
15490
15637
  }
15491
15638
 
15492
15639
  assert(sum > 0.0);
15493
- sum = 1.0/sum;
15640
+ sum = (1.0 - eps)/sum;
15494
15641
  }
15495
15642
 
15496
- float dot_st1_dst1 = 0;
15497
- ggml_vec_scale_f32(nc, sm, sum);
15498
- ggml_vec_cpy_f32 (nc, ds0, sm);
15499
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
15500
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
15501
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
15502
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
15503
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
15504
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
15505
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
15643
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
15644
+ ggml_vec_scale_f32(nc, ds0, sum);
15645
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
15646
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
15647
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
15648
+
15506
15649
 
15507
15650
  #ifndef NDEBUG
15508
15651
  for (int i = 0; i < nc; ++i) {
15509
- assert(!isnan(sm[i]));
15510
- assert(!isinf(sm[i]));
15511
15652
  assert(!isnan(ds0[i]));
15512
15653
  assert(!isinf(ds0[i]));
15513
15654
  }
@@ -15731,7 +15872,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15731
15872
  } break;
15732
15873
  case GGML_OP_CONV_TRANSPOSE_2D:
15733
15874
  {
15734
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15875
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15735
15876
  } break;
15736
15877
  case GGML_OP_POOL_1D:
15737
15878
  {
@@ -16062,9 +16203,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16062
16203
  {
16063
16204
  // necessary for llama
16064
16205
  if (src0->grad) {
16206
+ float eps;
16207
+ memcpy(&eps, tensor->op_params, sizeof(float));
16208
+
16065
16209
  src0->grad = ggml_add_impl(ctx,
16066
16210
  src0->grad,
16067
- ggml_rms_norm_back(ctx, src0, tensor->grad),
16211
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
16068
16212
  inplace);
16069
16213
  }
16070
16214
  } break;
@@ -16832,9 +16976,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16832
16976
  return result;
16833
16977
  }
16834
16978
 
16835
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
16836
- struct ggml_cgraph result = *gf;
16837
-
16979
+ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
16838
16980
  GGML_ASSERT(gf->n_nodes > 0);
16839
16981
 
16840
16982
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16858,15 +17000,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16858
17000
  }
16859
17001
  }
16860
17002
 
16861
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
17003
+ for (int i = 0; i < gf->n_nodes; i++) {
16862
17004
  struct ggml_tensor * node = gf->nodes[i];
16863
17005
 
16864
17006
  if (node->is_param) {
16865
17007
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16866
- ggml_build_forward_expand(&result, node->grad);
17008
+ ggml_build_forward_expand(gb, node->grad);
16867
17009
  }
16868
17010
  }
17011
+ }
16869
17012
 
17013
+ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
17014
+ struct ggml_cgraph result = *gf;
17015
+ ggml_build_backward_expand(ctx, gf, &result, keep);
16870
17016
  return result;
16871
17017
  }
16872
17018
 
@@ -17542,10 +17688,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
17542
17688
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
17543
17689
  {
17544
17690
  n_tasks = n_threads;
17545
-
17546
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
17547
-
17548
- work_size = MAX(work_size, cur);
17549
17691
  } break;
17550
17692
  case GGML_OP_NONE:
17551
17693
  {
@@ -18423,14 +18565,16 @@ static enum ggml_opt_result ggml_opt_adam(
18423
18565
  struct ggml_opt_params params,
18424
18566
  struct ggml_tensor * f,
18425
18567
  struct ggml_cgraph * gf,
18426
- struct ggml_cgraph * gb) {
18568
+ struct ggml_cgraph * gb,
18569
+ ggml_opt_callback callback,
18570
+ void * callback_data) {
18427
18571
  GGML_ASSERT(ggml_is_scalar(f));
18428
18572
 
18429
18573
  // these will store the parameters we want to optimize
18430
18574
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
18431
18575
 
18432
18576
  int np = 0;
18433
- int nx = 0;
18577
+ int64_t nx = 0;
18434
18578
  for (int i = 0; i < gf->n_nodes; ++i) {
18435
18579
  if (gf->nodes[i]->is_param) {
18436
18580
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -18449,31 +18593,32 @@ static enum ggml_opt_result ggml_opt_adam(
18449
18593
  }
18450
18594
 
18451
18595
  // constants
18452
- const float sched = params.adam.sched;
18453
- const float decay = params.adam.decay * sched;
18454
- const float alpha = params.adam.alpha * sched;
18596
+ float sched = params.adam.sched;
18597
+ const float alpha = params.adam.alpha;
18598
+ const float decay = params.adam.decay * alpha;
18455
18599
  const float beta1 = params.adam.beta1;
18456
18600
  const float beta2 = params.adam.beta2;
18457
18601
  const float eps = params.adam.eps;
18602
+ const float gclip = params.adam.gclip;
18603
+ const int decay_min_ndim = params.adam.decay_min_ndim;
18458
18604
 
18459
- float * x = opt->adam.x->data; // view of the parameters
18460
- float * g1 = opt->adam.g1->data; // gradient
18461
- float * g2 = opt->adam.g2->data; // gradient squared
18462
18605
  float * m = opt->adam.m->data; // first moment
18463
18606
  float * v = opt->adam.v->data; // second moment
18464
- float * mh = opt->adam.mh->data; // first moment hat
18465
- float * vh = opt->adam.vh->data; // second moment hat
18466
18607
 
18467
18608
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18468
18609
 
18469
- // update view
18470
- ggml_opt_get_params(np, ps, x);
18610
+ if (callback) {
18611
+ callback(callback_data, &sched);
18612
+ }
18471
18613
 
18472
18614
  // compute the function value
18473
18615
  ggml_graph_reset (gf);
18474
18616
  ggml_set_f32 (f->grad, 1.0f);
18475
18617
 
18476
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18618
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18619
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18620
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18621
+ ggml_graph_compute(gb, &cplan);
18477
18622
 
18478
18623
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
18479
18624
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -18481,6 +18626,9 @@ static enum ggml_opt_result ggml_opt_adam(
18481
18626
  pf[opt->iter % params.past] = opt->adam.fx_prev;
18482
18627
  }
18483
18628
 
18629
+ opt->loss_before = opt->adam.fx_prev;
18630
+ opt->loss_after = opt->adam.fx_prev;
18631
+
18484
18632
  // initialize
18485
18633
  if (opt->just_initialized) {
18486
18634
  opt->adam.n_no_improvement = 0;
@@ -18513,50 +18661,55 @@ static enum ggml_opt_result ggml_opt_adam(
18513
18661
  UNUSED(t_start_cpu);
18514
18662
 
18515
18663
  {
18516
- // update the gradient
18517
- ggml_opt_get_grad(np, ps, g1);
18518
-
18519
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
18520
- ggml_vec_scale_f32(nx, m, beta1);
18521
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
18522
-
18523
- // g2 = g1^2
18524
- ggml_vec_sqr_f32 (nx, g2, g1);
18525
-
18526
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
18527
- ggml_vec_scale_f32(nx, v, beta2);
18528
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
18529
-
18530
- // m^hat = m_t / (1 - beta1^t)
18531
- // v^hat = v_t / (1 - beta2^t)
18532
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
18533
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
18534
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
18535
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
18536
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
18537
- ggml_vec_cpy_f32 (nx, mh, m);
18538
- ggml_vec_cpy_f32 (nx, vh, v);
18539
-
18540
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
18541
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
18542
-
18543
- ggml_vec_sqrt_f32 (nx, vh, vh);
18544
- ggml_vec_acc1_f32 (nx, vh, eps);
18545
-
18546
- ggml_vec_div_f32 (nx, mh, mh, vh);
18547
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
18548
- ggml_vec_sub_f32 (nx, x, x, mh);
18664
+ float gnorm = 1.0f;
18665
+ if (gclip > 0.0f) {
18666
+ // gradient clipping
18667
+ ggml_float sum = 0.0;
18668
+ for (int p = 0; p < np; ++p) {
18669
+ const int64_t ne = ggml_nelements(ps[p]);
18670
+ for (int64_t j = 0; j < ne; ++j) {
18671
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
18672
+ sum += (ggml_float)(g*g);
18673
+ }
18674
+ }
18675
+ ggml_float norm = sqrt(sum);
18676
+ if (norm > (ggml_float) gclip) {
18677
+ gnorm = (float) ((ggml_float) gclip / norm);
18678
+ }
18679
+ }
18680
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
18681
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
18682
+ int64_t i = 0;
18683
+ for (int p = 0; p < np; ++p) {
18684
+ const int64_t ne = ggml_nelements(ps[p]);
18685
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
18686
+ for (int64_t j = 0; j < ne; ++j) {
18687
+ float x = ggml_get_f32_1d(ps[p], j);
18688
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
18689
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
18690
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
18691
+ float mh = m[i]*beta1h;
18692
+ float vh = v[i]*beta2h;
18693
+ vh = sqrtf(vh) + eps;
18694
+ x = x*(1.0f - p_decay) - mh/vh;
18695
+ ggml_set_f32_1d(ps[p], j, x);
18696
+ ++i;
18697
+ }
18698
+ }
18699
+ }
18549
18700
 
18550
- // update the parameters
18551
- ggml_opt_set_params(np, ps, x);
18701
+ if (callback) {
18702
+ callback(callback_data, &sched);
18552
18703
  }
18553
18704
 
18554
18705
  ggml_graph_reset (gf);
18555
18706
  ggml_set_f32 (f->grad, 1.0f);
18556
18707
 
18557
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18708
+ ggml_graph_compute(gb, &cplan);
18558
18709
 
18559
18710
  const float fx = ggml_get_f32_1d(f, 0);
18711
+ opt->loss_after = fx;
18712
+
18560
18713
 
18561
18714
  // check convergence
18562
18715
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -18625,7 +18778,6 @@ struct ggml_lbfgs_iteration_data {
18625
18778
  };
18626
18779
 
18627
18780
  static enum ggml_opt_result linesearch_backtracking(
18628
- struct ggml_context * ctx,
18629
18781
  const struct ggml_opt_params * params,
18630
18782
  int nx,
18631
18783
  float * x,
@@ -18637,8 +18789,11 @@ static enum ggml_opt_result linesearch_backtracking(
18637
18789
  struct ggml_tensor * f,
18638
18790
  struct ggml_cgraph * gf,
18639
18791
  struct ggml_cgraph * gb,
18792
+ struct ggml_cplan * cplan,
18640
18793
  const int np,
18641
- struct ggml_tensor * ps[]) {
18794
+ struct ggml_tensor * ps[],
18795
+ ggml_opt_callback callback,
18796
+ void * callback_data) {
18642
18797
  int count = 0;
18643
18798
 
18644
18799
  float width = 0.0f;
@@ -18667,6 +18822,12 @@ static enum ggml_opt_result linesearch_backtracking(
18667
18822
  dgtest = params->lbfgs.ftol*dginit;
18668
18823
 
18669
18824
  while (true) {
18825
+ if (callback) {
18826
+ // LBFG-S does not support learning rate -> ignore learning schedule
18827
+ float sched = 0;
18828
+ callback(callback_data, &sched);
18829
+ }
18830
+
18670
18831
  ggml_vec_cpy_f32(nx, x, xp);
18671
18832
  ggml_vec_mad_f32(nx, x, d, *step);
18672
18833
 
@@ -18677,7 +18838,7 @@ static enum ggml_opt_result linesearch_backtracking(
18677
18838
  ggml_graph_reset (gf);
18678
18839
  ggml_set_f32 (f->grad, 1.0f);
18679
18840
 
18680
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
18841
+ ggml_graph_compute(gb, cplan);
18681
18842
 
18682
18843
  ggml_opt_get_grad(np, ps, g);
18683
18844
 
@@ -18737,7 +18898,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18737
18898
  struct ggml_opt_params params,
18738
18899
  struct ggml_tensor * f,
18739
18900
  struct ggml_cgraph * gf,
18740
- struct ggml_cgraph * gb) {
18901
+ struct ggml_cgraph * gb,
18902
+ ggml_opt_callback callback,
18903
+ void * callback_data) {
18741
18904
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18742
18905
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18743
18906
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -18769,6 +18932,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18769
18932
  opt->iter = iter;
18770
18933
  }
18771
18934
 
18935
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18936
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18937
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18938
+
18772
18939
  float * x = opt->lbfgs.x->data; // current parameters
18773
18940
  float * xp = opt->lbfgs.xp->data; // previous parameters
18774
18941
  float * g = opt->lbfgs.g->data; // current gradient
@@ -18790,6 +18957,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18790
18957
  float * lm_s = opt->lbfgs.lms->data;
18791
18958
  float * lm_y = opt->lbfgs.lmy->data;
18792
18959
 
18960
+ if (callback) {
18961
+ // LBFG-S does not support learning rate -> ignore learning schedule
18962
+ float sched = 0;
18963
+ callback(callback_data, &sched);
18964
+ }
18965
+
18793
18966
  // evaluate the function value and its gradient
18794
18967
  {
18795
18968
  ggml_opt_set_params(np, ps, x);
@@ -18797,11 +18970,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18797
18970
  ggml_graph_reset (gf);
18798
18971
  ggml_set_f32 (f->grad, 1.0f);
18799
18972
 
18800
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18973
+ ggml_graph_compute(gb, &cplan);
18801
18974
 
18802
18975
  ggml_opt_get_grad(np, ps, g);
18803
18976
 
18804
18977
  fx = ggml_get_f32_1d(f, 0);
18978
+
18979
+ opt->loss_before = fx;
18980
+ opt->loss_after = fx;
18805
18981
  }
18806
18982
 
18807
18983
  // search direction = -gradient
@@ -18856,7 +19032,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18856
19032
  ggml_vec_cpy_f32(nx, xp, x);
18857
19033
  ggml_vec_cpy_f32(nx, gp, g);
18858
19034
 
18859
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
19035
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
18860
19036
 
18861
19037
  if (ls < 0) {
18862
19038
  // linesearch failed - go back to the previous point and return
@@ -18866,6 +19042,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18866
19042
  return ls;
18867
19043
  }
18868
19044
 
19045
+ opt->loss_after = fx;
19046
+
18869
19047
  ggml_vec_norm_f32(nx, &xnorm, x);
18870
19048
  ggml_vec_norm_f32(nx, &gnorm, g);
18871
19049
 
@@ -18923,7 +19101,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18923
19101
  // ys = y^t \cdot s -> 1 / \rho.
18924
19102
  // yy = y^t \cdot y.
18925
19103
  //
18926
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
19104
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18927
19105
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18928
19106
 
18929
19107
  lm_ys[end[0]] = ys;
@@ -18986,13 +19164,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18986
19164
  .adam = {
18987
19165
  .n_iter = 10000,
18988
19166
  .sched = 1.000f,
18989
- .decay = 0.001f,
19167
+ .decay = 0.0f,
19168
+ .decay_min_ndim = 2,
18990
19169
  .alpha = 0.001f,
18991
19170
  .beta1 = 0.9f,
18992
19171
  .beta2 = 0.999f,
18993
19172
  .eps = 1e-8f,
18994
19173
  .eps_f = 1e-5f,
18995
19174
  .eps_g = 1e-3f,
19175
+ .gclip = 0.0f,
18996
19176
  },
18997
19177
  };
18998
19178
  } break;
@@ -19042,23 +19222,13 @@ GGML_API void ggml_opt_init(
19042
19222
  switch (opt->params.type) {
19043
19223
  case GGML_OPT_ADAM:
19044
19224
  {
19045
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19046
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19047
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19048
19225
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19049
19226
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19050
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19051
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19052
19227
  opt->adam.pf = params.past > 0
19053
19228
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
19054
19229
  : NULL;
19055
- ggml_set_zero(opt->adam.x);
19056
- ggml_set_zero(opt->adam.g1);
19057
- ggml_set_zero(opt->adam.g2);
19058
19230
  ggml_set_zero(opt->adam.m);
19059
19231
  ggml_set_zero(opt->adam.v);
19060
- ggml_set_zero(opt->adam.mh);
19061
- ggml_set_zero(opt->adam.vh);
19062
19232
  if (opt->adam.pf) {
19063
19233
  ggml_set_zero(opt->adam.pf);
19064
19234
  }
@@ -19142,7 +19312,7 @@ enum ggml_opt_result ggml_opt_resume(
19142
19312
  *gf = ggml_build_forward (f);
19143
19313
  *gb = ggml_build_backward(ctx, gf, true);
19144
19314
 
19145
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
19315
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
19146
19316
  }
19147
19317
 
19148
19318
  enum ggml_opt_result ggml_opt_resume_g(
@@ -19150,7 +19320,9 @@ enum ggml_opt_result ggml_opt_resume_g(
19150
19320
  struct ggml_opt_context * opt,
19151
19321
  struct ggml_tensor * f,
19152
19322
  struct ggml_cgraph * gf,
19153
- struct ggml_cgraph * gb) {
19323
+ struct ggml_cgraph * gb,
19324
+ ggml_opt_callback callback,
19325
+ void * callback_data) {
19154
19326
 
19155
19327
  // build forward + backward compute graphs
19156
19328
  enum ggml_opt_result result = GGML_OPT_OK;
@@ -19158,11 +19330,11 @@ enum ggml_opt_result ggml_opt_resume_g(
19158
19330
  switch (opt->params.type) {
19159
19331
  case GGML_OPT_ADAM:
19160
19332
  {
19161
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
19333
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19162
19334
  } break;
19163
19335
  case GGML_OPT_LBFGS:
19164
19336
  {
19165
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
19337
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19166
19338
  } break;
19167
19339
  }
19168
19340
 
@@ -19394,7 +19566,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19394
19566
  ////////////////////////////////////////////////////////////////////////////////
19395
19567
 
19396
19568
  struct gguf_str {
19397
- uint32_t n;
19569
+ uint64_t n; // GGUFv2
19398
19570
  char * data;
19399
19571
  };
19400
19572
 
@@ -19408,9 +19580,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19408
19580
  [GGUF_TYPE_FLOAT32] = sizeof(float),
19409
19581
  [GGUF_TYPE_BOOL] = sizeof(bool),
19410
19582
  [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19583
+ [GGUF_TYPE_UINT64] = sizeof(uint64_t),
19584
+ [GGUF_TYPE_INT64] = sizeof(int64_t),
19585
+ [GGUF_TYPE_FLOAT64] = sizeof(double),
19411
19586
  [GGUF_TYPE_ARRAY] = 0, // undefined
19412
19587
  };
19413
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19588
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19414
19589
 
19415
19590
  static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19416
19591
  [GGUF_TYPE_UINT8] = "u8",
@@ -19423,8 +19598,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19423
19598
  [GGUF_TYPE_BOOL] = "bool",
19424
19599
  [GGUF_TYPE_STRING] = "str",
19425
19600
  [GGUF_TYPE_ARRAY] = "arr",
19601
+ [GGUF_TYPE_UINT64] = "u64",
19602
+ [GGUF_TYPE_INT64] = "i64",
19603
+ [GGUF_TYPE_FLOAT64] = "f64",
19426
19604
  };
19427
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19605
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19428
19606
 
19429
19607
  union gguf_value {
19430
19608
  uint8_t uint8;
@@ -19434,6 +19612,9 @@ union gguf_value {
19434
19612
  uint32_t uint32;
19435
19613
  int32_t int32;
19436
19614
  float float32;
19615
+ uint64_t uint64;
19616
+ int64_t int64;
19617
+ double float64;
19437
19618
  bool bool_;
19438
19619
 
19439
19620
  struct gguf_str str;
@@ -19441,7 +19622,7 @@ union gguf_value {
19441
19622
  struct {
19442
19623
  enum gguf_type type;
19443
19624
 
19444
- uint32_t n;
19625
+ uint64_t n; // GGUFv2
19445
19626
  void * data;
19446
19627
  } arr;
19447
19628
  };
@@ -19449,8 +19630,6 @@ union gguf_value {
19449
19630
  struct gguf_kv {
19450
19631
  struct gguf_str key;
19451
19632
 
19452
- uint32_t n_bytes; // TODO: is this actually needed?
19453
-
19454
19633
  enum gguf_type type;
19455
19634
  union gguf_value value;
19456
19635
  };
@@ -19458,15 +19637,15 @@ struct gguf_kv {
19458
19637
  struct gguf_header {
19459
19638
  uint32_t magic;
19460
19639
  uint32_t version;
19461
- uint32_t n_tensors;
19462
- uint32_t n_kv;
19640
+ uint64_t n_tensors; // GGUFv2
19641
+ uint64_t n_kv; // GGUFv2
19463
19642
  };
19464
19643
 
19465
19644
  struct gguf_tensor_info {
19466
19645
  struct gguf_str name;
19467
19646
 
19468
19647
  uint32_t n_dims;
19469
- uint32_t ne[GGML_MAX_DIMS];
19648
+ uint64_t ne[GGML_MAX_DIMS];
19470
19649
 
19471
19650
  enum ggml_type type;
19472
19651
 
@@ -19497,19 +19676,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
19497
19676
  return n == size;
19498
19677
  }
19499
19678
 
19500
- static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
19679
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19680
+ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
19501
19681
  p->n = 0;
19502
19682
  p->data = NULL;
19503
19683
 
19504
19684
  bool ok = true;
19505
19685
 
19506
- // TODO: how to avoid mallocs for strings?
19507
19686
  ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19508
19687
  ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19509
19688
 
19510
19689
  return ok;
19511
19690
  }
19512
19691
 
19692
+ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
19693
+ p->n = 0;
19694
+ p->data = NULL;
19695
+
19696
+ bool ok = true;
19697
+
19698
+ uint32_t n = 0;
19699
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
19700
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19701
+
19702
+ return ok;
19703
+ }
19704
+
19513
19705
  struct gguf_context * gguf_init_empty(void) {
19514
19706
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19515
19707
 
@@ -19565,8 +19757,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19565
19757
  ctx->data = NULL;
19566
19758
 
19567
19759
  ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19568
- ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19569
- ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19760
+
19761
+ if (ctx->header.version == 1) {
19762
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19763
+ uint32_t n_tensors = 0;
19764
+ uint32_t n_kv = 0;
19765
+
19766
+ ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
19767
+ ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
19768
+
19769
+ ctx->header.n_tensors = n_tensors;
19770
+ ctx->header.n_kv = n_kv;
19771
+ } else {
19772
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19773
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19774
+ }
19570
19775
 
19571
19776
  if (!ok) {
19572
19777
  fprintf(stderr, "%s: failed to read header\n", __func__);
@@ -19576,18 +19781,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19576
19781
  }
19577
19782
  }
19578
19783
 
19784
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19785
+ bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
19786
+ if (ctx->header.version == 1) {
19787
+ gguf_fread_str = gguf_fread_str_v1;
19788
+ }
19789
+
19579
19790
  // read the kv pairs
19580
19791
  {
19581
- ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19792
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19582
19793
 
19583
19794
  for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19584
19795
  struct gguf_kv * kv = &ctx->kv[i];
19585
19796
 
19586
19797
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19587
19798
 
19588
- ok = ok && gguf_fread_str(file, &kv->key, &offset);
19589
- //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
19590
- ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19799
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19800
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19591
19801
 
19592
19802
  //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19593
19803
 
@@ -19599,12 +19809,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19599
19809
  case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19600
19810
  case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19601
19811
  case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19812
+ case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
19813
+ case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
19814
+ case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
19602
19815
  case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19603
19816
  case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19604
19817
  case GGUF_TYPE_ARRAY:
19605
19818
  {
19606
19819
  ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19607
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19820
+
19821
+ if (ctx->header.version == 1) {
19822
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19823
+ uint32_t n = 0;
19824
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
19825
+ kv->value.arr.n = n;
19826
+ } else {
19827
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19828
+ }
19608
19829
 
19609
19830
  switch (kv->value.arr.type) {
19610
19831
  case GGUF_TYPE_UINT8:
@@ -19614,6 +19835,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19614
19835
  case GGUF_TYPE_UINT32:
19615
19836
  case GGUF_TYPE_INT32:
19616
19837
  case GGUF_TYPE_FLOAT32:
19838
+ case GGUF_TYPE_UINT64:
19839
+ case GGUF_TYPE_INT64:
19840
+ case GGUF_TYPE_FLOAT64:
19617
19841
  case GGUF_TYPE_BOOL:
19618
19842
  {
19619
19843
  kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -19648,7 +19872,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19648
19872
 
19649
19873
  // read the tensor infos
19650
19874
  {
19651
- ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19875
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19652
19876
 
19653
19877
  for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19654
19878
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19660,7 +19884,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19660
19884
  ok = ok && gguf_fread_str(file, &info->name, &offset);
19661
19885
  ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19662
19886
  for (uint32_t j = 0; j < info->n_dims; ++j) {
19663
- ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19887
+ if (ctx->header.version == 1) {
19888
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19889
+ uint32_t t = 0;
19890
+ ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
19891
+ info->ne[j] = t;
19892
+ } else {
19893
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19894
+ }
19664
19895
  }
19665
19896
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19666
19897
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
@@ -19744,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19744
19975
 
19745
19976
  struct ggml_tensor * data = NULL;
19746
19977
 
19747
- if (params.no_alloc == false) {
19978
+ if (!params.no_alloc) {
19748
19979
  data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
19749
19980
 
19750
19981
  ok = ok && data != NULL;
@@ -19785,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19785
20016
  }
19786
20017
 
19787
20018
  // point the data member to the appropriate location in the binary blob using the tensor infos
19788
- if (params.no_alloc == false) {
20019
+ if (!params.no_alloc) {
19789
20020
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
19790
20021
  cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
19791
20022
  }
@@ -19842,7 +20073,7 @@ void gguf_free(struct gguf_context * ctx) {
19842
20073
  }
19843
20074
  }
19844
20075
 
19845
- GGML_ALIGNED_FREE(ctx->kv);
20076
+ free(ctx->kv);
19846
20077
  }
19847
20078
 
19848
20079
  if (ctx->infos) {
@@ -19854,7 +20085,7 @@ void gguf_free(struct gguf_context * ctx) {
19854
20085
  }
19855
20086
  }
19856
20087
 
19857
- GGML_ALIGNED_FREE(ctx->infos);
20088
+ free(ctx->infos);
19858
20089
  }
19859
20090
 
19860
20091
  GGML_ALIGNED_FREE(ctx);
@@ -19954,6 +20185,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
19954
20185
  return ctx->kv[i].value.float32;
19955
20186
  }
19956
20187
 
20188
+ uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20189
+ return ctx->kv[i].value.uint64;
20190
+ }
20191
+
20192
+ int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20193
+ return ctx->kv[i].value.int64;
20194
+ }
20195
+
20196
+ double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20197
+ return ctx->kv[i].value.float64;
20198
+ }
20199
+
19957
20200
  bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
19958
20201
  return ctx->kv[i].value.bool_;
19959
20202
  }
@@ -20000,7 +20243,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
20000
20243
  const int n_kv = gguf_get_n_kv(ctx);
20001
20244
 
20002
20245
  ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20003
- ctx->kv[n_kv].key.n = strlen(key) + 1;
20246
+ ctx->kv[n_kv].key.n = strlen(key);
20004
20247
  ctx->kv[n_kv].key.data = strdup(key);
20005
20248
  ctx->header.n_kv++;
20006
20249
 
@@ -20056,6 +20299,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20056
20299
  ctx->kv[idx].value.float32 = val;
20057
20300
  }
20058
20301
 
20302
+ void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
20303
+ const int idx = gguf_get_or_add_key(ctx, key);
20304
+
20305
+ ctx->kv[idx].type = GGUF_TYPE_UINT64;
20306
+ ctx->kv[idx].value.uint64 = val;
20307
+ }
20308
+
20309
+ void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
20310
+ const int idx = gguf_get_or_add_key(ctx, key);
20311
+
20312
+ ctx->kv[idx].type = GGUF_TYPE_INT64;
20313
+ ctx->kv[idx].value.int64 = val;
20314
+ }
20315
+
20316
+ void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
20317
+ const int idx = gguf_get_or_add_key(ctx, key);
20318
+
20319
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
20320
+ ctx->kv[idx].value.float64 = val;
20321
+ }
20322
+
20059
20323
  void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20060
20324
  const int idx = gguf_get_or_add_key(ctx, key);
20061
20325
 
@@ -20067,7 +20331,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
20067
20331
  const int idx = gguf_get_or_add_key(ctx, key);
20068
20332
 
20069
20333
  ctx->kv[idx].type = GGUF_TYPE_STRING;
20070
- ctx->kv[idx].value.str.n = strlen(val) + 1;
20334
+ ctx->kv[idx].value.str.n = strlen(val);
20071
20335
  ctx->kv[idx].value.str.data = strdup(val);
20072
20336
  }
20073
20337
 
@@ -20090,7 +20354,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
20090
20354
  ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20091
20355
  for (int i = 0; i < n; i++) {
20092
20356
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20093
- str->n = strlen(data[i]) + 1;
20357
+ str->n = strlen(data[i]);
20094
20358
  str->data = strdup(data[i]);
20095
20359
  }
20096
20360
  }
@@ -20106,6 +20370,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20106
20370
  case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20107
20371
  case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20108
20372
  case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20373
+ case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
20374
+ case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
20375
+ case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
20109
20376
  case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20110
20377
  case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20111
20378
  case GGUF_TYPE_ARRAY:
@@ -20134,7 +20401,7 @@ void gguf_add_tensor(
20134
20401
  const int idx = ctx->header.n_tensors;
20135
20402
  ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20136
20403
 
20137
- ctx->infos[idx].name.n = strlen(tensor->name) + 1;
20404
+ ctx->infos[idx].name.n = strlen(tensor->name);
20138
20405
  ctx->infos[idx].name.data = strdup(tensor->name);
20139
20406
 
20140
20407
  for (int i = 0; i < GGML_MAX_DIMS; ++i) {
@@ -20267,6 +20534,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20267
20534
  case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20268
20535
  case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20269
20536
  case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20537
+ case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
20538
+ case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
20539
+ case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
20270
20540
  case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20271
20541
  case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20272
20542
  case GGUF_TYPE_ARRAY:
@@ -20282,6 +20552,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20282
20552
  case GGUF_TYPE_UINT32:
20283
20553
  case GGUF_TYPE_INT32:
20284
20554
  case GGUF_TYPE_FLOAT32:
20555
+ case GGUF_TYPE_UINT64:
20556
+ case GGUF_TYPE_INT64:
20557
+ case GGUF_TYPE_FLOAT64:
20285
20558
  case GGUF_TYPE_BOOL:
20286
20559
  {
20287
20560
  gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -20516,6 +20789,14 @@ int ggml_cpu_has_sse3(void) {
20516
20789
  #endif
20517
20790
  }
20518
20791
 
20792
+ int ggml_cpu_has_ssse3(void) {
20793
+ #if defined(__SSSE3__)
20794
+ return 1;
20795
+ #else
20796
+ return 0;
20797
+ #endif
20798
+ }
20799
+
20519
20800
  int ggml_cpu_has_vsx(void) {
20520
20801
  #if defined(__POWER9_VECTOR__)
20521
20802
  return 1;