llama_cpp 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
103
103
  #include <sys/stat.h>
104
104
  #include <unistd.h>
105
105
 
106
+ #endif
107
+ #ifdef GGML_USE_CPU_HBM
108
+ #include <hbwmalloc.h>
106
109
  #endif
107
110
 
108
111
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -123,6 +126,8 @@ typedef void * thread_ret_t;
123
126
  #define GGML_GELU_FP16
124
127
  #define GGML_GELU_QUICK_FP16
125
128
  #define GGML_SILU_FP16
129
+ // #define GGML_CROSS_ENTROPY_EXP_FP16
130
+ // #define GGML_FLASH_ATTN_EXP_FP16
126
131
 
127
132
  #define GGML_SOFT_MAX_UNROLL 4
128
133
  #define GGML_VEC_DOT_UNROLL 2
@@ -157,12 +162,6 @@ typedef void * thread_ret_t;
157
162
  //#define GGML_SOFT_MAX_ACCELERATE
158
163
  #endif
159
164
 
160
- #if UINTPTR_MAX == 0xFFFFFFFF
161
- #define GGML_MEM_ALIGN 4
162
- #else
163
- #define GGML_MEM_ALIGN 16
164
- #endif
165
-
166
165
  //
167
166
  // logging
168
167
  //
@@ -192,13 +191,19 @@ typedef void * thread_ret_t;
192
191
  //
193
192
 
194
193
  #if defined(_MSC_VER) || defined(__MINGW32__)
195
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
194
+ #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
195
+ #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
196
  #else
198
197
  inline static void * ggml_aligned_malloc(size_t size) {
198
+ if (size == 0) {
199
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
200
+ return NULL;
201
+ }
199
202
  void * aligned_memory = NULL;
200
- #ifdef GGML_USE_METAL
201
- int result = posix_memalign(&aligned_memory, getpagesize(), size);
203
+ #ifdef GGML_USE_CPU_HBM
204
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
205
+ #elif GGML_USE_METAL
206
+ int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
202
207
  #else
203
208
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
204
209
  #endif
@@ -218,8 +223,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
218
223
  }
219
224
  return aligned_memory;
220
225
  }
221
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
222
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
226
+ #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
227
+ #ifdef GGML_USE_CPU_HBM
228
+ #define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
229
+ #else
230
+ #define GGML_ALIGNED_FREE(ptr) free(ptr)
231
+ #endif
223
232
  #endif
224
233
 
225
234
  #define UNUSED GGML_UNUSED
@@ -305,6 +314,10 @@ typedef double ggml_float;
305
314
  #endif
306
315
  #endif
307
316
 
317
+ #ifdef __riscv_v_intrinsic
318
+ #include <riscv_vector.h>
319
+ #endif
320
+
308
321
  #ifdef __F16C__
309
322
 
310
323
  #ifdef _MSC_VER
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
817
830
 
818
831
  #if !defined(__aarch64__)
819
832
 
820
- inline static uint16_t vaddvq_u8(uint8x16_t v) {
821
- return
822
- (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
823
- (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
824
- (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
825
- (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
826
- (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
827
- (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
828
- (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
829
- (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
830
- }
831
-
832
- inline static int16_t vaddvq_s8(int8x16_t v) {
833
- return
834
- (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
835
- (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
836
- (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
837
- (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
838
- (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
839
- (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
840
- (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
841
- (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
842
- }
843
-
844
- inline static int32_t vaddvq_s16(int16x8_t v) {
845
- return
846
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
847
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
848
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
849
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
850
- }
851
-
852
- inline static uint32_t vaddvq_u16(uint16x8_t v) {
853
- return
854
- (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
855
- (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
856
- (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
857
- (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
858
- }
859
-
860
833
  inline static int32_t vaddvq_s32(int32x4_t v) {
861
834
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
862
835
  }
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
865
838
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
866
839
  }
867
840
 
868
- inline static float vminvq_f32(float32x4_t v) {
869
- return
870
- MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
871
- MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
872
- }
873
-
874
841
  inline static float vmaxvq_f32(float32x4_t v) {
875
842
  return
876
843
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -2436,7 +2403,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2436
2403
  const int nb = n / qk;
2437
2404
 
2438
2405
  assert(n % qk == 0);
2439
- assert(nb % 2 == 0);
2440
2406
 
2441
2407
  const block_q4_0 * restrict x = vx;
2442
2408
  const block_q8_0 * restrict y = vy;
@@ -2445,6 +2411,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2445
2411
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2446
2412
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2447
2413
 
2414
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2448
2415
  for (int i = 0; i < nb; i += 2) {
2449
2416
  const block_q4_0 * restrict x0 = &x[i + 0];
2450
2417
  const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2623,6 +2590,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2623
2590
  }
2624
2591
 
2625
2592
  // Main loop
2593
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2626
2594
  for (int i = 2; i < nb; i+=2) {
2627
2595
  _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
2628
2596
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2680,6 +2648,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2680
2648
  }
2681
2649
 
2682
2650
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2651
+ #elif defined(__riscv_v_intrinsic)
2652
+ float sumf = 0.0;
2653
+
2654
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2655
+
2656
+ for (int i = 0; i < nb; i++) {
2657
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2658
+
2659
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2660
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2661
+
2662
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2663
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2664
+
2665
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2666
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2667
+
2668
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2669
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2670
+
2671
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2672
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2673
+
2674
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2675
+
2676
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2677
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2678
+
2679
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2680
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2681
+
2682
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2683
+ }
2684
+
2685
+ *s = sumf;
2683
2686
  #else
2684
2687
  // scalar
2685
2688
  float sumf = 0.0;
@@ -2706,7 +2709,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2706
2709
  const int nb = n / qk;
2707
2710
 
2708
2711
  assert(n % qk == 0);
2709
- assert(nb % 2 == 0);
2710
2712
 
2711
2713
  const block_q4_1 * restrict x = vx;
2712
2714
  const block_q8_1 * restrict y = vy;
@@ -2718,6 +2720,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2718
2720
 
2719
2721
  float summs = 0;
2720
2722
 
2723
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2721
2724
  for (int i = 0; i < nb; i += 2) {
2722
2725
  const block_q4_1 * restrict x0 = &x[i + 0];
2723
2726
  const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2806,6 +2809,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2806
2809
  }
2807
2810
 
2808
2811
  *s = hsum_float_8(acc) + summs;
2812
+ #elif defined(__riscv_v_intrinsic)
2813
+ float sumf = 0.0;
2814
+
2815
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2816
+
2817
+ for (int i = 0; i < nb; i++) {
2818
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2819
+
2820
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2821
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2822
+
2823
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2824
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2825
+
2826
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2827
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2828
+
2829
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2830
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2831
+
2832
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2833
+
2834
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2835
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2836
+
2837
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2838
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2839
+
2840
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2841
+ }
2842
+
2843
+ *s = sumf;
2809
2844
  #else
2810
2845
  // scalar
2811
2846
  float sumf = 0.0;
@@ -2832,7 +2867,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2832
2867
  const int nb = n / qk;
2833
2868
 
2834
2869
  assert(n % qk == 0);
2835
- assert(nb % 2 == 0);
2836
2870
  assert(qk == QK5_0);
2837
2871
 
2838
2872
  const block_q5_0 * restrict x = vx;
@@ -2848,6 +2882,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2848
2882
  uint64_t tmp0[4];
2849
2883
  uint64_t tmp1[4];
2850
2884
 
2885
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2851
2886
  for (int i = 0; i < nb; i += 2) {
2852
2887
  const block_q5_0 * restrict x0 = &x[i];
2853
2888
  const block_q5_0 * restrict x1 = &x[i + 1];
@@ -3040,6 +3075,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3040
3075
  }
3041
3076
 
3042
3077
  *s = hsum_float_8(acc);
3078
+ #elif defined(__riscv_v_intrinsic)
3079
+ float sumf = 0.0;
3080
+
3081
+ uint32_t qh;
3082
+
3083
+ // These temp values are for masking and shift operations
3084
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3085
+ uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3086
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3087
+
3088
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3089
+
3090
+ for (int i = 0; i < nb; i++) {
3091
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3092
+
3093
+ // temporary registers
3094
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3095
+ vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3096
+ vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3097
+ vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3098
+
3099
+ // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3100
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3101
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3102
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3103
+
3104
+ // ((qh & (1u << (j + 16))) >> (j + 12));
3105
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3106
+ vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3107
+
3108
+ // narrowing
3109
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3110
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3111
+
3112
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3113
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3114
+
3115
+ // load
3116
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3117
+
3118
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3119
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3120
+
3121
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3122
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3123
+
3124
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3125
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3126
+
3127
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3128
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3129
+
3130
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3131
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3132
+
3133
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3134
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3135
+
3136
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3137
+
3138
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3139
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3140
+
3141
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3142
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3143
+
3144
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3145
+ }
3146
+
3147
+ *s = sumf;
3043
3148
  #else
3044
3149
  // scalar
3045
3150
  float sumf = 0.0;
@@ -3072,7 +3177,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3072
3177
  const int nb = n / qk;
3073
3178
 
3074
3179
  assert(n % qk == 0);
3075
- assert(nb % 2 == 0);
3076
3180
  assert(qk == QK5_1);
3077
3181
 
3078
3182
  const block_q5_1 * restrict x = vx;
@@ -3091,6 +3195,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3091
3195
  uint64_t tmp0[4];
3092
3196
  uint64_t tmp1[4];
3093
3197
 
3198
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3094
3199
  for (int i = 0; i < nb; i += 2) {
3095
3200
  const block_q5_1 * restrict x0 = &x[i];
3096
3201
  const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3296,6 +3401,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3296
3401
  }
3297
3402
 
3298
3403
  *s = hsum_float_8(acc) + summs;
3404
+ #elif defined(__riscv_v_intrinsic)
3405
+ float sumf = 0.0;
3406
+
3407
+ uint32_t qh;
3408
+
3409
+ // These temp values are for shift operations
3410
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3411
+
3412
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3413
+
3414
+ for (int i = 0; i < nb; i++) {
3415
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3416
+
3417
+ // temporary registers
3418
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3419
+ vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3420
+
3421
+ // load qh
3422
+ vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3423
+
3424
+ // ((qh >> (j + 0)) << 4) & 0x10;
3425
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3426
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3427
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3428
+
3429
+ // ((qh >> (j + 12)) ) & 0x10;
3430
+ vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3431
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3432
+
3433
+ // narrowing
3434
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3435
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3436
+
3437
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3438
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3439
+
3440
+ // load
3441
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3442
+
3443
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3444
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3445
+
3446
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3447
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3448
+
3449
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3450
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3451
+
3452
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3453
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3454
+
3455
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3456
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3457
+
3458
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3459
+
3460
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3461
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3462
+
3463
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3464
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3465
+
3466
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3467
+ }
3468
+
3469
+ *s = sumf;
3299
3470
  #else
3300
3471
  // scalar
3301
3472
  float sumf = 0.0;
@@ -3328,7 +3499,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3328
3499
  const int nb = n / qk;
3329
3500
 
3330
3501
  assert(n % qk == 0);
3331
- assert(nb % 2 == 0);
3332
3502
 
3333
3503
  const block_q8_0 * restrict x = vx;
3334
3504
  const block_q8_0 * restrict y = vy;
@@ -3337,6 +3507,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3337
3507
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3338
3508
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3339
3509
 
3510
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3340
3511
  for (int i = 0; i < nb; i += 2) {
3341
3512
  const block_q8_0 * restrict x0 = &x[i + 0];
3342
3513
  const block_q8_0 * restrict x1 = &x[i + 1];
@@ -3407,6 +3578,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3407
3578
  }
3408
3579
 
3409
3580
  *s = hsum_float_8(acc);
3581
+ #elif defined(__riscv_v_intrinsic)
3582
+ float sumf = 0.0;
3583
+ size_t vl = __riscv_vsetvl_e8m1(qk);
3584
+
3585
+ for (int i = 0; i < nb; i++) {
3586
+ // load elements
3587
+ vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
3588
+ vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
3589
+
3590
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
3591
+
3592
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
3593
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
3594
+
3595
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
3596
+
3597
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3598
+ }
3599
+
3600
+ *s = sumf;
3410
3601
  #else
3411
3602
  // scalar
3412
3603
  float sumf = 0.0;
@@ -4107,16 +4298,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4107
4298
  }
4108
4299
 
4109
4300
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4111
-
4112
- // this should handle cases where the tensor is not contiguous in memory
4113
- // probaby just:
4114
- //
4115
- // return tensor->ne[3]*tensor->nb[3]
4116
- //
4117
- // is enough, but just in case, adding the second part
4118
-
4119
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
4301
+ size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4302
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4303
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4304
+ }
4305
+ return nbytes;
4120
4306
  }
4121
4307
 
4122
4308
  size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@@ -4393,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4393
4579
  return NULL;
4394
4580
  }
4395
4581
 
4582
+ // allow to call ggml_init with 0 size
4583
+ if (params.mem_size == 0) {
4584
+ params.mem_size = GGML_MEM_ALIGN;
4585
+ }
4586
+
4396
4587
  const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4397
4588
 
4398
4589
  *ctx = (struct ggml_context) {
@@ -4570,36 +4761,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4570
4761
  enum ggml_type type,
4571
4762
  int n_dims,
4572
4763
  const int64_t * ne,
4573
- void * data) {
4764
+ struct ggml_tensor * view_src,
4765
+ size_t view_offs) {
4574
4766
 
4575
4767
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4576
4768
 
4577
- size_t data_size = 0;
4769
+ // find the base tensor and absolute offset
4770
+ if (view_src != NULL && view_src->view_src != NULL) {
4771
+ view_offs += view_src->view_offs;
4772
+ view_src = view_src->view_src;
4773
+ }
4578
4774
 
4579
- if (data == NULL && !ctx->no_alloc) {
4580
- data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4581
- for (int i = 1; i < n_dims; i++) {
4582
- data_size *= ne[i];
4583
- }
4775
+ size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4776
+ for (int i = 1; i < n_dims; i++) {
4777
+ data_size *= ne[i];
4584
4778
  }
4585
4779
 
4586
- if (ctx->scratch.data != NULL && data == NULL) {
4587
- // allocate tensor data in the scratch buffer
4588
- if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4589
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4590
- __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4591
- assert(false);
4592
- return NULL;
4593
- }
4780
+ GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
4781
+
4782
+ void * data = view_src != NULL ? view_src->data : NULL;
4783
+ if (data != NULL) {
4784
+ data = (char *) data + view_offs;
4785
+ }
4594
4786
 
4595
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4787
+ size_t obj_alloc_size = 0;
4788
+
4789
+ if (view_src == NULL && !ctx->no_alloc) {
4790
+ if (ctx->scratch.data != NULL) {
4791
+ // allocate tensor data in the scratch buffer
4792
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4793
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4794
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4795
+ assert(false);
4796
+ return NULL;
4797
+ }
4596
4798
 
4597
- ctx->scratch.offs += data_size;
4799
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4598
4800
 
4599
- data_size = 0;
4801
+ ctx->scratch.offs += data_size;
4802
+ } else {
4803
+ // allocate tensor data in the context's memory pool
4804
+ obj_alloc_size = data_size;
4805
+ }
4600
4806
  }
4601
4807
 
4602
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4808
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
4603
4809
 
4604
4810
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4605
4811
 
@@ -4619,7 +4825,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4619
4825
  /*.perf_runs =*/ 0,
4620
4826
  /*.perf_cycles =*/ 0,
4621
4827
  /*.perf_time_us =*/ 0,
4622
- /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4828
+ /*.view_src =*/ view_src,
4829
+ /*.view_offs =*/ view_offs,
4830
+ /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
4623
4831
  /*.name =*/ { 0 },
4624
4832
  /*.extra =*/ NULL,
4625
4833
  /*.padding =*/ { 0 },
@@ -4643,28 +4851,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4643
4851
  return result;
4644
4852
  }
4645
4853
 
4646
- static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4647
- GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4648
- assert(params_size <= GGML_MAX_OP_PARAMS);
4649
- memcpy(tensor->op_params, params, params_size);
4650
- }
4651
-
4652
- static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4653
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4654
- return ((const int32_t *)(tensor->op_params))[i];
4655
- }
4656
-
4657
- static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4658
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4659
- ((int32_t *)(tensor->op_params))[i] = value;
4660
- }
4661
-
4662
4854
  struct ggml_tensor * ggml_new_tensor(
4663
4855
  struct ggml_context * ctx,
4664
4856
  enum ggml_type type,
4665
4857
  int n_dims,
4666
4858
  const int64_t * ne) {
4667
- return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4859
+ return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
4668
4860
  }
4669
4861
 
4670
4862
  struct ggml_tensor * ggml_new_tensor_1d(
@@ -4729,7 +4921,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
4729
4921
  }
4730
4922
 
4731
4923
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4732
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
4924
+ return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
4925
+ }
4926
+
4927
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4928
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4929
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4930
+ memcpy(tensor->op_params, params, params_size);
4931
+ }
4932
+
4933
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4934
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4935
+ return ((const int32_t *)(tensor->op_params))[i];
4936
+ }
4937
+
4938
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4939
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4940
+ ((int32_t *)(tensor->op_params))[i] = value;
4733
4941
  }
4734
4942
 
4735
4943
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@@ -5015,14 +5223,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
5015
5223
 
5016
5224
  struct ggml_tensor * ggml_view_tensor(
5017
5225
  struct ggml_context * ctx,
5018
- const struct ggml_tensor * src) {
5019
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
5226
+ struct ggml_tensor * src) {
5227
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
5020
5228
  ggml_format_name(result, "%s (view)", src->name);
5021
5229
 
5022
- result->nb[0] = src->nb[0];
5023
- result->nb[1] = src->nb[1];
5024
- result->nb[2] = src->nb[2];
5025
- result->nb[3] = src->nb[3];
5230
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
5231
+ result->nb[i] = src->nb[i];
5232
+ }
5026
5233
 
5027
5234
  return result;
5028
5235
  }
@@ -5280,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
5280
5487
  }
5281
5488
 
5282
5489
  if (inplace) {
5283
- GGML_ASSERT(is_node == false);
5490
+ GGML_ASSERT(!is_node);
5284
5491
  }
5285
5492
 
5286
5493
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5323,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
5323
5530
  }
5324
5531
 
5325
5532
  if (inplace) {
5326
- GGML_ASSERT(is_node == false);
5533
+ GGML_ASSERT(!is_node);
5327
5534
  }
5328
5535
 
5329
5536
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5595,7 +5802,7 @@ struct ggml_tensor * ggml_repeat_back(
5595
5802
 
5596
5803
  // ggml_concat
5597
5804
 
5598
- struct ggml_tensor* ggml_concat(
5805
+ struct ggml_tensor * ggml_concat(
5599
5806
  struct ggml_context* ctx,
5600
5807
  struct ggml_tensor* a,
5601
5808
  struct ggml_tensor* b) {
@@ -5862,7 +6069,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5862
6069
  struct ggml_tensor * ggml_rms_norm_back(
5863
6070
  struct ggml_context * ctx,
5864
6071
  struct ggml_tensor * a,
5865
- struct ggml_tensor * b) {
6072
+ struct ggml_tensor * b,
6073
+ float eps) {
5866
6074
  bool is_node = false;
5867
6075
 
5868
6076
  if (a->grad) {
@@ -5872,6 +6080,8 @@ struct ggml_tensor * ggml_rms_norm_back(
5872
6080
 
5873
6081
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5874
6082
 
6083
+ ggml_set_op_params(result, &eps, sizeof(eps));
6084
+
5875
6085
  result->op = GGML_OP_RMS_NORM_BACK;
5876
6086
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5877
6087
  result->src[0] = a;
@@ -6201,7 +6411,7 @@ struct ggml_tensor * ggml_reshape(
6201
6411
  //GGML_ASSERT(false);
6202
6412
  }
6203
6413
 
6204
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6414
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
6205
6415
  ggml_format_name(result, "%s (reshaped)", a->name);
6206
6416
 
6207
6417
  result->op = GGML_OP_RESHAPE;
@@ -6225,7 +6435,7 @@ struct ggml_tensor * ggml_reshape_1d(
6225
6435
  }
6226
6436
 
6227
6437
  const int64_t ne[1] = { ne0 };
6228
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6438
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
6229
6439
  ggml_format_name(result, "%s (reshaped)", a->name);
6230
6440
 
6231
6441
  result->op = GGML_OP_RESHAPE;
@@ -6250,7 +6460,7 @@ struct ggml_tensor * ggml_reshape_2d(
6250
6460
  }
6251
6461
 
6252
6462
  const int64_t ne[2] = { ne0, ne1 };
6253
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6463
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
6254
6464
  ggml_format_name(result, "%s (reshaped)", a->name);
6255
6465
 
6256
6466
  result->op = GGML_OP_RESHAPE;
@@ -6276,7 +6486,7 @@ struct ggml_tensor * ggml_reshape_3d(
6276
6486
  }
6277
6487
 
6278
6488
  const int64_t ne[3] = { ne0, ne1, ne2 };
6279
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6489
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
6280
6490
  ggml_format_name(result, "%s (reshaped)", a->name);
6281
6491
 
6282
6492
  result->op = GGML_OP_RESHAPE;
@@ -6286,7 +6496,6 @@ struct ggml_tensor * ggml_reshape_3d(
6286
6496
  return result;
6287
6497
  }
6288
6498
 
6289
-
6290
6499
  struct ggml_tensor * ggml_reshape_4d(
6291
6500
  struct ggml_context * ctx,
6292
6501
  struct ggml_tensor * a,
@@ -6304,7 +6513,7 @@ struct ggml_tensor * ggml_reshape_4d(
6304
6513
  }
6305
6514
 
6306
6515
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6307
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6516
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
6308
6517
  ggml_format_name(result, "%s (reshaped)", a->name);
6309
6518
 
6310
6519
  result->op = GGML_OP_RESHAPE;
@@ -6314,46 +6523,40 @@ struct ggml_tensor * ggml_reshape_4d(
6314
6523
  return result;
6315
6524
  }
6316
6525
 
6317
- // ggml_view_1d
6318
-
6319
- static struct ggml_tensor * ggml_view_tensor_offset(
6526
+ static struct ggml_tensor * ggml_view_impl(
6320
6527
  struct ggml_context * ctx,
6321
6528
  struct ggml_tensor * a,
6322
6529
  int n_dims,
6323
6530
  const int64_t * ne,
6324
6531
  size_t offset) {
6325
- // don't calculate an offset from an unallocated tensor
6326
- void * data = NULL;
6327
- if (a->data != NULL) {
6328
- data = (char *) a->data + offset;
6329
- }
6330
6532
 
6331
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6533
+ bool is_node = false;
6534
+
6535
+ if (a->grad) {
6536
+ is_node = true;
6537
+ }
6332
6538
 
6539
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
6333
6540
  ggml_format_name(result, "%s (view)", a->name);
6334
6541
 
6335
6542
  ggml_set_op_params(result, &offset, sizeof(offset));
6336
6543
 
6544
+ result->op = GGML_OP_VIEW;
6545
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6546
+ result->src[0] = a;
6547
+
6337
6548
  return result;
6338
6549
  }
6339
6550
 
6551
+ // ggml_view_1d
6552
+
6340
6553
  struct ggml_tensor * ggml_view_1d(
6341
6554
  struct ggml_context * ctx,
6342
6555
  struct ggml_tensor * a,
6343
6556
  int64_t ne0,
6344
6557
  size_t offset) {
6345
6558
 
6346
- bool is_node = false;
6347
-
6348
- if (a->grad) {
6349
- is_node = true;
6350
- }
6351
-
6352
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6353
-
6354
- result->op = GGML_OP_VIEW;
6355
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6356
- result->src[0] = a;
6559
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
6357
6560
 
6358
6561
  return result;
6359
6562
  }
@@ -6368,24 +6571,14 @@ struct ggml_tensor * ggml_view_2d(
6368
6571
  size_t nb1,
6369
6572
  size_t offset) {
6370
6573
 
6371
- bool is_node = false;
6372
-
6373
- if (a->grad) {
6374
- is_node = true;
6375
- }
6376
-
6377
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6574
+ const int64_t ne[2] = { ne0, ne1 };
6378
6575
 
6379
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6576
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
6380
6577
 
6381
6578
  result->nb[1] = nb1;
6382
6579
  result->nb[2] = result->nb[1]*ne1;
6383
6580
  result->nb[3] = result->nb[2];
6384
6581
 
6385
- result->op = GGML_OP_VIEW;
6386
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6387
- result->src[0] = a;
6388
-
6389
6582
  return result;
6390
6583
  }
6391
6584
 
@@ -6401,24 +6594,14 @@ struct ggml_tensor * ggml_view_3d(
6401
6594
  size_t nb2,
6402
6595
  size_t offset) {
6403
6596
 
6404
- bool is_node = false;
6405
-
6406
- if (a->grad) {
6407
- is_node = true;
6408
- }
6409
-
6410
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6597
+ const int64_t ne[3] = { ne0, ne1, ne2 };
6411
6598
 
6412
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6599
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
6413
6600
 
6414
6601
  result->nb[1] = nb1;
6415
6602
  result->nb[2] = nb2;
6416
6603
  result->nb[3] = result->nb[2]*ne2;
6417
6604
 
6418
- result->op = GGML_OP_VIEW;
6419
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6420
- result->src[0] = a;
6421
-
6422
6605
  return result;
6423
6606
  }
6424
6607
 
@@ -6436,24 +6619,14 @@ struct ggml_tensor * ggml_view_4d(
6436
6619
  size_t nb3,
6437
6620
  size_t offset) {
6438
6621
 
6439
- bool is_node = false;
6440
-
6441
- if (a->grad) {
6442
- is_node = true;
6443
- }
6444
-
6445
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6622
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6446
6623
 
6447
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6624
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
6448
6625
 
6449
6626
  result->nb[1] = nb1;
6450
6627
  result->nb[2] = nb2;
6451
6628
  result->nb[3] = nb3;
6452
6629
 
6453
- result->op = GGML_OP_VIEW;
6454
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6455
- result->src[0] = a;
6456
-
6457
6630
  return result;
6458
6631
  }
6459
6632
 
@@ -6640,7 +6813,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6640
6813
 
6641
6814
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6642
6815
 
6643
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6816
+ int32_t params[] = { n_past };
6644
6817
  ggml_set_op_params(result, params, sizeof(params));
6645
6818
 
6646
6819
  result->op = GGML_OP_DIAG_MASK_INF;
@@ -6657,7 +6830,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
6657
6830
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6658
6831
  }
6659
6832
 
6660
-
6661
6833
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6662
6834
  struct ggml_context * ctx,
6663
6835
  struct ggml_tensor * a,
@@ -6680,7 +6852,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6680
6852
 
6681
6853
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6682
6854
 
6683
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6855
+ int32_t params[] = { n_past };
6684
6856
  ggml_set_op_params(result, params, sizeof(params));
6685
6857
 
6686
6858
  result->op = GGML_OP_DIAG_MASK_ZERO;
@@ -7097,11 +7269,13 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7097
7269
  };
7098
7270
 
7099
7271
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7272
+
7273
+ ggml_set_op_params_i32(result, 0, stride);
7274
+
7100
7275
  result->op = GGML_OP_CONV_TRANSPOSE_2D;
7101
7276
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7102
7277
  result->src[0] = a;
7103
7278
  result->src[1] = b;
7104
- result->src[2] = ggml_new_i32(ctx, stride);
7105
7279
 
7106
7280
  return result;
7107
7281
  }
@@ -9446,6 +9620,8 @@ static void ggml_compute_forward_div_f32(
9446
9620
 
9447
9621
 
9448
9622
  #ifdef GGML_USE_ACCELERATE
9623
+ UNUSED(ggml_vec_div_f32);
9624
+
9449
9625
  vDSP_vdiv(
9450
9626
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9451
9627
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -10752,7 +10928,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
10752
10928
 
10753
10929
  GGML_TENSOR_BINARY_OP_LOCALS;
10754
10930
 
10755
- const float eps = 1e-6f; // TODO: make this a parameter
10931
+ float eps;
10932
+ memcpy(&eps, dst->op_params, sizeof(float));
10756
10933
 
10757
10934
  // TODO: optimize
10758
10935
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11930,8 +12107,8 @@ static void ggml_compute_forward_diag_mask_f32(
11930
12107
  const int ith = params->ith;
11931
12108
  const int nth = params->nth;
11932
12109
 
11933
- const int n_past = ((int32_t *) dst->op_params)[0];
11934
- const bool inplace = (bool)((int32_t *) dst->op_params)[1];
12110
+ const int n_past = ((int32_t *) dst->op_params)[0];
12111
+ const bool inplace = src0->data == dst->data;
11935
12112
 
11936
12113
  GGML_ASSERT(n_past >= 0);
11937
12114
 
@@ -12142,6 +12319,7 @@ static void ggml_compute_forward_soft_max_back_f32(
12142
12319
  // dx = J * dy
12143
12320
  // dxk = sum_i(Jki * dyi)
12144
12321
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
12322
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
12145
12323
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
12146
12324
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
12147
12325
  // dxk = -yk * dot(y, dy) + yk*dyk
@@ -13497,7 +13675,6 @@ static void ggml_compute_forward_conv_transpose_2d(
13497
13675
  const struct ggml_compute_params * params,
13498
13676
  const struct ggml_tensor * src0,
13499
13677
  const struct ggml_tensor * src1,
13500
- const struct ggml_tensor * opt0,
13501
13678
  struct ggml_tensor * dst) {
13502
13679
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13503
13680
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13557,7 +13734,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13557
13734
  return;
13558
13735
  }
13559
13736
 
13560
- const int32_t stride = ((const int32_t*)(opt0->data))[0];
13737
+ const int32_t stride = ggml_get_op_params_i32(dst, 0);
13561
13738
 
13562
13739
  // total patches in dst
13563
13740
  const int np = ne2;
@@ -13570,7 +13747,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13570
13747
  const int ip1 = MIN(ip0 + dp, np);
13571
13748
 
13572
13749
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13573
- ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
13750
+ ggml_fp16_t * const wdata_src = wdata + nk;
13574
13751
 
13575
13752
  for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13576
13753
  float * dst_data = (float *)((char *) dst->data + i2*nb2);
@@ -13582,9 +13759,8 @@ static void ggml_compute_forward_conv_transpose_2d(
13582
13759
  for (int i00 = 0; i00 < ne00; i00++) {
13583
13760
  float v = 0;
13584
13761
  ggml_vec_dot_f16(ne03, &v,
13585
- (ggml_fp16_t *) wdata_src + i1n,
13586
- (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
13587
-
13762
+ wdata_src + i1n,
13763
+ wdata_kernel + i01*ne00*ne03 + i00*ne03);
13588
13764
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13589
13765
  }
13590
13766
  }
@@ -13934,7 +14110,7 @@ static void ggml_compute_forward_flash_attn_f32(
13934
14110
  vvexpf(S, S, &Mup);
13935
14111
  ggml_vec_sum_f32(Mup, &sum, S);
13936
14112
  #else
13937
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14113
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13938
14114
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13939
14115
 
13940
14116
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13944,9 +14120,13 @@ static void ggml_compute_forward_flash_attn_f32(
13944
14120
  if (SS[j] == -INFINITY) {
13945
14121
  SS[j] = 0.0f;
13946
14122
  } else {
14123
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14124
+ const float val = expf(SS[j] - max);
14125
+ #else
13947
14126
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
13948
14127
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13949
14128
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14129
+ #endif
13950
14130
  sump[j] += (ggml_float)val;
13951
14131
  SS[j] = val;
13952
14132
  }
@@ -14524,7 +14704,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14524
14704
  vvexpf(SM, SM, &Mup);
14525
14705
  ggml_vec_sum_f32(Mup, &sum, SM);
14526
14706
  #else
14527
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14707
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
14528
14708
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
14529
14709
 
14530
14710
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -14535,9 +14715,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
14535
14715
  if (SR[j] == -INFINITY) {
14536
14716
  SW[j] = 0.0f;
14537
14717
  } else {
14718
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14719
+ const float val = expf(SR[j] - max);
14720
+ #else
14538
14721
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
14539
14722
  memcpy(&scvt[j], &s, sizeof(uint16_t));
14540
14723
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14724
+ #endif
14541
14725
  sump[j] += (ggml_float)val;
14542
14726
  SW[j] = val;
14543
14727
  }
@@ -15275,6 +15459,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15275
15459
  const int nc = src0->ne[0];
15276
15460
  const int nr = ggml_nrows(src0);
15277
15461
 
15462
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15463
+
15278
15464
  if (params->type == GGML_TASK_INIT) {
15279
15465
  if (ith == 0) {
15280
15466
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -15286,7 +15472,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15286
15472
  if (ith == 0) {
15287
15473
  float * dp = (float *) dst->data;
15288
15474
  ggml_vec_sum_f32(nth, dp, sums);
15289
- dp[0] *= -1.0f;
15475
+ dp[0] *= -1.0f / (float) nr;
15290
15476
  }
15291
15477
  return;
15292
15478
  }
@@ -15303,7 +15489,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15303
15489
  for (int i1 = ir0; i1 < ir1; i1++) {
15304
15490
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15305
15491
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15306
- float * st = (float *) params->wdata + nth + ith*nc;
15492
+ float * st = ((float *) params->wdata) + nth + ith*nc;
15307
15493
 
15308
15494
  #ifndef NDEBUG
15309
15495
  for (int i = 0; i < nc; ++i) {
@@ -15318,15 +15504,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15318
15504
  float max = -INFINITY;
15319
15505
  ggml_vec_max_f32(nc, &max, s0);
15320
15506
 
15321
- uint16_t scvt;
15507
+ uint16_t scvt; UNUSED(scvt);
15322
15508
  for (int i = 0; i < nc; i++) {
15323
15509
  if (s0[i] == -INFINITY) {
15324
15510
  st[i] = 0.0f;
15325
15511
  } else {
15326
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15512
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15513
+ const float s = s0[i] - max;
15514
+ const float val = expf(s);
15515
+ #else
15327
15516
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15328
15517
  memcpy(&scvt, &s, sizeof(scvt));
15329
15518
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15519
+ #endif
15330
15520
  sum += (ggml_float)val;
15331
15521
  st[i] = val;
15332
15522
  }
@@ -15342,7 +15532,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15342
15532
  ggml_vec_log_f32(nc, st, st);
15343
15533
  ggml_vec_mul_f32(nc, st, st, s1);
15344
15534
 
15345
- ggml_vec_sum_f32(nc, sums + ith, st);
15535
+ float st_sum = 0;
15536
+ ggml_vec_sum_f32(nc, &st_sum, st);
15537
+ sums[ith] += st_sum;
15346
15538
 
15347
15539
  #ifndef NDEBUG
15348
15540
  for (int i = 0; i < nc; ++i) {
@@ -15392,7 +15584,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15392
15584
  return;
15393
15585
  }
15394
15586
 
15395
- const float eps = 1e-9f;
15587
+ const double eps = 1e-9;
15396
15588
 
15397
15589
  // TODO: handle transposed/permuted matrices
15398
15590
  const int64_t nc = src0->ne[0];
@@ -15411,7 +15603,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15411
15603
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
15412
15604
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15413
15605
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15414
- float * sm = (float *) params->wdata + ith*nc;
15415
15606
 
15416
15607
  #ifndef NDEBUG
15417
15608
  for (int i = 0; i < nc; ++i) {
@@ -15420,54 +15611,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15420
15611
  assert(!isnan(s1[i]));
15421
15612
  }
15422
15613
  #endif
15423
- // step by step explanation:
15424
- {
15425
- //float * sums = (float *) params->wdata;
15426
-
15427
- // forward pass with annotated gradients from backward pass
15428
- // (built by going in reverse operation order, adding to gradients of current operation args)
15429
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
15430
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15431
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
15432
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
15433
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
15434
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
15435
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
15436
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
15437
-
15438
- // substitute into grad[st1], because we can reuse softmax_back from this point on
15439
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
15440
- // postorder:
15441
- // grad[st1] := softmax(s0)
15442
- // grad[st1] := grad[st1]*(1.0 - eps)
15443
- // grad[st1] := grad[st1] + eps
15444
- // grad[st1] := s1 / grad[st1]
15445
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
15446
-
15447
- // src0 gradients by going through softmax_back
15448
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15449
- // from softmax_back:
15450
- // dxk = yk * (dyk - dot(y, dy))
15451
- // dot_y_dy := dot(y, dy)
15452
- // dx := dy
15453
- // dx := dx - dot_y_dy
15454
- // dx := dx * y
15455
- // postorder:
15456
- // dot_st1_dst1 := dot(st1, grad[st1])
15457
- // grad[s0] := grad[st1]
15458
- // grad[s0] := grad[s0] - dot_st1_dst1
15459
- // grad[s0] := grad[s0] * st1
15460
-
15461
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
15462
- // sm := softmax(s0)
15463
- // grad[s0] := sm*(1.0 - eps)
15464
- // grad[s0] := grad[s0] + eps
15465
- // grad[s0] := s1 / grad[s0]
15466
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
15467
- // dot_st1_dst1 := dot(sm, grad[s0])
15468
- // grad[s0] := grad[s0] - dot_st1_dst1
15469
- // grad[s0] := grad[s0] * sm
15470
- }
15471
15614
 
15472
15615
  // soft_max
15473
15616
  ggml_float sum = 0.0;
@@ -15475,39 +15618,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15475
15618
  float max = -INFINITY;
15476
15619
  ggml_vec_max_f32(nc, &max, s0);
15477
15620
 
15478
- uint16_t scvt;
15621
+ uint16_t scvt; UNUSED(scvt);
15479
15622
  for (int i = 0; i < nc; i++) {
15480
15623
  if (s0[i] == -INFINITY) {
15481
- sm[i] = 0.0f;
15624
+ ds0[i] = 0.0f;
15482
15625
  } else {
15483
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15626
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15627
+ const float s = s0[i] - max;
15628
+ const float val = expf(s);
15629
+ #else
15484
15630
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15485
15631
  memcpy(&scvt, &s, sizeof(scvt));
15486
15632
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15633
+ #endif
15487
15634
  sum += (ggml_float)val;
15488
- sm[i] = val;
15635
+ ds0[i] = val;
15489
15636
  }
15490
15637
  }
15491
15638
 
15492
15639
  assert(sum > 0.0);
15493
- sum = 1.0/sum;
15640
+ sum = (1.0 - eps)/sum;
15494
15641
  }
15495
15642
 
15496
- float dot_st1_dst1 = 0;
15497
- ggml_vec_scale_f32(nc, sm, sum);
15498
- ggml_vec_cpy_f32 (nc, ds0, sm);
15499
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
15500
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
15501
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
15502
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
15503
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
15504
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
15505
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
15643
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
15644
+ ggml_vec_scale_f32(nc, ds0, sum);
15645
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
15646
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
15647
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
15648
+
15506
15649
 
15507
15650
  #ifndef NDEBUG
15508
15651
  for (int i = 0; i < nc; ++i) {
15509
- assert(!isnan(sm[i]));
15510
- assert(!isinf(sm[i]));
15511
15652
  assert(!isnan(ds0[i]));
15512
15653
  assert(!isinf(ds0[i]));
15513
15654
  }
@@ -15731,7 +15872,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15731
15872
  } break;
15732
15873
  case GGML_OP_CONV_TRANSPOSE_2D:
15733
15874
  {
15734
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15875
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15735
15876
  } break;
15736
15877
  case GGML_OP_POOL_1D:
15737
15878
  {
@@ -16062,9 +16203,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16062
16203
  {
16063
16204
  // necessary for llama
16064
16205
  if (src0->grad) {
16206
+ float eps;
16207
+ memcpy(&eps, tensor->op_params, sizeof(float));
16208
+
16065
16209
  src0->grad = ggml_add_impl(ctx,
16066
16210
  src0->grad,
16067
- ggml_rms_norm_back(ctx, src0, tensor->grad),
16211
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
16068
16212
  inplace);
16069
16213
  }
16070
16214
  } break;
@@ -16832,9 +16976,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16832
16976
  return result;
16833
16977
  }
16834
16978
 
16835
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
16836
- struct ggml_cgraph result = *gf;
16837
-
16979
+ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
16838
16980
  GGML_ASSERT(gf->n_nodes > 0);
16839
16981
 
16840
16982
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16858,15 +17000,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16858
17000
  }
16859
17001
  }
16860
17002
 
16861
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
17003
+ for (int i = 0; i < gf->n_nodes; i++) {
16862
17004
  struct ggml_tensor * node = gf->nodes[i];
16863
17005
 
16864
17006
  if (node->is_param) {
16865
17007
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16866
- ggml_build_forward_expand(&result, node->grad);
17008
+ ggml_build_forward_expand(gb, node->grad);
16867
17009
  }
16868
17010
  }
17011
+ }
16869
17012
 
17013
+ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
17014
+ struct ggml_cgraph result = *gf;
17015
+ ggml_build_backward_expand(ctx, gf, &result, keep);
16870
17016
  return result;
16871
17017
  }
16872
17018
 
@@ -17542,10 +17688,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
17542
17688
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
17543
17689
  {
17544
17690
  n_tasks = n_threads;
17545
-
17546
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
17547
-
17548
- work_size = MAX(work_size, cur);
17549
17691
  } break;
17550
17692
  case GGML_OP_NONE:
17551
17693
  {
@@ -18423,14 +18565,16 @@ static enum ggml_opt_result ggml_opt_adam(
18423
18565
  struct ggml_opt_params params,
18424
18566
  struct ggml_tensor * f,
18425
18567
  struct ggml_cgraph * gf,
18426
- struct ggml_cgraph * gb) {
18568
+ struct ggml_cgraph * gb,
18569
+ ggml_opt_callback callback,
18570
+ void * callback_data) {
18427
18571
  GGML_ASSERT(ggml_is_scalar(f));
18428
18572
 
18429
18573
  // these will store the parameters we want to optimize
18430
18574
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
18431
18575
 
18432
18576
  int np = 0;
18433
- int nx = 0;
18577
+ int64_t nx = 0;
18434
18578
  for (int i = 0; i < gf->n_nodes; ++i) {
18435
18579
  if (gf->nodes[i]->is_param) {
18436
18580
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -18449,31 +18593,32 @@ static enum ggml_opt_result ggml_opt_adam(
18449
18593
  }
18450
18594
 
18451
18595
  // constants
18452
- const float sched = params.adam.sched;
18453
- const float decay = params.adam.decay * sched;
18454
- const float alpha = params.adam.alpha * sched;
18596
+ float sched = params.adam.sched;
18597
+ const float alpha = params.adam.alpha;
18598
+ const float decay = params.adam.decay * alpha;
18455
18599
  const float beta1 = params.adam.beta1;
18456
18600
  const float beta2 = params.adam.beta2;
18457
18601
  const float eps = params.adam.eps;
18602
+ const float gclip = params.adam.gclip;
18603
+ const int decay_min_ndim = params.adam.decay_min_ndim;
18458
18604
 
18459
- float * x = opt->adam.x->data; // view of the parameters
18460
- float * g1 = opt->adam.g1->data; // gradient
18461
- float * g2 = opt->adam.g2->data; // gradient squared
18462
18605
  float * m = opt->adam.m->data; // first moment
18463
18606
  float * v = opt->adam.v->data; // second moment
18464
- float * mh = opt->adam.mh->data; // first moment hat
18465
- float * vh = opt->adam.vh->data; // second moment hat
18466
18607
 
18467
18608
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18468
18609
 
18469
- // update view
18470
- ggml_opt_get_params(np, ps, x);
18610
+ if (callback) {
18611
+ callback(callback_data, &sched);
18612
+ }
18471
18613
 
18472
18614
  // compute the function value
18473
18615
  ggml_graph_reset (gf);
18474
18616
  ggml_set_f32 (f->grad, 1.0f);
18475
18617
 
18476
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18618
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18619
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18620
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18621
+ ggml_graph_compute(gb, &cplan);
18477
18622
 
18478
18623
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
18479
18624
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -18481,6 +18626,9 @@ static enum ggml_opt_result ggml_opt_adam(
18481
18626
  pf[opt->iter % params.past] = opt->adam.fx_prev;
18482
18627
  }
18483
18628
 
18629
+ opt->loss_before = opt->adam.fx_prev;
18630
+ opt->loss_after = opt->adam.fx_prev;
18631
+
18484
18632
  // initialize
18485
18633
  if (opt->just_initialized) {
18486
18634
  opt->adam.n_no_improvement = 0;
@@ -18513,50 +18661,55 @@ static enum ggml_opt_result ggml_opt_adam(
18513
18661
  UNUSED(t_start_cpu);
18514
18662
 
18515
18663
  {
18516
- // update the gradient
18517
- ggml_opt_get_grad(np, ps, g1);
18518
-
18519
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
18520
- ggml_vec_scale_f32(nx, m, beta1);
18521
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
18522
-
18523
- // g2 = g1^2
18524
- ggml_vec_sqr_f32 (nx, g2, g1);
18525
-
18526
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
18527
- ggml_vec_scale_f32(nx, v, beta2);
18528
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
18529
-
18530
- // m^hat = m_t / (1 - beta1^t)
18531
- // v^hat = v_t / (1 - beta2^t)
18532
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
18533
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
18534
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
18535
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
18536
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
18537
- ggml_vec_cpy_f32 (nx, mh, m);
18538
- ggml_vec_cpy_f32 (nx, vh, v);
18539
-
18540
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
18541
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
18542
-
18543
- ggml_vec_sqrt_f32 (nx, vh, vh);
18544
- ggml_vec_acc1_f32 (nx, vh, eps);
18545
-
18546
- ggml_vec_div_f32 (nx, mh, mh, vh);
18547
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
18548
- ggml_vec_sub_f32 (nx, x, x, mh);
18664
+ float gnorm = 1.0f;
18665
+ if (gclip > 0.0f) {
18666
+ // gradient clipping
18667
+ ggml_float sum = 0.0;
18668
+ for (int p = 0; p < np; ++p) {
18669
+ const int64_t ne = ggml_nelements(ps[p]);
18670
+ for (int64_t j = 0; j < ne; ++j) {
18671
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
18672
+ sum += (ggml_float)(g*g);
18673
+ }
18674
+ }
18675
+ ggml_float norm = sqrt(sum);
18676
+ if (norm > (ggml_float) gclip) {
18677
+ gnorm = (float) ((ggml_float) gclip / norm);
18678
+ }
18679
+ }
18680
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
18681
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
18682
+ int64_t i = 0;
18683
+ for (int p = 0; p < np; ++p) {
18684
+ const int64_t ne = ggml_nelements(ps[p]);
18685
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
18686
+ for (int64_t j = 0; j < ne; ++j) {
18687
+ float x = ggml_get_f32_1d(ps[p], j);
18688
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
18689
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
18690
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
18691
+ float mh = m[i]*beta1h;
18692
+ float vh = v[i]*beta2h;
18693
+ vh = sqrtf(vh) + eps;
18694
+ x = x*(1.0f - p_decay) - mh/vh;
18695
+ ggml_set_f32_1d(ps[p], j, x);
18696
+ ++i;
18697
+ }
18698
+ }
18699
+ }
18549
18700
 
18550
- // update the parameters
18551
- ggml_opt_set_params(np, ps, x);
18701
+ if (callback) {
18702
+ callback(callback_data, &sched);
18552
18703
  }
18553
18704
 
18554
18705
  ggml_graph_reset (gf);
18555
18706
  ggml_set_f32 (f->grad, 1.0f);
18556
18707
 
18557
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18708
+ ggml_graph_compute(gb, &cplan);
18558
18709
 
18559
18710
  const float fx = ggml_get_f32_1d(f, 0);
18711
+ opt->loss_after = fx;
18712
+
18560
18713
 
18561
18714
  // check convergence
18562
18715
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -18625,7 +18778,6 @@ struct ggml_lbfgs_iteration_data {
18625
18778
  };
18626
18779
 
18627
18780
  static enum ggml_opt_result linesearch_backtracking(
18628
- struct ggml_context * ctx,
18629
18781
  const struct ggml_opt_params * params,
18630
18782
  int nx,
18631
18783
  float * x,
@@ -18637,8 +18789,11 @@ static enum ggml_opt_result linesearch_backtracking(
18637
18789
  struct ggml_tensor * f,
18638
18790
  struct ggml_cgraph * gf,
18639
18791
  struct ggml_cgraph * gb,
18792
+ struct ggml_cplan * cplan,
18640
18793
  const int np,
18641
- struct ggml_tensor * ps[]) {
18794
+ struct ggml_tensor * ps[],
18795
+ ggml_opt_callback callback,
18796
+ void * callback_data) {
18642
18797
  int count = 0;
18643
18798
 
18644
18799
  float width = 0.0f;
@@ -18667,6 +18822,12 @@ static enum ggml_opt_result linesearch_backtracking(
18667
18822
  dgtest = params->lbfgs.ftol*dginit;
18668
18823
 
18669
18824
  while (true) {
18825
+ if (callback) {
18826
+ // LBFG-S does not support learning rate -> ignore learning schedule
18827
+ float sched = 0;
18828
+ callback(callback_data, &sched);
18829
+ }
18830
+
18670
18831
  ggml_vec_cpy_f32(nx, x, xp);
18671
18832
  ggml_vec_mad_f32(nx, x, d, *step);
18672
18833
 
@@ -18677,7 +18838,7 @@ static enum ggml_opt_result linesearch_backtracking(
18677
18838
  ggml_graph_reset (gf);
18678
18839
  ggml_set_f32 (f->grad, 1.0f);
18679
18840
 
18680
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
18841
+ ggml_graph_compute(gb, cplan);
18681
18842
 
18682
18843
  ggml_opt_get_grad(np, ps, g);
18683
18844
 
@@ -18737,7 +18898,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18737
18898
  struct ggml_opt_params params,
18738
18899
  struct ggml_tensor * f,
18739
18900
  struct ggml_cgraph * gf,
18740
- struct ggml_cgraph * gb) {
18901
+ struct ggml_cgraph * gb,
18902
+ ggml_opt_callback callback,
18903
+ void * callback_data) {
18741
18904
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18742
18905
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18743
18906
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -18769,6 +18932,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18769
18932
  opt->iter = iter;
18770
18933
  }
18771
18934
 
18935
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18936
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18937
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18938
+
18772
18939
  float * x = opt->lbfgs.x->data; // current parameters
18773
18940
  float * xp = opt->lbfgs.xp->data; // previous parameters
18774
18941
  float * g = opt->lbfgs.g->data; // current gradient
@@ -18790,6 +18957,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18790
18957
  float * lm_s = opt->lbfgs.lms->data;
18791
18958
  float * lm_y = opt->lbfgs.lmy->data;
18792
18959
 
18960
+ if (callback) {
18961
+ // LBFG-S does not support learning rate -> ignore learning schedule
18962
+ float sched = 0;
18963
+ callback(callback_data, &sched);
18964
+ }
18965
+
18793
18966
  // evaluate the function value and its gradient
18794
18967
  {
18795
18968
  ggml_opt_set_params(np, ps, x);
@@ -18797,11 +18970,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18797
18970
  ggml_graph_reset (gf);
18798
18971
  ggml_set_f32 (f->grad, 1.0f);
18799
18972
 
18800
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18973
+ ggml_graph_compute(gb, &cplan);
18801
18974
 
18802
18975
  ggml_opt_get_grad(np, ps, g);
18803
18976
 
18804
18977
  fx = ggml_get_f32_1d(f, 0);
18978
+
18979
+ opt->loss_before = fx;
18980
+ opt->loss_after = fx;
18805
18981
  }
18806
18982
 
18807
18983
  // search direction = -gradient
@@ -18856,7 +19032,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18856
19032
  ggml_vec_cpy_f32(nx, xp, x);
18857
19033
  ggml_vec_cpy_f32(nx, gp, g);
18858
19034
 
18859
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
19035
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
18860
19036
 
18861
19037
  if (ls < 0) {
18862
19038
  // linesearch failed - go back to the previous point and return
@@ -18866,6 +19042,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18866
19042
  return ls;
18867
19043
  }
18868
19044
 
19045
+ opt->loss_after = fx;
19046
+
18869
19047
  ggml_vec_norm_f32(nx, &xnorm, x);
18870
19048
  ggml_vec_norm_f32(nx, &gnorm, g);
18871
19049
 
@@ -18923,7 +19101,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18923
19101
  // ys = y^t \cdot s -> 1 / \rho.
18924
19102
  // yy = y^t \cdot y.
18925
19103
  //
18926
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
19104
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18927
19105
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18928
19106
 
18929
19107
  lm_ys[end[0]] = ys;
@@ -18986,13 +19164,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18986
19164
  .adam = {
18987
19165
  .n_iter = 10000,
18988
19166
  .sched = 1.000f,
18989
- .decay = 0.001f,
19167
+ .decay = 0.0f,
19168
+ .decay_min_ndim = 2,
18990
19169
  .alpha = 0.001f,
18991
19170
  .beta1 = 0.9f,
18992
19171
  .beta2 = 0.999f,
18993
19172
  .eps = 1e-8f,
18994
19173
  .eps_f = 1e-5f,
18995
19174
  .eps_g = 1e-3f,
19175
+ .gclip = 0.0f,
18996
19176
  },
18997
19177
  };
18998
19178
  } break;
@@ -19042,23 +19222,13 @@ GGML_API void ggml_opt_init(
19042
19222
  switch (opt->params.type) {
19043
19223
  case GGML_OPT_ADAM:
19044
19224
  {
19045
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19046
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19047
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19048
19225
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19049
19226
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19050
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19051
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19052
19227
  opt->adam.pf = params.past > 0
19053
19228
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
19054
19229
  : NULL;
19055
- ggml_set_zero(opt->adam.x);
19056
- ggml_set_zero(opt->adam.g1);
19057
- ggml_set_zero(opt->adam.g2);
19058
19230
  ggml_set_zero(opt->adam.m);
19059
19231
  ggml_set_zero(opt->adam.v);
19060
- ggml_set_zero(opt->adam.mh);
19061
- ggml_set_zero(opt->adam.vh);
19062
19232
  if (opt->adam.pf) {
19063
19233
  ggml_set_zero(opt->adam.pf);
19064
19234
  }
@@ -19142,7 +19312,7 @@ enum ggml_opt_result ggml_opt_resume(
19142
19312
  *gf = ggml_build_forward (f);
19143
19313
  *gb = ggml_build_backward(ctx, gf, true);
19144
19314
 
19145
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
19315
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
19146
19316
  }
19147
19317
 
19148
19318
  enum ggml_opt_result ggml_opt_resume_g(
@@ -19150,7 +19320,9 @@ enum ggml_opt_result ggml_opt_resume_g(
19150
19320
  struct ggml_opt_context * opt,
19151
19321
  struct ggml_tensor * f,
19152
19322
  struct ggml_cgraph * gf,
19153
- struct ggml_cgraph * gb) {
19323
+ struct ggml_cgraph * gb,
19324
+ ggml_opt_callback callback,
19325
+ void * callback_data) {
19154
19326
 
19155
19327
  // build forward + backward compute graphs
19156
19328
  enum ggml_opt_result result = GGML_OPT_OK;
@@ -19158,11 +19330,11 @@ enum ggml_opt_result ggml_opt_resume_g(
19158
19330
  switch (opt->params.type) {
19159
19331
  case GGML_OPT_ADAM:
19160
19332
  {
19161
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
19333
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19162
19334
  } break;
19163
19335
  case GGML_OPT_LBFGS:
19164
19336
  {
19165
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
19337
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19166
19338
  } break;
19167
19339
  }
19168
19340
 
@@ -19394,7 +19566,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19394
19566
  ////////////////////////////////////////////////////////////////////////////////
19395
19567
 
19396
19568
  struct gguf_str {
19397
- uint32_t n;
19569
+ uint64_t n; // GGUFv2
19398
19570
  char * data;
19399
19571
  };
19400
19572
 
@@ -19408,9 +19580,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19408
19580
  [GGUF_TYPE_FLOAT32] = sizeof(float),
19409
19581
  [GGUF_TYPE_BOOL] = sizeof(bool),
19410
19582
  [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19583
+ [GGUF_TYPE_UINT64] = sizeof(uint64_t),
19584
+ [GGUF_TYPE_INT64] = sizeof(int64_t),
19585
+ [GGUF_TYPE_FLOAT64] = sizeof(double),
19411
19586
  [GGUF_TYPE_ARRAY] = 0, // undefined
19412
19587
  };
19413
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19588
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19414
19589
 
19415
19590
  static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19416
19591
  [GGUF_TYPE_UINT8] = "u8",
@@ -19423,8 +19598,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19423
19598
  [GGUF_TYPE_BOOL] = "bool",
19424
19599
  [GGUF_TYPE_STRING] = "str",
19425
19600
  [GGUF_TYPE_ARRAY] = "arr",
19601
+ [GGUF_TYPE_UINT64] = "u64",
19602
+ [GGUF_TYPE_INT64] = "i64",
19603
+ [GGUF_TYPE_FLOAT64] = "f64",
19426
19604
  };
19427
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19605
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19428
19606
 
19429
19607
  union gguf_value {
19430
19608
  uint8_t uint8;
@@ -19434,6 +19612,9 @@ union gguf_value {
19434
19612
  uint32_t uint32;
19435
19613
  int32_t int32;
19436
19614
  float float32;
19615
+ uint64_t uint64;
19616
+ int64_t int64;
19617
+ double float64;
19437
19618
  bool bool_;
19438
19619
 
19439
19620
  struct gguf_str str;
@@ -19441,7 +19622,7 @@ union gguf_value {
19441
19622
  struct {
19442
19623
  enum gguf_type type;
19443
19624
 
19444
- uint32_t n;
19625
+ uint64_t n; // GGUFv2
19445
19626
  void * data;
19446
19627
  } arr;
19447
19628
  };
@@ -19449,8 +19630,6 @@ union gguf_value {
19449
19630
  struct gguf_kv {
19450
19631
  struct gguf_str key;
19451
19632
 
19452
- uint32_t n_bytes; // TODO: is this actually needed?
19453
-
19454
19633
  enum gguf_type type;
19455
19634
  union gguf_value value;
19456
19635
  };
@@ -19458,15 +19637,15 @@ struct gguf_kv {
19458
19637
  struct gguf_header {
19459
19638
  uint32_t magic;
19460
19639
  uint32_t version;
19461
- uint32_t n_tensors;
19462
- uint32_t n_kv;
19640
+ uint64_t n_tensors; // GGUFv2
19641
+ uint64_t n_kv; // GGUFv2
19463
19642
  };
19464
19643
 
19465
19644
  struct gguf_tensor_info {
19466
19645
  struct gguf_str name;
19467
19646
 
19468
19647
  uint32_t n_dims;
19469
- uint32_t ne[GGML_MAX_DIMS];
19648
+ uint64_t ne[GGML_MAX_DIMS];
19470
19649
 
19471
19650
  enum ggml_type type;
19472
19651
 
@@ -19497,19 +19676,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
19497
19676
  return n == size;
19498
19677
  }
19499
19678
 
19500
- static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
19679
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19680
+ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
19501
19681
  p->n = 0;
19502
19682
  p->data = NULL;
19503
19683
 
19504
19684
  bool ok = true;
19505
19685
 
19506
- // TODO: how to avoid mallocs for strings?
19507
19686
  ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19508
19687
  ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19509
19688
 
19510
19689
  return ok;
19511
19690
  }
19512
19691
 
19692
+ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
19693
+ p->n = 0;
19694
+ p->data = NULL;
19695
+
19696
+ bool ok = true;
19697
+
19698
+ uint32_t n = 0;
19699
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
19700
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19701
+
19702
+ return ok;
19703
+ }
19704
+
19513
19705
  struct gguf_context * gguf_init_empty(void) {
19514
19706
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19515
19707
 
@@ -19565,8 +19757,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19565
19757
  ctx->data = NULL;
19566
19758
 
19567
19759
  ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19568
- ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19569
- ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19760
+
19761
+ if (ctx->header.version == 1) {
19762
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19763
+ uint32_t n_tensors = 0;
19764
+ uint32_t n_kv = 0;
19765
+
19766
+ ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
19767
+ ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
19768
+
19769
+ ctx->header.n_tensors = n_tensors;
19770
+ ctx->header.n_kv = n_kv;
19771
+ } else {
19772
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19773
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19774
+ }
19570
19775
 
19571
19776
  if (!ok) {
19572
19777
  fprintf(stderr, "%s: failed to read header\n", __func__);
@@ -19576,18 +19781,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19576
19781
  }
19577
19782
  }
19578
19783
 
19784
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19785
+ bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
19786
+ if (ctx->header.version == 1) {
19787
+ gguf_fread_str = gguf_fread_str_v1;
19788
+ }
19789
+
19579
19790
  // read the kv pairs
19580
19791
  {
19581
- ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19792
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19582
19793
 
19583
19794
  for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19584
19795
  struct gguf_kv * kv = &ctx->kv[i];
19585
19796
 
19586
19797
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19587
19798
 
19588
- ok = ok && gguf_fread_str(file, &kv->key, &offset);
19589
- //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
19590
- ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19799
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19800
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19591
19801
 
19592
19802
  //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19593
19803
 
@@ -19599,12 +19809,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19599
19809
  case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19600
19810
  case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19601
19811
  case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19812
+ case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
19813
+ case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
19814
+ case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
19602
19815
  case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19603
19816
  case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19604
19817
  case GGUF_TYPE_ARRAY:
19605
19818
  {
19606
19819
  ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19607
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19820
+
19821
+ if (ctx->header.version == 1) {
19822
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19823
+ uint32_t n = 0;
19824
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
19825
+ kv->value.arr.n = n;
19826
+ } else {
19827
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19828
+ }
19608
19829
 
19609
19830
  switch (kv->value.arr.type) {
19610
19831
  case GGUF_TYPE_UINT8:
@@ -19614,6 +19835,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19614
19835
  case GGUF_TYPE_UINT32:
19615
19836
  case GGUF_TYPE_INT32:
19616
19837
  case GGUF_TYPE_FLOAT32:
19838
+ case GGUF_TYPE_UINT64:
19839
+ case GGUF_TYPE_INT64:
19840
+ case GGUF_TYPE_FLOAT64:
19617
19841
  case GGUF_TYPE_BOOL:
19618
19842
  {
19619
19843
  kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -19648,7 +19872,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19648
19872
 
19649
19873
  // read the tensor infos
19650
19874
  {
19651
- ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19875
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19652
19876
 
19653
19877
  for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19654
19878
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19660,7 +19884,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19660
19884
  ok = ok && gguf_fread_str(file, &info->name, &offset);
19661
19885
  ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19662
19886
  for (uint32_t j = 0; j < info->n_dims; ++j) {
19663
- ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19887
+ if (ctx->header.version == 1) {
19888
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19889
+ uint32_t t = 0;
19890
+ ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
19891
+ info->ne[j] = t;
19892
+ } else {
19893
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19894
+ }
19664
19895
  }
19665
19896
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19666
19897
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
@@ -19744,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19744
19975
 
19745
19976
  struct ggml_tensor * data = NULL;
19746
19977
 
19747
- if (params.no_alloc == false) {
19978
+ if (!params.no_alloc) {
19748
19979
  data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
19749
19980
 
19750
19981
  ok = ok && data != NULL;
@@ -19785,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19785
20016
  }
19786
20017
 
19787
20018
  // point the data member to the appropriate location in the binary blob using the tensor infos
19788
- if (params.no_alloc == false) {
20019
+ if (!params.no_alloc) {
19789
20020
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
19790
20021
  cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
19791
20022
  }
@@ -19842,7 +20073,7 @@ void gguf_free(struct gguf_context * ctx) {
19842
20073
  }
19843
20074
  }
19844
20075
 
19845
- GGML_ALIGNED_FREE(ctx->kv);
20076
+ free(ctx->kv);
19846
20077
  }
19847
20078
 
19848
20079
  if (ctx->infos) {
@@ -19854,7 +20085,7 @@ void gguf_free(struct gguf_context * ctx) {
19854
20085
  }
19855
20086
  }
19856
20087
 
19857
- GGML_ALIGNED_FREE(ctx->infos);
20088
+ free(ctx->infos);
19858
20089
  }
19859
20090
 
19860
20091
  GGML_ALIGNED_FREE(ctx);
@@ -19954,6 +20185,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
19954
20185
  return ctx->kv[i].value.float32;
19955
20186
  }
19956
20187
 
20188
+ uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20189
+ return ctx->kv[i].value.uint64;
20190
+ }
20191
+
20192
+ int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20193
+ return ctx->kv[i].value.int64;
20194
+ }
20195
+
20196
+ double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20197
+ return ctx->kv[i].value.float64;
20198
+ }
20199
+
19957
20200
  bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
19958
20201
  return ctx->kv[i].value.bool_;
19959
20202
  }
@@ -20000,7 +20243,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
20000
20243
  const int n_kv = gguf_get_n_kv(ctx);
20001
20244
 
20002
20245
  ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20003
- ctx->kv[n_kv].key.n = strlen(key) + 1;
20246
+ ctx->kv[n_kv].key.n = strlen(key);
20004
20247
  ctx->kv[n_kv].key.data = strdup(key);
20005
20248
  ctx->header.n_kv++;
20006
20249
 
@@ -20056,6 +20299,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20056
20299
  ctx->kv[idx].value.float32 = val;
20057
20300
  }
20058
20301
 
20302
+ void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
20303
+ const int idx = gguf_get_or_add_key(ctx, key);
20304
+
20305
+ ctx->kv[idx].type = GGUF_TYPE_UINT64;
20306
+ ctx->kv[idx].value.uint64 = val;
20307
+ }
20308
+
20309
+ void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
20310
+ const int idx = gguf_get_or_add_key(ctx, key);
20311
+
20312
+ ctx->kv[idx].type = GGUF_TYPE_INT64;
20313
+ ctx->kv[idx].value.int64 = val;
20314
+ }
20315
+
20316
+ void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
20317
+ const int idx = gguf_get_or_add_key(ctx, key);
20318
+
20319
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
20320
+ ctx->kv[idx].value.float64 = val;
20321
+ }
20322
+
20059
20323
  void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20060
20324
  const int idx = gguf_get_or_add_key(ctx, key);
20061
20325
 
@@ -20067,7 +20331,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
20067
20331
  const int idx = gguf_get_or_add_key(ctx, key);
20068
20332
 
20069
20333
  ctx->kv[idx].type = GGUF_TYPE_STRING;
20070
- ctx->kv[idx].value.str.n = strlen(val) + 1;
20334
+ ctx->kv[idx].value.str.n = strlen(val);
20071
20335
  ctx->kv[idx].value.str.data = strdup(val);
20072
20336
  }
20073
20337
 
@@ -20090,7 +20354,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
20090
20354
  ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20091
20355
  for (int i = 0; i < n; i++) {
20092
20356
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20093
- str->n = strlen(data[i]) + 1;
20357
+ str->n = strlen(data[i]);
20094
20358
  str->data = strdup(data[i]);
20095
20359
  }
20096
20360
  }
@@ -20106,6 +20370,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20106
20370
  case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20107
20371
  case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20108
20372
  case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20373
+ case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
20374
+ case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
20375
+ case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
20109
20376
  case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20110
20377
  case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20111
20378
  case GGUF_TYPE_ARRAY:
@@ -20134,7 +20401,7 @@ void gguf_add_tensor(
20134
20401
  const int idx = ctx->header.n_tensors;
20135
20402
  ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20136
20403
 
20137
- ctx->infos[idx].name.n = strlen(tensor->name) + 1;
20404
+ ctx->infos[idx].name.n = strlen(tensor->name);
20138
20405
  ctx->infos[idx].name.data = strdup(tensor->name);
20139
20406
 
20140
20407
  for (int i = 0; i < GGML_MAX_DIMS; ++i) {
@@ -20267,6 +20534,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20267
20534
  case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20268
20535
  case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20269
20536
  case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20537
+ case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
20538
+ case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
20539
+ case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
20270
20540
  case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20271
20541
  case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20272
20542
  case GGUF_TYPE_ARRAY:
@@ -20282,6 +20552,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20282
20552
  case GGUF_TYPE_UINT32:
20283
20553
  case GGUF_TYPE_INT32:
20284
20554
  case GGUF_TYPE_FLOAT32:
20555
+ case GGUF_TYPE_UINT64:
20556
+ case GGUF_TYPE_INT64:
20557
+ case GGUF_TYPE_FLOAT64:
20285
20558
  case GGUF_TYPE_BOOL:
20286
20559
  {
20287
20560
  gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -20516,6 +20789,14 @@ int ggml_cpu_has_sse3(void) {
20516
20789
  #endif
20517
20790
  }
20518
20791
 
20792
+ int ggml_cpu_has_ssse3(void) {
20793
+ #if defined(__SSSE3__)
20794
+ return 1;
20795
+ #else
20796
+ return 0;
20797
+ #endif
20798
+ }
20799
+
20519
20800
  int ggml_cpu_has_vsx(void) {
20520
20801
  #if defined(__POWER9_VECTOR__)
20521
20802
  return 1;