llama_cpp 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
123
123
  #define GGML_GELU_FP16
124
124
  #define GGML_GELU_QUICK_FP16
125
125
  #define GGML_SILU_FP16
126
+ // #define GGML_CROSS_ENTROPY_EXP_FP16
127
+ // #define GGML_FLASH_ATTN_EXP_FP16
126
128
 
127
129
  #define GGML_SOFT_MAX_UNROLL 4
128
130
  #define GGML_VEC_DOT_UNROLL 2
@@ -157,12 +159,6 @@ typedef void * thread_ret_t;
157
159
  //#define GGML_SOFT_MAX_ACCELERATE
158
160
  #endif
159
161
 
160
- #if UINTPTR_MAX == 0xFFFFFFFF
161
- #define GGML_MEM_ALIGN 4
162
- #else
163
- #define GGML_MEM_ALIGN 16
164
- #endif
165
-
166
162
  //
167
163
  // logging
168
164
  //
@@ -192,8 +188,8 @@ typedef void * thread_ret_t;
192
188
  //
193
189
 
194
190
  #if defined(_MSC_VER) || defined(__MINGW32__)
195
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
191
+ #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
192
+ #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
193
  #else
198
194
  inline static void * ggml_aligned_malloc(size_t size) {
199
195
  void * aligned_memory = NULL;
@@ -218,8 +214,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
218
214
  }
219
215
  return aligned_memory;
220
216
  }
221
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
222
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
217
+ #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
218
+ #define GGML_ALIGNED_FREE(ptr) free(ptr)
223
219
  #endif
224
220
 
225
221
  #define UNUSED GGML_UNUSED
@@ -305,6 +301,10 @@ typedef double ggml_float;
305
301
  #endif
306
302
  #endif
307
303
 
304
+ #ifdef __riscv_v_intrinsic
305
+ #include <riscv_vector.h>
306
+ #endif
307
+
308
308
  #ifdef __F16C__
309
309
 
310
310
  #ifdef _MSC_VER
@@ -2436,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2436
2436
  const int nb = n / qk;
2437
2437
 
2438
2438
  assert(n % qk == 0);
2439
- assert(nb % 2 == 0);
2440
2439
 
2441
2440
  const block_q4_0 * restrict x = vx;
2442
2441
  const block_q8_0 * restrict y = vy;
@@ -2445,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2445
2444
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2446
2445
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2447
2446
 
2447
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2448
2448
  for (int i = 0; i < nb; i += 2) {
2449
2449
  const block_q4_0 * restrict x0 = &x[i + 0];
2450
2450
  const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2623,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2623
2623
  }
2624
2624
 
2625
2625
  // Main loop
2626
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2626
2627
  for (int i = 2; i < nb; i+=2) {
2627
2628
  _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
2628
2629
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2680,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2680
2681
  }
2681
2682
 
2682
2683
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2684
+ #elif defined(__riscv_v_intrinsic)
2685
+ float sumf = 0.0;
2686
+
2687
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2688
+
2689
+ for (int i = 0; i < nb; i++) {
2690
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2691
+
2692
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2693
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2694
+
2695
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2696
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2697
+
2698
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2699
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2700
+
2701
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2702
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2703
+
2704
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2705
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2706
+
2707
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2708
+
2709
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2710
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2711
+
2712
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2713
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2714
+
2715
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2716
+ }
2717
+
2718
+ *s = sumf;
2683
2719
  #else
2684
2720
  // scalar
2685
2721
  float sumf = 0.0;
@@ -2706,7 +2742,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2706
2742
  const int nb = n / qk;
2707
2743
 
2708
2744
  assert(n % qk == 0);
2709
- assert(nb % 2 == 0);
2710
2745
 
2711
2746
  const block_q4_1 * restrict x = vx;
2712
2747
  const block_q8_1 * restrict y = vy;
@@ -2718,6 +2753,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2718
2753
 
2719
2754
  float summs = 0;
2720
2755
 
2756
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2721
2757
  for (int i = 0; i < nb; i += 2) {
2722
2758
  const block_q4_1 * restrict x0 = &x[i + 0];
2723
2759
  const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2806,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2806
2842
  }
2807
2843
 
2808
2844
  *s = hsum_float_8(acc) + summs;
2845
+ #elif defined(__riscv_v_intrinsic)
2846
+ float sumf = 0.0;
2847
+
2848
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2849
+
2850
+ for (int i = 0; i < nb; i++) {
2851
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2852
+
2853
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2854
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2855
+
2856
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2857
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2858
+
2859
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2860
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2861
+
2862
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2863
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2864
+
2865
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2866
+
2867
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2868
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2869
+
2870
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2871
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2872
+
2873
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2874
+ }
2875
+
2876
+ *s = sumf;
2809
2877
  #else
2810
2878
  // scalar
2811
2879
  float sumf = 0.0;
@@ -2832,7 +2900,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2832
2900
  const int nb = n / qk;
2833
2901
 
2834
2902
  assert(n % qk == 0);
2835
- assert(nb % 2 == 0);
2836
2903
  assert(qk == QK5_0);
2837
2904
 
2838
2905
  const block_q5_0 * restrict x = vx;
@@ -2848,6 +2915,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2848
2915
  uint64_t tmp0[4];
2849
2916
  uint64_t tmp1[4];
2850
2917
 
2918
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2851
2919
  for (int i = 0; i < nb; i += 2) {
2852
2920
  const block_q5_0 * restrict x0 = &x[i];
2853
2921
  const block_q5_0 * restrict x1 = &x[i + 1];
@@ -3040,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3040
3108
  }
3041
3109
 
3042
3110
  *s = hsum_float_8(acc);
3111
+ #elif defined(__riscv_v_intrinsic)
3112
+ float sumf = 0.0;
3113
+
3114
+ uint32_t qh;
3115
+
3116
+ // These temp values are for masking and shift operations
3117
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3118
+ uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3119
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3120
+
3121
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3122
+
3123
+ for (int i = 0; i < nb; i++) {
3124
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3125
+
3126
+ // temporary registers
3127
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3128
+ vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3129
+ vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3130
+ vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3131
+
3132
+ // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3133
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3134
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3135
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3136
+
3137
+ // ((qh & (1u << (j + 16))) >> (j + 12));
3138
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3139
+ vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3140
+
3141
+ // narrowing
3142
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3143
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3144
+
3145
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3146
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3147
+
3148
+ // load
3149
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3150
+
3151
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3152
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3153
+
3154
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3155
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3156
+
3157
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3158
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3159
+
3160
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3161
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3162
+
3163
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3164
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3165
+
3166
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3167
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3168
+
3169
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3170
+
3171
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3172
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3173
+
3174
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3175
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3176
+
3177
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3178
+ }
3179
+
3180
+ *s = sumf;
3043
3181
  #else
3044
3182
  // scalar
3045
3183
  float sumf = 0.0;
@@ -3072,7 +3210,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3072
3210
  const int nb = n / qk;
3073
3211
 
3074
3212
  assert(n % qk == 0);
3075
- assert(nb % 2 == 0);
3076
3213
  assert(qk == QK5_1);
3077
3214
 
3078
3215
  const block_q5_1 * restrict x = vx;
@@ -3091,6 +3228,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3091
3228
  uint64_t tmp0[4];
3092
3229
  uint64_t tmp1[4];
3093
3230
 
3231
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3094
3232
  for (int i = 0; i < nb; i += 2) {
3095
3233
  const block_q5_1 * restrict x0 = &x[i];
3096
3234
  const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3296,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3296
3434
  }
3297
3435
 
3298
3436
  *s = hsum_float_8(acc) + summs;
3437
+ #elif defined(__riscv_v_intrinsic)
3438
+ float sumf = 0.0;
3439
+
3440
+ uint32_t qh;
3441
+
3442
+ // These temp values are for shift operations
3443
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3444
+
3445
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3446
+
3447
+ for (int i = 0; i < nb; i++) {
3448
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3449
+
3450
+ // temporary registers
3451
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3452
+ vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3453
+
3454
+ // load qh
3455
+ vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3456
+
3457
+ // ((qh >> (j + 0)) << 4) & 0x10;
3458
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3459
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3460
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3461
+
3462
+ // ((qh >> (j + 12)) ) & 0x10;
3463
+ vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3464
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3465
+
3466
+ // narrowing
3467
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3468
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3469
+
3470
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3471
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3472
+
3473
+ // load
3474
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3475
+
3476
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3477
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3478
+
3479
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3480
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3481
+
3482
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3483
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3484
+
3485
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3486
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3487
+
3488
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3489
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3490
+
3491
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3492
+
3493
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3494
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3495
+
3496
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3497
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3498
+
3499
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3500
+ }
3501
+
3502
+ *s = sumf;
3299
3503
  #else
3300
3504
  // scalar
3301
3505
  float sumf = 0.0;
@@ -3328,7 +3532,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3328
3532
  const int nb = n / qk;
3329
3533
 
3330
3534
  assert(n % qk == 0);
3331
- assert(nb % 2 == 0);
3332
3535
 
3333
3536
  const block_q8_0 * restrict x = vx;
3334
3537
  const block_q8_0 * restrict y = vy;
@@ -3337,6 +3540,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3337
3540
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3338
3541
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3339
3542
 
3543
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3340
3544
  for (int i = 0; i < nb; i += 2) {
3341
3545
  const block_q8_0 * restrict x0 = &x[i + 0];
3342
3546
  const block_q8_0 * restrict x1 = &x[i + 1];
@@ -3407,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3407
3611
  }
3408
3612
 
3409
3613
  *s = hsum_float_8(acc);
3614
+ #elif defined(__riscv_v_intrinsic)
3615
+ float sumf = 0.0;
3616
+ size_t vl = __riscv_vsetvl_e8m1(qk);
3617
+
3618
+ for (int i = 0; i < nb; i++) {
3619
+ // load elements
3620
+ vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
3621
+ vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
3622
+
3623
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
3624
+
3625
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
3626
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
3627
+
3628
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
3629
+
3630
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3631
+ }
3632
+
3633
+ *s = sumf;
3410
3634
  #else
3411
3635
  // scalar
3412
3636
  float sumf = 0.0;
@@ -4107,16 +4331,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4107
4331
  }
4108
4332
 
4109
4333
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4111
-
4112
- // this should handle cases where the tensor is not contiguous in memory
4113
- // probaby just:
4114
- //
4115
- // return tensor->ne[3]*tensor->nb[3]
4116
- //
4117
- // is enough, but just in case, adding the second part
4118
-
4119
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
4334
+ size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4335
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4336
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4337
+ }
4338
+ return nbytes;
4120
4339
  }
4121
4340
 
4122
4341
  size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@@ -4570,36 +4789,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4570
4789
  enum ggml_type type,
4571
4790
  int n_dims,
4572
4791
  const int64_t * ne,
4573
- void * data) {
4792
+ struct ggml_tensor * view_src,
4793
+ size_t view_offs) {
4574
4794
 
4575
4795
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4576
4796
 
4577
- size_t data_size = 0;
4797
+ // find the base tensor and absolute offset
4798
+ if (view_src != NULL && view_src->view_src != NULL) {
4799
+ view_offs += view_src->view_offs;
4800
+ view_src = view_src->view_src;
4801
+ }
4578
4802
 
4579
- if (data == NULL && !ctx->no_alloc) {
4580
- data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4581
- for (int i = 1; i < n_dims; i++) {
4582
- data_size *= ne[i];
4583
- }
4803
+ size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4804
+ for (int i = 1; i < n_dims; i++) {
4805
+ data_size *= ne[i];
4584
4806
  }
4585
4807
 
4586
- if (ctx->scratch.data != NULL && data == NULL) {
4587
- // allocate tensor data in the scratch buffer
4588
- if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4589
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4590
- __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4591
- assert(false);
4592
- return NULL;
4593
- }
4808
+ GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
4594
4809
 
4595
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4810
+ void * data = view_src != NULL ? view_src->data : NULL;
4811
+ if (data != NULL) {
4812
+ data = (char *) data + view_offs;
4813
+ }
4596
4814
 
4597
- ctx->scratch.offs += data_size;
4815
+ size_t obj_alloc_size = 0;
4598
4816
 
4599
- data_size = 0;
4817
+ if (view_src == NULL && ctx->no_alloc == false) {
4818
+ if (ctx->scratch.data != NULL) {
4819
+ // allocate tensor data in the scratch buffer
4820
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4821
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4822
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4823
+ assert(false);
4824
+ return NULL;
4825
+ }
4826
+
4827
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4828
+
4829
+ ctx->scratch.offs += data_size;
4830
+ } else {
4831
+ // allocate tensor data in the context's memory pool
4832
+ obj_alloc_size = data_size;
4833
+ }
4600
4834
  }
4601
4835
 
4602
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4836
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
4603
4837
 
4604
4838
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4605
4839
 
@@ -4619,7 +4853,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4619
4853
  /*.perf_runs =*/ 0,
4620
4854
  /*.perf_cycles =*/ 0,
4621
4855
  /*.perf_time_us =*/ 0,
4622
- /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4856
+ /*.view_src =*/ view_src,
4857
+ /*.view_offs =*/ view_offs,
4858
+ /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
4623
4859
  /*.name =*/ { 0 },
4624
4860
  /*.extra =*/ NULL,
4625
4861
  /*.padding =*/ { 0 },
@@ -4643,28 +4879,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4643
4879
  return result;
4644
4880
  }
4645
4881
 
4646
- static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4647
- GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4648
- assert(params_size <= GGML_MAX_OP_PARAMS);
4649
- memcpy(tensor->op_params, params, params_size);
4650
- }
4651
-
4652
- static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4653
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4654
- return ((const int32_t *)(tensor->op_params))[i];
4655
- }
4656
-
4657
- static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4658
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4659
- ((int32_t *)(tensor->op_params))[i] = value;
4660
- }
4661
-
4662
4882
  struct ggml_tensor * ggml_new_tensor(
4663
4883
  struct ggml_context * ctx,
4664
4884
  enum ggml_type type,
4665
4885
  int n_dims,
4666
4886
  const int64_t * ne) {
4667
- return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4887
+ return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
4668
4888
  }
4669
4889
 
4670
4890
  struct ggml_tensor * ggml_new_tensor_1d(
@@ -4729,7 +4949,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
4729
4949
  }
4730
4950
 
4731
4951
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4732
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
4952
+ return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
4953
+ }
4954
+
4955
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4956
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4957
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4958
+ memcpy(tensor->op_params, params, params_size);
4959
+ }
4960
+
4961
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4962
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4963
+ return ((const int32_t *)(tensor->op_params))[i];
4964
+ }
4965
+
4966
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4967
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4968
+ ((int32_t *)(tensor->op_params))[i] = value;
4733
4969
  }
4734
4970
 
4735
4971
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@@ -5015,14 +5251,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
5015
5251
 
5016
5252
  struct ggml_tensor * ggml_view_tensor(
5017
5253
  struct ggml_context * ctx,
5018
- const struct ggml_tensor * src) {
5019
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
5254
+ struct ggml_tensor * src) {
5255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
5020
5256
  ggml_format_name(result, "%s (view)", src->name);
5021
5257
 
5022
- result->nb[0] = src->nb[0];
5023
- result->nb[1] = src->nb[1];
5024
- result->nb[2] = src->nb[2];
5025
- result->nb[3] = src->nb[3];
5258
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
5259
+ result->nb[i] = src->nb[i];
5260
+ }
5026
5261
 
5027
5262
  return result;
5028
5263
  }
@@ -5595,7 +5830,7 @@ struct ggml_tensor * ggml_repeat_back(
5595
5830
 
5596
5831
  // ggml_concat
5597
5832
 
5598
- struct ggml_tensor* ggml_concat(
5833
+ struct ggml_tensor * ggml_concat(
5599
5834
  struct ggml_context* ctx,
5600
5835
  struct ggml_tensor* a,
5601
5836
  struct ggml_tensor* b) {
@@ -5862,7 +6097,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5862
6097
  struct ggml_tensor * ggml_rms_norm_back(
5863
6098
  struct ggml_context * ctx,
5864
6099
  struct ggml_tensor * a,
5865
- struct ggml_tensor * b) {
6100
+ struct ggml_tensor * b,
6101
+ float eps) {
5866
6102
  bool is_node = false;
5867
6103
 
5868
6104
  if (a->grad) {
@@ -5872,6 +6108,8 @@ struct ggml_tensor * ggml_rms_norm_back(
5872
6108
 
5873
6109
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5874
6110
 
6111
+ ggml_set_op_params(result, &eps, sizeof(eps));
6112
+
5875
6113
  result->op = GGML_OP_RMS_NORM_BACK;
5876
6114
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5877
6115
  result->src[0] = a;
@@ -6201,7 +6439,7 @@ struct ggml_tensor * ggml_reshape(
6201
6439
  //GGML_ASSERT(false);
6202
6440
  }
6203
6441
 
6204
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6442
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
6205
6443
  ggml_format_name(result, "%s (reshaped)", a->name);
6206
6444
 
6207
6445
  result->op = GGML_OP_RESHAPE;
@@ -6225,7 +6463,7 @@ struct ggml_tensor * ggml_reshape_1d(
6225
6463
  }
6226
6464
 
6227
6465
  const int64_t ne[1] = { ne0 };
6228
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6466
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
6229
6467
  ggml_format_name(result, "%s (reshaped)", a->name);
6230
6468
 
6231
6469
  result->op = GGML_OP_RESHAPE;
@@ -6250,7 +6488,7 @@ struct ggml_tensor * ggml_reshape_2d(
6250
6488
  }
6251
6489
 
6252
6490
  const int64_t ne[2] = { ne0, ne1 };
6253
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6491
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
6254
6492
  ggml_format_name(result, "%s (reshaped)", a->name);
6255
6493
 
6256
6494
  result->op = GGML_OP_RESHAPE;
@@ -6276,7 +6514,7 @@ struct ggml_tensor * ggml_reshape_3d(
6276
6514
  }
6277
6515
 
6278
6516
  const int64_t ne[3] = { ne0, ne1, ne2 };
6279
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6517
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
6280
6518
  ggml_format_name(result, "%s (reshaped)", a->name);
6281
6519
 
6282
6520
  result->op = GGML_OP_RESHAPE;
@@ -6286,7 +6524,6 @@ struct ggml_tensor * ggml_reshape_3d(
6286
6524
  return result;
6287
6525
  }
6288
6526
 
6289
-
6290
6527
  struct ggml_tensor * ggml_reshape_4d(
6291
6528
  struct ggml_context * ctx,
6292
6529
  struct ggml_tensor * a,
@@ -6304,7 +6541,7 @@ struct ggml_tensor * ggml_reshape_4d(
6304
6541
  }
6305
6542
 
6306
6543
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6307
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6544
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
6308
6545
  ggml_format_name(result, "%s (reshaped)", a->name);
6309
6546
 
6310
6547
  result->op = GGML_OP_RESHAPE;
@@ -6314,46 +6551,40 @@ struct ggml_tensor * ggml_reshape_4d(
6314
6551
  return result;
6315
6552
  }
6316
6553
 
6317
- // ggml_view_1d
6318
-
6319
- static struct ggml_tensor * ggml_view_tensor_offset(
6554
+ static struct ggml_tensor * ggml_view_impl(
6320
6555
  struct ggml_context * ctx,
6321
6556
  struct ggml_tensor * a,
6322
6557
  int n_dims,
6323
6558
  const int64_t * ne,
6324
6559
  size_t offset) {
6325
- // don't calculate an offset from an unallocated tensor
6326
- void * data = NULL;
6327
- if (a->data != NULL) {
6328
- data = (char *) a->data + offset;
6329
- }
6330
6560
 
6331
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6561
+ bool is_node = false;
6562
+
6563
+ if (a->grad) {
6564
+ is_node = true;
6565
+ }
6332
6566
 
6567
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
6333
6568
  ggml_format_name(result, "%s (view)", a->name);
6334
6569
 
6335
6570
  ggml_set_op_params(result, &offset, sizeof(offset));
6336
6571
 
6572
+ result->op = GGML_OP_VIEW;
6573
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6574
+ result->src[0] = a;
6575
+
6337
6576
  return result;
6338
6577
  }
6339
6578
 
6579
+ // ggml_view_1d
6580
+
6340
6581
  struct ggml_tensor * ggml_view_1d(
6341
6582
  struct ggml_context * ctx,
6342
6583
  struct ggml_tensor * a,
6343
6584
  int64_t ne0,
6344
6585
  size_t offset) {
6345
6586
 
6346
- bool is_node = false;
6347
-
6348
- if (a->grad) {
6349
- is_node = true;
6350
- }
6351
-
6352
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6353
-
6354
- result->op = GGML_OP_VIEW;
6355
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6356
- result->src[0] = a;
6587
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
6357
6588
 
6358
6589
  return result;
6359
6590
  }
@@ -6368,24 +6599,14 @@ struct ggml_tensor * ggml_view_2d(
6368
6599
  size_t nb1,
6369
6600
  size_t offset) {
6370
6601
 
6371
- bool is_node = false;
6372
-
6373
- if (a->grad) {
6374
- is_node = true;
6375
- }
6376
-
6377
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6602
+ const int64_t ne[2] = { ne0, ne1 };
6378
6603
 
6379
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6604
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
6380
6605
 
6381
6606
  result->nb[1] = nb1;
6382
6607
  result->nb[2] = result->nb[1]*ne1;
6383
6608
  result->nb[3] = result->nb[2];
6384
6609
 
6385
- result->op = GGML_OP_VIEW;
6386
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6387
- result->src[0] = a;
6388
-
6389
6610
  return result;
6390
6611
  }
6391
6612
 
@@ -6401,24 +6622,14 @@ struct ggml_tensor * ggml_view_3d(
6401
6622
  size_t nb2,
6402
6623
  size_t offset) {
6403
6624
 
6404
- bool is_node = false;
6405
-
6406
- if (a->grad) {
6407
- is_node = true;
6408
- }
6409
-
6410
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6625
+ const int64_t ne[3] = { ne0, ne1, ne2 };
6411
6626
 
6412
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6627
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
6413
6628
 
6414
6629
  result->nb[1] = nb1;
6415
6630
  result->nb[2] = nb2;
6416
6631
  result->nb[3] = result->nb[2]*ne2;
6417
6632
 
6418
- result->op = GGML_OP_VIEW;
6419
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6420
- result->src[0] = a;
6421
-
6422
6633
  return result;
6423
6634
  }
6424
6635
 
@@ -6436,24 +6647,14 @@ struct ggml_tensor * ggml_view_4d(
6436
6647
  size_t nb3,
6437
6648
  size_t offset) {
6438
6649
 
6439
- bool is_node = false;
6440
-
6441
- if (a->grad) {
6442
- is_node = true;
6443
- }
6444
-
6445
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6650
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6446
6651
 
6447
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6652
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
6448
6653
 
6449
6654
  result->nb[1] = nb1;
6450
6655
  result->nb[2] = nb2;
6451
6656
  result->nb[3] = nb3;
6452
6657
 
6453
- result->op = GGML_OP_VIEW;
6454
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6455
- result->src[0] = a;
6456
-
6457
6658
  return result;
6458
6659
  }
6459
6660
 
@@ -6640,7 +6841,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6640
6841
 
6641
6842
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6642
6843
 
6643
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6844
+ int32_t params[] = { n_past };
6644
6845
  ggml_set_op_params(result, params, sizeof(params));
6645
6846
 
6646
6847
  result->op = GGML_OP_DIAG_MASK_INF;
@@ -6657,7 +6858,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
6657
6858
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6658
6859
  }
6659
6860
 
6660
-
6661
6861
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6662
6862
  struct ggml_context * ctx,
6663
6863
  struct ggml_tensor * a,
@@ -6680,7 +6880,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6680
6880
 
6681
6881
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6682
6882
 
6683
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6883
+ int32_t params[] = { n_past };
6684
6884
  ggml_set_op_params(result, params, sizeof(params));
6685
6885
 
6686
6886
  result->op = GGML_OP_DIAG_MASK_ZERO;
@@ -7097,11 +7297,13 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7097
7297
  };
7098
7298
 
7099
7299
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7300
+
7301
+ ggml_set_op_params_i32(result, 0, stride);
7302
+
7100
7303
  result->op = GGML_OP_CONV_TRANSPOSE_2D;
7101
7304
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7102
7305
  result->src[0] = a;
7103
7306
  result->src[1] = b;
7104
- result->src[2] = ggml_new_i32(ctx, stride);
7105
7307
 
7106
7308
  return result;
7107
7309
  }
@@ -9446,6 +9648,8 @@ static void ggml_compute_forward_div_f32(
9446
9648
 
9447
9649
 
9448
9650
  #ifdef GGML_USE_ACCELERATE
9651
+ UNUSED(ggml_vec_div_f32);
9652
+
9449
9653
  vDSP_vdiv(
9450
9654
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9451
9655
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -10752,7 +10956,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
10752
10956
 
10753
10957
  GGML_TENSOR_BINARY_OP_LOCALS;
10754
10958
 
10755
- const float eps = 1e-6f; // TODO: make this a parameter
10959
+ float eps;
10960
+ memcpy(&eps, dst->op_params, sizeof(float));
10756
10961
 
10757
10962
  // TODO: optimize
10758
10963
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11930,8 +12135,8 @@ static void ggml_compute_forward_diag_mask_f32(
11930
12135
  const int ith = params->ith;
11931
12136
  const int nth = params->nth;
11932
12137
 
11933
- const int n_past = ((int32_t *) dst->op_params)[0];
11934
- const bool inplace = (bool)((int32_t *) dst->op_params)[1];
12138
+ const int n_past = ((int32_t *) dst->op_params)[0];
12139
+ const bool inplace = src0->data == dst->data;
11935
12140
 
11936
12141
  GGML_ASSERT(n_past >= 0);
11937
12142
 
@@ -12142,6 +12347,7 @@ static void ggml_compute_forward_soft_max_back_f32(
12142
12347
  // dx = J * dy
12143
12348
  // dxk = sum_i(Jki * dyi)
12144
12349
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
12350
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
12145
12351
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
12146
12352
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
12147
12353
  // dxk = -yk * dot(y, dy) + yk*dyk
@@ -13497,7 +13703,6 @@ static void ggml_compute_forward_conv_transpose_2d(
13497
13703
  const struct ggml_compute_params * params,
13498
13704
  const struct ggml_tensor * src0,
13499
13705
  const struct ggml_tensor * src1,
13500
- const struct ggml_tensor * opt0,
13501
13706
  struct ggml_tensor * dst) {
13502
13707
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13503
13708
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13557,7 +13762,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13557
13762
  return;
13558
13763
  }
13559
13764
 
13560
- const int32_t stride = ((const int32_t*)(opt0->data))[0];
13765
+ const int32_t stride = ggml_get_op_params_i32(dst, 0);
13561
13766
 
13562
13767
  // total patches in dst
13563
13768
  const int np = ne2;
@@ -13570,7 +13775,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13570
13775
  const int ip1 = MIN(ip0 + dp, np);
13571
13776
 
13572
13777
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13573
- ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
13778
+ ggml_fp16_t * const wdata_src = wdata + nk;
13574
13779
 
13575
13780
  for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13576
13781
  float * dst_data = (float *)((char *) dst->data + i2*nb2);
@@ -13582,9 +13787,8 @@ static void ggml_compute_forward_conv_transpose_2d(
13582
13787
  for (int i00 = 0; i00 < ne00; i00++) {
13583
13788
  float v = 0;
13584
13789
  ggml_vec_dot_f16(ne03, &v,
13585
- (ggml_fp16_t *) wdata_src + i1n,
13586
- (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
13587
-
13790
+ wdata_src + i1n,
13791
+ wdata_kernel + i01*ne00*ne03 + i00*ne03);
13588
13792
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13589
13793
  }
13590
13794
  }
@@ -13934,7 +14138,7 @@ static void ggml_compute_forward_flash_attn_f32(
13934
14138
  vvexpf(S, S, &Mup);
13935
14139
  ggml_vec_sum_f32(Mup, &sum, S);
13936
14140
  #else
13937
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14141
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13938
14142
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13939
14143
 
13940
14144
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13944,9 +14148,13 @@ static void ggml_compute_forward_flash_attn_f32(
13944
14148
  if (SS[j] == -INFINITY) {
13945
14149
  SS[j] = 0.0f;
13946
14150
  } else {
14151
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14152
+ const float val = expf(SS[j] - max);
14153
+ #else
13947
14154
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
13948
14155
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13949
14156
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14157
+ #endif
13950
14158
  sump[j] += (ggml_float)val;
13951
14159
  SS[j] = val;
13952
14160
  }
@@ -14524,7 +14732,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14524
14732
  vvexpf(SM, SM, &Mup);
14525
14733
  ggml_vec_sum_f32(Mup, &sum, SM);
14526
14734
  #else
14527
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14735
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
14528
14736
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
14529
14737
 
14530
14738
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -14535,9 +14743,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
14535
14743
  if (SR[j] == -INFINITY) {
14536
14744
  SW[j] = 0.0f;
14537
14745
  } else {
14746
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14747
+ const float val = expf(SR[j] - max);
14748
+ #else
14538
14749
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
14539
14750
  memcpy(&scvt[j], &s, sizeof(uint16_t));
14540
14751
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14752
+ #endif
14541
14753
  sump[j] += (ggml_float)val;
14542
14754
  SW[j] = val;
14543
14755
  }
@@ -15275,6 +15487,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15275
15487
  const int nc = src0->ne[0];
15276
15488
  const int nr = ggml_nrows(src0);
15277
15489
 
15490
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15491
+
15278
15492
  if (params->type == GGML_TASK_INIT) {
15279
15493
  if (ith == 0) {
15280
15494
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -15286,7 +15500,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15286
15500
  if (ith == 0) {
15287
15501
  float * dp = (float *) dst->data;
15288
15502
  ggml_vec_sum_f32(nth, dp, sums);
15289
- dp[0] *= -1.0f;
15503
+ dp[0] *= -1.0f / (float) nr;
15290
15504
  }
15291
15505
  return;
15292
15506
  }
@@ -15303,7 +15517,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15303
15517
  for (int i1 = ir0; i1 < ir1; i1++) {
15304
15518
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15305
15519
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15306
- float * st = (float *) params->wdata + nth + ith*nc;
15520
+ float * st = ((float *) params->wdata) + nth + ith*nc;
15307
15521
 
15308
15522
  #ifndef NDEBUG
15309
15523
  for (int i = 0; i < nc; ++i) {
@@ -15318,15 +15532,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15318
15532
  float max = -INFINITY;
15319
15533
  ggml_vec_max_f32(nc, &max, s0);
15320
15534
 
15321
- uint16_t scvt;
15535
+ uint16_t scvt; UNUSED(scvt);
15322
15536
  for (int i = 0; i < nc; i++) {
15323
15537
  if (s0[i] == -INFINITY) {
15324
15538
  st[i] = 0.0f;
15325
15539
  } else {
15326
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15540
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15541
+ const float s = s0[i] - max;
15542
+ const float val = expf(s);
15543
+ #else
15327
15544
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15328
15545
  memcpy(&scvt, &s, sizeof(scvt));
15329
15546
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15547
+ #endif
15330
15548
  sum += (ggml_float)val;
15331
15549
  st[i] = val;
15332
15550
  }
@@ -15342,7 +15560,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15342
15560
  ggml_vec_log_f32(nc, st, st);
15343
15561
  ggml_vec_mul_f32(nc, st, st, s1);
15344
15562
 
15345
- ggml_vec_sum_f32(nc, sums + ith, st);
15563
+ float st_sum = 0;
15564
+ ggml_vec_sum_f32(nc, &st_sum, st);
15565
+ sums[ith] += st_sum;
15346
15566
 
15347
15567
  #ifndef NDEBUG
15348
15568
  for (int i = 0; i < nc; ++i) {
@@ -15392,7 +15612,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15392
15612
  return;
15393
15613
  }
15394
15614
 
15395
- const float eps = 1e-9f;
15615
+ const double eps = 1e-9;
15396
15616
 
15397
15617
  // TODO: handle transposed/permuted matrices
15398
15618
  const int64_t nc = src0->ne[0];
@@ -15411,7 +15631,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15411
15631
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
15412
15632
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15413
15633
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15414
- float * sm = (float *) params->wdata + ith*nc;
15415
15634
 
15416
15635
  #ifndef NDEBUG
15417
15636
  for (int i = 0; i < nc; ++i) {
@@ -15420,54 +15639,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15420
15639
  assert(!isnan(s1[i]));
15421
15640
  }
15422
15641
  #endif
15423
- // step by step explanation:
15424
- {
15425
- //float * sums = (float *) params->wdata;
15426
-
15427
- // forward pass with annotated gradients from backward pass
15428
- // (built by going in reverse operation order, adding to gradients of current operation args)
15429
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
15430
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15431
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
15432
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
15433
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
15434
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
15435
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
15436
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
15437
-
15438
- // substitute into grad[st1], because we can reuse softmax_back from this point on
15439
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
15440
- // postorder:
15441
- // grad[st1] := softmax(s0)
15442
- // grad[st1] := grad[st1]*(1.0 - eps)
15443
- // grad[st1] := grad[st1] + eps
15444
- // grad[st1] := s1 / grad[st1]
15445
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
15446
-
15447
- // src0 gradients by going through softmax_back
15448
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15449
- // from softmax_back:
15450
- // dxk = yk * (dyk - dot(y, dy))
15451
- // dot_y_dy := dot(y, dy)
15452
- // dx := dy
15453
- // dx := dx - dot_y_dy
15454
- // dx := dx * y
15455
- // postorder:
15456
- // dot_st1_dst1 := dot(st1, grad[st1])
15457
- // grad[s0] := grad[st1]
15458
- // grad[s0] := grad[s0] - dot_st1_dst1
15459
- // grad[s0] := grad[s0] * st1
15460
-
15461
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
15462
- // sm := softmax(s0)
15463
- // grad[s0] := sm*(1.0 - eps)
15464
- // grad[s0] := grad[s0] + eps
15465
- // grad[s0] := s1 / grad[s0]
15466
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
15467
- // dot_st1_dst1 := dot(sm, grad[s0])
15468
- // grad[s0] := grad[s0] - dot_st1_dst1
15469
- // grad[s0] := grad[s0] * sm
15470
- }
15471
15642
 
15472
15643
  // soft_max
15473
15644
  ggml_float sum = 0.0;
@@ -15475,39 +15646,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15475
15646
  float max = -INFINITY;
15476
15647
  ggml_vec_max_f32(nc, &max, s0);
15477
15648
 
15478
- uint16_t scvt;
15649
+ uint16_t scvt; UNUSED(scvt);
15479
15650
  for (int i = 0; i < nc; i++) {
15480
15651
  if (s0[i] == -INFINITY) {
15481
- sm[i] = 0.0f;
15652
+ ds0[i] = 0.0f;
15482
15653
  } else {
15483
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15654
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15655
+ const float s = s0[i] - max;
15656
+ const float val = expf(s);
15657
+ #else
15484
15658
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15485
15659
  memcpy(&scvt, &s, sizeof(scvt));
15486
15660
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15661
+ #endif
15487
15662
  sum += (ggml_float)val;
15488
- sm[i] = val;
15663
+ ds0[i] = val;
15489
15664
  }
15490
15665
  }
15491
15666
 
15492
15667
  assert(sum > 0.0);
15493
- sum = 1.0/sum;
15668
+ sum = (1.0 - eps)/sum;
15494
15669
  }
15495
15670
 
15496
- float dot_st1_dst1 = 0;
15497
- ggml_vec_scale_f32(nc, sm, sum);
15498
- ggml_vec_cpy_f32 (nc, ds0, sm);
15499
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
15500
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
15501
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
15502
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
15503
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
15504
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
15505
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
15671
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
15672
+ ggml_vec_scale_f32(nc, ds0, sum);
15673
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
15674
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
15675
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
15676
+
15506
15677
 
15507
15678
  #ifndef NDEBUG
15508
15679
  for (int i = 0; i < nc; ++i) {
15509
- assert(!isnan(sm[i]));
15510
- assert(!isinf(sm[i]));
15511
15680
  assert(!isnan(ds0[i]));
15512
15681
  assert(!isinf(ds0[i]));
15513
15682
  }
@@ -15731,7 +15900,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15731
15900
  } break;
15732
15901
  case GGML_OP_CONV_TRANSPOSE_2D:
15733
15902
  {
15734
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15903
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15735
15904
  } break;
15736
15905
  case GGML_OP_POOL_1D:
15737
15906
  {
@@ -16062,9 +16231,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16062
16231
  {
16063
16232
  // necessary for llama
16064
16233
  if (src0->grad) {
16234
+ float eps;
16235
+ memcpy(&eps, tensor->op_params, sizeof(float));
16236
+
16065
16237
  src0->grad = ggml_add_impl(ctx,
16066
16238
  src0->grad,
16067
- ggml_rms_norm_back(ctx, src0, tensor->grad),
16239
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
16068
16240
  inplace);
16069
16241
  }
16070
16242
  } break;
@@ -16832,9 +17004,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16832
17004
  return result;
16833
17005
  }
16834
17006
 
16835
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
16836
- struct ggml_cgraph result = *gf;
16837
-
17007
+ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
16838
17008
  GGML_ASSERT(gf->n_nodes > 0);
16839
17009
 
16840
17010
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16858,15 +17028,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16858
17028
  }
16859
17029
  }
16860
17030
 
16861
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
17031
+ for (int i = 0; i < gf->n_nodes; i++) {
16862
17032
  struct ggml_tensor * node = gf->nodes[i];
16863
17033
 
16864
17034
  if (node->is_param) {
16865
17035
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16866
- ggml_build_forward_expand(&result, node->grad);
17036
+ ggml_build_forward_expand(gb, node->grad);
16867
17037
  }
16868
17038
  }
17039
+ }
16869
17040
 
17041
+ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
17042
+ struct ggml_cgraph result = *gf;
17043
+ ggml_build_backward_expand(ctx, gf, &result, keep);
16870
17044
  return result;
16871
17045
  }
16872
17046
 
@@ -17542,10 +17716,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
17542
17716
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
17543
17717
  {
17544
17718
  n_tasks = n_threads;
17545
-
17546
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
17547
-
17548
- work_size = MAX(work_size, cur);
17549
17719
  } break;
17550
17720
  case GGML_OP_NONE:
17551
17721
  {
@@ -18423,14 +18593,16 @@ static enum ggml_opt_result ggml_opt_adam(
18423
18593
  struct ggml_opt_params params,
18424
18594
  struct ggml_tensor * f,
18425
18595
  struct ggml_cgraph * gf,
18426
- struct ggml_cgraph * gb) {
18596
+ struct ggml_cgraph * gb,
18597
+ ggml_opt_callback callback,
18598
+ void * callback_data) {
18427
18599
  GGML_ASSERT(ggml_is_scalar(f));
18428
18600
 
18429
18601
  // these will store the parameters we want to optimize
18430
18602
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
18431
18603
 
18432
18604
  int np = 0;
18433
- int nx = 0;
18605
+ int64_t nx = 0;
18434
18606
  for (int i = 0; i < gf->n_nodes; ++i) {
18435
18607
  if (gf->nodes[i]->is_param) {
18436
18608
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -18449,31 +18621,32 @@ static enum ggml_opt_result ggml_opt_adam(
18449
18621
  }
18450
18622
 
18451
18623
  // constants
18452
- const float sched = params.adam.sched;
18453
- const float decay = params.adam.decay * sched;
18454
- const float alpha = params.adam.alpha * sched;
18624
+ float sched = params.adam.sched;
18625
+ const float alpha = params.adam.alpha;
18626
+ const float decay = params.adam.decay * alpha;
18455
18627
  const float beta1 = params.adam.beta1;
18456
18628
  const float beta2 = params.adam.beta2;
18457
18629
  const float eps = params.adam.eps;
18630
+ const float gclip = params.adam.gclip;
18631
+ const int decay_min_ndim = params.adam.decay_min_ndim;
18458
18632
 
18459
- float * x = opt->adam.x->data; // view of the parameters
18460
- float * g1 = opt->adam.g1->data; // gradient
18461
- float * g2 = opt->adam.g2->data; // gradient squared
18462
18633
  float * m = opt->adam.m->data; // first moment
18463
18634
  float * v = opt->adam.v->data; // second moment
18464
- float * mh = opt->adam.mh->data; // first moment hat
18465
- float * vh = opt->adam.vh->data; // second moment hat
18466
18635
 
18467
18636
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18468
18637
 
18469
- // update view
18470
- ggml_opt_get_params(np, ps, x);
18638
+ if (callback) {
18639
+ callback(callback_data, &sched);
18640
+ }
18471
18641
 
18472
18642
  // compute the function value
18473
18643
  ggml_graph_reset (gf);
18474
18644
  ggml_set_f32 (f->grad, 1.0f);
18475
18645
 
18476
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18646
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18647
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18648
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18649
+ ggml_graph_compute(gb, &cplan);
18477
18650
 
18478
18651
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
18479
18652
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -18481,6 +18654,9 @@ static enum ggml_opt_result ggml_opt_adam(
18481
18654
  pf[opt->iter % params.past] = opt->adam.fx_prev;
18482
18655
  }
18483
18656
 
18657
+ opt->loss_before = opt->adam.fx_prev;
18658
+ opt->loss_after = opt->adam.fx_prev;
18659
+
18484
18660
  // initialize
18485
18661
  if (opt->just_initialized) {
18486
18662
  opt->adam.n_no_improvement = 0;
@@ -18513,50 +18689,55 @@ static enum ggml_opt_result ggml_opt_adam(
18513
18689
  UNUSED(t_start_cpu);
18514
18690
 
18515
18691
  {
18516
- // update the gradient
18517
- ggml_opt_get_grad(np, ps, g1);
18518
-
18519
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
18520
- ggml_vec_scale_f32(nx, m, beta1);
18521
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
18522
-
18523
- // g2 = g1^2
18524
- ggml_vec_sqr_f32 (nx, g2, g1);
18525
-
18526
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
18527
- ggml_vec_scale_f32(nx, v, beta2);
18528
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
18529
-
18530
- // m^hat = m_t / (1 - beta1^t)
18531
- // v^hat = v_t / (1 - beta2^t)
18532
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
18533
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
18534
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
18535
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
18536
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
18537
- ggml_vec_cpy_f32 (nx, mh, m);
18538
- ggml_vec_cpy_f32 (nx, vh, v);
18539
-
18540
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
18541
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
18542
-
18543
- ggml_vec_sqrt_f32 (nx, vh, vh);
18544
- ggml_vec_acc1_f32 (nx, vh, eps);
18545
-
18546
- ggml_vec_div_f32 (nx, mh, mh, vh);
18547
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
18548
- ggml_vec_sub_f32 (nx, x, x, mh);
18692
+ float gnorm = 1.0f;
18693
+ if (gclip > 0.0f) {
18694
+ // gradient clipping
18695
+ ggml_float sum = 0.0;
18696
+ for (int p = 0; p < np; ++p) {
18697
+ const int64_t ne = ggml_nelements(ps[p]);
18698
+ for (int64_t j = 0; j < ne; ++j) {
18699
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
18700
+ sum += (ggml_float)(g*g);
18701
+ }
18702
+ }
18703
+ ggml_float norm = sqrt(sum);
18704
+ if (norm > (ggml_float) gclip) {
18705
+ gnorm = (float) ((ggml_float) gclip / norm);
18706
+ }
18707
+ }
18708
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
18709
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
18710
+ int64_t i = 0;
18711
+ for (int p = 0; p < np; ++p) {
18712
+ const int64_t ne = ggml_nelements(ps[p]);
18713
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
18714
+ for (int64_t j = 0; j < ne; ++j) {
18715
+ float x = ggml_get_f32_1d(ps[p], j);
18716
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
18717
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
18718
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
18719
+ float mh = m[i]*beta1h;
18720
+ float vh = v[i]*beta2h;
18721
+ vh = sqrtf(vh) + eps;
18722
+ x = x*(1.0f - p_decay) - mh/vh;
18723
+ ggml_set_f32_1d(ps[p], j, x);
18724
+ ++i;
18725
+ }
18726
+ }
18727
+ }
18549
18728
 
18550
- // update the parameters
18551
- ggml_opt_set_params(np, ps, x);
18729
+ if (callback) {
18730
+ callback(callback_data, &sched);
18552
18731
  }
18553
18732
 
18554
18733
  ggml_graph_reset (gf);
18555
18734
  ggml_set_f32 (f->grad, 1.0f);
18556
18735
 
18557
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18736
+ ggml_graph_compute(gb, &cplan);
18558
18737
 
18559
18738
  const float fx = ggml_get_f32_1d(f, 0);
18739
+ opt->loss_after = fx;
18740
+
18560
18741
 
18561
18742
  // check convergence
18562
18743
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -18625,7 +18806,6 @@ struct ggml_lbfgs_iteration_data {
18625
18806
  };
18626
18807
 
18627
18808
  static enum ggml_opt_result linesearch_backtracking(
18628
- struct ggml_context * ctx,
18629
18809
  const struct ggml_opt_params * params,
18630
18810
  int nx,
18631
18811
  float * x,
@@ -18637,8 +18817,11 @@ static enum ggml_opt_result linesearch_backtracking(
18637
18817
  struct ggml_tensor * f,
18638
18818
  struct ggml_cgraph * gf,
18639
18819
  struct ggml_cgraph * gb,
18820
+ struct ggml_cplan * cplan,
18640
18821
  const int np,
18641
- struct ggml_tensor * ps[]) {
18822
+ struct ggml_tensor * ps[],
18823
+ ggml_opt_callback callback,
18824
+ void * callback_data) {
18642
18825
  int count = 0;
18643
18826
 
18644
18827
  float width = 0.0f;
@@ -18667,6 +18850,12 @@ static enum ggml_opt_result linesearch_backtracking(
18667
18850
  dgtest = params->lbfgs.ftol*dginit;
18668
18851
 
18669
18852
  while (true) {
18853
+ if (callback) {
18854
+ // LBFG-S does not support learning rate -> ignore learning schedule
18855
+ float sched = 0;
18856
+ callback(callback_data, &sched);
18857
+ }
18858
+
18670
18859
  ggml_vec_cpy_f32(nx, x, xp);
18671
18860
  ggml_vec_mad_f32(nx, x, d, *step);
18672
18861
 
@@ -18677,7 +18866,7 @@ static enum ggml_opt_result linesearch_backtracking(
18677
18866
  ggml_graph_reset (gf);
18678
18867
  ggml_set_f32 (f->grad, 1.0f);
18679
18868
 
18680
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
18869
+ ggml_graph_compute(gb, cplan);
18681
18870
 
18682
18871
  ggml_opt_get_grad(np, ps, g);
18683
18872
 
@@ -18737,7 +18926,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18737
18926
  struct ggml_opt_params params,
18738
18927
  struct ggml_tensor * f,
18739
18928
  struct ggml_cgraph * gf,
18740
- struct ggml_cgraph * gb) {
18929
+ struct ggml_cgraph * gb,
18930
+ ggml_opt_callback callback,
18931
+ void * callback_data) {
18741
18932
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18742
18933
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18743
18934
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -18769,6 +18960,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18769
18960
  opt->iter = iter;
18770
18961
  }
18771
18962
 
18963
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18964
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18965
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18966
+
18772
18967
  float * x = opt->lbfgs.x->data; // current parameters
18773
18968
  float * xp = opt->lbfgs.xp->data; // previous parameters
18774
18969
  float * g = opt->lbfgs.g->data; // current gradient
@@ -18790,6 +18985,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18790
18985
  float * lm_s = opt->lbfgs.lms->data;
18791
18986
  float * lm_y = opt->lbfgs.lmy->data;
18792
18987
 
18988
+ if (callback) {
18989
+ // LBFG-S does not support learning rate -> ignore learning schedule
18990
+ float sched = 0;
18991
+ callback(callback_data, &sched);
18992
+ }
18993
+
18793
18994
  // evaluate the function value and its gradient
18794
18995
  {
18795
18996
  ggml_opt_set_params(np, ps, x);
@@ -18797,11 +18998,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18797
18998
  ggml_graph_reset (gf);
18798
18999
  ggml_set_f32 (f->grad, 1.0f);
18799
19000
 
18800
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
19001
+ ggml_graph_compute(gb, &cplan);
18801
19002
 
18802
19003
  ggml_opt_get_grad(np, ps, g);
18803
19004
 
18804
19005
  fx = ggml_get_f32_1d(f, 0);
19006
+
19007
+ opt->loss_before = fx;
19008
+ opt->loss_after = fx;
18805
19009
  }
18806
19010
 
18807
19011
  // search direction = -gradient
@@ -18856,7 +19060,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18856
19060
  ggml_vec_cpy_f32(nx, xp, x);
18857
19061
  ggml_vec_cpy_f32(nx, gp, g);
18858
19062
 
18859
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
19063
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
18860
19064
 
18861
19065
  if (ls < 0) {
18862
19066
  // linesearch failed - go back to the previous point and return
@@ -18866,6 +19070,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18866
19070
  return ls;
18867
19071
  }
18868
19072
 
19073
+ opt->loss_after = fx;
19074
+
18869
19075
  ggml_vec_norm_f32(nx, &xnorm, x);
18870
19076
  ggml_vec_norm_f32(nx, &gnorm, g);
18871
19077
 
@@ -18923,7 +19129,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18923
19129
  // ys = y^t \cdot s -> 1 / \rho.
18924
19130
  // yy = y^t \cdot y.
18925
19131
  //
18926
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
19132
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18927
19133
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18928
19134
 
18929
19135
  lm_ys[end[0]] = ys;
@@ -18986,13 +19192,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18986
19192
  .adam = {
18987
19193
  .n_iter = 10000,
18988
19194
  .sched = 1.000f,
18989
- .decay = 0.001f,
19195
+ .decay = 0.0f,
19196
+ .decay_min_ndim = 2,
18990
19197
  .alpha = 0.001f,
18991
19198
  .beta1 = 0.9f,
18992
19199
  .beta2 = 0.999f,
18993
19200
  .eps = 1e-8f,
18994
19201
  .eps_f = 1e-5f,
18995
19202
  .eps_g = 1e-3f,
19203
+ .gclip = 0.0f,
18996
19204
  },
18997
19205
  };
18998
19206
  } break;
@@ -19042,23 +19250,13 @@ GGML_API void ggml_opt_init(
19042
19250
  switch (opt->params.type) {
19043
19251
  case GGML_OPT_ADAM:
19044
19252
  {
19045
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19046
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19047
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19048
19253
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19049
19254
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19050
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19051
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19052
19255
  opt->adam.pf = params.past > 0
19053
19256
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
19054
19257
  : NULL;
19055
- ggml_set_zero(opt->adam.x);
19056
- ggml_set_zero(opt->adam.g1);
19057
- ggml_set_zero(opt->adam.g2);
19058
19258
  ggml_set_zero(opt->adam.m);
19059
19259
  ggml_set_zero(opt->adam.v);
19060
- ggml_set_zero(opt->adam.mh);
19061
- ggml_set_zero(opt->adam.vh);
19062
19260
  if (opt->adam.pf) {
19063
19261
  ggml_set_zero(opt->adam.pf);
19064
19262
  }
@@ -19142,7 +19340,7 @@ enum ggml_opt_result ggml_opt_resume(
19142
19340
  *gf = ggml_build_forward (f);
19143
19341
  *gb = ggml_build_backward(ctx, gf, true);
19144
19342
 
19145
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
19343
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
19146
19344
  }
19147
19345
 
19148
19346
  enum ggml_opt_result ggml_opt_resume_g(
@@ -19150,7 +19348,9 @@ enum ggml_opt_result ggml_opt_resume_g(
19150
19348
  struct ggml_opt_context * opt,
19151
19349
  struct ggml_tensor * f,
19152
19350
  struct ggml_cgraph * gf,
19153
- struct ggml_cgraph * gb) {
19351
+ struct ggml_cgraph * gb,
19352
+ ggml_opt_callback callback,
19353
+ void * callback_data) {
19154
19354
 
19155
19355
  // build forward + backward compute graphs
19156
19356
  enum ggml_opt_result result = GGML_OPT_OK;
@@ -19158,11 +19358,11 @@ enum ggml_opt_result ggml_opt_resume_g(
19158
19358
  switch (opt->params.type) {
19159
19359
  case GGML_OPT_ADAM:
19160
19360
  {
19161
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
19361
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19162
19362
  } break;
19163
19363
  case GGML_OPT_LBFGS:
19164
19364
  {
19165
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
19365
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19166
19366
  } break;
19167
19367
  }
19168
19368
 
@@ -19394,7 +19594,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19394
19594
  ////////////////////////////////////////////////////////////////////////////////
19395
19595
 
19396
19596
  struct gguf_str {
19397
- uint32_t n;
19597
+ uint64_t n; // GGUFv2
19398
19598
  char * data;
19399
19599
  };
19400
19600
 
@@ -19408,9 +19608,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19408
19608
  [GGUF_TYPE_FLOAT32] = sizeof(float),
19409
19609
  [GGUF_TYPE_BOOL] = sizeof(bool),
19410
19610
  [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19611
+ [GGUF_TYPE_UINT64] = sizeof(uint64_t),
19612
+ [GGUF_TYPE_INT64] = sizeof(int64_t),
19613
+ [GGUF_TYPE_FLOAT64] = sizeof(double),
19411
19614
  [GGUF_TYPE_ARRAY] = 0, // undefined
19412
19615
  };
19413
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19616
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19414
19617
 
19415
19618
  static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19416
19619
  [GGUF_TYPE_UINT8] = "u8",
@@ -19423,8 +19626,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19423
19626
  [GGUF_TYPE_BOOL] = "bool",
19424
19627
  [GGUF_TYPE_STRING] = "str",
19425
19628
  [GGUF_TYPE_ARRAY] = "arr",
19629
+ [GGUF_TYPE_UINT64] = "u64",
19630
+ [GGUF_TYPE_INT64] = "i64",
19631
+ [GGUF_TYPE_FLOAT64] = "f64",
19426
19632
  };
19427
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19633
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19428
19634
 
19429
19635
  union gguf_value {
19430
19636
  uint8_t uint8;
@@ -19434,6 +19640,9 @@ union gguf_value {
19434
19640
  uint32_t uint32;
19435
19641
  int32_t int32;
19436
19642
  float float32;
19643
+ uint64_t uint64;
19644
+ int64_t int64;
19645
+ double float64;
19437
19646
  bool bool_;
19438
19647
 
19439
19648
  struct gguf_str str;
@@ -19441,7 +19650,7 @@ union gguf_value {
19441
19650
  struct {
19442
19651
  enum gguf_type type;
19443
19652
 
19444
- uint32_t n;
19653
+ uint64_t n; // GGUFv2
19445
19654
  void * data;
19446
19655
  } arr;
19447
19656
  };
@@ -19449,8 +19658,6 @@ union gguf_value {
19449
19658
  struct gguf_kv {
19450
19659
  struct gguf_str key;
19451
19660
 
19452
- uint32_t n_bytes; // TODO: is this actually needed?
19453
-
19454
19661
  enum gguf_type type;
19455
19662
  union gguf_value value;
19456
19663
  };
@@ -19458,15 +19665,15 @@ struct gguf_kv {
19458
19665
  struct gguf_header {
19459
19666
  uint32_t magic;
19460
19667
  uint32_t version;
19461
- uint32_t n_tensors;
19462
- uint32_t n_kv;
19668
+ uint64_t n_tensors; // GGUFv2
19669
+ uint64_t n_kv; // GGUFv2
19463
19670
  };
19464
19671
 
19465
19672
  struct gguf_tensor_info {
19466
19673
  struct gguf_str name;
19467
19674
 
19468
19675
  uint32_t n_dims;
19469
- uint32_t ne[GGML_MAX_DIMS];
19676
+ uint64_t ne[GGML_MAX_DIMS];
19470
19677
 
19471
19678
  enum ggml_type type;
19472
19679
 
@@ -19497,19 +19704,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
19497
19704
  return n == size;
19498
19705
  }
19499
19706
 
19500
- static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
19707
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19708
+ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
19501
19709
  p->n = 0;
19502
19710
  p->data = NULL;
19503
19711
 
19504
19712
  bool ok = true;
19505
19713
 
19506
- // TODO: how to avoid mallocs for strings?
19507
19714
  ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19508
19715
  ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19509
19716
 
19510
19717
  return ok;
19511
19718
  }
19512
19719
 
19720
+ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
19721
+ p->n = 0;
19722
+ p->data = NULL;
19723
+
19724
+ bool ok = true;
19725
+
19726
+ uint32_t n = 0;
19727
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
19728
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19729
+
19730
+ return ok;
19731
+ }
19732
+
19513
19733
  struct gguf_context * gguf_init_empty(void) {
19514
19734
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19515
19735
 
@@ -19565,8 +19785,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19565
19785
  ctx->data = NULL;
19566
19786
 
19567
19787
  ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19568
- ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19569
- ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19788
+
19789
+ if (ctx->header.version == 1) {
19790
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19791
+ uint32_t n_tensors = 0;
19792
+ uint32_t n_kv = 0;
19793
+
19794
+ ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
19795
+ ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
19796
+
19797
+ ctx->header.n_tensors = n_tensors;
19798
+ ctx->header.n_kv = n_kv;
19799
+ } else {
19800
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19801
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19802
+ }
19570
19803
 
19571
19804
  if (!ok) {
19572
19805
  fprintf(stderr, "%s: failed to read header\n", __func__);
@@ -19576,18 +19809,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19576
19809
  }
19577
19810
  }
19578
19811
 
19812
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19813
+ bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
19814
+ if (ctx->header.version == 1) {
19815
+ gguf_fread_str = gguf_fread_str_v1;
19816
+ }
19817
+
19579
19818
  // read the kv pairs
19580
19819
  {
19581
- ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19820
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19582
19821
 
19583
19822
  for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19584
19823
  struct gguf_kv * kv = &ctx->kv[i];
19585
19824
 
19586
19825
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19587
19826
 
19588
- ok = ok && gguf_fread_str(file, &kv->key, &offset);
19589
- //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
19590
- ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19827
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19828
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19591
19829
 
19592
19830
  //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19593
19831
 
@@ -19599,12 +19837,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19599
19837
  case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19600
19838
  case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19601
19839
  case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19840
+ case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
19841
+ case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
19842
+ case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
19602
19843
  case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19603
19844
  case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19604
19845
  case GGUF_TYPE_ARRAY:
19605
19846
  {
19606
19847
  ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19607
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19848
+
19849
+ if (ctx->header.version == 1) {
19850
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19851
+ uint32_t n = 0;
19852
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
19853
+ kv->value.arr.n = n;
19854
+ } else {
19855
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19856
+ }
19608
19857
 
19609
19858
  switch (kv->value.arr.type) {
19610
19859
  case GGUF_TYPE_UINT8:
@@ -19614,6 +19863,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19614
19863
  case GGUF_TYPE_UINT32:
19615
19864
  case GGUF_TYPE_INT32:
19616
19865
  case GGUF_TYPE_FLOAT32:
19866
+ case GGUF_TYPE_UINT64:
19867
+ case GGUF_TYPE_INT64:
19868
+ case GGUF_TYPE_FLOAT64:
19617
19869
  case GGUF_TYPE_BOOL:
19618
19870
  {
19619
19871
  kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -19648,7 +19900,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19648
19900
 
19649
19901
  // read the tensor infos
19650
19902
  {
19651
- ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19903
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19652
19904
 
19653
19905
  for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19654
19906
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19660,7 +19912,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19660
19912
  ok = ok && gguf_fread_str(file, &info->name, &offset);
19661
19913
  ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19662
19914
  for (uint32_t j = 0; j < info->n_dims; ++j) {
19663
- ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19915
+ if (ctx->header.version == 1) {
19916
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19917
+ uint32_t t = 0;
19918
+ ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
19919
+ info->ne[j] = t;
19920
+ } else {
19921
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19922
+ }
19664
19923
  }
19665
19924
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19666
19925
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
@@ -19842,7 +20101,7 @@ void gguf_free(struct gguf_context * ctx) {
19842
20101
  }
19843
20102
  }
19844
20103
 
19845
- GGML_ALIGNED_FREE(ctx->kv);
20104
+ free(ctx->kv);
19846
20105
  }
19847
20106
 
19848
20107
  if (ctx->infos) {
@@ -19854,7 +20113,7 @@ void gguf_free(struct gguf_context * ctx) {
19854
20113
  }
19855
20114
  }
19856
20115
 
19857
- GGML_ALIGNED_FREE(ctx->infos);
20116
+ free(ctx->infos);
19858
20117
  }
19859
20118
 
19860
20119
  GGML_ALIGNED_FREE(ctx);
@@ -19954,6 +20213,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
19954
20213
  return ctx->kv[i].value.float32;
19955
20214
  }
19956
20215
 
20216
+ uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20217
+ return ctx->kv[i].value.uint64;
20218
+ }
20219
+
20220
+ int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20221
+ return ctx->kv[i].value.int64;
20222
+ }
20223
+
20224
+ double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20225
+ return ctx->kv[i].value.float64;
20226
+ }
20227
+
19957
20228
  bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
19958
20229
  return ctx->kv[i].value.bool_;
19959
20230
  }
@@ -20000,7 +20271,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
20000
20271
  const int n_kv = gguf_get_n_kv(ctx);
20001
20272
 
20002
20273
  ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20003
- ctx->kv[n_kv].key.n = strlen(key) + 1;
20274
+ ctx->kv[n_kv].key.n = strlen(key);
20004
20275
  ctx->kv[n_kv].key.data = strdup(key);
20005
20276
  ctx->header.n_kv++;
20006
20277
 
@@ -20056,6 +20327,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20056
20327
  ctx->kv[idx].value.float32 = val;
20057
20328
  }
20058
20329
 
20330
+ void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
20331
+ const int idx = gguf_get_or_add_key(ctx, key);
20332
+
20333
+ ctx->kv[idx].type = GGUF_TYPE_UINT64;
20334
+ ctx->kv[idx].value.uint64 = val;
20335
+ }
20336
+
20337
+ void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
20338
+ const int idx = gguf_get_or_add_key(ctx, key);
20339
+
20340
+ ctx->kv[idx].type = GGUF_TYPE_INT64;
20341
+ ctx->kv[idx].value.int64 = val;
20342
+ }
20343
+
20344
+ void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
20345
+ const int idx = gguf_get_or_add_key(ctx, key);
20346
+
20347
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
20348
+ ctx->kv[idx].value.float64 = val;
20349
+ }
20350
+
20059
20351
  void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20060
20352
  const int idx = gguf_get_or_add_key(ctx, key);
20061
20353
 
@@ -20067,7 +20359,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
20067
20359
  const int idx = gguf_get_or_add_key(ctx, key);
20068
20360
 
20069
20361
  ctx->kv[idx].type = GGUF_TYPE_STRING;
20070
- ctx->kv[idx].value.str.n = strlen(val) + 1;
20362
+ ctx->kv[idx].value.str.n = strlen(val);
20071
20363
  ctx->kv[idx].value.str.data = strdup(val);
20072
20364
  }
20073
20365
 
@@ -20090,7 +20382,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
20090
20382
  ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20091
20383
  for (int i = 0; i < n; i++) {
20092
20384
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20093
- str->n = strlen(data[i]) + 1;
20385
+ str->n = strlen(data[i]);
20094
20386
  str->data = strdup(data[i]);
20095
20387
  }
20096
20388
  }
@@ -20106,6 +20398,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20106
20398
  case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20107
20399
  case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20108
20400
  case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20401
+ case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
20402
+ case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
20403
+ case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
20109
20404
  case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20110
20405
  case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20111
20406
  case GGUF_TYPE_ARRAY:
@@ -20134,7 +20429,7 @@ void gguf_add_tensor(
20134
20429
  const int idx = ctx->header.n_tensors;
20135
20430
  ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20136
20431
 
20137
- ctx->infos[idx].name.n = strlen(tensor->name) + 1;
20432
+ ctx->infos[idx].name.n = strlen(tensor->name);
20138
20433
  ctx->infos[idx].name.data = strdup(tensor->name);
20139
20434
 
20140
20435
  for (int i = 0; i < GGML_MAX_DIMS; ++i) {
@@ -20267,6 +20562,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20267
20562
  case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20268
20563
  case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20269
20564
  case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20565
+ case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
20566
+ case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
20567
+ case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
20270
20568
  case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20271
20569
  case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20272
20570
  case GGUF_TYPE_ARRAY:
@@ -20282,6 +20580,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20282
20580
  case GGUF_TYPE_UINT32:
20283
20581
  case GGUF_TYPE_INT32:
20284
20582
  case GGUF_TYPE_FLOAT32:
20583
+ case GGUF_TYPE_UINT64:
20584
+ case GGUF_TYPE_INT64:
20585
+ case GGUF_TYPE_FLOAT64:
20285
20586
  case GGUF_TYPE_BOOL:
20286
20587
  {
20287
20588
  gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -20516,6 +20817,14 @@ int ggml_cpu_has_sse3(void) {
20516
20817
  #endif
20517
20818
  }
20518
20819
 
20820
+ int ggml_cpu_has_ssse3(void) {
20821
+ #if defined(__SSSE3__)
20822
+ return 1;
20823
+ #else
20824
+ return 0;
20825
+ #endif
20826
+ }
20827
+
20519
20828
  int ggml_cpu_has_vsx(void) {
20520
20829
  #if defined(__POWER9_VECTOR__)
20521
20830
  return 1;