llama_cpp 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
123
123
  #define GGML_GELU_FP16
124
124
  #define GGML_GELU_QUICK_FP16
125
125
  #define GGML_SILU_FP16
126
+ // #define GGML_CROSS_ENTROPY_EXP_FP16
127
+ // #define GGML_FLASH_ATTN_EXP_FP16
126
128
 
127
129
  #define GGML_SOFT_MAX_UNROLL 4
128
130
  #define GGML_VEC_DOT_UNROLL 2
@@ -157,12 +159,6 @@ typedef void * thread_ret_t;
157
159
  //#define GGML_SOFT_MAX_ACCELERATE
158
160
  #endif
159
161
 
160
- #if UINTPTR_MAX == 0xFFFFFFFF
161
- #define GGML_MEM_ALIGN 4
162
- #else
163
- #define GGML_MEM_ALIGN 16
164
- #endif
165
-
166
162
  //
167
163
  // logging
168
164
  //
@@ -192,8 +188,8 @@ typedef void * thread_ret_t;
192
188
  //
193
189
 
194
190
  #if defined(_MSC_VER) || defined(__MINGW32__)
195
- #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
- #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
191
+ #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
192
+ #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
193
  #else
198
194
  inline static void * ggml_aligned_malloc(size_t size) {
199
195
  void * aligned_memory = NULL;
@@ -218,8 +214,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
218
214
  }
219
215
  return aligned_memory;
220
216
  }
221
- #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
222
- #define GGML_ALIGNED_FREE(ptr) free(ptr)
217
+ #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
218
+ #define GGML_ALIGNED_FREE(ptr) free(ptr)
223
219
  #endif
224
220
 
225
221
  #define UNUSED GGML_UNUSED
@@ -305,6 +301,10 @@ typedef double ggml_float;
305
301
  #endif
306
302
  #endif
307
303
 
304
+ #ifdef __riscv_v_intrinsic
305
+ #include <riscv_vector.h>
306
+ #endif
307
+
308
308
  #ifdef __F16C__
309
309
 
310
310
  #ifdef _MSC_VER
@@ -2436,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2436
2436
  const int nb = n / qk;
2437
2437
 
2438
2438
  assert(n % qk == 0);
2439
- assert(nb % 2 == 0);
2440
2439
 
2441
2440
  const block_q4_0 * restrict x = vx;
2442
2441
  const block_q8_0 * restrict y = vy;
@@ -2445,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2445
2444
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2446
2445
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2447
2446
 
2447
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2448
2448
  for (int i = 0; i < nb; i += 2) {
2449
2449
  const block_q4_0 * restrict x0 = &x[i + 0];
2450
2450
  const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2623,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2623
2623
  }
2624
2624
 
2625
2625
  // Main loop
2626
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2626
2627
  for (int i = 2; i < nb; i+=2) {
2627
2628
  _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
2628
2629
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2680,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2680
2681
  }
2681
2682
 
2682
2683
  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2684
+ #elif defined(__riscv_v_intrinsic)
2685
+ float sumf = 0.0;
2686
+
2687
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2688
+
2689
+ for (int i = 0; i < nb; i++) {
2690
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2691
+
2692
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2693
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2694
+
2695
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2696
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2697
+
2698
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2699
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2700
+
2701
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2702
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2703
+
2704
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2705
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2706
+
2707
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2708
+
2709
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2710
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2711
+
2712
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2713
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2714
+
2715
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2716
+ }
2717
+
2718
+ *s = sumf;
2683
2719
  #else
2684
2720
  // scalar
2685
2721
  float sumf = 0.0;
@@ -2706,7 +2742,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2706
2742
  const int nb = n / qk;
2707
2743
 
2708
2744
  assert(n % qk == 0);
2709
- assert(nb % 2 == 0);
2710
2745
 
2711
2746
  const block_q4_1 * restrict x = vx;
2712
2747
  const block_q8_1 * restrict y = vy;
@@ -2718,6 +2753,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2718
2753
 
2719
2754
  float summs = 0;
2720
2755
 
2756
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2721
2757
  for (int i = 0; i < nb; i += 2) {
2722
2758
  const block_q4_1 * restrict x0 = &x[i + 0];
2723
2759
  const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2806,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2806
2842
  }
2807
2843
 
2808
2844
  *s = hsum_float_8(acc) + summs;
2845
+ #elif defined(__riscv_v_intrinsic)
2846
+ float sumf = 0.0;
2847
+
2848
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
2849
+
2850
+ for (int i = 0; i < nb; i++) {
2851
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2852
+
2853
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2854
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2855
+
2856
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2857
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2858
+
2859
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2860
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2861
+
2862
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2863
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2864
+
2865
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2866
+
2867
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2868
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2869
+
2870
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2871
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2872
+
2873
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2874
+ }
2875
+
2876
+ *s = sumf;
2809
2877
  #else
2810
2878
  // scalar
2811
2879
  float sumf = 0.0;
@@ -2832,7 +2900,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2832
2900
  const int nb = n / qk;
2833
2901
 
2834
2902
  assert(n % qk == 0);
2835
- assert(nb % 2 == 0);
2836
2903
  assert(qk == QK5_0);
2837
2904
 
2838
2905
  const block_q5_0 * restrict x = vx;
@@ -2848,6 +2915,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2848
2915
  uint64_t tmp0[4];
2849
2916
  uint64_t tmp1[4];
2850
2917
 
2918
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
2851
2919
  for (int i = 0; i < nb; i += 2) {
2852
2920
  const block_q5_0 * restrict x0 = &x[i];
2853
2921
  const block_q5_0 * restrict x1 = &x[i + 1];
@@ -3040,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3040
3108
  }
3041
3109
 
3042
3110
  *s = hsum_float_8(acc);
3111
+ #elif defined(__riscv_v_intrinsic)
3112
+ float sumf = 0.0;
3113
+
3114
+ uint32_t qh;
3115
+
3116
+ // These temp values are for masking and shift operations
3117
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3118
+ uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3119
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3120
+
3121
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3122
+
3123
+ for (int i = 0; i < nb; i++) {
3124
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3125
+
3126
+ // temporary registers
3127
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3128
+ vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3129
+ vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3130
+ vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3131
+
3132
+ // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3133
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3134
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3135
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3136
+
3137
+ // ((qh & (1u << (j + 16))) >> (j + 12));
3138
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3139
+ vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3140
+
3141
+ // narrowing
3142
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3143
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3144
+
3145
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3146
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3147
+
3148
+ // load
3149
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3150
+
3151
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3152
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3153
+
3154
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3155
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3156
+
3157
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3158
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3159
+
3160
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3161
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3162
+
3163
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3164
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3165
+
3166
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3167
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3168
+
3169
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3170
+
3171
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3172
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3173
+
3174
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3175
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3176
+
3177
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3178
+ }
3179
+
3180
+ *s = sumf;
3043
3181
  #else
3044
3182
  // scalar
3045
3183
  float sumf = 0.0;
@@ -3072,7 +3210,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3072
3210
  const int nb = n / qk;
3073
3211
 
3074
3212
  assert(n % qk == 0);
3075
- assert(nb % 2 == 0);
3076
3213
  assert(qk == QK5_1);
3077
3214
 
3078
3215
  const block_q5_1 * restrict x = vx;
@@ -3091,6 +3228,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3091
3228
  uint64_t tmp0[4];
3092
3229
  uint64_t tmp1[4];
3093
3230
 
3231
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3094
3232
  for (int i = 0; i < nb; i += 2) {
3095
3233
  const block_q5_1 * restrict x0 = &x[i];
3096
3234
  const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3296,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3296
3434
  }
3297
3435
 
3298
3436
  *s = hsum_float_8(acc) + summs;
3437
+ #elif defined(__riscv_v_intrinsic)
3438
+ float sumf = 0.0;
3439
+
3440
+ uint32_t qh;
3441
+
3442
+ // These temp values are for shift operations
3443
+ uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3444
+
3445
+ size_t vl = __riscv_vsetvl_e8m1(qk/2);
3446
+
3447
+ for (int i = 0; i < nb; i++) {
3448
+ memcpy(&qh, x[i].qh, sizeof(uint32_t));
3449
+
3450
+ // temporary registers
3451
+ vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3452
+ vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3453
+
3454
+ // load qh
3455
+ vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3456
+
3457
+ // ((qh >> (j + 0)) << 4) & 0x10;
3458
+ vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3459
+ vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3460
+ vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3461
+
3462
+ // ((qh >> (j + 12)) ) & 0x10;
3463
+ vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3464
+ vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3465
+
3466
+ // narrowing
3467
+ vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3468
+ vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3469
+
3470
+ vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3471
+ vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3472
+
3473
+ // load
3474
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3475
+
3476
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3477
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3478
+
3479
+ vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3480
+ vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3481
+
3482
+ vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3483
+ vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3484
+
3485
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3486
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3487
+
3488
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3489
+ vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3490
+
3491
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3492
+
3493
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3494
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3495
+
3496
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3497
+ sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3498
+
3499
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3500
+ }
3501
+
3502
+ *s = sumf;
3299
3503
  #else
3300
3504
  // scalar
3301
3505
  float sumf = 0.0;
@@ -3328,7 +3532,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3328
3532
  const int nb = n / qk;
3329
3533
 
3330
3534
  assert(n % qk == 0);
3331
- assert(nb % 2 == 0);
3332
3535
 
3333
3536
  const block_q8_0 * restrict x = vx;
3334
3537
  const block_q8_0 * restrict y = vy;
@@ -3337,6 +3540,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3337
3540
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3338
3541
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3339
3542
 
3543
+ GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
3340
3544
  for (int i = 0; i < nb; i += 2) {
3341
3545
  const block_q8_0 * restrict x0 = &x[i + 0];
3342
3546
  const block_q8_0 * restrict x1 = &x[i + 1];
@@ -3407,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3407
3611
  }
3408
3612
 
3409
3613
  *s = hsum_float_8(acc);
3614
+ #elif defined(__riscv_v_intrinsic)
3615
+ float sumf = 0.0;
3616
+ size_t vl = __riscv_vsetvl_e8m1(qk);
3617
+
3618
+ for (int i = 0; i < nb; i++) {
3619
+ // load elements
3620
+ vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
3621
+ vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
3622
+
3623
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
3624
+
3625
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
3626
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
3627
+
3628
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
3629
+
3630
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3631
+ }
3632
+
3633
+ *s = sumf;
3410
3634
  #else
3411
3635
  // scalar
3412
3636
  float sumf = 0.0;
@@ -4107,16 +4331,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4107
4331
  }
4108
4332
 
4109
4333
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4111
-
4112
- // this should handle cases where the tensor is not contiguous in memory
4113
- // probaby just:
4114
- //
4115
- // return tensor->ne[3]*tensor->nb[3]
4116
- //
4117
- // is enough, but just in case, adding the second part
4118
-
4119
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
4334
+ size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4335
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4336
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4337
+ }
4338
+ return nbytes;
4120
4339
  }
4121
4340
 
4122
4341
  size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@@ -4570,36 +4789,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4570
4789
  enum ggml_type type,
4571
4790
  int n_dims,
4572
4791
  const int64_t * ne,
4573
- void * data) {
4792
+ struct ggml_tensor * view_src,
4793
+ size_t view_offs) {
4574
4794
 
4575
4795
  assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4576
4796
 
4577
- size_t data_size = 0;
4797
+ // find the base tensor and absolute offset
4798
+ if (view_src != NULL && view_src->view_src != NULL) {
4799
+ view_offs += view_src->view_offs;
4800
+ view_src = view_src->view_src;
4801
+ }
4578
4802
 
4579
- if (data == NULL && !ctx->no_alloc) {
4580
- data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4581
- for (int i = 1; i < n_dims; i++) {
4582
- data_size *= ne[i];
4583
- }
4803
+ size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4804
+ for (int i = 1; i < n_dims; i++) {
4805
+ data_size *= ne[i];
4584
4806
  }
4585
4807
 
4586
- if (ctx->scratch.data != NULL && data == NULL) {
4587
- // allocate tensor data in the scratch buffer
4588
- if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4589
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4590
- __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4591
- assert(false);
4592
- return NULL;
4593
- }
4808
+ GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
4594
4809
 
4595
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4810
+ void * data = view_src != NULL ? view_src->data : NULL;
4811
+ if (data != NULL) {
4812
+ data = (char *) data + view_offs;
4813
+ }
4596
4814
 
4597
- ctx->scratch.offs += data_size;
4815
+ size_t obj_alloc_size = 0;
4598
4816
 
4599
- data_size = 0;
4817
+ if (view_src == NULL && ctx->no_alloc == false) {
4818
+ if (ctx->scratch.data != NULL) {
4819
+ // allocate tensor data in the scratch buffer
4820
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4821
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4822
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4823
+ assert(false);
4824
+ return NULL;
4825
+ }
4826
+
4827
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4828
+
4829
+ ctx->scratch.offs += data_size;
4830
+ } else {
4831
+ // allocate tensor data in the context's memory pool
4832
+ obj_alloc_size = data_size;
4833
+ }
4600
4834
  }
4601
4835
 
4602
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4836
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
4603
4837
 
4604
4838
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4605
4839
 
@@ -4619,7 +4853,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4619
4853
  /*.perf_runs =*/ 0,
4620
4854
  /*.perf_cycles =*/ 0,
4621
4855
  /*.perf_time_us =*/ 0,
4622
- /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4856
+ /*.view_src =*/ view_src,
4857
+ /*.view_offs =*/ view_offs,
4858
+ /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
4623
4859
  /*.name =*/ { 0 },
4624
4860
  /*.extra =*/ NULL,
4625
4861
  /*.padding =*/ { 0 },
@@ -4643,28 +4879,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4643
4879
  return result;
4644
4880
  }
4645
4881
 
4646
- static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4647
- GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4648
- assert(params_size <= GGML_MAX_OP_PARAMS);
4649
- memcpy(tensor->op_params, params, params_size);
4650
- }
4651
-
4652
- static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4653
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4654
- return ((const int32_t *)(tensor->op_params))[i];
4655
- }
4656
-
4657
- static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4658
- assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4659
- ((int32_t *)(tensor->op_params))[i] = value;
4660
- }
4661
-
4662
4882
  struct ggml_tensor * ggml_new_tensor(
4663
4883
  struct ggml_context * ctx,
4664
4884
  enum ggml_type type,
4665
4885
  int n_dims,
4666
4886
  const int64_t * ne) {
4667
- return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4887
+ return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
4668
4888
  }
4669
4889
 
4670
4890
  struct ggml_tensor * ggml_new_tensor_1d(
@@ -4729,7 +4949,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
4729
4949
  }
4730
4950
 
4731
4951
  struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
4732
- return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
4952
+ return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
4953
+ }
4954
+
4955
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4956
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4957
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4958
+ memcpy(tensor->op_params, params, params_size);
4959
+ }
4960
+
4961
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4962
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4963
+ return ((const int32_t *)(tensor->op_params))[i];
4964
+ }
4965
+
4966
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4967
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4968
+ ((int32_t *)(tensor->op_params))[i] = value;
4733
4969
  }
4734
4970
 
4735
4971
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
@@ -5015,14 +5251,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
5015
5251
 
5016
5252
  struct ggml_tensor * ggml_view_tensor(
5017
5253
  struct ggml_context * ctx,
5018
- const struct ggml_tensor * src) {
5019
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
5254
+ struct ggml_tensor * src) {
5255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
5020
5256
  ggml_format_name(result, "%s (view)", src->name);
5021
5257
 
5022
- result->nb[0] = src->nb[0];
5023
- result->nb[1] = src->nb[1];
5024
- result->nb[2] = src->nb[2];
5025
- result->nb[3] = src->nb[3];
5258
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
5259
+ result->nb[i] = src->nb[i];
5260
+ }
5026
5261
 
5027
5262
  return result;
5028
5263
  }
@@ -5595,7 +5830,7 @@ struct ggml_tensor * ggml_repeat_back(
5595
5830
 
5596
5831
  // ggml_concat
5597
5832
 
5598
- struct ggml_tensor* ggml_concat(
5833
+ struct ggml_tensor * ggml_concat(
5599
5834
  struct ggml_context* ctx,
5600
5835
  struct ggml_tensor* a,
5601
5836
  struct ggml_tensor* b) {
@@ -5862,7 +6097,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5862
6097
  struct ggml_tensor * ggml_rms_norm_back(
5863
6098
  struct ggml_context * ctx,
5864
6099
  struct ggml_tensor * a,
5865
- struct ggml_tensor * b) {
6100
+ struct ggml_tensor * b,
6101
+ float eps) {
5866
6102
  bool is_node = false;
5867
6103
 
5868
6104
  if (a->grad) {
@@ -5872,6 +6108,8 @@ struct ggml_tensor * ggml_rms_norm_back(
5872
6108
 
5873
6109
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5874
6110
 
6111
+ ggml_set_op_params(result, &eps, sizeof(eps));
6112
+
5875
6113
  result->op = GGML_OP_RMS_NORM_BACK;
5876
6114
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5877
6115
  result->src[0] = a;
@@ -6201,7 +6439,7 @@ struct ggml_tensor * ggml_reshape(
6201
6439
  //GGML_ASSERT(false);
6202
6440
  }
6203
6441
 
6204
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6442
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
6205
6443
  ggml_format_name(result, "%s (reshaped)", a->name);
6206
6444
 
6207
6445
  result->op = GGML_OP_RESHAPE;
@@ -6225,7 +6463,7 @@ struct ggml_tensor * ggml_reshape_1d(
6225
6463
  }
6226
6464
 
6227
6465
  const int64_t ne[1] = { ne0 };
6228
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6466
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
6229
6467
  ggml_format_name(result, "%s (reshaped)", a->name);
6230
6468
 
6231
6469
  result->op = GGML_OP_RESHAPE;
@@ -6250,7 +6488,7 @@ struct ggml_tensor * ggml_reshape_2d(
6250
6488
  }
6251
6489
 
6252
6490
  const int64_t ne[2] = { ne0, ne1 };
6253
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6491
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
6254
6492
  ggml_format_name(result, "%s (reshaped)", a->name);
6255
6493
 
6256
6494
  result->op = GGML_OP_RESHAPE;
@@ -6276,7 +6514,7 @@ struct ggml_tensor * ggml_reshape_3d(
6276
6514
  }
6277
6515
 
6278
6516
  const int64_t ne[3] = { ne0, ne1, ne2 };
6279
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6517
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
6280
6518
  ggml_format_name(result, "%s (reshaped)", a->name);
6281
6519
 
6282
6520
  result->op = GGML_OP_RESHAPE;
@@ -6286,7 +6524,6 @@ struct ggml_tensor * ggml_reshape_3d(
6286
6524
  return result;
6287
6525
  }
6288
6526
 
6289
-
6290
6527
  struct ggml_tensor * ggml_reshape_4d(
6291
6528
  struct ggml_context * ctx,
6292
6529
  struct ggml_tensor * a,
@@ -6304,7 +6541,7 @@ struct ggml_tensor * ggml_reshape_4d(
6304
6541
  }
6305
6542
 
6306
6543
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6307
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6544
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
6308
6545
  ggml_format_name(result, "%s (reshaped)", a->name);
6309
6546
 
6310
6547
  result->op = GGML_OP_RESHAPE;
@@ -6314,46 +6551,40 @@ struct ggml_tensor * ggml_reshape_4d(
6314
6551
  return result;
6315
6552
  }
6316
6553
 
6317
- // ggml_view_1d
6318
-
6319
- static struct ggml_tensor * ggml_view_tensor_offset(
6554
+ static struct ggml_tensor * ggml_view_impl(
6320
6555
  struct ggml_context * ctx,
6321
6556
  struct ggml_tensor * a,
6322
6557
  int n_dims,
6323
6558
  const int64_t * ne,
6324
6559
  size_t offset) {
6325
- // don't calculate an offset from an unallocated tensor
6326
- void * data = NULL;
6327
- if (a->data != NULL) {
6328
- data = (char *) a->data + offset;
6329
- }
6330
6560
 
6331
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6561
+ bool is_node = false;
6562
+
6563
+ if (a->grad) {
6564
+ is_node = true;
6565
+ }
6332
6566
 
6567
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
6333
6568
  ggml_format_name(result, "%s (view)", a->name);
6334
6569
 
6335
6570
  ggml_set_op_params(result, &offset, sizeof(offset));
6336
6571
 
6572
+ result->op = GGML_OP_VIEW;
6573
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6574
+ result->src[0] = a;
6575
+
6337
6576
  return result;
6338
6577
  }
6339
6578
 
6579
+ // ggml_view_1d
6580
+
6340
6581
  struct ggml_tensor * ggml_view_1d(
6341
6582
  struct ggml_context * ctx,
6342
6583
  struct ggml_tensor * a,
6343
6584
  int64_t ne0,
6344
6585
  size_t offset) {
6345
6586
 
6346
- bool is_node = false;
6347
-
6348
- if (a->grad) {
6349
- is_node = true;
6350
- }
6351
-
6352
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6353
-
6354
- result->op = GGML_OP_VIEW;
6355
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6356
- result->src[0] = a;
6587
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
6357
6588
 
6358
6589
  return result;
6359
6590
  }
@@ -6368,24 +6599,14 @@ struct ggml_tensor * ggml_view_2d(
6368
6599
  size_t nb1,
6369
6600
  size_t offset) {
6370
6601
 
6371
- bool is_node = false;
6372
-
6373
- if (a->grad) {
6374
- is_node = true;
6375
- }
6376
-
6377
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6602
+ const int64_t ne[2] = { ne0, ne1 };
6378
6603
 
6379
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6604
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
6380
6605
 
6381
6606
  result->nb[1] = nb1;
6382
6607
  result->nb[2] = result->nb[1]*ne1;
6383
6608
  result->nb[3] = result->nb[2];
6384
6609
 
6385
- result->op = GGML_OP_VIEW;
6386
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6387
- result->src[0] = a;
6388
-
6389
6610
  return result;
6390
6611
  }
6391
6612
 
@@ -6401,24 +6622,14 @@ struct ggml_tensor * ggml_view_3d(
6401
6622
  size_t nb2,
6402
6623
  size_t offset) {
6403
6624
 
6404
- bool is_node = false;
6405
-
6406
- if (a->grad) {
6407
- is_node = true;
6408
- }
6409
-
6410
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6625
+ const int64_t ne[3] = { ne0, ne1, ne2 };
6411
6626
 
6412
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6627
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
6413
6628
 
6414
6629
  result->nb[1] = nb1;
6415
6630
  result->nb[2] = nb2;
6416
6631
  result->nb[3] = result->nb[2]*ne2;
6417
6632
 
6418
- result->op = GGML_OP_VIEW;
6419
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6420
- result->src[0] = a;
6421
-
6422
6633
  return result;
6423
6634
  }
6424
6635
 
@@ -6436,24 +6647,14 @@ struct ggml_tensor * ggml_view_4d(
6436
6647
  size_t nb3,
6437
6648
  size_t offset) {
6438
6649
 
6439
- bool is_node = false;
6440
-
6441
- if (a->grad) {
6442
- is_node = true;
6443
- }
6444
-
6445
- const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6650
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6446
6651
 
6447
- struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6652
+ struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
6448
6653
 
6449
6654
  result->nb[1] = nb1;
6450
6655
  result->nb[2] = nb2;
6451
6656
  result->nb[3] = nb3;
6452
6657
 
6453
- result->op = GGML_OP_VIEW;
6454
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6455
- result->src[0] = a;
6456
-
6457
6658
  return result;
6458
6659
  }
6459
6660
 
@@ -6640,7 +6841,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6640
6841
 
6641
6842
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6642
6843
 
6643
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6844
+ int32_t params[] = { n_past };
6644
6845
  ggml_set_op_params(result, params, sizeof(params));
6645
6846
 
6646
6847
  result->op = GGML_OP_DIAG_MASK_INF;
@@ -6657,7 +6858,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
6657
6858
  return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
6658
6859
  }
6659
6860
 
6660
-
6661
6861
  struct ggml_tensor * ggml_diag_mask_inf_inplace(
6662
6862
  struct ggml_context * ctx,
6663
6863
  struct ggml_tensor * a,
@@ -6680,7 +6880,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6680
6880
 
6681
6881
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6682
6882
 
6683
- int32_t params[] = { n_past, inplace ? 1 : 0 };
6883
+ int32_t params[] = { n_past };
6684
6884
  ggml_set_op_params(result, params, sizeof(params));
6685
6885
 
6686
6886
  result->op = GGML_OP_DIAG_MASK_ZERO;
@@ -7097,11 +7297,13 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7097
7297
  };
7098
7298
 
7099
7299
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7300
+
7301
+ ggml_set_op_params_i32(result, 0, stride);
7302
+
7100
7303
  result->op = GGML_OP_CONV_TRANSPOSE_2D;
7101
7304
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7102
7305
  result->src[0] = a;
7103
7306
  result->src[1] = b;
7104
- result->src[2] = ggml_new_i32(ctx, stride);
7105
7307
 
7106
7308
  return result;
7107
7309
  }
@@ -9446,6 +9648,8 @@ static void ggml_compute_forward_div_f32(
9446
9648
 
9447
9649
 
9448
9650
  #ifdef GGML_USE_ACCELERATE
9651
+ UNUSED(ggml_vec_div_f32);
9652
+
9449
9653
  vDSP_vdiv(
9450
9654
  (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
9451
9655
  (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -10752,7 +10956,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
10752
10956
 
10753
10957
  GGML_TENSOR_BINARY_OP_LOCALS;
10754
10958
 
10755
- const float eps = 1e-6f; // TODO: make this a parameter
10959
+ float eps;
10960
+ memcpy(&eps, dst->op_params, sizeof(float));
10756
10961
 
10757
10962
  // TODO: optimize
10758
10963
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11930,8 +12135,8 @@ static void ggml_compute_forward_diag_mask_f32(
11930
12135
  const int ith = params->ith;
11931
12136
  const int nth = params->nth;
11932
12137
 
11933
- const int n_past = ((int32_t *) dst->op_params)[0];
11934
- const bool inplace = (bool)((int32_t *) dst->op_params)[1];
12138
+ const int n_past = ((int32_t *) dst->op_params)[0];
12139
+ const bool inplace = src0->data == dst->data;
11935
12140
 
11936
12141
  GGML_ASSERT(n_past >= 0);
11937
12142
 
@@ -12142,6 +12347,7 @@ static void ggml_compute_forward_soft_max_back_f32(
12142
12347
  // dx = J * dy
12143
12348
  // dxk = sum_i(Jki * dyi)
12144
12349
  // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
12350
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
12145
12351
  // dxk = sum_i(-yk*yi * dyi) + yk*dyk
12146
12352
  // dxk = -yk * sum_i(yi * dyi) + yk*dyk
12147
12353
  // dxk = -yk * dot(y, dy) + yk*dyk
@@ -13497,7 +13703,6 @@ static void ggml_compute_forward_conv_transpose_2d(
13497
13703
  const struct ggml_compute_params * params,
13498
13704
  const struct ggml_tensor * src0,
13499
13705
  const struct ggml_tensor * src1,
13500
- const struct ggml_tensor * opt0,
13501
13706
  struct ggml_tensor * dst) {
13502
13707
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13503
13708
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13557,7 +13762,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13557
13762
  return;
13558
13763
  }
13559
13764
 
13560
- const int32_t stride = ((const int32_t*)(opt0->data))[0];
13765
+ const int32_t stride = ggml_get_op_params_i32(dst, 0);
13561
13766
 
13562
13767
  // total patches in dst
13563
13768
  const int np = ne2;
@@ -13570,7 +13775,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13570
13775
  const int ip1 = MIN(ip0 + dp, np);
13571
13776
 
13572
13777
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13573
- ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
13778
+ ggml_fp16_t * const wdata_src = wdata + nk;
13574
13779
 
13575
13780
  for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13576
13781
  float * dst_data = (float *)((char *) dst->data + i2*nb2);
@@ -13582,9 +13787,8 @@ static void ggml_compute_forward_conv_transpose_2d(
13582
13787
  for (int i00 = 0; i00 < ne00; i00++) {
13583
13788
  float v = 0;
13584
13789
  ggml_vec_dot_f16(ne03, &v,
13585
- (ggml_fp16_t *) wdata_src + i1n,
13586
- (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
13587
-
13790
+ wdata_src + i1n,
13791
+ wdata_kernel + i01*ne00*ne03 + i00*ne03);
13588
13792
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13589
13793
  }
13590
13794
  }
@@ -13934,7 +14138,7 @@ static void ggml_compute_forward_flash_attn_f32(
13934
14138
  vvexpf(S, S, &Mup);
13935
14139
  ggml_vec_sum_f32(Mup, &sum, S);
13936
14140
  #else
13937
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14141
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
13938
14142
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
13939
14143
 
13940
14144
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13944,9 +14148,13 @@ static void ggml_compute_forward_flash_attn_f32(
13944
14148
  if (SS[j] == -INFINITY) {
13945
14149
  SS[j] = 0.0f;
13946
14150
  } else {
14151
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14152
+ const float val = expf(SS[j] - max);
14153
+ #else
13947
14154
  ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
13948
14155
  memcpy(&scvt[j], &s, sizeof(uint16_t));
13949
14156
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14157
+ #endif
13950
14158
  sump[j] += (ggml_float)val;
13951
14159
  SS[j] = val;
13952
14160
  }
@@ -14524,7 +14732,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14524
14732
  vvexpf(SM, SM, &Mup);
14525
14733
  ggml_vec_sum_f32(Mup, &sum, SM);
14526
14734
  #else
14527
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
14735
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
14528
14736
  ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
14529
14737
 
14530
14738
  for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -14535,9 +14743,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
14535
14743
  if (SR[j] == -INFINITY) {
14536
14744
  SW[j] = 0.0f;
14537
14745
  } else {
14746
+ #ifndef GGML_FLASH_ATTN_EXP_FP16
14747
+ const float val = expf(SR[j] - max);
14748
+ #else
14538
14749
  ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
14539
14750
  memcpy(&scvt[j], &s, sizeof(uint16_t));
14540
14751
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
14752
+ #endif
14541
14753
  sump[j] += (ggml_float)val;
14542
14754
  SW[j] = val;
14543
14755
  }
@@ -15275,6 +15487,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15275
15487
  const int nc = src0->ne[0];
15276
15488
  const int nr = ggml_nrows(src0);
15277
15489
 
15490
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15491
+
15278
15492
  if (params->type == GGML_TASK_INIT) {
15279
15493
  if (ith == 0) {
15280
15494
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -15286,7 +15500,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15286
15500
  if (ith == 0) {
15287
15501
  float * dp = (float *) dst->data;
15288
15502
  ggml_vec_sum_f32(nth, dp, sums);
15289
- dp[0] *= -1.0f;
15503
+ dp[0] *= -1.0f / (float) nr;
15290
15504
  }
15291
15505
  return;
15292
15506
  }
@@ -15303,7 +15517,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15303
15517
  for (int i1 = ir0; i1 < ir1; i1++) {
15304
15518
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15305
15519
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15306
- float * st = (float *) params->wdata + nth + ith*nc;
15520
+ float * st = ((float *) params->wdata) + nth + ith*nc;
15307
15521
 
15308
15522
  #ifndef NDEBUG
15309
15523
  for (int i = 0; i < nc; ++i) {
@@ -15318,15 +15532,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15318
15532
  float max = -INFINITY;
15319
15533
  ggml_vec_max_f32(nc, &max, s0);
15320
15534
 
15321
- uint16_t scvt;
15535
+ uint16_t scvt; UNUSED(scvt);
15322
15536
  for (int i = 0; i < nc; i++) {
15323
15537
  if (s0[i] == -INFINITY) {
15324
15538
  st[i] = 0.0f;
15325
15539
  } else {
15326
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15540
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15541
+ const float s = s0[i] - max;
15542
+ const float val = expf(s);
15543
+ #else
15327
15544
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15328
15545
  memcpy(&scvt, &s, sizeof(scvt));
15329
15546
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15547
+ #endif
15330
15548
  sum += (ggml_float)val;
15331
15549
  st[i] = val;
15332
15550
  }
@@ -15342,7 +15560,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15342
15560
  ggml_vec_log_f32(nc, st, st);
15343
15561
  ggml_vec_mul_f32(nc, st, st, s1);
15344
15562
 
15345
- ggml_vec_sum_f32(nc, sums + ith, st);
15563
+ float st_sum = 0;
15564
+ ggml_vec_sum_f32(nc, &st_sum, st);
15565
+ sums[ith] += st_sum;
15346
15566
 
15347
15567
  #ifndef NDEBUG
15348
15568
  for (int i = 0; i < nc; ++i) {
@@ -15392,7 +15612,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15392
15612
  return;
15393
15613
  }
15394
15614
 
15395
- const float eps = 1e-9f;
15615
+ const double eps = 1e-9;
15396
15616
 
15397
15617
  // TODO: handle transposed/permuted matrices
15398
15618
  const int64_t nc = src0->ne[0];
@@ -15411,7 +15631,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15411
15631
  float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
15412
15632
  float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
15413
15633
  float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
15414
- float * sm = (float *) params->wdata + ith*nc;
15415
15634
 
15416
15635
  #ifndef NDEBUG
15417
15636
  for (int i = 0; i < nc; ++i) {
@@ -15420,54 +15639,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15420
15639
  assert(!isnan(s1[i]));
15421
15640
  }
15422
15641
  #endif
15423
- // step by step explanation:
15424
- {
15425
- //float * sums = (float *) params->wdata;
15426
-
15427
- // forward pass with annotated gradients from backward pass
15428
- // (built by going in reverse operation order, adding to gradients of current operation args)
15429
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
15430
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15431
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
15432
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
15433
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
15434
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
15435
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
15436
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
15437
-
15438
- // substitute into grad[st1], because we can reuse softmax_back from this point on
15439
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
15440
- // postorder:
15441
- // grad[st1] := softmax(s0)
15442
- // grad[st1] := grad[st1]*(1.0 - eps)
15443
- // grad[st1] := grad[st1] + eps
15444
- // grad[st1] := s1 / grad[st1]
15445
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
15446
-
15447
- // src0 gradients by going through softmax_back
15448
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
15449
- // from softmax_back:
15450
- // dxk = yk * (dyk - dot(y, dy))
15451
- // dot_y_dy := dot(y, dy)
15452
- // dx := dy
15453
- // dx := dx - dot_y_dy
15454
- // dx := dx * y
15455
- // postorder:
15456
- // dot_st1_dst1 := dot(st1, grad[st1])
15457
- // grad[s0] := grad[st1]
15458
- // grad[s0] := grad[s0] - dot_st1_dst1
15459
- // grad[s0] := grad[s0] * st1
15460
-
15461
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
15462
- // sm := softmax(s0)
15463
- // grad[s0] := sm*(1.0 - eps)
15464
- // grad[s0] := grad[s0] + eps
15465
- // grad[s0] := s1 / grad[s0]
15466
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
15467
- // dot_st1_dst1 := dot(sm, grad[s0])
15468
- // grad[s0] := grad[s0] - dot_st1_dst1
15469
- // grad[s0] := grad[s0] * sm
15470
- }
15471
15642
 
15472
15643
  // soft_max
15473
15644
  ggml_float sum = 0.0;
@@ -15475,39 +15646,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15475
15646
  float max = -INFINITY;
15476
15647
  ggml_vec_max_f32(nc, &max, s0);
15477
15648
 
15478
- uint16_t scvt;
15649
+ uint16_t scvt; UNUSED(scvt);
15479
15650
  for (int i = 0; i < nc; i++) {
15480
15651
  if (s0[i] == -INFINITY) {
15481
- sm[i] = 0.0f;
15652
+ ds0[i] = 0.0f;
15482
15653
  } else {
15483
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
15654
+ #ifndef GGML_CROSS_ENTROPY_EXP_FP16
15655
+ const float s = s0[i] - max;
15656
+ const float val = expf(s);
15657
+ #else
15484
15658
  ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
15485
15659
  memcpy(&scvt, &s, sizeof(scvt));
15486
15660
  const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
15661
+ #endif
15487
15662
  sum += (ggml_float)val;
15488
- sm[i] = val;
15663
+ ds0[i] = val;
15489
15664
  }
15490
15665
  }
15491
15666
 
15492
15667
  assert(sum > 0.0);
15493
- sum = 1.0/sum;
15668
+ sum = (1.0 - eps)/sum;
15494
15669
  }
15495
15670
 
15496
- float dot_st1_dst1 = 0;
15497
- ggml_vec_scale_f32(nc, sm, sum);
15498
- ggml_vec_cpy_f32 (nc, ds0, sm);
15499
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
15500
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
15501
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
15502
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
15503
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
15504
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
15505
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
15671
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
15672
+ ggml_vec_scale_f32(nc, ds0, sum);
15673
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
15674
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
15675
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
15676
+
15506
15677
 
15507
15678
  #ifndef NDEBUG
15508
15679
  for (int i = 0; i < nc; ++i) {
15509
- assert(!isnan(sm[i]));
15510
- assert(!isinf(sm[i]));
15511
15680
  assert(!isnan(ds0[i]));
15512
15681
  assert(!isinf(ds0[i]));
15513
15682
  }
@@ -15731,7 +15900,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15731
15900
  } break;
15732
15901
  case GGML_OP_CONV_TRANSPOSE_2D:
15733
15902
  {
15734
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15903
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15735
15904
  } break;
15736
15905
  case GGML_OP_POOL_1D:
15737
15906
  {
@@ -16062,9 +16231,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16062
16231
  {
16063
16232
  // necessary for llama
16064
16233
  if (src0->grad) {
16234
+ float eps;
16235
+ memcpy(&eps, tensor->op_params, sizeof(float));
16236
+
16065
16237
  src0->grad = ggml_add_impl(ctx,
16066
16238
  src0->grad,
16067
- ggml_rms_norm_back(ctx, src0, tensor->grad),
16239
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
16068
16240
  inplace);
16069
16241
  }
16070
16242
  } break;
@@ -16832,9 +17004,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16832
17004
  return result;
16833
17005
  }
16834
17006
 
16835
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
16836
- struct ggml_cgraph result = *gf;
16837
-
17007
+ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
16838
17008
  GGML_ASSERT(gf->n_nodes > 0);
16839
17009
 
16840
17010
  // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16858,15 +17028,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16858
17028
  }
16859
17029
  }
16860
17030
 
16861
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
17031
+ for (int i = 0; i < gf->n_nodes; i++) {
16862
17032
  struct ggml_tensor * node = gf->nodes[i];
16863
17033
 
16864
17034
  if (node->is_param) {
16865
17035
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16866
- ggml_build_forward_expand(&result, node->grad);
17036
+ ggml_build_forward_expand(gb, node->grad);
16867
17037
  }
16868
17038
  }
17039
+ }
16869
17040
 
17041
+ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
17042
+ struct ggml_cgraph result = *gf;
17043
+ ggml_build_backward_expand(ctx, gf, &result, keep);
16870
17044
  return result;
16871
17045
  }
16872
17046
 
@@ -17542,10 +17716,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
17542
17716
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
17543
17717
  {
17544
17718
  n_tasks = n_threads;
17545
-
17546
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
17547
-
17548
- work_size = MAX(work_size, cur);
17549
17719
  } break;
17550
17720
  case GGML_OP_NONE:
17551
17721
  {
@@ -18423,14 +18593,16 @@ static enum ggml_opt_result ggml_opt_adam(
18423
18593
  struct ggml_opt_params params,
18424
18594
  struct ggml_tensor * f,
18425
18595
  struct ggml_cgraph * gf,
18426
- struct ggml_cgraph * gb) {
18596
+ struct ggml_cgraph * gb,
18597
+ ggml_opt_callback callback,
18598
+ void * callback_data) {
18427
18599
  GGML_ASSERT(ggml_is_scalar(f));
18428
18600
 
18429
18601
  // these will store the parameters we want to optimize
18430
18602
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
18431
18603
 
18432
18604
  int np = 0;
18433
- int nx = 0;
18605
+ int64_t nx = 0;
18434
18606
  for (int i = 0; i < gf->n_nodes; ++i) {
18435
18607
  if (gf->nodes[i]->is_param) {
18436
18608
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -18449,31 +18621,32 @@ static enum ggml_opt_result ggml_opt_adam(
18449
18621
  }
18450
18622
 
18451
18623
  // constants
18452
- const float sched = params.adam.sched;
18453
- const float decay = params.adam.decay * sched;
18454
- const float alpha = params.adam.alpha * sched;
18624
+ float sched = params.adam.sched;
18625
+ const float alpha = params.adam.alpha;
18626
+ const float decay = params.adam.decay * alpha;
18455
18627
  const float beta1 = params.adam.beta1;
18456
18628
  const float beta2 = params.adam.beta2;
18457
18629
  const float eps = params.adam.eps;
18630
+ const float gclip = params.adam.gclip;
18631
+ const int decay_min_ndim = params.adam.decay_min_ndim;
18458
18632
 
18459
- float * x = opt->adam.x->data; // view of the parameters
18460
- float * g1 = opt->adam.g1->data; // gradient
18461
- float * g2 = opt->adam.g2->data; // gradient squared
18462
18633
  float * m = opt->adam.m->data; // first moment
18463
18634
  float * v = opt->adam.v->data; // second moment
18464
- float * mh = opt->adam.mh->data; // first moment hat
18465
- float * vh = opt->adam.vh->data; // second moment hat
18466
18635
 
18467
18636
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18468
18637
 
18469
- // update view
18470
- ggml_opt_get_params(np, ps, x);
18638
+ if (callback) {
18639
+ callback(callback_data, &sched);
18640
+ }
18471
18641
 
18472
18642
  // compute the function value
18473
18643
  ggml_graph_reset (gf);
18474
18644
  ggml_set_f32 (f->grad, 1.0f);
18475
18645
 
18476
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18646
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18647
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18648
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18649
+ ggml_graph_compute(gb, &cplan);
18477
18650
 
18478
18651
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
18479
18652
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -18481,6 +18654,9 @@ static enum ggml_opt_result ggml_opt_adam(
18481
18654
  pf[opt->iter % params.past] = opt->adam.fx_prev;
18482
18655
  }
18483
18656
 
18657
+ opt->loss_before = opt->adam.fx_prev;
18658
+ opt->loss_after = opt->adam.fx_prev;
18659
+
18484
18660
  // initialize
18485
18661
  if (opt->just_initialized) {
18486
18662
  opt->adam.n_no_improvement = 0;
@@ -18513,50 +18689,55 @@ static enum ggml_opt_result ggml_opt_adam(
18513
18689
  UNUSED(t_start_cpu);
18514
18690
 
18515
18691
  {
18516
- // update the gradient
18517
- ggml_opt_get_grad(np, ps, g1);
18518
-
18519
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
18520
- ggml_vec_scale_f32(nx, m, beta1);
18521
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
18522
-
18523
- // g2 = g1^2
18524
- ggml_vec_sqr_f32 (nx, g2, g1);
18525
-
18526
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
18527
- ggml_vec_scale_f32(nx, v, beta2);
18528
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
18529
-
18530
- // m^hat = m_t / (1 - beta1^t)
18531
- // v^hat = v_t / (1 - beta2^t)
18532
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
18533
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
18534
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
18535
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
18536
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
18537
- ggml_vec_cpy_f32 (nx, mh, m);
18538
- ggml_vec_cpy_f32 (nx, vh, v);
18539
-
18540
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
18541
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
18542
-
18543
- ggml_vec_sqrt_f32 (nx, vh, vh);
18544
- ggml_vec_acc1_f32 (nx, vh, eps);
18545
-
18546
- ggml_vec_div_f32 (nx, mh, mh, vh);
18547
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
18548
- ggml_vec_sub_f32 (nx, x, x, mh);
18692
+ float gnorm = 1.0f;
18693
+ if (gclip > 0.0f) {
18694
+ // gradient clipping
18695
+ ggml_float sum = 0.0;
18696
+ for (int p = 0; p < np; ++p) {
18697
+ const int64_t ne = ggml_nelements(ps[p]);
18698
+ for (int64_t j = 0; j < ne; ++j) {
18699
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
18700
+ sum += (ggml_float)(g*g);
18701
+ }
18702
+ }
18703
+ ggml_float norm = sqrt(sum);
18704
+ if (norm > (ggml_float) gclip) {
18705
+ gnorm = (float) ((ggml_float) gclip / norm);
18706
+ }
18707
+ }
18708
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
18709
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
18710
+ int64_t i = 0;
18711
+ for (int p = 0; p < np; ++p) {
18712
+ const int64_t ne = ggml_nelements(ps[p]);
18713
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
18714
+ for (int64_t j = 0; j < ne; ++j) {
18715
+ float x = ggml_get_f32_1d(ps[p], j);
18716
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
18717
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
18718
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
18719
+ float mh = m[i]*beta1h;
18720
+ float vh = v[i]*beta2h;
18721
+ vh = sqrtf(vh) + eps;
18722
+ x = x*(1.0f - p_decay) - mh/vh;
18723
+ ggml_set_f32_1d(ps[p], j, x);
18724
+ ++i;
18725
+ }
18726
+ }
18727
+ }
18549
18728
 
18550
- // update the parameters
18551
- ggml_opt_set_params(np, ps, x);
18729
+ if (callback) {
18730
+ callback(callback_data, &sched);
18552
18731
  }
18553
18732
 
18554
18733
  ggml_graph_reset (gf);
18555
18734
  ggml_set_f32 (f->grad, 1.0f);
18556
18735
 
18557
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
18736
+ ggml_graph_compute(gb, &cplan);
18558
18737
 
18559
18738
  const float fx = ggml_get_f32_1d(f, 0);
18739
+ opt->loss_after = fx;
18740
+
18560
18741
 
18561
18742
  // check convergence
18562
18743
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -18625,7 +18806,6 @@ struct ggml_lbfgs_iteration_data {
18625
18806
  };
18626
18807
 
18627
18808
  static enum ggml_opt_result linesearch_backtracking(
18628
- struct ggml_context * ctx,
18629
18809
  const struct ggml_opt_params * params,
18630
18810
  int nx,
18631
18811
  float * x,
@@ -18637,8 +18817,11 @@ static enum ggml_opt_result linesearch_backtracking(
18637
18817
  struct ggml_tensor * f,
18638
18818
  struct ggml_cgraph * gf,
18639
18819
  struct ggml_cgraph * gb,
18820
+ struct ggml_cplan * cplan,
18640
18821
  const int np,
18641
- struct ggml_tensor * ps[]) {
18822
+ struct ggml_tensor * ps[],
18823
+ ggml_opt_callback callback,
18824
+ void * callback_data) {
18642
18825
  int count = 0;
18643
18826
 
18644
18827
  float width = 0.0f;
@@ -18667,6 +18850,12 @@ static enum ggml_opt_result linesearch_backtracking(
18667
18850
  dgtest = params->lbfgs.ftol*dginit;
18668
18851
 
18669
18852
  while (true) {
18853
+ if (callback) {
18854
+ // LBFG-S does not support learning rate -> ignore learning schedule
18855
+ float sched = 0;
18856
+ callback(callback_data, &sched);
18857
+ }
18858
+
18670
18859
  ggml_vec_cpy_f32(nx, x, xp);
18671
18860
  ggml_vec_mad_f32(nx, x, d, *step);
18672
18861
 
@@ -18677,7 +18866,7 @@ static enum ggml_opt_result linesearch_backtracking(
18677
18866
  ggml_graph_reset (gf);
18678
18867
  ggml_set_f32 (f->grad, 1.0f);
18679
18868
 
18680
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
18869
+ ggml_graph_compute(gb, cplan);
18681
18870
 
18682
18871
  ggml_opt_get_grad(np, ps, g);
18683
18872
 
@@ -18737,7 +18926,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18737
18926
  struct ggml_opt_params params,
18738
18927
  struct ggml_tensor * f,
18739
18928
  struct ggml_cgraph * gf,
18740
- struct ggml_cgraph * gb) {
18929
+ struct ggml_cgraph * gb,
18930
+ ggml_opt_callback callback,
18931
+ void * callback_data) {
18741
18932
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18742
18933
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18743
18934
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -18769,6 +18960,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18769
18960
  opt->iter = iter;
18770
18961
  }
18771
18962
 
18963
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18964
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18965
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18966
+
18772
18967
  float * x = opt->lbfgs.x->data; // current parameters
18773
18968
  float * xp = opt->lbfgs.xp->data; // previous parameters
18774
18969
  float * g = opt->lbfgs.g->data; // current gradient
@@ -18790,6 +18985,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18790
18985
  float * lm_s = opt->lbfgs.lms->data;
18791
18986
  float * lm_y = opt->lbfgs.lmy->data;
18792
18987
 
18988
+ if (callback) {
18989
+ // LBFG-S does not support learning rate -> ignore learning schedule
18990
+ float sched = 0;
18991
+ callback(callback_data, &sched);
18992
+ }
18993
+
18793
18994
  // evaluate the function value and its gradient
18794
18995
  {
18795
18996
  ggml_opt_set_params(np, ps, x);
@@ -18797,11 +18998,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18797
18998
  ggml_graph_reset (gf);
18798
18999
  ggml_set_f32 (f->grad, 1.0f);
18799
19000
 
18800
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
19001
+ ggml_graph_compute(gb, &cplan);
18801
19002
 
18802
19003
  ggml_opt_get_grad(np, ps, g);
18803
19004
 
18804
19005
  fx = ggml_get_f32_1d(f, 0);
19006
+
19007
+ opt->loss_before = fx;
19008
+ opt->loss_after = fx;
18805
19009
  }
18806
19010
 
18807
19011
  // search direction = -gradient
@@ -18856,7 +19060,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18856
19060
  ggml_vec_cpy_f32(nx, xp, x);
18857
19061
  ggml_vec_cpy_f32(nx, gp, g);
18858
19062
 
18859
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
19063
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
18860
19064
 
18861
19065
  if (ls < 0) {
18862
19066
  // linesearch failed - go back to the previous point and return
@@ -18866,6 +19070,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18866
19070
  return ls;
18867
19071
  }
18868
19072
 
19073
+ opt->loss_after = fx;
19074
+
18869
19075
  ggml_vec_norm_f32(nx, &xnorm, x);
18870
19076
  ggml_vec_norm_f32(nx, &gnorm, g);
18871
19077
 
@@ -18923,7 +19129,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18923
19129
  // ys = y^t \cdot s -> 1 / \rho.
18924
19130
  // yy = y^t \cdot y.
18925
19131
  //
18926
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
19132
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18927
19133
  ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
18928
19134
 
18929
19135
  lm_ys[end[0]] = ys;
@@ -18986,13 +19192,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18986
19192
  .adam = {
18987
19193
  .n_iter = 10000,
18988
19194
  .sched = 1.000f,
18989
- .decay = 0.001f,
19195
+ .decay = 0.0f,
19196
+ .decay_min_ndim = 2,
18990
19197
  .alpha = 0.001f,
18991
19198
  .beta1 = 0.9f,
18992
19199
  .beta2 = 0.999f,
18993
19200
  .eps = 1e-8f,
18994
19201
  .eps_f = 1e-5f,
18995
19202
  .eps_g = 1e-3f,
19203
+ .gclip = 0.0f,
18996
19204
  },
18997
19205
  };
18998
19206
  } break;
@@ -19042,23 +19250,13 @@ GGML_API void ggml_opt_init(
19042
19250
  switch (opt->params.type) {
19043
19251
  case GGML_OPT_ADAM:
19044
19252
  {
19045
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19046
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19047
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19048
19253
  opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19049
19254
  opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19050
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19051
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
19052
19255
  opt->adam.pf = params.past > 0
19053
19256
  ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
19054
19257
  : NULL;
19055
- ggml_set_zero(opt->adam.x);
19056
- ggml_set_zero(opt->adam.g1);
19057
- ggml_set_zero(opt->adam.g2);
19058
19258
  ggml_set_zero(opt->adam.m);
19059
19259
  ggml_set_zero(opt->adam.v);
19060
- ggml_set_zero(opt->adam.mh);
19061
- ggml_set_zero(opt->adam.vh);
19062
19260
  if (opt->adam.pf) {
19063
19261
  ggml_set_zero(opt->adam.pf);
19064
19262
  }
@@ -19142,7 +19340,7 @@ enum ggml_opt_result ggml_opt_resume(
19142
19340
  *gf = ggml_build_forward (f);
19143
19341
  *gb = ggml_build_backward(ctx, gf, true);
19144
19342
 
19145
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
19343
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
19146
19344
  }
19147
19345
 
19148
19346
  enum ggml_opt_result ggml_opt_resume_g(
@@ -19150,7 +19348,9 @@ enum ggml_opt_result ggml_opt_resume_g(
19150
19348
  struct ggml_opt_context * opt,
19151
19349
  struct ggml_tensor * f,
19152
19350
  struct ggml_cgraph * gf,
19153
- struct ggml_cgraph * gb) {
19351
+ struct ggml_cgraph * gb,
19352
+ ggml_opt_callback callback,
19353
+ void * callback_data) {
19154
19354
 
19155
19355
  // build forward + backward compute graphs
19156
19356
  enum ggml_opt_result result = GGML_OPT_OK;
@@ -19158,11 +19358,11 @@ enum ggml_opt_result ggml_opt_resume_g(
19158
19358
  switch (opt->params.type) {
19159
19359
  case GGML_OPT_ADAM:
19160
19360
  {
19161
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
19361
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19162
19362
  } break;
19163
19363
  case GGML_OPT_LBFGS:
19164
19364
  {
19165
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
19365
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19166
19366
  } break;
19167
19367
  }
19168
19368
 
@@ -19394,7 +19594,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19394
19594
  ////////////////////////////////////////////////////////////////////////////////
19395
19595
 
19396
19596
  struct gguf_str {
19397
- uint32_t n;
19597
+ uint64_t n; // GGUFv2
19398
19598
  char * data;
19399
19599
  };
19400
19600
 
@@ -19408,9 +19608,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19408
19608
  [GGUF_TYPE_FLOAT32] = sizeof(float),
19409
19609
  [GGUF_TYPE_BOOL] = sizeof(bool),
19410
19610
  [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19611
+ [GGUF_TYPE_UINT64] = sizeof(uint64_t),
19612
+ [GGUF_TYPE_INT64] = sizeof(int64_t),
19613
+ [GGUF_TYPE_FLOAT64] = sizeof(double),
19411
19614
  [GGUF_TYPE_ARRAY] = 0, // undefined
19412
19615
  };
19413
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19616
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19414
19617
 
19415
19618
  static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19416
19619
  [GGUF_TYPE_UINT8] = "u8",
@@ -19423,8 +19626,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19423
19626
  [GGUF_TYPE_BOOL] = "bool",
19424
19627
  [GGUF_TYPE_STRING] = "str",
19425
19628
  [GGUF_TYPE_ARRAY] = "arr",
19629
+ [GGUF_TYPE_UINT64] = "u64",
19630
+ [GGUF_TYPE_INT64] = "i64",
19631
+ [GGUF_TYPE_FLOAT64] = "f64",
19426
19632
  };
19427
- static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19633
+ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
19428
19634
 
19429
19635
  union gguf_value {
19430
19636
  uint8_t uint8;
@@ -19434,6 +19640,9 @@ union gguf_value {
19434
19640
  uint32_t uint32;
19435
19641
  int32_t int32;
19436
19642
  float float32;
19643
+ uint64_t uint64;
19644
+ int64_t int64;
19645
+ double float64;
19437
19646
  bool bool_;
19438
19647
 
19439
19648
  struct gguf_str str;
@@ -19441,7 +19650,7 @@ union gguf_value {
19441
19650
  struct {
19442
19651
  enum gguf_type type;
19443
19652
 
19444
- uint32_t n;
19653
+ uint64_t n; // GGUFv2
19445
19654
  void * data;
19446
19655
  } arr;
19447
19656
  };
@@ -19449,8 +19658,6 @@ union gguf_value {
19449
19658
  struct gguf_kv {
19450
19659
  struct gguf_str key;
19451
19660
 
19452
- uint32_t n_bytes; // TODO: is this actually needed?
19453
-
19454
19661
  enum gguf_type type;
19455
19662
  union gguf_value value;
19456
19663
  };
@@ -19458,15 +19665,15 @@ struct gguf_kv {
19458
19665
  struct gguf_header {
19459
19666
  uint32_t magic;
19460
19667
  uint32_t version;
19461
- uint32_t n_tensors;
19462
- uint32_t n_kv;
19668
+ uint64_t n_tensors; // GGUFv2
19669
+ uint64_t n_kv; // GGUFv2
19463
19670
  };
19464
19671
 
19465
19672
  struct gguf_tensor_info {
19466
19673
  struct gguf_str name;
19467
19674
 
19468
19675
  uint32_t n_dims;
19469
- uint32_t ne[GGML_MAX_DIMS];
19676
+ uint64_t ne[GGML_MAX_DIMS];
19470
19677
 
19471
19678
  enum ggml_type type;
19472
19679
 
@@ -19497,19 +19704,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
19497
19704
  return n == size;
19498
19705
  }
19499
19706
 
19500
- static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
19707
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19708
+ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
19501
19709
  p->n = 0;
19502
19710
  p->data = NULL;
19503
19711
 
19504
19712
  bool ok = true;
19505
19713
 
19506
- // TODO: how to avoid mallocs for strings?
19507
19714
  ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19508
19715
  ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19509
19716
 
19510
19717
  return ok;
19511
19718
  }
19512
19719
 
19720
+ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
19721
+ p->n = 0;
19722
+ p->data = NULL;
19723
+
19724
+ bool ok = true;
19725
+
19726
+ uint32_t n = 0;
19727
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
19728
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19729
+
19730
+ return ok;
19731
+ }
19732
+
19513
19733
  struct gguf_context * gguf_init_empty(void) {
19514
19734
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19515
19735
 
@@ -19565,8 +19785,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19565
19785
  ctx->data = NULL;
19566
19786
 
19567
19787
  ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19568
- ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19569
- ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19788
+
19789
+ if (ctx->header.version == 1) {
19790
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19791
+ uint32_t n_tensors = 0;
19792
+ uint32_t n_kv = 0;
19793
+
19794
+ ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
19795
+ ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
19796
+
19797
+ ctx->header.n_tensors = n_tensors;
19798
+ ctx->header.n_kv = n_kv;
19799
+ } else {
19800
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19801
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19802
+ }
19570
19803
 
19571
19804
  if (!ok) {
19572
19805
  fprintf(stderr, "%s: failed to read header\n", __func__);
@@ -19576,18 +19809,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19576
19809
  }
19577
19810
  }
19578
19811
 
19812
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19813
+ bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
19814
+ if (ctx->header.version == 1) {
19815
+ gguf_fread_str = gguf_fread_str_v1;
19816
+ }
19817
+
19579
19818
  // read the kv pairs
19580
19819
  {
19581
- ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19820
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19582
19821
 
19583
19822
  for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19584
19823
  struct gguf_kv * kv = &ctx->kv[i];
19585
19824
 
19586
19825
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19587
19826
 
19588
- ok = ok && gguf_fread_str(file, &kv->key, &offset);
19589
- //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
19590
- ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19827
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19828
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19591
19829
 
19592
19830
  //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19593
19831
 
@@ -19599,12 +19837,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19599
19837
  case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19600
19838
  case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19601
19839
  case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19840
+ case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
19841
+ case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
19842
+ case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
19602
19843
  case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19603
19844
  case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19604
19845
  case GGUF_TYPE_ARRAY:
19605
19846
  {
19606
19847
  ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19607
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19848
+
19849
+ if (ctx->header.version == 1) {
19850
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19851
+ uint32_t n = 0;
19852
+ ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
19853
+ kv->value.arr.n = n;
19854
+ } else {
19855
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19856
+ }
19608
19857
 
19609
19858
  switch (kv->value.arr.type) {
19610
19859
  case GGUF_TYPE_UINT8:
@@ -19614,6 +19863,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19614
19863
  case GGUF_TYPE_UINT32:
19615
19864
  case GGUF_TYPE_INT32:
19616
19865
  case GGUF_TYPE_FLOAT32:
19866
+ case GGUF_TYPE_UINT64:
19867
+ case GGUF_TYPE_INT64:
19868
+ case GGUF_TYPE_FLOAT64:
19617
19869
  case GGUF_TYPE_BOOL:
19618
19870
  {
19619
19871
  kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -19648,7 +19900,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19648
19900
 
19649
19901
  // read the tensor infos
19650
19902
  {
19651
- ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19903
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19652
19904
 
19653
19905
  for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19654
19906
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19660,7 +19912,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19660
19912
  ok = ok && gguf_fread_str(file, &info->name, &offset);
19661
19913
  ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19662
19914
  for (uint32_t j = 0; j < info->n_dims; ++j) {
19663
- ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19915
+ if (ctx->header.version == 1) {
19916
+ // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
19917
+ uint32_t t = 0;
19918
+ ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
19919
+ info->ne[j] = t;
19920
+ } else {
19921
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19922
+ }
19664
19923
  }
19665
19924
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19666
19925
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
@@ -19842,7 +20101,7 @@ void gguf_free(struct gguf_context * ctx) {
19842
20101
  }
19843
20102
  }
19844
20103
 
19845
- GGML_ALIGNED_FREE(ctx->kv);
20104
+ free(ctx->kv);
19846
20105
  }
19847
20106
 
19848
20107
  if (ctx->infos) {
@@ -19854,7 +20113,7 @@ void gguf_free(struct gguf_context * ctx) {
19854
20113
  }
19855
20114
  }
19856
20115
 
19857
- GGML_ALIGNED_FREE(ctx->infos);
20116
+ free(ctx->infos);
19858
20117
  }
19859
20118
 
19860
20119
  GGML_ALIGNED_FREE(ctx);
@@ -19954,6 +20213,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
19954
20213
  return ctx->kv[i].value.float32;
19955
20214
  }
19956
20215
 
20216
+ uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20217
+ return ctx->kv[i].value.uint64;
20218
+ }
20219
+
20220
+ int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20221
+ return ctx->kv[i].value.int64;
20222
+ }
20223
+
20224
+ double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20225
+ return ctx->kv[i].value.float64;
20226
+ }
20227
+
19957
20228
  bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
19958
20229
  return ctx->kv[i].value.bool_;
19959
20230
  }
@@ -20000,7 +20271,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
20000
20271
  const int n_kv = gguf_get_n_kv(ctx);
20001
20272
 
20002
20273
  ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20003
- ctx->kv[n_kv].key.n = strlen(key) + 1;
20274
+ ctx->kv[n_kv].key.n = strlen(key);
20004
20275
  ctx->kv[n_kv].key.data = strdup(key);
20005
20276
  ctx->header.n_kv++;
20006
20277
 
@@ -20056,6 +20327,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20056
20327
  ctx->kv[idx].value.float32 = val;
20057
20328
  }
20058
20329
 
20330
+ void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
20331
+ const int idx = gguf_get_or_add_key(ctx, key);
20332
+
20333
+ ctx->kv[idx].type = GGUF_TYPE_UINT64;
20334
+ ctx->kv[idx].value.uint64 = val;
20335
+ }
20336
+
20337
+ void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
20338
+ const int idx = gguf_get_or_add_key(ctx, key);
20339
+
20340
+ ctx->kv[idx].type = GGUF_TYPE_INT64;
20341
+ ctx->kv[idx].value.int64 = val;
20342
+ }
20343
+
20344
+ void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
20345
+ const int idx = gguf_get_or_add_key(ctx, key);
20346
+
20347
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
20348
+ ctx->kv[idx].value.float64 = val;
20349
+ }
20350
+
20059
20351
  void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20060
20352
  const int idx = gguf_get_or_add_key(ctx, key);
20061
20353
 
@@ -20067,7 +20359,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
20067
20359
  const int idx = gguf_get_or_add_key(ctx, key);
20068
20360
 
20069
20361
  ctx->kv[idx].type = GGUF_TYPE_STRING;
20070
- ctx->kv[idx].value.str.n = strlen(val) + 1;
20362
+ ctx->kv[idx].value.str.n = strlen(val);
20071
20363
  ctx->kv[idx].value.str.data = strdup(val);
20072
20364
  }
20073
20365
 
@@ -20090,7 +20382,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
20090
20382
  ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20091
20383
  for (int i = 0; i < n; i++) {
20092
20384
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20093
- str->n = strlen(data[i]) + 1;
20385
+ str->n = strlen(data[i]);
20094
20386
  str->data = strdup(data[i]);
20095
20387
  }
20096
20388
  }
@@ -20106,6 +20398,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20106
20398
  case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20107
20399
  case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20108
20400
  case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20401
+ case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
20402
+ case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
20403
+ case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
20109
20404
  case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20110
20405
  case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20111
20406
  case GGUF_TYPE_ARRAY:
@@ -20134,7 +20429,7 @@ void gguf_add_tensor(
20134
20429
  const int idx = ctx->header.n_tensors;
20135
20430
  ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20136
20431
 
20137
- ctx->infos[idx].name.n = strlen(tensor->name) + 1;
20432
+ ctx->infos[idx].name.n = strlen(tensor->name);
20138
20433
  ctx->infos[idx].name.data = strdup(tensor->name);
20139
20434
 
20140
20435
  for (int i = 0; i < GGML_MAX_DIMS; ++i) {
@@ -20267,6 +20562,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20267
20562
  case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20268
20563
  case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20269
20564
  case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20565
+ case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
20566
+ case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
20567
+ case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
20270
20568
  case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20271
20569
  case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20272
20570
  case GGUF_TYPE_ARRAY:
@@ -20282,6 +20580,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20282
20580
  case GGUF_TYPE_UINT32:
20283
20581
  case GGUF_TYPE_INT32:
20284
20582
  case GGUF_TYPE_FLOAT32:
20583
+ case GGUF_TYPE_UINT64:
20584
+ case GGUF_TYPE_INT64:
20585
+ case GGUF_TYPE_FLOAT64:
20285
20586
  case GGUF_TYPE_BOOL:
20286
20587
  {
20287
20588
  gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@@ -20516,6 +20817,14 @@ int ggml_cpu_has_sse3(void) {
20516
20817
  #endif
20517
20818
  }
20518
20819
 
20820
+ int ggml_cpu_has_ssse3(void) {
20821
+ #if defined(__SSSE3__)
20822
+ return 1;
20823
+ #else
20824
+ return 0;
20825
+ #endif
20826
+ }
20827
+
20519
20828
  int ggml_cpu_has_vsx(void) {
20520
20829
  #if defined(__POWER9_VECTOR__)
20521
20830
  return 1;