llama_cpp 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
|
|
103
103
|
#include <sys/stat.h>
|
104
104
|
#include <unistd.h>
|
105
105
|
|
106
|
+
#endif
|
107
|
+
#ifdef GGML_USE_CPU_HBM
|
108
|
+
#include <hbwmalloc.h>
|
106
109
|
#endif
|
107
110
|
|
108
111
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -123,6 +126,8 @@ typedef void * thread_ret_t;
|
|
123
126
|
#define GGML_GELU_FP16
|
124
127
|
#define GGML_GELU_QUICK_FP16
|
125
128
|
#define GGML_SILU_FP16
|
129
|
+
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
130
|
+
// #define GGML_FLASH_ATTN_EXP_FP16
|
126
131
|
|
127
132
|
#define GGML_SOFT_MAX_UNROLL 4
|
128
133
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -157,12 +162,6 @@ typedef void * thread_ret_t;
|
|
157
162
|
//#define GGML_SOFT_MAX_ACCELERATE
|
158
163
|
#endif
|
159
164
|
|
160
|
-
#if UINTPTR_MAX == 0xFFFFFFFF
|
161
|
-
#define GGML_MEM_ALIGN 4
|
162
|
-
#else
|
163
|
-
#define GGML_MEM_ALIGN 16
|
164
|
-
#endif
|
165
|
-
|
166
165
|
//
|
167
166
|
// logging
|
168
167
|
//
|
@@ -192,13 +191,19 @@ typedef void * thread_ret_t;
|
|
192
191
|
//
|
193
192
|
|
194
193
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
195
|
-
#define GGML_ALIGNED_MALLOC(size)
|
196
|
-
#define GGML_ALIGNED_FREE(ptr)
|
194
|
+
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
195
|
+
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
196
|
#else
|
198
197
|
inline static void * ggml_aligned_malloc(size_t size) {
|
198
|
+
if (size == 0) {
|
199
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
200
|
+
return NULL;
|
201
|
+
}
|
199
202
|
void * aligned_memory = NULL;
|
200
|
-
#ifdef
|
201
|
-
int result =
|
203
|
+
#ifdef GGML_USE_CPU_HBM
|
204
|
+
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
205
|
+
#elif GGML_USE_METAL
|
206
|
+
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
202
207
|
#else
|
203
208
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
204
209
|
#endif
|
@@ -218,8 +223,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
218
223
|
}
|
219
224
|
return aligned_memory;
|
220
225
|
}
|
221
|
-
#define GGML_ALIGNED_MALLOC(size)
|
222
|
-
#
|
226
|
+
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
227
|
+
#ifdef GGML_USE_CPU_HBM
|
228
|
+
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
229
|
+
#else
|
230
|
+
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
231
|
+
#endif
|
223
232
|
#endif
|
224
233
|
|
225
234
|
#define UNUSED GGML_UNUSED
|
@@ -305,6 +314,10 @@ typedef double ggml_float;
|
|
305
314
|
#endif
|
306
315
|
#endif
|
307
316
|
|
317
|
+
#ifdef __riscv_v_intrinsic
|
318
|
+
#include <riscv_vector.h>
|
319
|
+
#endif
|
320
|
+
|
308
321
|
#ifdef __F16C__
|
309
322
|
|
310
323
|
#ifdef _MSC_VER
|
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
817
830
|
|
818
831
|
#if !defined(__aarch64__)
|
819
832
|
|
820
|
-
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
821
|
-
return
|
822
|
-
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
823
|
-
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
824
|
-
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
825
|
-
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
826
|
-
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
827
|
-
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
828
|
-
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
829
|
-
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
830
|
-
}
|
831
|
-
|
832
|
-
inline static int16_t vaddvq_s8(int8x16_t v) {
|
833
|
-
return
|
834
|
-
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
835
|
-
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
836
|
-
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
837
|
-
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
838
|
-
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
839
|
-
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
840
|
-
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
841
|
-
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
842
|
-
}
|
843
|
-
|
844
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
845
|
-
return
|
846
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
847
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
848
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
849
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
850
|
-
}
|
851
|
-
|
852
|
-
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
853
|
-
return
|
854
|
-
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
855
|
-
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
856
|
-
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
857
|
-
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
858
|
-
}
|
859
|
-
|
860
833
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
861
834
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
862
835
|
}
|
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
865
838
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
866
839
|
}
|
867
840
|
|
868
|
-
inline static float vminvq_f32(float32x4_t v) {
|
869
|
-
return
|
870
|
-
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
871
|
-
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
872
|
-
}
|
873
|
-
|
874
841
|
inline static float vmaxvq_f32(float32x4_t v) {
|
875
842
|
return
|
876
843
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
@@ -2436,7 +2403,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2436
2403
|
const int nb = n / qk;
|
2437
2404
|
|
2438
2405
|
assert(n % qk == 0);
|
2439
|
-
assert(nb % 2 == 0);
|
2440
2406
|
|
2441
2407
|
const block_q4_0 * restrict x = vx;
|
2442
2408
|
const block_q8_0 * restrict y = vy;
|
@@ -2445,6 +2411,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2445
2411
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
2446
2412
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
2447
2413
|
|
2414
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2448
2415
|
for (int i = 0; i < nb; i += 2) {
|
2449
2416
|
const block_q4_0 * restrict x0 = &x[i + 0];
|
2450
2417
|
const block_q4_0 * restrict x1 = &x[i + 1];
|
@@ -2623,6 +2590,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2623
2590
|
}
|
2624
2591
|
|
2625
2592
|
// Main loop
|
2593
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2626
2594
|
for (int i = 2; i < nb; i+=2) {
|
2627
2595
|
_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
|
2628
2596
|
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
@@ -2680,6 +2648,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2680
2648
|
}
|
2681
2649
|
|
2682
2650
|
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
2651
|
+
#elif defined(__riscv_v_intrinsic)
|
2652
|
+
float sumf = 0.0;
|
2653
|
+
|
2654
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2655
|
+
|
2656
|
+
for (int i = 0; i < nb; i++) {
|
2657
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2658
|
+
|
2659
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2660
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2661
|
+
|
2662
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2663
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2664
|
+
|
2665
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2666
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2667
|
+
|
2668
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
|
2669
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
|
2670
|
+
|
2671
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2672
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2673
|
+
|
2674
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2675
|
+
|
2676
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2677
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2678
|
+
|
2679
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2680
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2681
|
+
|
2682
|
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2683
|
+
}
|
2684
|
+
|
2685
|
+
*s = sumf;
|
2683
2686
|
#else
|
2684
2687
|
// scalar
|
2685
2688
|
float sumf = 0.0;
|
@@ -2706,7 +2709,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2706
2709
|
const int nb = n / qk;
|
2707
2710
|
|
2708
2711
|
assert(n % qk == 0);
|
2709
|
-
assert(nb % 2 == 0);
|
2710
2712
|
|
2711
2713
|
const block_q4_1 * restrict x = vx;
|
2712
2714
|
const block_q8_1 * restrict y = vy;
|
@@ -2718,6 +2720,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2718
2720
|
|
2719
2721
|
float summs = 0;
|
2720
2722
|
|
2723
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2721
2724
|
for (int i = 0; i < nb; i += 2) {
|
2722
2725
|
const block_q4_1 * restrict x0 = &x[i + 0];
|
2723
2726
|
const block_q4_1 * restrict x1 = &x[i + 1];
|
@@ -2806,6 +2809,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2806
2809
|
}
|
2807
2810
|
|
2808
2811
|
*s = hsum_float_8(acc) + summs;
|
2812
|
+
#elif defined(__riscv_v_intrinsic)
|
2813
|
+
float sumf = 0.0;
|
2814
|
+
|
2815
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2816
|
+
|
2817
|
+
for (int i = 0; i < nb; i++) {
|
2818
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2819
|
+
|
2820
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2821
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2822
|
+
|
2823
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2824
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2825
|
+
|
2826
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2827
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2828
|
+
|
2829
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2830
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2831
|
+
|
2832
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2833
|
+
|
2834
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2835
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2836
|
+
|
2837
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2838
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2839
|
+
|
2840
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2841
|
+
}
|
2842
|
+
|
2843
|
+
*s = sumf;
|
2809
2844
|
#else
|
2810
2845
|
// scalar
|
2811
2846
|
float sumf = 0.0;
|
@@ -2832,7 +2867,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2832
2867
|
const int nb = n / qk;
|
2833
2868
|
|
2834
2869
|
assert(n % qk == 0);
|
2835
|
-
assert(nb % 2 == 0);
|
2836
2870
|
assert(qk == QK5_0);
|
2837
2871
|
|
2838
2872
|
const block_q5_0 * restrict x = vx;
|
@@ -2848,6 +2882,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2848
2882
|
uint64_t tmp0[4];
|
2849
2883
|
uint64_t tmp1[4];
|
2850
2884
|
|
2885
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2851
2886
|
for (int i = 0; i < nb; i += 2) {
|
2852
2887
|
const block_q5_0 * restrict x0 = &x[i];
|
2853
2888
|
const block_q5_0 * restrict x1 = &x[i + 1];
|
@@ -3040,6 +3075,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
3040
3075
|
}
|
3041
3076
|
|
3042
3077
|
*s = hsum_float_8(acc);
|
3078
|
+
#elif defined(__riscv_v_intrinsic)
|
3079
|
+
float sumf = 0.0;
|
3080
|
+
|
3081
|
+
uint32_t qh;
|
3082
|
+
|
3083
|
+
// These temp values are for masking and shift operations
|
3084
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3085
|
+
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
3086
|
+
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
|
3087
|
+
|
3088
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3089
|
+
|
3090
|
+
for (int i = 0; i < nb; i++) {
|
3091
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3092
|
+
|
3093
|
+
// temporary registers
|
3094
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
|
3095
|
+
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3096
|
+
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
|
3097
|
+
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
|
3098
|
+
|
3099
|
+
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
3100
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
|
3101
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
|
3102
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3103
|
+
|
3104
|
+
// ((qh & (1u << (j + 16))) >> (j + 12));
|
3105
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
|
3106
|
+
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
|
3107
|
+
|
3108
|
+
// narrowing
|
3109
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
|
3110
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3111
|
+
|
3112
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
|
3113
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3114
|
+
|
3115
|
+
// load
|
3116
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3117
|
+
|
3118
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3119
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3120
|
+
|
3121
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3122
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3123
|
+
|
3124
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3125
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3126
|
+
|
3127
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3128
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3129
|
+
|
3130
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
|
3131
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
|
3132
|
+
|
3133
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3134
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3135
|
+
|
3136
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3137
|
+
|
3138
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3139
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3140
|
+
|
3141
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3142
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3143
|
+
|
3144
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
3145
|
+
}
|
3146
|
+
|
3147
|
+
*s = sumf;
|
3043
3148
|
#else
|
3044
3149
|
// scalar
|
3045
3150
|
float sumf = 0.0;
|
@@ -3072,7 +3177,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3072
3177
|
const int nb = n / qk;
|
3073
3178
|
|
3074
3179
|
assert(n % qk == 0);
|
3075
|
-
assert(nb % 2 == 0);
|
3076
3180
|
assert(qk == QK5_1);
|
3077
3181
|
|
3078
3182
|
const block_q5_1 * restrict x = vx;
|
@@ -3091,6 +3195,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3091
3195
|
uint64_t tmp0[4];
|
3092
3196
|
uint64_t tmp1[4];
|
3093
3197
|
|
3198
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3094
3199
|
for (int i = 0; i < nb; i += 2) {
|
3095
3200
|
const block_q5_1 * restrict x0 = &x[i];
|
3096
3201
|
const block_q5_1 * restrict x1 = &x[i + 1];
|
@@ -3296,6 +3401,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3296
3401
|
}
|
3297
3402
|
|
3298
3403
|
*s = hsum_float_8(acc) + summs;
|
3404
|
+
#elif defined(__riscv_v_intrinsic)
|
3405
|
+
float sumf = 0.0;
|
3406
|
+
|
3407
|
+
uint32_t qh;
|
3408
|
+
|
3409
|
+
// These temp values are for shift operations
|
3410
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3411
|
+
|
3412
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3413
|
+
|
3414
|
+
for (int i = 0; i < nb; i++) {
|
3415
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3416
|
+
|
3417
|
+
// temporary registers
|
3418
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3419
|
+
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
|
3420
|
+
|
3421
|
+
// load qh
|
3422
|
+
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
|
3423
|
+
|
3424
|
+
// ((qh >> (j + 0)) << 4) & 0x10;
|
3425
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
|
3426
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3427
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
|
3428
|
+
|
3429
|
+
// ((qh >> (j + 12)) ) & 0x10;
|
3430
|
+
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
|
3431
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
|
3432
|
+
|
3433
|
+
// narrowing
|
3434
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
|
3435
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3436
|
+
|
3437
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
|
3438
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3439
|
+
|
3440
|
+
// load
|
3441
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3442
|
+
|
3443
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3444
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3445
|
+
|
3446
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3447
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3448
|
+
|
3449
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3450
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3451
|
+
|
3452
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3453
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3454
|
+
|
3455
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3456
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3457
|
+
|
3458
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3459
|
+
|
3460
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3461
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3462
|
+
|
3463
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3464
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3465
|
+
|
3466
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
3467
|
+
}
|
3468
|
+
|
3469
|
+
*s = sumf;
|
3299
3470
|
#else
|
3300
3471
|
// scalar
|
3301
3472
|
float sumf = 0.0;
|
@@ -3328,7 +3499,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3328
3499
|
const int nb = n / qk;
|
3329
3500
|
|
3330
3501
|
assert(n % qk == 0);
|
3331
|
-
assert(nb % 2 == 0);
|
3332
3502
|
|
3333
3503
|
const block_q8_0 * restrict x = vx;
|
3334
3504
|
const block_q8_0 * restrict y = vy;
|
@@ -3337,6 +3507,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3337
3507
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3338
3508
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
3339
3509
|
|
3510
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3340
3511
|
for (int i = 0; i < nb; i += 2) {
|
3341
3512
|
const block_q8_0 * restrict x0 = &x[i + 0];
|
3342
3513
|
const block_q8_0 * restrict x1 = &x[i + 1];
|
@@ -3407,6 +3578,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3407
3578
|
}
|
3408
3579
|
|
3409
3580
|
*s = hsum_float_8(acc);
|
3581
|
+
#elif defined(__riscv_v_intrinsic)
|
3582
|
+
float sumf = 0.0;
|
3583
|
+
size_t vl = __riscv_vsetvl_e8m1(qk);
|
3584
|
+
|
3585
|
+
for (int i = 0; i < nb; i++) {
|
3586
|
+
// load elements
|
3587
|
+
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
3588
|
+
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3589
|
+
|
3590
|
+
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
3591
|
+
|
3592
|
+
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3593
|
+
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
3594
|
+
|
3595
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
3596
|
+
|
3597
|
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
3598
|
+
}
|
3599
|
+
|
3600
|
+
*s = sumf;
|
3410
3601
|
#else
|
3411
3602
|
// scalar
|
3412
3603
|
float sumf = 0.0;
|
@@ -4107,16 +4298,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
4107
4298
|
}
|
4108
4299
|
|
4109
4300
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
4110
|
-
|
4111
|
-
|
4112
|
-
|
4113
|
-
|
4114
|
-
|
4115
|
-
// return tensor->ne[3]*tensor->nb[3]
|
4116
|
-
//
|
4117
|
-
// is enough, but just in case, adding the second part
|
4118
|
-
|
4119
|
-
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
|
4301
|
+
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
|
4302
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
4303
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4304
|
+
}
|
4305
|
+
return nbytes;
|
4120
4306
|
}
|
4121
4307
|
|
4122
4308
|
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
@@ -4393,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4393
4579
|
return NULL;
|
4394
4580
|
}
|
4395
4581
|
|
4582
|
+
// allow to call ggml_init with 0 size
|
4583
|
+
if (params.mem_size == 0) {
|
4584
|
+
params.mem_size = GGML_MEM_ALIGN;
|
4585
|
+
}
|
4586
|
+
|
4396
4587
|
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
4397
4588
|
|
4398
4589
|
*ctx = (struct ggml_context) {
|
@@ -4570,36 +4761,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4570
4761
|
enum ggml_type type,
|
4571
4762
|
int n_dims,
|
4572
4763
|
const int64_t * ne,
|
4573
|
-
|
4764
|
+
struct ggml_tensor * view_src,
|
4765
|
+
size_t view_offs) {
|
4574
4766
|
|
4575
4767
|
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4576
4768
|
|
4577
|
-
|
4769
|
+
// find the base tensor and absolute offset
|
4770
|
+
if (view_src != NULL && view_src->view_src != NULL) {
|
4771
|
+
view_offs += view_src->view_offs;
|
4772
|
+
view_src = view_src->view_src;
|
4773
|
+
}
|
4578
4774
|
|
4579
|
-
|
4580
|
-
|
4581
|
-
|
4582
|
-
data_size *= ne[i];
|
4583
|
-
}
|
4775
|
+
size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
4776
|
+
for (int i = 1; i < n_dims; i++) {
|
4777
|
+
data_size *= ne[i];
|
4584
4778
|
}
|
4585
4779
|
|
4586
|
-
|
4587
|
-
|
4588
|
-
|
4589
|
-
|
4590
|
-
|
4591
|
-
|
4592
|
-
return NULL;
|
4593
|
-
}
|
4780
|
+
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
4781
|
+
|
4782
|
+
void * data = view_src != NULL ? view_src->data : NULL;
|
4783
|
+
if (data != NULL) {
|
4784
|
+
data = (char *) data + view_offs;
|
4785
|
+
}
|
4594
4786
|
|
4595
|
-
|
4787
|
+
size_t obj_alloc_size = 0;
|
4788
|
+
|
4789
|
+
if (view_src == NULL && !ctx->no_alloc) {
|
4790
|
+
if (ctx->scratch.data != NULL) {
|
4791
|
+
// allocate tensor data in the scratch buffer
|
4792
|
+
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
4793
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4794
|
+
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
4795
|
+
assert(false);
|
4796
|
+
return NULL;
|
4797
|
+
}
|
4596
4798
|
|
4597
|
-
|
4799
|
+
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
|
4598
4800
|
|
4599
|
-
|
4801
|
+
ctx->scratch.offs += data_size;
|
4802
|
+
} else {
|
4803
|
+
// allocate tensor data in the context's memory pool
|
4804
|
+
obj_alloc_size = data_size;
|
4805
|
+
}
|
4600
4806
|
}
|
4601
4807
|
|
4602
|
-
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE +
|
4808
|
+
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
|
4603
4809
|
|
4604
4810
|
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
|
4605
4811
|
|
@@ -4619,7 +4825,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4619
4825
|
/*.perf_runs =*/ 0,
|
4620
4826
|
/*.perf_cycles =*/ 0,
|
4621
4827
|
/*.perf_time_us =*/ 0,
|
4622
|
-
/*.
|
4828
|
+
/*.view_src =*/ view_src,
|
4829
|
+
/*.view_offs =*/ view_offs,
|
4830
|
+
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
|
4623
4831
|
/*.name =*/ { 0 },
|
4624
4832
|
/*.extra =*/ NULL,
|
4625
4833
|
/*.padding =*/ { 0 },
|
@@ -4643,28 +4851,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4643
4851
|
return result;
|
4644
4852
|
}
|
4645
4853
|
|
4646
|
-
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4647
|
-
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4648
|
-
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4649
|
-
memcpy(tensor->op_params, params, params_size);
|
4650
|
-
}
|
4651
|
-
|
4652
|
-
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4653
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4654
|
-
return ((const int32_t *)(tensor->op_params))[i];
|
4655
|
-
}
|
4656
|
-
|
4657
|
-
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4658
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4659
|
-
((int32_t *)(tensor->op_params))[i] = value;
|
4660
|
-
}
|
4661
|
-
|
4662
4854
|
struct ggml_tensor * ggml_new_tensor(
|
4663
4855
|
struct ggml_context * ctx,
|
4664
4856
|
enum ggml_type type,
|
4665
4857
|
int n_dims,
|
4666
4858
|
const int64_t * ne) {
|
4667
|
-
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4859
|
+
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
|
4668
4860
|
}
|
4669
4861
|
|
4670
4862
|
struct ggml_tensor * ggml_new_tensor_1d(
|
@@ -4729,7 +4921,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
4729
4921
|
}
|
4730
4922
|
|
4731
4923
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
4732
|
-
return
|
4924
|
+
return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
|
4925
|
+
}
|
4926
|
+
|
4927
|
+
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4928
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4929
|
+
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4930
|
+
memcpy(tensor->op_params, params, params_size);
|
4931
|
+
}
|
4932
|
+
|
4933
|
+
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4934
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4935
|
+
return ((const int32_t *)(tensor->op_params))[i];
|
4936
|
+
}
|
4937
|
+
|
4938
|
+
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4939
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4940
|
+
((int32_t *)(tensor->op_params))[i] = value;
|
4733
4941
|
}
|
4734
4942
|
|
4735
4943
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
@@ -5015,14 +5223,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
5015
5223
|
|
5016
5224
|
struct ggml_tensor * ggml_view_tensor(
|
5017
5225
|
struct ggml_context * ctx,
|
5018
|
-
|
5019
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src
|
5226
|
+
struct ggml_tensor * src) {
|
5227
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
|
5020
5228
|
ggml_format_name(result, "%s (view)", src->name);
|
5021
5229
|
|
5022
|
-
|
5023
|
-
|
5024
|
-
|
5025
|
-
result->nb[3] = src->nb[3];
|
5230
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
5231
|
+
result->nb[i] = src->nb[i];
|
5232
|
+
}
|
5026
5233
|
|
5027
5234
|
return result;
|
5028
5235
|
}
|
@@ -5280,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
5280
5487
|
}
|
5281
5488
|
|
5282
5489
|
if (inplace) {
|
5283
|
-
GGML_ASSERT(is_node
|
5490
|
+
GGML_ASSERT(!is_node);
|
5284
5491
|
}
|
5285
5492
|
|
5286
5493
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5323,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
5323
5530
|
}
|
5324
5531
|
|
5325
5532
|
if (inplace) {
|
5326
|
-
GGML_ASSERT(is_node
|
5533
|
+
GGML_ASSERT(!is_node);
|
5327
5534
|
}
|
5328
5535
|
|
5329
5536
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5595,7 +5802,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5595
5802
|
|
5596
5803
|
// ggml_concat
|
5597
5804
|
|
5598
|
-
struct ggml_tensor* ggml_concat(
|
5805
|
+
struct ggml_tensor * ggml_concat(
|
5599
5806
|
struct ggml_context* ctx,
|
5600
5807
|
struct ggml_tensor* a,
|
5601
5808
|
struct ggml_tensor* b) {
|
@@ -5862,7 +6069,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
5862
6069
|
struct ggml_tensor * ggml_rms_norm_back(
|
5863
6070
|
struct ggml_context * ctx,
|
5864
6071
|
struct ggml_tensor * a,
|
5865
|
-
struct ggml_tensor * b
|
6072
|
+
struct ggml_tensor * b,
|
6073
|
+
float eps) {
|
5866
6074
|
bool is_node = false;
|
5867
6075
|
|
5868
6076
|
if (a->grad) {
|
@@ -5872,6 +6080,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5872
6080
|
|
5873
6081
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5874
6082
|
|
6083
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
6084
|
+
|
5875
6085
|
result->op = GGML_OP_RMS_NORM_BACK;
|
5876
6086
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5877
6087
|
result->src[0] = a;
|
@@ -6201,7 +6411,7 @@ struct ggml_tensor * ggml_reshape(
|
|
6201
6411
|
//GGML_ASSERT(false);
|
6202
6412
|
}
|
6203
6413
|
|
6204
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a
|
6414
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
|
6205
6415
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6206
6416
|
|
6207
6417
|
result->op = GGML_OP_RESHAPE;
|
@@ -6225,7 +6435,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6225
6435
|
}
|
6226
6436
|
|
6227
6437
|
const int64_t ne[1] = { ne0 };
|
6228
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a
|
6438
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
|
6229
6439
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6230
6440
|
|
6231
6441
|
result->op = GGML_OP_RESHAPE;
|
@@ -6250,7 +6460,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6250
6460
|
}
|
6251
6461
|
|
6252
6462
|
const int64_t ne[2] = { ne0, ne1 };
|
6253
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a
|
6463
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
|
6254
6464
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6255
6465
|
|
6256
6466
|
result->op = GGML_OP_RESHAPE;
|
@@ -6276,7 +6486,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6276
6486
|
}
|
6277
6487
|
|
6278
6488
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6279
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a
|
6489
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
|
6280
6490
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6281
6491
|
|
6282
6492
|
result->op = GGML_OP_RESHAPE;
|
@@ -6286,7 +6496,6 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6286
6496
|
return result;
|
6287
6497
|
}
|
6288
6498
|
|
6289
|
-
|
6290
6499
|
struct ggml_tensor * ggml_reshape_4d(
|
6291
6500
|
struct ggml_context * ctx,
|
6292
6501
|
struct ggml_tensor * a,
|
@@ -6304,7 +6513,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6304
6513
|
}
|
6305
6514
|
|
6306
6515
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6307
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a
|
6516
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
|
6308
6517
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6309
6518
|
|
6310
6519
|
result->op = GGML_OP_RESHAPE;
|
@@ -6314,46 +6523,40 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6314
6523
|
return result;
|
6315
6524
|
}
|
6316
6525
|
|
6317
|
-
|
6318
|
-
|
6319
|
-
static struct ggml_tensor * ggml_view_tensor_offset(
|
6526
|
+
static struct ggml_tensor * ggml_view_impl(
|
6320
6527
|
struct ggml_context * ctx,
|
6321
6528
|
struct ggml_tensor * a,
|
6322
6529
|
int n_dims,
|
6323
6530
|
const int64_t * ne,
|
6324
6531
|
size_t offset) {
|
6325
|
-
// don't calculate an offset from an unallocated tensor
|
6326
|
-
void * data = NULL;
|
6327
|
-
if (a->data != NULL) {
|
6328
|
-
data = (char *) a->data + offset;
|
6329
|
-
}
|
6330
6532
|
|
6331
|
-
|
6533
|
+
bool is_node = false;
|
6534
|
+
|
6535
|
+
if (a->grad) {
|
6536
|
+
is_node = true;
|
6537
|
+
}
|
6332
6538
|
|
6539
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
|
6333
6540
|
ggml_format_name(result, "%s (view)", a->name);
|
6334
6541
|
|
6335
6542
|
ggml_set_op_params(result, &offset, sizeof(offset));
|
6336
6543
|
|
6544
|
+
result->op = GGML_OP_VIEW;
|
6545
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6546
|
+
result->src[0] = a;
|
6547
|
+
|
6337
6548
|
return result;
|
6338
6549
|
}
|
6339
6550
|
|
6551
|
+
// ggml_view_1d
|
6552
|
+
|
6340
6553
|
struct ggml_tensor * ggml_view_1d(
|
6341
6554
|
struct ggml_context * ctx,
|
6342
6555
|
struct ggml_tensor * a,
|
6343
6556
|
int64_t ne0,
|
6344
6557
|
size_t offset) {
|
6345
6558
|
|
6346
|
-
|
6347
|
-
|
6348
|
-
if (a->grad) {
|
6349
|
-
is_node = true;
|
6350
|
-
}
|
6351
|
-
|
6352
|
-
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6353
|
-
|
6354
|
-
result->op = GGML_OP_VIEW;
|
6355
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6356
|
-
result->src[0] = a;
|
6559
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
|
6357
6560
|
|
6358
6561
|
return result;
|
6359
6562
|
}
|
@@ -6368,24 +6571,14 @@ struct ggml_tensor * ggml_view_2d(
|
|
6368
6571
|
size_t nb1,
|
6369
6572
|
size_t offset) {
|
6370
6573
|
|
6371
|
-
|
6372
|
-
|
6373
|
-
if (a->grad) {
|
6374
|
-
is_node = true;
|
6375
|
-
}
|
6376
|
-
|
6377
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6574
|
+
const int64_t ne[2] = { ne0, ne1 };
|
6378
6575
|
|
6379
|
-
struct ggml_tensor * result =
|
6576
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
|
6380
6577
|
|
6381
6578
|
result->nb[1] = nb1;
|
6382
6579
|
result->nb[2] = result->nb[1]*ne1;
|
6383
6580
|
result->nb[3] = result->nb[2];
|
6384
6581
|
|
6385
|
-
result->op = GGML_OP_VIEW;
|
6386
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6387
|
-
result->src[0] = a;
|
6388
|
-
|
6389
6582
|
return result;
|
6390
6583
|
}
|
6391
6584
|
|
@@ -6401,24 +6594,14 @@ struct ggml_tensor * ggml_view_3d(
|
|
6401
6594
|
size_t nb2,
|
6402
6595
|
size_t offset) {
|
6403
6596
|
|
6404
|
-
|
6405
|
-
|
6406
|
-
if (a->grad) {
|
6407
|
-
is_node = true;
|
6408
|
-
}
|
6409
|
-
|
6410
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6597
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6411
6598
|
|
6412
|
-
struct ggml_tensor * result =
|
6599
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
|
6413
6600
|
|
6414
6601
|
result->nb[1] = nb1;
|
6415
6602
|
result->nb[2] = nb2;
|
6416
6603
|
result->nb[3] = result->nb[2]*ne2;
|
6417
6604
|
|
6418
|
-
result->op = GGML_OP_VIEW;
|
6419
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6420
|
-
result->src[0] = a;
|
6421
|
-
|
6422
6605
|
return result;
|
6423
6606
|
}
|
6424
6607
|
|
@@ -6436,24 +6619,14 @@ struct ggml_tensor * ggml_view_4d(
|
|
6436
6619
|
size_t nb3,
|
6437
6620
|
size_t offset) {
|
6438
6621
|
|
6439
|
-
|
6440
|
-
|
6441
|
-
if (a->grad) {
|
6442
|
-
is_node = true;
|
6443
|
-
}
|
6444
|
-
|
6445
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6622
|
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6446
6623
|
|
6447
|
-
struct ggml_tensor * result =
|
6624
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
|
6448
6625
|
|
6449
6626
|
result->nb[1] = nb1;
|
6450
6627
|
result->nb[2] = nb2;
|
6451
6628
|
result->nb[3] = nb3;
|
6452
6629
|
|
6453
|
-
result->op = GGML_OP_VIEW;
|
6454
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6455
|
-
result->src[0] = a;
|
6456
|
-
|
6457
6630
|
return result;
|
6458
6631
|
}
|
6459
6632
|
|
@@ -6640,7 +6813,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6640
6813
|
|
6641
6814
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6642
6815
|
|
6643
|
-
int32_t params[] = { n_past
|
6816
|
+
int32_t params[] = { n_past };
|
6644
6817
|
ggml_set_op_params(result, params, sizeof(params));
|
6645
6818
|
|
6646
6819
|
result->op = GGML_OP_DIAG_MASK_INF;
|
@@ -6657,7 +6830,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
|
|
6657
6830
|
return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
|
6658
6831
|
}
|
6659
6832
|
|
6660
|
-
|
6661
6833
|
struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
6662
6834
|
struct ggml_context * ctx,
|
6663
6835
|
struct ggml_tensor * a,
|
@@ -6680,7 +6852,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6680
6852
|
|
6681
6853
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6682
6854
|
|
6683
|
-
int32_t params[] = { n_past
|
6855
|
+
int32_t params[] = { n_past };
|
6684
6856
|
ggml_set_op_params(result, params, sizeof(params));
|
6685
6857
|
|
6686
6858
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
@@ -7097,11 +7269,13 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
|
7097
7269
|
};
|
7098
7270
|
|
7099
7271
|
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7272
|
+
|
7273
|
+
ggml_set_op_params_i32(result, 0, stride);
|
7274
|
+
|
7100
7275
|
result->op = GGML_OP_CONV_TRANSPOSE_2D;
|
7101
7276
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7102
7277
|
result->src[0] = a;
|
7103
7278
|
result->src[1] = b;
|
7104
|
-
result->src[2] = ggml_new_i32(ctx, stride);
|
7105
7279
|
|
7106
7280
|
return result;
|
7107
7281
|
}
|
@@ -9446,6 +9620,8 @@ static void ggml_compute_forward_div_f32(
|
|
9446
9620
|
|
9447
9621
|
|
9448
9622
|
#ifdef GGML_USE_ACCELERATE
|
9623
|
+
UNUSED(ggml_vec_div_f32);
|
9624
|
+
|
9449
9625
|
vDSP_vdiv(
|
9450
9626
|
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
9451
9627
|
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
@@ -10752,7 +10928,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
10752
10928
|
|
10753
10929
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
10754
10930
|
|
10755
|
-
|
10931
|
+
float eps;
|
10932
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10756
10933
|
|
10757
10934
|
// TODO: optimize
|
10758
10935
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -11930,8 +12107,8 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
11930
12107
|
const int ith = params->ith;
|
11931
12108
|
const int nth = params->nth;
|
11932
12109
|
|
11933
|
-
const int n_past =
|
11934
|
-
const bool inplace =
|
12110
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
12111
|
+
const bool inplace = src0->data == dst->data;
|
11935
12112
|
|
11936
12113
|
GGML_ASSERT(n_past >= 0);
|
11937
12114
|
|
@@ -12142,6 +12319,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|
12142
12319
|
// dx = J * dy
|
12143
12320
|
// dxk = sum_i(Jki * dyi)
|
12144
12321
|
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
12322
|
+
// dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
|
12145
12323
|
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
12146
12324
|
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
12147
12325
|
// dxk = -yk * dot(y, dy) + yk*dyk
|
@@ -13497,7 +13675,6 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13497
13675
|
const struct ggml_compute_params * params,
|
13498
13676
|
const struct ggml_tensor * src0,
|
13499
13677
|
const struct ggml_tensor * src1,
|
13500
|
-
const struct ggml_tensor * opt0,
|
13501
13678
|
struct ggml_tensor * dst) {
|
13502
13679
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13503
13680
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -13557,7 +13734,7 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13557
13734
|
return;
|
13558
13735
|
}
|
13559
13736
|
|
13560
|
-
const int32_t stride = (
|
13737
|
+
const int32_t stride = ggml_get_op_params_i32(dst, 0);
|
13561
13738
|
|
13562
13739
|
// total patches in dst
|
13563
13740
|
const int np = ne2;
|
@@ -13570,7 +13747,7 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13570
13747
|
const int ip1 = MIN(ip0 + dp, np);
|
13571
13748
|
|
13572
13749
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13573
|
-
ggml_fp16_t * const wdata_src =
|
13750
|
+
ggml_fp16_t * const wdata_src = wdata + nk;
|
13574
13751
|
|
13575
13752
|
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
13576
13753
|
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
@@ -13582,9 +13759,8 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13582
13759
|
for (int i00 = 0; i00 < ne00; i00++) {
|
13583
13760
|
float v = 0;
|
13584
13761
|
ggml_vec_dot_f16(ne03, &v,
|
13585
|
-
|
13586
|
-
|
13587
|
-
|
13762
|
+
wdata_src + i1n,
|
13763
|
+
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
13588
13764
|
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
13589
13765
|
}
|
13590
13766
|
}
|
@@ -13934,7 +14110,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13934
14110
|
vvexpf(S, S, &Mup);
|
13935
14111
|
ggml_vec_sum_f32(Mup, &sum, S);
|
13936
14112
|
#else
|
13937
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14113
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
13938
14114
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
13939
14115
|
|
13940
14116
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -13944,9 +14120,13 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13944
14120
|
if (SS[j] == -INFINITY) {
|
13945
14121
|
SS[j] = 0.0f;
|
13946
14122
|
} else {
|
14123
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14124
|
+
const float val = expf(SS[j] - max);
|
14125
|
+
#else
|
13947
14126
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
13948
14127
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
13949
14128
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14129
|
+
#endif
|
13950
14130
|
sump[j] += (ggml_float)val;
|
13951
14131
|
SS[j] = val;
|
13952
14132
|
}
|
@@ -14524,7 +14704,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
14524
14704
|
vvexpf(SM, SM, &Mup);
|
14525
14705
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
14526
14706
|
#else
|
14527
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14707
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
14528
14708
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
14529
14709
|
|
14530
14710
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -14535,9 +14715,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
14535
14715
|
if (SR[j] == -INFINITY) {
|
14536
14716
|
SW[j] = 0.0f;
|
14537
14717
|
} else {
|
14718
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14719
|
+
const float val = expf(SR[j] - max);
|
14720
|
+
#else
|
14538
14721
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
14539
14722
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
14540
14723
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14724
|
+
#endif
|
14541
14725
|
sump[j] += (ggml_float)val;
|
14542
14726
|
SW[j] = val;
|
14543
14727
|
}
|
@@ -15275,6 +15459,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15275
15459
|
const int nc = src0->ne[0];
|
15276
15460
|
const int nr = ggml_nrows(src0);
|
15277
15461
|
|
15462
|
+
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
15463
|
+
|
15278
15464
|
if (params->type == GGML_TASK_INIT) {
|
15279
15465
|
if (ith == 0) {
|
15280
15466
|
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
@@ -15286,7 +15472,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15286
15472
|
if (ith == 0) {
|
15287
15473
|
float * dp = (float *) dst->data;
|
15288
15474
|
ggml_vec_sum_f32(nth, dp, sums);
|
15289
|
-
dp[0] *= -1.0f;
|
15475
|
+
dp[0] *= -1.0f / (float) nr;
|
15290
15476
|
}
|
15291
15477
|
return;
|
15292
15478
|
}
|
@@ -15303,7 +15489,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15303
15489
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
15304
15490
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
15305
15491
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
15306
|
-
float * st = (float *) params->wdata + nth + ith*nc;
|
15492
|
+
float * st = ((float *) params->wdata) + nth + ith*nc;
|
15307
15493
|
|
15308
15494
|
#ifndef NDEBUG
|
15309
15495
|
for (int i = 0; i < nc; ++i) {
|
@@ -15318,15 +15504,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15318
15504
|
float max = -INFINITY;
|
15319
15505
|
ggml_vec_max_f32(nc, &max, s0);
|
15320
15506
|
|
15321
|
-
uint16_t scvt;
|
15507
|
+
uint16_t scvt; UNUSED(scvt);
|
15322
15508
|
for (int i = 0; i < nc; i++) {
|
15323
15509
|
if (s0[i] == -INFINITY) {
|
15324
15510
|
st[i] = 0.0f;
|
15325
15511
|
} else {
|
15326
|
-
|
15512
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15513
|
+
const float s = s0[i] - max;
|
15514
|
+
const float val = expf(s);
|
15515
|
+
#else
|
15327
15516
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
15328
15517
|
memcpy(&scvt, &s, sizeof(scvt));
|
15329
15518
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15519
|
+
#endif
|
15330
15520
|
sum += (ggml_float)val;
|
15331
15521
|
st[i] = val;
|
15332
15522
|
}
|
@@ -15342,7 +15532,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15342
15532
|
ggml_vec_log_f32(nc, st, st);
|
15343
15533
|
ggml_vec_mul_f32(nc, st, st, s1);
|
15344
15534
|
|
15345
|
-
|
15535
|
+
float st_sum = 0;
|
15536
|
+
ggml_vec_sum_f32(nc, &st_sum, st);
|
15537
|
+
sums[ith] += st_sum;
|
15346
15538
|
|
15347
15539
|
#ifndef NDEBUG
|
15348
15540
|
for (int i = 0; i < nc; ++i) {
|
@@ -15392,7 +15584,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15392
15584
|
return;
|
15393
15585
|
}
|
15394
15586
|
|
15395
|
-
const
|
15587
|
+
const double eps = 1e-9;
|
15396
15588
|
|
15397
15589
|
// TODO: handle transposed/permuted matrices
|
15398
15590
|
const int64_t nc = src0->ne[0];
|
@@ -15411,7 +15603,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15411
15603
|
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
15412
15604
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
15413
15605
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
15414
|
-
float * sm = (float *) params->wdata + ith*nc;
|
15415
15606
|
|
15416
15607
|
#ifndef NDEBUG
|
15417
15608
|
for (int i = 0; i < nc; ++i) {
|
@@ -15420,54 +15611,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15420
15611
|
assert(!isnan(s1[i]));
|
15421
15612
|
}
|
15422
15613
|
#endif
|
15423
|
-
// step by step explanation:
|
15424
|
-
{
|
15425
|
-
//float * sums = (float *) params->wdata;
|
15426
|
-
|
15427
|
-
// forward pass with annotated gradients from backward pass
|
15428
|
-
// (built by going in reverse operation order, adding to gradients of current operation args)
|
15429
|
-
// st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
|
15430
|
-
// from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
15431
|
-
// ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
|
15432
|
-
// ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
|
15433
|
-
// ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
|
15434
|
-
// ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
|
15435
|
-
// ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
|
15436
|
-
// ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
|
15437
|
-
|
15438
|
-
// substitute into grad[st1], because we can reuse softmax_back from this point on
|
15439
|
-
// grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
|
15440
|
-
// postorder:
|
15441
|
-
// grad[st1] := softmax(s0)
|
15442
|
-
// grad[st1] := grad[st1]*(1.0 - eps)
|
15443
|
-
// grad[st1] := grad[st1] + eps
|
15444
|
-
// grad[st1] := s1 / grad[st1]
|
15445
|
-
// grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
|
15446
|
-
|
15447
|
-
// src0 gradients by going through softmax_back
|
15448
|
-
// grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
15449
|
-
// from softmax_back:
|
15450
|
-
// dxk = yk * (dyk - dot(y, dy))
|
15451
|
-
// dot_y_dy := dot(y, dy)
|
15452
|
-
// dx := dy
|
15453
|
-
// dx := dx - dot_y_dy
|
15454
|
-
// dx := dx * y
|
15455
|
-
// postorder:
|
15456
|
-
// dot_st1_dst1 := dot(st1, grad[st1])
|
15457
|
-
// grad[s0] := grad[st1]
|
15458
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
15459
|
-
// grad[s0] := grad[s0] * st1
|
15460
|
-
|
15461
|
-
// prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
|
15462
|
-
// sm := softmax(s0)
|
15463
|
-
// grad[s0] := sm*(1.0 - eps)
|
15464
|
-
// grad[s0] := grad[s0] + eps
|
15465
|
-
// grad[s0] := s1 / grad[s0]
|
15466
|
-
// grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
|
15467
|
-
// dot_st1_dst1 := dot(sm, grad[s0])
|
15468
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
15469
|
-
// grad[s0] := grad[s0] * sm
|
15470
|
-
}
|
15471
15614
|
|
15472
15615
|
// soft_max
|
15473
15616
|
ggml_float sum = 0.0;
|
@@ -15475,39 +15618,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15475
15618
|
float max = -INFINITY;
|
15476
15619
|
ggml_vec_max_f32(nc, &max, s0);
|
15477
15620
|
|
15478
|
-
uint16_t scvt;
|
15621
|
+
uint16_t scvt; UNUSED(scvt);
|
15479
15622
|
for (int i = 0; i < nc; i++) {
|
15480
15623
|
if (s0[i] == -INFINITY) {
|
15481
|
-
|
15624
|
+
ds0[i] = 0.0f;
|
15482
15625
|
} else {
|
15483
|
-
|
15626
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15627
|
+
const float s = s0[i] - max;
|
15628
|
+
const float val = expf(s);
|
15629
|
+
#else
|
15484
15630
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
15485
15631
|
memcpy(&scvt, &s, sizeof(scvt));
|
15486
15632
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15633
|
+
#endif
|
15487
15634
|
sum += (ggml_float)val;
|
15488
|
-
|
15635
|
+
ds0[i] = val;
|
15489
15636
|
}
|
15490
15637
|
}
|
15491
15638
|
|
15492
15639
|
assert(sum > 0.0);
|
15493
|
-
sum = 1.0/sum;
|
15640
|
+
sum = (1.0 - eps)/sum;
|
15494
15641
|
}
|
15495
15642
|
|
15496
|
-
|
15497
|
-
ggml_vec_scale_f32(nc,
|
15498
|
-
|
15499
|
-
|
15500
|
-
|
15501
|
-
|
15502
|
-
ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
|
15503
|
-
ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
|
15504
|
-
ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
|
15505
|
-
ggml_vec_mul_f32 (nc, ds0, ds0, sm);
|
15643
|
+
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
15644
|
+
ggml_vec_scale_f32(nc, ds0, sum);
|
15645
|
+
ggml_vec_add1_f32(nc, ds0, ds0, eps);
|
15646
|
+
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
15647
|
+
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
15648
|
+
|
15506
15649
|
|
15507
15650
|
#ifndef NDEBUG
|
15508
15651
|
for (int i = 0; i < nc; ++i) {
|
15509
|
-
assert(!isnan(sm[i]));
|
15510
|
-
assert(!isinf(sm[i]));
|
15511
15652
|
assert(!isnan(ds0[i]));
|
15512
15653
|
assert(!isinf(ds0[i]));
|
15513
15654
|
}
|
@@ -15731,7 +15872,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15731
15872
|
} break;
|
15732
15873
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15733
15874
|
{
|
15734
|
-
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor
|
15875
|
+
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
15735
15876
|
} break;
|
15736
15877
|
case GGML_OP_POOL_1D:
|
15737
15878
|
{
|
@@ -16062,9 +16203,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16062
16203
|
{
|
16063
16204
|
// necessary for llama
|
16064
16205
|
if (src0->grad) {
|
16206
|
+
float eps;
|
16207
|
+
memcpy(&eps, tensor->op_params, sizeof(float));
|
16208
|
+
|
16065
16209
|
src0->grad = ggml_add_impl(ctx,
|
16066
16210
|
src0->grad,
|
16067
|
-
ggml_rms_norm_back(ctx, src0, tensor->grad),
|
16211
|
+
ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
|
16068
16212
|
inplace);
|
16069
16213
|
}
|
16070
16214
|
} break;
|
@@ -16832,9 +16976,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
16832
16976
|
return result;
|
16833
16977
|
}
|
16834
16978
|
|
16835
|
-
|
16836
|
-
struct ggml_cgraph result = *gf;
|
16837
|
-
|
16979
|
+
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
16838
16980
|
GGML_ASSERT(gf->n_nodes > 0);
|
16839
16981
|
|
16840
16982
|
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
@@ -16858,15 +17000,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|
16858
17000
|
}
|
16859
17001
|
}
|
16860
17002
|
|
16861
|
-
for (int i =
|
17003
|
+
for (int i = 0; i < gf->n_nodes; i++) {
|
16862
17004
|
struct ggml_tensor * node = gf->nodes[i];
|
16863
17005
|
|
16864
17006
|
if (node->is_param) {
|
16865
17007
|
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
16866
|
-
ggml_build_forward_expand(
|
17008
|
+
ggml_build_forward_expand(gb, node->grad);
|
16867
17009
|
}
|
16868
17010
|
}
|
17011
|
+
}
|
16869
17012
|
|
17013
|
+
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
17014
|
+
struct ggml_cgraph result = *gf;
|
17015
|
+
ggml_build_backward_expand(ctx, gf, &result, keep);
|
16870
17016
|
return result;
|
16871
17017
|
}
|
16872
17018
|
|
@@ -17542,10 +17688,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
17542
17688
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
17543
17689
|
{
|
17544
17690
|
n_tasks = n_threads;
|
17545
|
-
|
17546
|
-
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
17547
|
-
|
17548
|
-
work_size = MAX(work_size, cur);
|
17549
17691
|
} break;
|
17550
17692
|
case GGML_OP_NONE:
|
17551
17693
|
{
|
@@ -18423,14 +18565,16 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18423
18565
|
struct ggml_opt_params params,
|
18424
18566
|
struct ggml_tensor * f,
|
18425
18567
|
struct ggml_cgraph * gf,
|
18426
|
-
struct ggml_cgraph * gb
|
18568
|
+
struct ggml_cgraph * gb,
|
18569
|
+
ggml_opt_callback callback,
|
18570
|
+
void * callback_data) {
|
18427
18571
|
GGML_ASSERT(ggml_is_scalar(f));
|
18428
18572
|
|
18429
18573
|
// these will store the parameters we want to optimize
|
18430
18574
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
18431
18575
|
|
18432
18576
|
int np = 0;
|
18433
|
-
|
18577
|
+
int64_t nx = 0;
|
18434
18578
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
18435
18579
|
if (gf->nodes[i]->is_param) {
|
18436
18580
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
@@ -18449,31 +18593,32 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18449
18593
|
}
|
18450
18594
|
|
18451
18595
|
// constants
|
18452
|
-
|
18453
|
-
const float
|
18454
|
-
const float
|
18596
|
+
float sched = params.adam.sched;
|
18597
|
+
const float alpha = params.adam.alpha;
|
18598
|
+
const float decay = params.adam.decay * alpha;
|
18455
18599
|
const float beta1 = params.adam.beta1;
|
18456
18600
|
const float beta2 = params.adam.beta2;
|
18457
18601
|
const float eps = params.adam.eps;
|
18602
|
+
const float gclip = params.adam.gclip;
|
18603
|
+
const int decay_min_ndim = params.adam.decay_min_ndim;
|
18458
18604
|
|
18459
|
-
float * x = opt->adam.x->data; // view of the parameters
|
18460
|
-
float * g1 = opt->adam.g1->data; // gradient
|
18461
|
-
float * g2 = opt->adam.g2->data; // gradient squared
|
18462
18605
|
float * m = opt->adam.m->data; // first moment
|
18463
18606
|
float * v = opt->adam.v->data; // second moment
|
18464
|
-
float * mh = opt->adam.mh->data; // first moment hat
|
18465
|
-
float * vh = opt->adam.vh->data; // second moment hat
|
18466
18607
|
|
18467
18608
|
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
18468
18609
|
|
18469
|
-
|
18470
|
-
|
18610
|
+
if (callback) {
|
18611
|
+
callback(callback_data, &sched);
|
18612
|
+
}
|
18471
18613
|
|
18472
18614
|
// compute the function value
|
18473
18615
|
ggml_graph_reset (gf);
|
18474
18616
|
ggml_set_f32 (f->grad, 1.0f);
|
18475
18617
|
|
18476
|
-
|
18618
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18619
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18620
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18621
|
+
ggml_graph_compute(gb, &cplan);
|
18477
18622
|
|
18478
18623
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
18479
18624
|
opt->adam.fx_best = opt->adam.fx_prev;
|
@@ -18481,6 +18626,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18481
18626
|
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
18482
18627
|
}
|
18483
18628
|
|
18629
|
+
opt->loss_before = opt->adam.fx_prev;
|
18630
|
+
opt->loss_after = opt->adam.fx_prev;
|
18631
|
+
|
18484
18632
|
// initialize
|
18485
18633
|
if (opt->just_initialized) {
|
18486
18634
|
opt->adam.n_no_improvement = 0;
|
@@ -18513,50 +18661,55 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18513
18661
|
UNUSED(t_start_cpu);
|
18514
18662
|
|
18515
18663
|
{
|
18516
|
-
|
18517
|
-
|
18518
|
-
|
18519
|
-
|
18520
|
-
|
18521
|
-
|
18522
|
-
|
18523
|
-
|
18524
|
-
|
18525
|
-
|
18526
|
-
|
18527
|
-
|
18528
|
-
|
18529
|
-
|
18530
|
-
|
18531
|
-
|
18532
|
-
|
18533
|
-
|
18534
|
-
|
18535
|
-
|
18536
|
-
|
18537
|
-
|
18538
|
-
|
18539
|
-
|
18540
|
-
|
18541
|
-
|
18542
|
-
|
18543
|
-
|
18544
|
-
|
18545
|
-
|
18546
|
-
|
18547
|
-
|
18548
|
-
|
18664
|
+
float gnorm = 1.0f;
|
18665
|
+
if (gclip > 0.0f) {
|
18666
|
+
// gradient clipping
|
18667
|
+
ggml_float sum = 0.0;
|
18668
|
+
for (int p = 0; p < np; ++p) {
|
18669
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18670
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18671
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j);
|
18672
|
+
sum += (ggml_float)(g*g);
|
18673
|
+
}
|
18674
|
+
}
|
18675
|
+
ggml_float norm = sqrt(sum);
|
18676
|
+
if (norm > (ggml_float) gclip) {
|
18677
|
+
gnorm = (float) ((ggml_float) gclip / norm);
|
18678
|
+
}
|
18679
|
+
}
|
18680
|
+
const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
|
18681
|
+
const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
|
18682
|
+
int64_t i = 0;
|
18683
|
+
for (int p = 0; p < np; ++p) {
|
18684
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18685
|
+
const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
18686
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18687
|
+
float x = ggml_get_f32_1d(ps[p], j);
|
18688
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
|
18689
|
+
m[i] = m[i]*beta1 + g*(1.0f - beta1);
|
18690
|
+
v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
|
18691
|
+
float mh = m[i]*beta1h;
|
18692
|
+
float vh = v[i]*beta2h;
|
18693
|
+
vh = sqrtf(vh) + eps;
|
18694
|
+
x = x*(1.0f - p_decay) - mh/vh;
|
18695
|
+
ggml_set_f32_1d(ps[p], j, x);
|
18696
|
+
++i;
|
18697
|
+
}
|
18698
|
+
}
|
18699
|
+
}
|
18549
18700
|
|
18550
|
-
|
18551
|
-
|
18701
|
+
if (callback) {
|
18702
|
+
callback(callback_data, &sched);
|
18552
18703
|
}
|
18553
18704
|
|
18554
18705
|
ggml_graph_reset (gf);
|
18555
18706
|
ggml_set_f32 (f->grad, 1.0f);
|
18556
18707
|
|
18557
|
-
|
18708
|
+
ggml_graph_compute(gb, &cplan);
|
18558
18709
|
|
18559
18710
|
const float fx = ggml_get_f32_1d(f, 0);
|
18711
|
+
opt->loss_after = fx;
|
18712
|
+
|
18560
18713
|
|
18561
18714
|
// check convergence
|
18562
18715
|
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
@@ -18625,7 +18778,6 @@ struct ggml_lbfgs_iteration_data {
|
|
18625
18778
|
};
|
18626
18779
|
|
18627
18780
|
static enum ggml_opt_result linesearch_backtracking(
|
18628
|
-
struct ggml_context * ctx,
|
18629
18781
|
const struct ggml_opt_params * params,
|
18630
18782
|
int nx,
|
18631
18783
|
float * x,
|
@@ -18637,8 +18789,11 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18637
18789
|
struct ggml_tensor * f,
|
18638
18790
|
struct ggml_cgraph * gf,
|
18639
18791
|
struct ggml_cgraph * gb,
|
18792
|
+
struct ggml_cplan * cplan,
|
18640
18793
|
const int np,
|
18641
|
-
struct ggml_tensor * ps[]
|
18794
|
+
struct ggml_tensor * ps[],
|
18795
|
+
ggml_opt_callback callback,
|
18796
|
+
void * callback_data) {
|
18642
18797
|
int count = 0;
|
18643
18798
|
|
18644
18799
|
float width = 0.0f;
|
@@ -18667,6 +18822,12 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18667
18822
|
dgtest = params->lbfgs.ftol*dginit;
|
18668
18823
|
|
18669
18824
|
while (true) {
|
18825
|
+
if (callback) {
|
18826
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18827
|
+
float sched = 0;
|
18828
|
+
callback(callback_data, &sched);
|
18829
|
+
}
|
18830
|
+
|
18670
18831
|
ggml_vec_cpy_f32(nx, x, xp);
|
18671
18832
|
ggml_vec_mad_f32(nx, x, d, *step);
|
18672
18833
|
|
@@ -18677,7 +18838,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18677
18838
|
ggml_graph_reset (gf);
|
18678
18839
|
ggml_set_f32 (f->grad, 1.0f);
|
18679
18840
|
|
18680
|
-
|
18841
|
+
ggml_graph_compute(gb, cplan);
|
18681
18842
|
|
18682
18843
|
ggml_opt_get_grad(np, ps, g);
|
18683
18844
|
|
@@ -18737,7 +18898,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18737
18898
|
struct ggml_opt_params params,
|
18738
18899
|
struct ggml_tensor * f,
|
18739
18900
|
struct ggml_cgraph * gf,
|
18740
|
-
struct ggml_cgraph * gb
|
18901
|
+
struct ggml_cgraph * gb,
|
18902
|
+
ggml_opt_callback callback,
|
18903
|
+
void * callback_data) {
|
18741
18904
|
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
18742
18905
|
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
18743
18906
|
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
@@ -18769,6 +18932,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18769
18932
|
opt->iter = iter;
|
18770
18933
|
}
|
18771
18934
|
|
18935
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18936
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18937
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18938
|
+
|
18772
18939
|
float * x = opt->lbfgs.x->data; // current parameters
|
18773
18940
|
float * xp = opt->lbfgs.xp->data; // previous parameters
|
18774
18941
|
float * g = opt->lbfgs.g->data; // current gradient
|
@@ -18790,6 +18957,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18790
18957
|
float * lm_s = opt->lbfgs.lms->data;
|
18791
18958
|
float * lm_y = opt->lbfgs.lmy->data;
|
18792
18959
|
|
18960
|
+
if (callback) {
|
18961
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18962
|
+
float sched = 0;
|
18963
|
+
callback(callback_data, &sched);
|
18964
|
+
}
|
18965
|
+
|
18793
18966
|
// evaluate the function value and its gradient
|
18794
18967
|
{
|
18795
18968
|
ggml_opt_set_params(np, ps, x);
|
@@ -18797,11 +18970,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18797
18970
|
ggml_graph_reset (gf);
|
18798
18971
|
ggml_set_f32 (f->grad, 1.0f);
|
18799
18972
|
|
18800
|
-
|
18973
|
+
ggml_graph_compute(gb, &cplan);
|
18801
18974
|
|
18802
18975
|
ggml_opt_get_grad(np, ps, g);
|
18803
18976
|
|
18804
18977
|
fx = ggml_get_f32_1d(f, 0);
|
18978
|
+
|
18979
|
+
opt->loss_before = fx;
|
18980
|
+
opt->loss_after = fx;
|
18805
18981
|
}
|
18806
18982
|
|
18807
18983
|
// search direction = -gradient
|
@@ -18856,7 +19032,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18856
19032
|
ggml_vec_cpy_f32(nx, xp, x);
|
18857
19033
|
ggml_vec_cpy_f32(nx, gp, g);
|
18858
19034
|
|
18859
|
-
ls = linesearch_backtracking(
|
19035
|
+
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
|
18860
19036
|
|
18861
19037
|
if (ls < 0) {
|
18862
19038
|
// linesearch failed - go back to the previous point and return
|
@@ -18866,6 +19042,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18866
19042
|
return ls;
|
18867
19043
|
}
|
18868
19044
|
|
19045
|
+
opt->loss_after = fx;
|
19046
|
+
|
18869
19047
|
ggml_vec_norm_f32(nx, &xnorm, x);
|
18870
19048
|
ggml_vec_norm_f32(nx, &gnorm, g);
|
18871
19049
|
|
@@ -18923,7 +19101,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18923
19101
|
// ys = y^t \cdot s -> 1 / \rho.
|
18924
19102
|
// yy = y^t \cdot y.
|
18925
19103
|
//
|
18926
|
-
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]
|
19104
|
+
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
18927
19105
|
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
18928
19106
|
|
18929
19107
|
lm_ys[end[0]] = ys;
|
@@ -18986,13 +19164,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18986
19164
|
.adam = {
|
18987
19165
|
.n_iter = 10000,
|
18988
19166
|
.sched = 1.000f,
|
18989
|
-
.decay = 0.
|
19167
|
+
.decay = 0.0f,
|
19168
|
+
.decay_min_ndim = 2,
|
18990
19169
|
.alpha = 0.001f,
|
18991
19170
|
.beta1 = 0.9f,
|
18992
19171
|
.beta2 = 0.999f,
|
18993
19172
|
.eps = 1e-8f,
|
18994
19173
|
.eps_f = 1e-5f,
|
18995
19174
|
.eps_g = 1e-3f,
|
19175
|
+
.gclip = 0.0f,
|
18996
19176
|
},
|
18997
19177
|
};
|
18998
19178
|
} break;
|
@@ -19042,23 +19222,13 @@ GGML_API void ggml_opt_init(
|
|
19042
19222
|
switch (opt->params.type) {
|
19043
19223
|
case GGML_OPT_ADAM:
|
19044
19224
|
{
|
19045
|
-
opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19046
|
-
opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19047
|
-
opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19048
19225
|
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19049
19226
|
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19050
|
-
opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19051
|
-
opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19052
19227
|
opt->adam.pf = params.past > 0
|
19053
19228
|
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
19054
19229
|
: NULL;
|
19055
|
-
ggml_set_zero(opt->adam.x);
|
19056
|
-
ggml_set_zero(opt->adam.g1);
|
19057
|
-
ggml_set_zero(opt->adam.g2);
|
19058
19230
|
ggml_set_zero(opt->adam.m);
|
19059
19231
|
ggml_set_zero(opt->adam.v);
|
19060
|
-
ggml_set_zero(opt->adam.mh);
|
19061
|
-
ggml_set_zero(opt->adam.vh);
|
19062
19232
|
if (opt->adam.pf) {
|
19063
19233
|
ggml_set_zero(opt->adam.pf);
|
19064
19234
|
}
|
@@ -19142,7 +19312,7 @@ enum ggml_opt_result ggml_opt_resume(
|
|
19142
19312
|
*gf = ggml_build_forward (f);
|
19143
19313
|
*gb = ggml_build_backward(ctx, gf, true);
|
19144
19314
|
|
19145
|
-
return ggml_opt_resume_g(ctx, opt, f, gf, gb);
|
19315
|
+
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
19146
19316
|
}
|
19147
19317
|
|
19148
19318
|
enum ggml_opt_result ggml_opt_resume_g(
|
@@ -19150,7 +19320,9 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
19150
19320
|
struct ggml_opt_context * opt,
|
19151
19321
|
struct ggml_tensor * f,
|
19152
19322
|
struct ggml_cgraph * gf,
|
19153
|
-
struct ggml_cgraph * gb
|
19323
|
+
struct ggml_cgraph * gb,
|
19324
|
+
ggml_opt_callback callback,
|
19325
|
+
void * callback_data) {
|
19154
19326
|
|
19155
19327
|
// build forward + backward compute graphs
|
19156
19328
|
enum ggml_opt_result result = GGML_OPT_OK;
|
@@ -19158,11 +19330,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
19158
19330
|
switch (opt->params.type) {
|
19159
19331
|
case GGML_OPT_ADAM:
|
19160
19332
|
{
|
19161
|
-
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
19333
|
+
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
19162
19334
|
} break;
|
19163
19335
|
case GGML_OPT_LBFGS:
|
19164
19336
|
{
|
19165
|
-
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
|
19337
|
+
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
19166
19338
|
} break;
|
19167
19339
|
}
|
19168
19340
|
|
@@ -19394,7 +19566,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
19394
19566
|
////////////////////////////////////////////////////////////////////////////////
|
19395
19567
|
|
19396
19568
|
struct gguf_str {
|
19397
|
-
|
19569
|
+
uint64_t n; // GGUFv2
|
19398
19570
|
char * data;
|
19399
19571
|
};
|
19400
19572
|
|
@@ -19408,9 +19580,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
|
19408
19580
|
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
19409
19581
|
[GGUF_TYPE_BOOL] = sizeof(bool),
|
19410
19582
|
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
19583
|
+
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
|
19584
|
+
[GGUF_TYPE_INT64] = sizeof(int64_t),
|
19585
|
+
[GGUF_TYPE_FLOAT64] = sizeof(double),
|
19411
19586
|
[GGUF_TYPE_ARRAY] = 0, // undefined
|
19412
19587
|
};
|
19413
|
-
static_assert(GGUF_TYPE_COUNT ==
|
19588
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19414
19589
|
|
19415
19590
|
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
19416
19591
|
[GGUF_TYPE_UINT8] = "u8",
|
@@ -19423,8 +19598,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
|
19423
19598
|
[GGUF_TYPE_BOOL] = "bool",
|
19424
19599
|
[GGUF_TYPE_STRING] = "str",
|
19425
19600
|
[GGUF_TYPE_ARRAY] = "arr",
|
19601
|
+
[GGUF_TYPE_UINT64] = "u64",
|
19602
|
+
[GGUF_TYPE_INT64] = "i64",
|
19603
|
+
[GGUF_TYPE_FLOAT64] = "f64",
|
19426
19604
|
};
|
19427
|
-
static_assert(GGUF_TYPE_COUNT ==
|
19605
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19428
19606
|
|
19429
19607
|
union gguf_value {
|
19430
19608
|
uint8_t uint8;
|
@@ -19434,6 +19612,9 @@ union gguf_value {
|
|
19434
19612
|
uint32_t uint32;
|
19435
19613
|
int32_t int32;
|
19436
19614
|
float float32;
|
19615
|
+
uint64_t uint64;
|
19616
|
+
int64_t int64;
|
19617
|
+
double float64;
|
19437
19618
|
bool bool_;
|
19438
19619
|
|
19439
19620
|
struct gguf_str str;
|
@@ -19441,7 +19622,7 @@ union gguf_value {
|
|
19441
19622
|
struct {
|
19442
19623
|
enum gguf_type type;
|
19443
19624
|
|
19444
|
-
|
19625
|
+
uint64_t n; // GGUFv2
|
19445
19626
|
void * data;
|
19446
19627
|
} arr;
|
19447
19628
|
};
|
@@ -19449,8 +19630,6 @@ union gguf_value {
|
|
19449
19630
|
struct gguf_kv {
|
19450
19631
|
struct gguf_str key;
|
19451
19632
|
|
19452
|
-
uint32_t n_bytes; // TODO: is this actually needed?
|
19453
|
-
|
19454
19633
|
enum gguf_type type;
|
19455
19634
|
union gguf_value value;
|
19456
19635
|
};
|
@@ -19458,15 +19637,15 @@ struct gguf_kv {
|
|
19458
19637
|
struct gguf_header {
|
19459
19638
|
uint32_t magic;
|
19460
19639
|
uint32_t version;
|
19461
|
-
|
19462
|
-
|
19640
|
+
uint64_t n_tensors; // GGUFv2
|
19641
|
+
uint64_t n_kv; // GGUFv2
|
19463
19642
|
};
|
19464
19643
|
|
19465
19644
|
struct gguf_tensor_info {
|
19466
19645
|
struct gguf_str name;
|
19467
19646
|
|
19468
19647
|
uint32_t n_dims;
|
19469
|
-
|
19648
|
+
uint64_t ne[GGML_MAX_DIMS];
|
19470
19649
|
|
19471
19650
|
enum ggml_type type;
|
19472
19651
|
|
@@ -19497,19 +19676,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
|
|
19497
19676
|
return n == size;
|
19498
19677
|
}
|
19499
19678
|
|
19500
|
-
|
19679
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19680
|
+
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
19501
19681
|
p->n = 0;
|
19502
19682
|
p->data = NULL;
|
19503
19683
|
|
19504
19684
|
bool ok = true;
|
19505
19685
|
|
19506
|
-
// TODO: how to avoid mallocs for strings?
|
19507
19686
|
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
19508
19687
|
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19509
19688
|
|
19510
19689
|
return ok;
|
19511
19690
|
}
|
19512
19691
|
|
19692
|
+
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
19693
|
+
p->n = 0;
|
19694
|
+
p->data = NULL;
|
19695
|
+
|
19696
|
+
bool ok = true;
|
19697
|
+
|
19698
|
+
uint32_t n = 0;
|
19699
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
19700
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19701
|
+
|
19702
|
+
return ok;
|
19703
|
+
}
|
19704
|
+
|
19513
19705
|
struct gguf_context * gguf_init_empty(void) {
|
19514
19706
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19515
19707
|
|
@@ -19565,8 +19757,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19565
19757
|
ctx->data = NULL;
|
19566
19758
|
|
19567
19759
|
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
19568
|
-
|
19569
|
-
|
19760
|
+
|
19761
|
+
if (ctx->header.version == 1) {
|
19762
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19763
|
+
uint32_t n_tensors = 0;
|
19764
|
+
uint32_t n_kv = 0;
|
19765
|
+
|
19766
|
+
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
19767
|
+
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
19768
|
+
|
19769
|
+
ctx->header.n_tensors = n_tensors;
|
19770
|
+
ctx->header.n_kv = n_kv;
|
19771
|
+
} else {
|
19772
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
19773
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
19774
|
+
}
|
19570
19775
|
|
19571
19776
|
if (!ok) {
|
19572
19777
|
fprintf(stderr, "%s: failed to read header\n", __func__);
|
@@ -19576,18 +19781,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19576
19781
|
}
|
19577
19782
|
}
|
19578
19783
|
|
19784
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19785
|
+
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
19786
|
+
if (ctx->header.version == 1) {
|
19787
|
+
gguf_fread_str = gguf_fread_str_v1;
|
19788
|
+
}
|
19789
|
+
|
19579
19790
|
// read the kv pairs
|
19580
19791
|
{
|
19581
|
-
ctx->kv =
|
19792
|
+
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19582
19793
|
|
19583
19794
|
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19584
19795
|
struct gguf_kv * kv = &ctx->kv[i];
|
19585
19796
|
|
19586
19797
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
19587
19798
|
|
19588
|
-
ok = ok && gguf_fread_str(file, &kv->key,
|
19589
|
-
|
19590
|
-
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19799
|
+
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
19800
|
+
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19591
19801
|
|
19592
19802
|
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
19593
19803
|
|
@@ -19599,12 +19809,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19599
19809
|
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
19600
19810
|
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
19601
19811
|
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
19812
|
+
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
|
19813
|
+
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
|
19814
|
+
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
|
19602
19815
|
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
19603
19816
|
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
19604
19817
|
case GGUF_TYPE_ARRAY:
|
19605
19818
|
{
|
19606
19819
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19607
|
-
|
19820
|
+
|
19821
|
+
if (ctx->header.version == 1) {
|
19822
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19823
|
+
uint32_t n = 0;
|
19824
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
19825
|
+
kv->value.arr.n = n;
|
19826
|
+
} else {
|
19827
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19828
|
+
}
|
19608
19829
|
|
19609
19830
|
switch (kv->value.arr.type) {
|
19610
19831
|
case GGUF_TYPE_UINT8:
|
@@ -19614,6 +19835,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19614
19835
|
case GGUF_TYPE_UINT32:
|
19615
19836
|
case GGUF_TYPE_INT32:
|
19616
19837
|
case GGUF_TYPE_FLOAT32:
|
19838
|
+
case GGUF_TYPE_UINT64:
|
19839
|
+
case GGUF_TYPE_INT64:
|
19840
|
+
case GGUF_TYPE_FLOAT64:
|
19617
19841
|
case GGUF_TYPE_BOOL:
|
19618
19842
|
{
|
19619
19843
|
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
@@ -19648,7 +19872,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19648
19872
|
|
19649
19873
|
// read the tensor infos
|
19650
19874
|
{
|
19651
|
-
ctx->infos =
|
19875
|
+
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19652
19876
|
|
19653
19877
|
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19654
19878
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -19660,7 +19884,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19660
19884
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19661
19885
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19662
19886
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19663
|
-
|
19887
|
+
if (ctx->header.version == 1) {
|
19888
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19889
|
+
uint32_t t = 0;
|
19890
|
+
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
19891
|
+
info->ne[j] = t;
|
19892
|
+
} else {
|
19893
|
+
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19894
|
+
}
|
19664
19895
|
}
|
19665
19896
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19666
19897
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
@@ -19744,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19744
19975
|
|
19745
19976
|
struct ggml_tensor * data = NULL;
|
19746
19977
|
|
19747
|
-
if (params.no_alloc
|
19978
|
+
if (!params.no_alloc) {
|
19748
19979
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
19749
19980
|
|
19750
19981
|
ok = ok && data != NULL;
|
@@ -19785,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19785
20016
|
}
|
19786
20017
|
|
19787
20018
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
19788
|
-
if (params.no_alloc
|
20019
|
+
if (!params.no_alloc) {
|
19789
20020
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
19790
20021
|
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
19791
20022
|
}
|
@@ -19842,7 +20073,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19842
20073
|
}
|
19843
20074
|
}
|
19844
20075
|
|
19845
|
-
|
20076
|
+
free(ctx->kv);
|
19846
20077
|
}
|
19847
20078
|
|
19848
20079
|
if (ctx->infos) {
|
@@ -19854,7 +20085,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19854
20085
|
}
|
19855
20086
|
}
|
19856
20087
|
|
19857
|
-
|
20088
|
+
free(ctx->infos);
|
19858
20089
|
}
|
19859
20090
|
|
19860
20091
|
GGML_ALIGNED_FREE(ctx);
|
@@ -19954,6 +20185,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
|
19954
20185
|
return ctx->kv[i].value.float32;
|
19955
20186
|
}
|
19956
20187
|
|
20188
|
+
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
20189
|
+
return ctx->kv[i].value.uint64;
|
20190
|
+
}
|
20191
|
+
|
20192
|
+
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
20193
|
+
return ctx->kv[i].value.int64;
|
20194
|
+
}
|
20195
|
+
|
20196
|
+
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
20197
|
+
return ctx->kv[i].value.float64;
|
20198
|
+
}
|
20199
|
+
|
19957
20200
|
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
19958
20201
|
return ctx->kv[i].value.bool_;
|
19959
20202
|
}
|
@@ -20000,7 +20243,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
|
20000
20243
|
const int n_kv = gguf_get_n_kv(ctx);
|
20001
20244
|
|
20002
20245
|
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
20003
|
-
ctx->kv[n_kv].key.n = strlen(key)
|
20246
|
+
ctx->kv[n_kv].key.n = strlen(key);
|
20004
20247
|
ctx->kv[n_kv].key.data = strdup(key);
|
20005
20248
|
ctx->header.n_kv++;
|
20006
20249
|
|
@@ -20056,6 +20299,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
|
20056
20299
|
ctx->kv[idx].value.float32 = val;
|
20057
20300
|
}
|
20058
20301
|
|
20302
|
+
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
|
20303
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20304
|
+
|
20305
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT64;
|
20306
|
+
ctx->kv[idx].value.uint64 = val;
|
20307
|
+
}
|
20308
|
+
|
20309
|
+
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
|
20310
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20311
|
+
|
20312
|
+
ctx->kv[idx].type = GGUF_TYPE_INT64;
|
20313
|
+
ctx->kv[idx].value.int64 = val;
|
20314
|
+
}
|
20315
|
+
|
20316
|
+
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
|
20317
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20318
|
+
|
20319
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
|
20320
|
+
ctx->kv[idx].value.float64 = val;
|
20321
|
+
}
|
20322
|
+
|
20059
20323
|
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
20060
20324
|
const int idx = gguf_get_or_add_key(ctx, key);
|
20061
20325
|
|
@@ -20067,7 +20331,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
|
|
20067
20331
|
const int idx = gguf_get_or_add_key(ctx, key);
|
20068
20332
|
|
20069
20333
|
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
20070
|
-
ctx->kv[idx].value.str.n = strlen(val)
|
20334
|
+
ctx->kv[idx].value.str.n = strlen(val);
|
20071
20335
|
ctx->kv[idx].value.str.data = strdup(val);
|
20072
20336
|
}
|
20073
20337
|
|
@@ -20090,7 +20354,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
20090
20354
|
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
20091
20355
|
for (int i = 0; i < n; i++) {
|
20092
20356
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
20093
|
-
str->n = strlen(data[i])
|
20357
|
+
str->n = strlen(data[i]);
|
20094
20358
|
str->data = strdup(data[i]);
|
20095
20359
|
}
|
20096
20360
|
}
|
@@ -20106,6 +20370,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
20106
20370
|
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
20107
20371
|
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
20108
20372
|
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
20373
|
+
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
|
20374
|
+
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
|
20375
|
+
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
|
20109
20376
|
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
20110
20377
|
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
20111
20378
|
case GGUF_TYPE_ARRAY:
|
@@ -20134,7 +20401,7 @@ void gguf_add_tensor(
|
|
20134
20401
|
const int idx = ctx->header.n_tensors;
|
20135
20402
|
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
20136
20403
|
|
20137
|
-
ctx->infos[idx].name.n = strlen(tensor->name)
|
20404
|
+
ctx->infos[idx].name.n = strlen(tensor->name);
|
20138
20405
|
ctx->infos[idx].name.data = strdup(tensor->name);
|
20139
20406
|
|
20140
20407
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
@@ -20267,6 +20534,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|
20267
20534
|
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
20268
20535
|
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
20269
20536
|
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
20537
|
+
case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
|
20538
|
+
case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
|
20539
|
+
case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
|
20270
20540
|
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
20271
20541
|
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
20272
20542
|
case GGUF_TYPE_ARRAY:
|
@@ -20282,6 +20552,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|
20282
20552
|
case GGUF_TYPE_UINT32:
|
20283
20553
|
case GGUF_TYPE_INT32:
|
20284
20554
|
case GGUF_TYPE_FLOAT32:
|
20555
|
+
case GGUF_TYPE_UINT64:
|
20556
|
+
case GGUF_TYPE_INT64:
|
20557
|
+
case GGUF_TYPE_FLOAT64:
|
20285
20558
|
case GGUF_TYPE_BOOL:
|
20286
20559
|
{
|
20287
20560
|
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
@@ -20516,6 +20789,14 @@ int ggml_cpu_has_sse3(void) {
|
|
20516
20789
|
#endif
|
20517
20790
|
}
|
20518
20791
|
|
20792
|
+
int ggml_cpu_has_ssse3(void) {
|
20793
|
+
#if defined(__SSSE3__)
|
20794
|
+
return 1;
|
20795
|
+
#else
|
20796
|
+
return 0;
|
20797
|
+
#endif
|
20798
|
+
}
|
20799
|
+
|
20519
20800
|
int ggml_cpu_has_vsx(void) {
|
20520
20801
|
#if defined(__POWER9_VECTOR__)
|
20521
20802
|
return 1;
|