llama_cpp 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
|
|
103
103
|
#include <sys/stat.h>
|
104
104
|
#include <unistd.h>
|
105
105
|
|
106
|
+
#endif
|
107
|
+
#ifdef GGML_USE_CPU_HBM
|
108
|
+
#include <hbwmalloc.h>
|
106
109
|
#endif
|
107
110
|
|
108
111
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -123,6 +126,8 @@ typedef void * thread_ret_t;
|
|
123
126
|
#define GGML_GELU_FP16
|
124
127
|
#define GGML_GELU_QUICK_FP16
|
125
128
|
#define GGML_SILU_FP16
|
129
|
+
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
130
|
+
// #define GGML_FLASH_ATTN_EXP_FP16
|
126
131
|
|
127
132
|
#define GGML_SOFT_MAX_UNROLL 4
|
128
133
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -157,12 +162,6 @@ typedef void * thread_ret_t;
|
|
157
162
|
//#define GGML_SOFT_MAX_ACCELERATE
|
158
163
|
#endif
|
159
164
|
|
160
|
-
#if UINTPTR_MAX == 0xFFFFFFFF
|
161
|
-
#define GGML_MEM_ALIGN 4
|
162
|
-
#else
|
163
|
-
#define GGML_MEM_ALIGN 16
|
164
|
-
#endif
|
165
|
-
|
166
165
|
//
|
167
166
|
// logging
|
168
167
|
//
|
@@ -192,13 +191,19 @@ typedef void * thread_ret_t;
|
|
192
191
|
//
|
193
192
|
|
194
193
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
195
|
-
#define GGML_ALIGNED_MALLOC(size)
|
196
|
-
#define GGML_ALIGNED_FREE(ptr)
|
194
|
+
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
195
|
+
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
196
|
#else
|
198
197
|
inline static void * ggml_aligned_malloc(size_t size) {
|
198
|
+
if (size == 0) {
|
199
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
200
|
+
return NULL;
|
201
|
+
}
|
199
202
|
void * aligned_memory = NULL;
|
200
|
-
#ifdef
|
201
|
-
int result =
|
203
|
+
#ifdef GGML_USE_CPU_HBM
|
204
|
+
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
205
|
+
#elif GGML_USE_METAL
|
206
|
+
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
202
207
|
#else
|
203
208
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
204
209
|
#endif
|
@@ -218,8 +223,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
218
223
|
}
|
219
224
|
return aligned_memory;
|
220
225
|
}
|
221
|
-
#define GGML_ALIGNED_MALLOC(size)
|
222
|
-
#
|
226
|
+
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
227
|
+
#ifdef GGML_USE_CPU_HBM
|
228
|
+
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
229
|
+
#else
|
230
|
+
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
231
|
+
#endif
|
223
232
|
#endif
|
224
233
|
|
225
234
|
#define UNUSED GGML_UNUSED
|
@@ -305,6 +314,10 @@ typedef double ggml_float;
|
|
305
314
|
#endif
|
306
315
|
#endif
|
307
316
|
|
317
|
+
#ifdef __riscv_v_intrinsic
|
318
|
+
#include <riscv_vector.h>
|
319
|
+
#endif
|
320
|
+
|
308
321
|
#ifdef __F16C__
|
309
322
|
|
310
323
|
#ifdef _MSC_VER
|
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
817
830
|
|
818
831
|
#if !defined(__aarch64__)
|
819
832
|
|
820
|
-
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
821
|
-
return
|
822
|
-
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
823
|
-
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
824
|
-
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
825
|
-
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
826
|
-
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
827
|
-
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
828
|
-
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
829
|
-
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
830
|
-
}
|
831
|
-
|
832
|
-
inline static int16_t vaddvq_s8(int8x16_t v) {
|
833
|
-
return
|
834
|
-
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
835
|
-
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
836
|
-
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
837
|
-
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
838
|
-
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
839
|
-
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
840
|
-
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
841
|
-
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
842
|
-
}
|
843
|
-
|
844
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
845
|
-
return
|
846
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
847
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
848
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
849
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
850
|
-
}
|
851
|
-
|
852
|
-
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
853
|
-
return
|
854
|
-
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
855
|
-
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
856
|
-
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
857
|
-
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
858
|
-
}
|
859
|
-
|
860
833
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
861
834
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
862
835
|
}
|
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
865
838
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
866
839
|
}
|
867
840
|
|
868
|
-
inline static float vminvq_f32(float32x4_t v) {
|
869
|
-
return
|
870
|
-
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
871
|
-
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
872
|
-
}
|
873
|
-
|
874
841
|
inline static float vmaxvq_f32(float32x4_t v) {
|
875
842
|
return
|
876
843
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
@@ -2436,7 +2403,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2436
2403
|
const int nb = n / qk;
|
2437
2404
|
|
2438
2405
|
assert(n % qk == 0);
|
2439
|
-
assert(nb % 2 == 0);
|
2440
2406
|
|
2441
2407
|
const block_q4_0 * restrict x = vx;
|
2442
2408
|
const block_q8_0 * restrict y = vy;
|
@@ -2445,6 +2411,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2445
2411
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
2446
2412
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
2447
2413
|
|
2414
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2448
2415
|
for (int i = 0; i < nb; i += 2) {
|
2449
2416
|
const block_q4_0 * restrict x0 = &x[i + 0];
|
2450
2417
|
const block_q4_0 * restrict x1 = &x[i + 1];
|
@@ -2623,6 +2590,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2623
2590
|
}
|
2624
2591
|
|
2625
2592
|
// Main loop
|
2593
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2626
2594
|
for (int i = 2; i < nb; i+=2) {
|
2627
2595
|
_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
|
2628
2596
|
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
@@ -2680,6 +2648,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2680
2648
|
}
|
2681
2649
|
|
2682
2650
|
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
2651
|
+
#elif defined(__riscv_v_intrinsic)
|
2652
|
+
float sumf = 0.0;
|
2653
|
+
|
2654
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2655
|
+
|
2656
|
+
for (int i = 0; i < nb; i++) {
|
2657
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2658
|
+
|
2659
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2660
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2661
|
+
|
2662
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2663
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2664
|
+
|
2665
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2666
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2667
|
+
|
2668
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
|
2669
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
|
2670
|
+
|
2671
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2672
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2673
|
+
|
2674
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2675
|
+
|
2676
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2677
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2678
|
+
|
2679
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2680
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2681
|
+
|
2682
|
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2683
|
+
}
|
2684
|
+
|
2685
|
+
*s = sumf;
|
2683
2686
|
#else
|
2684
2687
|
// scalar
|
2685
2688
|
float sumf = 0.0;
|
@@ -2706,7 +2709,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2706
2709
|
const int nb = n / qk;
|
2707
2710
|
|
2708
2711
|
assert(n % qk == 0);
|
2709
|
-
assert(nb % 2 == 0);
|
2710
2712
|
|
2711
2713
|
const block_q4_1 * restrict x = vx;
|
2712
2714
|
const block_q8_1 * restrict y = vy;
|
@@ -2718,6 +2720,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2718
2720
|
|
2719
2721
|
float summs = 0;
|
2720
2722
|
|
2723
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2721
2724
|
for (int i = 0; i < nb; i += 2) {
|
2722
2725
|
const block_q4_1 * restrict x0 = &x[i + 0];
|
2723
2726
|
const block_q4_1 * restrict x1 = &x[i + 1];
|
@@ -2806,6 +2809,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2806
2809
|
}
|
2807
2810
|
|
2808
2811
|
*s = hsum_float_8(acc) + summs;
|
2812
|
+
#elif defined(__riscv_v_intrinsic)
|
2813
|
+
float sumf = 0.0;
|
2814
|
+
|
2815
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2816
|
+
|
2817
|
+
for (int i = 0; i < nb; i++) {
|
2818
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2819
|
+
|
2820
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2821
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2822
|
+
|
2823
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2824
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2825
|
+
|
2826
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2827
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2828
|
+
|
2829
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2830
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2831
|
+
|
2832
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2833
|
+
|
2834
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2835
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2836
|
+
|
2837
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2838
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2839
|
+
|
2840
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2841
|
+
}
|
2842
|
+
|
2843
|
+
*s = sumf;
|
2809
2844
|
#else
|
2810
2845
|
// scalar
|
2811
2846
|
float sumf = 0.0;
|
@@ -2832,7 +2867,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2832
2867
|
const int nb = n / qk;
|
2833
2868
|
|
2834
2869
|
assert(n % qk == 0);
|
2835
|
-
assert(nb % 2 == 0);
|
2836
2870
|
assert(qk == QK5_0);
|
2837
2871
|
|
2838
2872
|
const block_q5_0 * restrict x = vx;
|
@@ -2848,6 +2882,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2848
2882
|
uint64_t tmp0[4];
|
2849
2883
|
uint64_t tmp1[4];
|
2850
2884
|
|
2885
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2851
2886
|
for (int i = 0; i < nb; i += 2) {
|
2852
2887
|
const block_q5_0 * restrict x0 = &x[i];
|
2853
2888
|
const block_q5_0 * restrict x1 = &x[i + 1];
|
@@ -3040,6 +3075,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
3040
3075
|
}
|
3041
3076
|
|
3042
3077
|
*s = hsum_float_8(acc);
|
3078
|
+
#elif defined(__riscv_v_intrinsic)
|
3079
|
+
float sumf = 0.0;
|
3080
|
+
|
3081
|
+
uint32_t qh;
|
3082
|
+
|
3083
|
+
// These temp values are for masking and shift operations
|
3084
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3085
|
+
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
3086
|
+
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
|
3087
|
+
|
3088
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3089
|
+
|
3090
|
+
for (int i = 0; i < nb; i++) {
|
3091
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3092
|
+
|
3093
|
+
// temporary registers
|
3094
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
|
3095
|
+
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3096
|
+
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
|
3097
|
+
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
|
3098
|
+
|
3099
|
+
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
3100
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
|
3101
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
|
3102
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3103
|
+
|
3104
|
+
// ((qh & (1u << (j + 16))) >> (j + 12));
|
3105
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
|
3106
|
+
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
|
3107
|
+
|
3108
|
+
// narrowing
|
3109
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
|
3110
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3111
|
+
|
3112
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
|
3113
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3114
|
+
|
3115
|
+
// load
|
3116
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3117
|
+
|
3118
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3119
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3120
|
+
|
3121
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3122
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3123
|
+
|
3124
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3125
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3126
|
+
|
3127
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3128
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3129
|
+
|
3130
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
|
3131
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
|
3132
|
+
|
3133
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3134
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3135
|
+
|
3136
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3137
|
+
|
3138
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3139
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3140
|
+
|
3141
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3142
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3143
|
+
|
3144
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
3145
|
+
}
|
3146
|
+
|
3147
|
+
*s = sumf;
|
3043
3148
|
#else
|
3044
3149
|
// scalar
|
3045
3150
|
float sumf = 0.0;
|
@@ -3072,7 +3177,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3072
3177
|
const int nb = n / qk;
|
3073
3178
|
|
3074
3179
|
assert(n % qk == 0);
|
3075
|
-
assert(nb % 2 == 0);
|
3076
3180
|
assert(qk == QK5_1);
|
3077
3181
|
|
3078
3182
|
const block_q5_1 * restrict x = vx;
|
@@ -3091,6 +3195,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3091
3195
|
uint64_t tmp0[4];
|
3092
3196
|
uint64_t tmp1[4];
|
3093
3197
|
|
3198
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3094
3199
|
for (int i = 0; i < nb; i += 2) {
|
3095
3200
|
const block_q5_1 * restrict x0 = &x[i];
|
3096
3201
|
const block_q5_1 * restrict x1 = &x[i + 1];
|
@@ -3296,6 +3401,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3296
3401
|
}
|
3297
3402
|
|
3298
3403
|
*s = hsum_float_8(acc) + summs;
|
3404
|
+
#elif defined(__riscv_v_intrinsic)
|
3405
|
+
float sumf = 0.0;
|
3406
|
+
|
3407
|
+
uint32_t qh;
|
3408
|
+
|
3409
|
+
// These temp values are for shift operations
|
3410
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3411
|
+
|
3412
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3413
|
+
|
3414
|
+
for (int i = 0; i < nb; i++) {
|
3415
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3416
|
+
|
3417
|
+
// temporary registers
|
3418
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3419
|
+
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
|
3420
|
+
|
3421
|
+
// load qh
|
3422
|
+
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
|
3423
|
+
|
3424
|
+
// ((qh >> (j + 0)) << 4) & 0x10;
|
3425
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
|
3426
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3427
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
|
3428
|
+
|
3429
|
+
// ((qh >> (j + 12)) ) & 0x10;
|
3430
|
+
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
|
3431
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
|
3432
|
+
|
3433
|
+
// narrowing
|
3434
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
|
3435
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3436
|
+
|
3437
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
|
3438
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3439
|
+
|
3440
|
+
// load
|
3441
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3442
|
+
|
3443
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3444
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3445
|
+
|
3446
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3447
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3448
|
+
|
3449
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3450
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3451
|
+
|
3452
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3453
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3454
|
+
|
3455
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3456
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3457
|
+
|
3458
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3459
|
+
|
3460
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3461
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3462
|
+
|
3463
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3464
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3465
|
+
|
3466
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
3467
|
+
}
|
3468
|
+
|
3469
|
+
*s = sumf;
|
3299
3470
|
#else
|
3300
3471
|
// scalar
|
3301
3472
|
float sumf = 0.0;
|
@@ -3328,7 +3499,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3328
3499
|
const int nb = n / qk;
|
3329
3500
|
|
3330
3501
|
assert(n % qk == 0);
|
3331
|
-
assert(nb % 2 == 0);
|
3332
3502
|
|
3333
3503
|
const block_q8_0 * restrict x = vx;
|
3334
3504
|
const block_q8_0 * restrict y = vy;
|
@@ -3337,6 +3507,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3337
3507
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3338
3508
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
3339
3509
|
|
3510
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3340
3511
|
for (int i = 0; i < nb; i += 2) {
|
3341
3512
|
const block_q8_0 * restrict x0 = &x[i + 0];
|
3342
3513
|
const block_q8_0 * restrict x1 = &x[i + 1];
|
@@ -3407,6 +3578,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3407
3578
|
}
|
3408
3579
|
|
3409
3580
|
*s = hsum_float_8(acc);
|
3581
|
+
#elif defined(__riscv_v_intrinsic)
|
3582
|
+
float sumf = 0.0;
|
3583
|
+
size_t vl = __riscv_vsetvl_e8m1(qk);
|
3584
|
+
|
3585
|
+
for (int i = 0; i < nb; i++) {
|
3586
|
+
// load elements
|
3587
|
+
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
3588
|
+
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3589
|
+
|
3590
|
+
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
3591
|
+
|
3592
|
+
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3593
|
+
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
3594
|
+
|
3595
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
3596
|
+
|
3597
|
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
3598
|
+
}
|
3599
|
+
|
3600
|
+
*s = sumf;
|
3410
3601
|
#else
|
3411
3602
|
// scalar
|
3412
3603
|
float sumf = 0.0;
|
@@ -4107,16 +4298,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
4107
4298
|
}
|
4108
4299
|
|
4109
4300
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
4110
|
-
|
4111
|
-
|
4112
|
-
|
4113
|
-
|
4114
|
-
|
4115
|
-
// return tensor->ne[3]*tensor->nb[3]
|
4116
|
-
//
|
4117
|
-
// is enough, but just in case, adding the second part
|
4118
|
-
|
4119
|
-
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
|
4301
|
+
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
|
4302
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
4303
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4304
|
+
}
|
4305
|
+
return nbytes;
|
4120
4306
|
}
|
4121
4307
|
|
4122
4308
|
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
@@ -4393,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4393
4579
|
return NULL;
|
4394
4580
|
}
|
4395
4581
|
|
4582
|
+
// allow to call ggml_init with 0 size
|
4583
|
+
if (params.mem_size == 0) {
|
4584
|
+
params.mem_size = GGML_MEM_ALIGN;
|
4585
|
+
}
|
4586
|
+
|
4396
4587
|
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
4397
4588
|
|
4398
4589
|
*ctx = (struct ggml_context) {
|
@@ -4570,36 +4761,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4570
4761
|
enum ggml_type type,
|
4571
4762
|
int n_dims,
|
4572
4763
|
const int64_t * ne,
|
4573
|
-
|
4764
|
+
struct ggml_tensor * view_src,
|
4765
|
+
size_t view_offs) {
|
4574
4766
|
|
4575
4767
|
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4576
4768
|
|
4577
|
-
|
4769
|
+
// find the base tensor and absolute offset
|
4770
|
+
if (view_src != NULL && view_src->view_src != NULL) {
|
4771
|
+
view_offs += view_src->view_offs;
|
4772
|
+
view_src = view_src->view_src;
|
4773
|
+
}
|
4578
4774
|
|
4579
|
-
|
4580
|
-
|
4581
|
-
|
4582
|
-
data_size *= ne[i];
|
4583
|
-
}
|
4775
|
+
size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
4776
|
+
for (int i = 1; i < n_dims; i++) {
|
4777
|
+
data_size *= ne[i];
|
4584
4778
|
}
|
4585
4779
|
|
4586
|
-
|
4587
|
-
|
4588
|
-
|
4589
|
-
|
4590
|
-
|
4591
|
-
|
4592
|
-
return NULL;
|
4593
|
-
}
|
4780
|
+
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
4781
|
+
|
4782
|
+
void * data = view_src != NULL ? view_src->data : NULL;
|
4783
|
+
if (data != NULL) {
|
4784
|
+
data = (char *) data + view_offs;
|
4785
|
+
}
|
4594
4786
|
|
4595
|
-
|
4787
|
+
size_t obj_alloc_size = 0;
|
4788
|
+
|
4789
|
+
if (view_src == NULL && !ctx->no_alloc) {
|
4790
|
+
if (ctx->scratch.data != NULL) {
|
4791
|
+
// allocate tensor data in the scratch buffer
|
4792
|
+
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
4793
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4794
|
+
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
4795
|
+
assert(false);
|
4796
|
+
return NULL;
|
4797
|
+
}
|
4596
4798
|
|
4597
|
-
|
4799
|
+
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
|
4598
4800
|
|
4599
|
-
|
4801
|
+
ctx->scratch.offs += data_size;
|
4802
|
+
} else {
|
4803
|
+
// allocate tensor data in the context's memory pool
|
4804
|
+
obj_alloc_size = data_size;
|
4805
|
+
}
|
4600
4806
|
}
|
4601
4807
|
|
4602
|
-
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE +
|
4808
|
+
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
|
4603
4809
|
|
4604
4810
|
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
|
4605
4811
|
|
@@ -4619,7 +4825,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4619
4825
|
/*.perf_runs =*/ 0,
|
4620
4826
|
/*.perf_cycles =*/ 0,
|
4621
4827
|
/*.perf_time_us =*/ 0,
|
4622
|
-
/*.
|
4828
|
+
/*.view_src =*/ view_src,
|
4829
|
+
/*.view_offs =*/ view_offs,
|
4830
|
+
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
|
4623
4831
|
/*.name =*/ { 0 },
|
4624
4832
|
/*.extra =*/ NULL,
|
4625
4833
|
/*.padding =*/ { 0 },
|
@@ -4643,28 +4851,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4643
4851
|
return result;
|
4644
4852
|
}
|
4645
4853
|
|
4646
|
-
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4647
|
-
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4648
|
-
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4649
|
-
memcpy(tensor->op_params, params, params_size);
|
4650
|
-
}
|
4651
|
-
|
4652
|
-
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4653
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4654
|
-
return ((const int32_t *)(tensor->op_params))[i];
|
4655
|
-
}
|
4656
|
-
|
4657
|
-
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4658
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4659
|
-
((int32_t *)(tensor->op_params))[i] = value;
|
4660
|
-
}
|
4661
|
-
|
4662
4854
|
struct ggml_tensor * ggml_new_tensor(
|
4663
4855
|
struct ggml_context * ctx,
|
4664
4856
|
enum ggml_type type,
|
4665
4857
|
int n_dims,
|
4666
4858
|
const int64_t * ne) {
|
4667
|
-
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4859
|
+
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
|
4668
4860
|
}
|
4669
4861
|
|
4670
4862
|
struct ggml_tensor * ggml_new_tensor_1d(
|
@@ -4729,7 +4921,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
4729
4921
|
}
|
4730
4922
|
|
4731
4923
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
4732
|
-
return
|
4924
|
+
return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
|
4925
|
+
}
|
4926
|
+
|
4927
|
+
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4928
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4929
|
+
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4930
|
+
memcpy(tensor->op_params, params, params_size);
|
4931
|
+
}
|
4932
|
+
|
4933
|
+
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4934
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4935
|
+
return ((const int32_t *)(tensor->op_params))[i];
|
4936
|
+
}
|
4937
|
+
|
4938
|
+
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4939
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4940
|
+
((int32_t *)(tensor->op_params))[i] = value;
|
4733
4941
|
}
|
4734
4942
|
|
4735
4943
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
@@ -5015,14 +5223,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
5015
5223
|
|
5016
5224
|
struct ggml_tensor * ggml_view_tensor(
|
5017
5225
|
struct ggml_context * ctx,
|
5018
|
-
|
5019
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src
|
5226
|
+
struct ggml_tensor * src) {
|
5227
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
|
5020
5228
|
ggml_format_name(result, "%s (view)", src->name);
|
5021
5229
|
|
5022
|
-
|
5023
|
-
|
5024
|
-
|
5025
|
-
result->nb[3] = src->nb[3];
|
5230
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
5231
|
+
result->nb[i] = src->nb[i];
|
5232
|
+
}
|
5026
5233
|
|
5027
5234
|
return result;
|
5028
5235
|
}
|
@@ -5280,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
5280
5487
|
}
|
5281
5488
|
|
5282
5489
|
if (inplace) {
|
5283
|
-
GGML_ASSERT(is_node
|
5490
|
+
GGML_ASSERT(!is_node);
|
5284
5491
|
}
|
5285
5492
|
|
5286
5493
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5323,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
5323
5530
|
}
|
5324
5531
|
|
5325
5532
|
if (inplace) {
|
5326
|
-
GGML_ASSERT(is_node
|
5533
|
+
GGML_ASSERT(!is_node);
|
5327
5534
|
}
|
5328
5535
|
|
5329
5536
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5595,7 +5802,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5595
5802
|
|
5596
5803
|
// ggml_concat
|
5597
5804
|
|
5598
|
-
struct ggml_tensor* ggml_concat(
|
5805
|
+
struct ggml_tensor * ggml_concat(
|
5599
5806
|
struct ggml_context* ctx,
|
5600
5807
|
struct ggml_tensor* a,
|
5601
5808
|
struct ggml_tensor* b) {
|
@@ -5862,7 +6069,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
5862
6069
|
struct ggml_tensor * ggml_rms_norm_back(
|
5863
6070
|
struct ggml_context * ctx,
|
5864
6071
|
struct ggml_tensor * a,
|
5865
|
-
struct ggml_tensor * b
|
6072
|
+
struct ggml_tensor * b,
|
6073
|
+
float eps) {
|
5866
6074
|
bool is_node = false;
|
5867
6075
|
|
5868
6076
|
if (a->grad) {
|
@@ -5872,6 +6080,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5872
6080
|
|
5873
6081
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5874
6082
|
|
6083
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
6084
|
+
|
5875
6085
|
result->op = GGML_OP_RMS_NORM_BACK;
|
5876
6086
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5877
6087
|
result->src[0] = a;
|
@@ -6201,7 +6411,7 @@ struct ggml_tensor * ggml_reshape(
|
|
6201
6411
|
//GGML_ASSERT(false);
|
6202
6412
|
}
|
6203
6413
|
|
6204
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a
|
6414
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
|
6205
6415
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6206
6416
|
|
6207
6417
|
result->op = GGML_OP_RESHAPE;
|
@@ -6225,7 +6435,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6225
6435
|
}
|
6226
6436
|
|
6227
6437
|
const int64_t ne[1] = { ne0 };
|
6228
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a
|
6438
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
|
6229
6439
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6230
6440
|
|
6231
6441
|
result->op = GGML_OP_RESHAPE;
|
@@ -6250,7 +6460,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6250
6460
|
}
|
6251
6461
|
|
6252
6462
|
const int64_t ne[2] = { ne0, ne1 };
|
6253
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a
|
6463
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
|
6254
6464
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6255
6465
|
|
6256
6466
|
result->op = GGML_OP_RESHAPE;
|
@@ -6276,7 +6486,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6276
6486
|
}
|
6277
6487
|
|
6278
6488
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6279
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a
|
6489
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
|
6280
6490
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6281
6491
|
|
6282
6492
|
result->op = GGML_OP_RESHAPE;
|
@@ -6286,7 +6496,6 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6286
6496
|
return result;
|
6287
6497
|
}
|
6288
6498
|
|
6289
|
-
|
6290
6499
|
struct ggml_tensor * ggml_reshape_4d(
|
6291
6500
|
struct ggml_context * ctx,
|
6292
6501
|
struct ggml_tensor * a,
|
@@ -6304,7 +6513,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6304
6513
|
}
|
6305
6514
|
|
6306
6515
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6307
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a
|
6516
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
|
6308
6517
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6309
6518
|
|
6310
6519
|
result->op = GGML_OP_RESHAPE;
|
@@ -6314,46 +6523,40 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6314
6523
|
return result;
|
6315
6524
|
}
|
6316
6525
|
|
6317
|
-
|
6318
|
-
|
6319
|
-
static struct ggml_tensor * ggml_view_tensor_offset(
|
6526
|
+
static struct ggml_tensor * ggml_view_impl(
|
6320
6527
|
struct ggml_context * ctx,
|
6321
6528
|
struct ggml_tensor * a,
|
6322
6529
|
int n_dims,
|
6323
6530
|
const int64_t * ne,
|
6324
6531
|
size_t offset) {
|
6325
|
-
// don't calculate an offset from an unallocated tensor
|
6326
|
-
void * data = NULL;
|
6327
|
-
if (a->data != NULL) {
|
6328
|
-
data = (char *) a->data + offset;
|
6329
|
-
}
|
6330
6532
|
|
6331
|
-
|
6533
|
+
bool is_node = false;
|
6534
|
+
|
6535
|
+
if (a->grad) {
|
6536
|
+
is_node = true;
|
6537
|
+
}
|
6332
6538
|
|
6539
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
|
6333
6540
|
ggml_format_name(result, "%s (view)", a->name);
|
6334
6541
|
|
6335
6542
|
ggml_set_op_params(result, &offset, sizeof(offset));
|
6336
6543
|
|
6544
|
+
result->op = GGML_OP_VIEW;
|
6545
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6546
|
+
result->src[0] = a;
|
6547
|
+
|
6337
6548
|
return result;
|
6338
6549
|
}
|
6339
6550
|
|
6551
|
+
// ggml_view_1d
|
6552
|
+
|
6340
6553
|
struct ggml_tensor * ggml_view_1d(
|
6341
6554
|
struct ggml_context * ctx,
|
6342
6555
|
struct ggml_tensor * a,
|
6343
6556
|
int64_t ne0,
|
6344
6557
|
size_t offset) {
|
6345
6558
|
|
6346
|
-
|
6347
|
-
|
6348
|
-
if (a->grad) {
|
6349
|
-
is_node = true;
|
6350
|
-
}
|
6351
|
-
|
6352
|
-
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6353
|
-
|
6354
|
-
result->op = GGML_OP_VIEW;
|
6355
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6356
|
-
result->src[0] = a;
|
6559
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
|
6357
6560
|
|
6358
6561
|
return result;
|
6359
6562
|
}
|
@@ -6368,24 +6571,14 @@ struct ggml_tensor * ggml_view_2d(
|
|
6368
6571
|
size_t nb1,
|
6369
6572
|
size_t offset) {
|
6370
6573
|
|
6371
|
-
|
6372
|
-
|
6373
|
-
if (a->grad) {
|
6374
|
-
is_node = true;
|
6375
|
-
}
|
6376
|
-
|
6377
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6574
|
+
const int64_t ne[2] = { ne0, ne1 };
|
6378
6575
|
|
6379
|
-
struct ggml_tensor * result =
|
6576
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
|
6380
6577
|
|
6381
6578
|
result->nb[1] = nb1;
|
6382
6579
|
result->nb[2] = result->nb[1]*ne1;
|
6383
6580
|
result->nb[3] = result->nb[2];
|
6384
6581
|
|
6385
|
-
result->op = GGML_OP_VIEW;
|
6386
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6387
|
-
result->src[0] = a;
|
6388
|
-
|
6389
6582
|
return result;
|
6390
6583
|
}
|
6391
6584
|
|
@@ -6401,24 +6594,14 @@ struct ggml_tensor * ggml_view_3d(
|
|
6401
6594
|
size_t nb2,
|
6402
6595
|
size_t offset) {
|
6403
6596
|
|
6404
|
-
|
6405
|
-
|
6406
|
-
if (a->grad) {
|
6407
|
-
is_node = true;
|
6408
|
-
}
|
6409
|
-
|
6410
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6597
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6411
6598
|
|
6412
|
-
struct ggml_tensor * result =
|
6599
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
|
6413
6600
|
|
6414
6601
|
result->nb[1] = nb1;
|
6415
6602
|
result->nb[2] = nb2;
|
6416
6603
|
result->nb[3] = result->nb[2]*ne2;
|
6417
6604
|
|
6418
|
-
result->op = GGML_OP_VIEW;
|
6419
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6420
|
-
result->src[0] = a;
|
6421
|
-
|
6422
6605
|
return result;
|
6423
6606
|
}
|
6424
6607
|
|
@@ -6436,24 +6619,14 @@ struct ggml_tensor * ggml_view_4d(
|
|
6436
6619
|
size_t nb3,
|
6437
6620
|
size_t offset) {
|
6438
6621
|
|
6439
|
-
|
6440
|
-
|
6441
|
-
if (a->grad) {
|
6442
|
-
is_node = true;
|
6443
|
-
}
|
6444
|
-
|
6445
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6622
|
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6446
6623
|
|
6447
|
-
struct ggml_tensor * result =
|
6624
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
|
6448
6625
|
|
6449
6626
|
result->nb[1] = nb1;
|
6450
6627
|
result->nb[2] = nb2;
|
6451
6628
|
result->nb[3] = nb3;
|
6452
6629
|
|
6453
|
-
result->op = GGML_OP_VIEW;
|
6454
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6455
|
-
result->src[0] = a;
|
6456
|
-
|
6457
6630
|
return result;
|
6458
6631
|
}
|
6459
6632
|
|
@@ -6640,7 +6813,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6640
6813
|
|
6641
6814
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6642
6815
|
|
6643
|
-
int32_t params[] = { n_past
|
6816
|
+
int32_t params[] = { n_past };
|
6644
6817
|
ggml_set_op_params(result, params, sizeof(params));
|
6645
6818
|
|
6646
6819
|
result->op = GGML_OP_DIAG_MASK_INF;
|
@@ -6657,7 +6830,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
|
|
6657
6830
|
return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
|
6658
6831
|
}
|
6659
6832
|
|
6660
|
-
|
6661
6833
|
struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
6662
6834
|
struct ggml_context * ctx,
|
6663
6835
|
struct ggml_tensor * a,
|
@@ -6680,7 +6852,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6680
6852
|
|
6681
6853
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6682
6854
|
|
6683
|
-
int32_t params[] = { n_past
|
6855
|
+
int32_t params[] = { n_past };
|
6684
6856
|
ggml_set_op_params(result, params, sizeof(params));
|
6685
6857
|
|
6686
6858
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
@@ -7097,11 +7269,13 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
|
7097
7269
|
};
|
7098
7270
|
|
7099
7271
|
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7272
|
+
|
7273
|
+
ggml_set_op_params_i32(result, 0, stride);
|
7274
|
+
|
7100
7275
|
result->op = GGML_OP_CONV_TRANSPOSE_2D;
|
7101
7276
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7102
7277
|
result->src[0] = a;
|
7103
7278
|
result->src[1] = b;
|
7104
|
-
result->src[2] = ggml_new_i32(ctx, stride);
|
7105
7279
|
|
7106
7280
|
return result;
|
7107
7281
|
}
|
@@ -9446,6 +9620,8 @@ static void ggml_compute_forward_div_f32(
|
|
9446
9620
|
|
9447
9621
|
|
9448
9622
|
#ifdef GGML_USE_ACCELERATE
|
9623
|
+
UNUSED(ggml_vec_div_f32);
|
9624
|
+
|
9449
9625
|
vDSP_vdiv(
|
9450
9626
|
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
9451
9627
|
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
@@ -10752,7 +10928,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
10752
10928
|
|
10753
10929
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
10754
10930
|
|
10755
|
-
|
10931
|
+
float eps;
|
10932
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10756
10933
|
|
10757
10934
|
// TODO: optimize
|
10758
10935
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -11930,8 +12107,8 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
11930
12107
|
const int ith = params->ith;
|
11931
12108
|
const int nth = params->nth;
|
11932
12109
|
|
11933
|
-
const int n_past =
|
11934
|
-
const bool inplace =
|
12110
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
12111
|
+
const bool inplace = src0->data == dst->data;
|
11935
12112
|
|
11936
12113
|
GGML_ASSERT(n_past >= 0);
|
11937
12114
|
|
@@ -12142,6 +12319,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|
12142
12319
|
// dx = J * dy
|
12143
12320
|
// dxk = sum_i(Jki * dyi)
|
12144
12321
|
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
12322
|
+
// dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
|
12145
12323
|
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
12146
12324
|
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
12147
12325
|
// dxk = -yk * dot(y, dy) + yk*dyk
|
@@ -13497,7 +13675,6 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13497
13675
|
const struct ggml_compute_params * params,
|
13498
13676
|
const struct ggml_tensor * src0,
|
13499
13677
|
const struct ggml_tensor * src1,
|
13500
|
-
const struct ggml_tensor * opt0,
|
13501
13678
|
struct ggml_tensor * dst) {
|
13502
13679
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13503
13680
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -13557,7 +13734,7 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13557
13734
|
return;
|
13558
13735
|
}
|
13559
13736
|
|
13560
|
-
const int32_t stride = (
|
13737
|
+
const int32_t stride = ggml_get_op_params_i32(dst, 0);
|
13561
13738
|
|
13562
13739
|
// total patches in dst
|
13563
13740
|
const int np = ne2;
|
@@ -13570,7 +13747,7 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13570
13747
|
const int ip1 = MIN(ip0 + dp, np);
|
13571
13748
|
|
13572
13749
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13573
|
-
ggml_fp16_t * const wdata_src =
|
13750
|
+
ggml_fp16_t * const wdata_src = wdata + nk;
|
13574
13751
|
|
13575
13752
|
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
13576
13753
|
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
@@ -13582,9 +13759,8 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
13582
13759
|
for (int i00 = 0; i00 < ne00; i00++) {
|
13583
13760
|
float v = 0;
|
13584
13761
|
ggml_vec_dot_f16(ne03, &v,
|
13585
|
-
|
13586
|
-
|
13587
|
-
|
13762
|
+
wdata_src + i1n,
|
13763
|
+
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
13588
13764
|
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
13589
13765
|
}
|
13590
13766
|
}
|
@@ -13934,7 +14110,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13934
14110
|
vvexpf(S, S, &Mup);
|
13935
14111
|
ggml_vec_sum_f32(Mup, &sum, S);
|
13936
14112
|
#else
|
13937
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14113
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
13938
14114
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
13939
14115
|
|
13940
14116
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -13944,9 +14120,13 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13944
14120
|
if (SS[j] == -INFINITY) {
|
13945
14121
|
SS[j] = 0.0f;
|
13946
14122
|
} else {
|
14123
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14124
|
+
const float val = expf(SS[j] - max);
|
14125
|
+
#else
|
13947
14126
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
13948
14127
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
13949
14128
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14129
|
+
#endif
|
13950
14130
|
sump[j] += (ggml_float)val;
|
13951
14131
|
SS[j] = val;
|
13952
14132
|
}
|
@@ -14524,7 +14704,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
14524
14704
|
vvexpf(SM, SM, &Mup);
|
14525
14705
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
14526
14706
|
#else
|
14527
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14707
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
14528
14708
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
14529
14709
|
|
14530
14710
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -14535,9 +14715,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
14535
14715
|
if (SR[j] == -INFINITY) {
|
14536
14716
|
SW[j] = 0.0f;
|
14537
14717
|
} else {
|
14718
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14719
|
+
const float val = expf(SR[j] - max);
|
14720
|
+
#else
|
14538
14721
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
14539
14722
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
14540
14723
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14724
|
+
#endif
|
14541
14725
|
sump[j] += (ggml_float)val;
|
14542
14726
|
SW[j] = val;
|
14543
14727
|
}
|
@@ -15275,6 +15459,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15275
15459
|
const int nc = src0->ne[0];
|
15276
15460
|
const int nr = ggml_nrows(src0);
|
15277
15461
|
|
15462
|
+
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
15463
|
+
|
15278
15464
|
if (params->type == GGML_TASK_INIT) {
|
15279
15465
|
if (ith == 0) {
|
15280
15466
|
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
@@ -15286,7 +15472,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15286
15472
|
if (ith == 0) {
|
15287
15473
|
float * dp = (float *) dst->data;
|
15288
15474
|
ggml_vec_sum_f32(nth, dp, sums);
|
15289
|
-
dp[0] *= -1.0f;
|
15475
|
+
dp[0] *= -1.0f / (float) nr;
|
15290
15476
|
}
|
15291
15477
|
return;
|
15292
15478
|
}
|
@@ -15303,7 +15489,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15303
15489
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
15304
15490
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
15305
15491
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
15306
|
-
float * st = (float *) params->wdata + nth + ith*nc;
|
15492
|
+
float * st = ((float *) params->wdata) + nth + ith*nc;
|
15307
15493
|
|
15308
15494
|
#ifndef NDEBUG
|
15309
15495
|
for (int i = 0; i < nc; ++i) {
|
@@ -15318,15 +15504,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15318
15504
|
float max = -INFINITY;
|
15319
15505
|
ggml_vec_max_f32(nc, &max, s0);
|
15320
15506
|
|
15321
|
-
uint16_t scvt;
|
15507
|
+
uint16_t scvt; UNUSED(scvt);
|
15322
15508
|
for (int i = 0; i < nc; i++) {
|
15323
15509
|
if (s0[i] == -INFINITY) {
|
15324
15510
|
st[i] = 0.0f;
|
15325
15511
|
} else {
|
15326
|
-
|
15512
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15513
|
+
const float s = s0[i] - max;
|
15514
|
+
const float val = expf(s);
|
15515
|
+
#else
|
15327
15516
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
15328
15517
|
memcpy(&scvt, &s, sizeof(scvt));
|
15329
15518
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15519
|
+
#endif
|
15330
15520
|
sum += (ggml_float)val;
|
15331
15521
|
st[i] = val;
|
15332
15522
|
}
|
@@ -15342,7 +15532,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
15342
15532
|
ggml_vec_log_f32(nc, st, st);
|
15343
15533
|
ggml_vec_mul_f32(nc, st, st, s1);
|
15344
15534
|
|
15345
|
-
|
15535
|
+
float st_sum = 0;
|
15536
|
+
ggml_vec_sum_f32(nc, &st_sum, st);
|
15537
|
+
sums[ith] += st_sum;
|
15346
15538
|
|
15347
15539
|
#ifndef NDEBUG
|
15348
15540
|
for (int i = 0; i < nc; ++i) {
|
@@ -15392,7 +15584,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15392
15584
|
return;
|
15393
15585
|
}
|
15394
15586
|
|
15395
|
-
const
|
15587
|
+
const double eps = 1e-9;
|
15396
15588
|
|
15397
15589
|
// TODO: handle transposed/permuted matrices
|
15398
15590
|
const int64_t nc = src0->ne[0];
|
@@ -15411,7 +15603,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15411
15603
|
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
15412
15604
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
15413
15605
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
15414
|
-
float * sm = (float *) params->wdata + ith*nc;
|
15415
15606
|
|
15416
15607
|
#ifndef NDEBUG
|
15417
15608
|
for (int i = 0; i < nc; ++i) {
|
@@ -15420,54 +15611,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15420
15611
|
assert(!isnan(s1[i]));
|
15421
15612
|
}
|
15422
15613
|
#endif
|
15423
|
-
// step by step explanation:
|
15424
|
-
{
|
15425
|
-
//float * sums = (float *) params->wdata;
|
15426
|
-
|
15427
|
-
// forward pass with annotated gradients from backward pass
|
15428
|
-
// (built by going in reverse operation order, adding to gradients of current operation args)
|
15429
|
-
// st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
|
15430
|
-
// from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
15431
|
-
// ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
|
15432
|
-
// ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
|
15433
|
-
// ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
|
15434
|
-
// ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
|
15435
|
-
// ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
|
15436
|
-
// ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
|
15437
|
-
|
15438
|
-
// substitute into grad[st1], because we can reuse softmax_back from this point on
|
15439
|
-
// grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
|
15440
|
-
// postorder:
|
15441
|
-
// grad[st1] := softmax(s0)
|
15442
|
-
// grad[st1] := grad[st1]*(1.0 - eps)
|
15443
|
-
// grad[st1] := grad[st1] + eps
|
15444
|
-
// grad[st1] := s1 / grad[st1]
|
15445
|
-
// grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
|
15446
|
-
|
15447
|
-
// src0 gradients by going through softmax_back
|
15448
|
-
// grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
15449
|
-
// from softmax_back:
|
15450
|
-
// dxk = yk * (dyk - dot(y, dy))
|
15451
|
-
// dot_y_dy := dot(y, dy)
|
15452
|
-
// dx := dy
|
15453
|
-
// dx := dx - dot_y_dy
|
15454
|
-
// dx := dx * y
|
15455
|
-
// postorder:
|
15456
|
-
// dot_st1_dst1 := dot(st1, grad[st1])
|
15457
|
-
// grad[s0] := grad[st1]
|
15458
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
15459
|
-
// grad[s0] := grad[s0] * st1
|
15460
|
-
|
15461
|
-
// prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
|
15462
|
-
// sm := softmax(s0)
|
15463
|
-
// grad[s0] := sm*(1.0 - eps)
|
15464
|
-
// grad[s0] := grad[s0] + eps
|
15465
|
-
// grad[s0] := s1 / grad[s0]
|
15466
|
-
// grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
|
15467
|
-
// dot_st1_dst1 := dot(sm, grad[s0])
|
15468
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
15469
|
-
// grad[s0] := grad[s0] * sm
|
15470
|
-
}
|
15471
15614
|
|
15472
15615
|
// soft_max
|
15473
15616
|
ggml_float sum = 0.0;
|
@@ -15475,39 +15618,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
15475
15618
|
float max = -INFINITY;
|
15476
15619
|
ggml_vec_max_f32(nc, &max, s0);
|
15477
15620
|
|
15478
|
-
uint16_t scvt;
|
15621
|
+
uint16_t scvt; UNUSED(scvt);
|
15479
15622
|
for (int i = 0; i < nc; i++) {
|
15480
15623
|
if (s0[i] == -INFINITY) {
|
15481
|
-
|
15624
|
+
ds0[i] = 0.0f;
|
15482
15625
|
} else {
|
15483
|
-
|
15626
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15627
|
+
const float s = s0[i] - max;
|
15628
|
+
const float val = expf(s);
|
15629
|
+
#else
|
15484
15630
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
15485
15631
|
memcpy(&scvt, &s, sizeof(scvt));
|
15486
15632
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15633
|
+
#endif
|
15487
15634
|
sum += (ggml_float)val;
|
15488
|
-
|
15635
|
+
ds0[i] = val;
|
15489
15636
|
}
|
15490
15637
|
}
|
15491
15638
|
|
15492
15639
|
assert(sum > 0.0);
|
15493
|
-
sum = 1.0/sum;
|
15640
|
+
sum = (1.0 - eps)/sum;
|
15494
15641
|
}
|
15495
15642
|
|
15496
|
-
|
15497
|
-
ggml_vec_scale_f32(nc,
|
15498
|
-
|
15499
|
-
|
15500
|
-
|
15501
|
-
|
15502
|
-
ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
|
15503
|
-
ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
|
15504
|
-
ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
|
15505
|
-
ggml_vec_mul_f32 (nc, ds0, ds0, sm);
|
15643
|
+
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
15644
|
+
ggml_vec_scale_f32(nc, ds0, sum);
|
15645
|
+
ggml_vec_add1_f32(nc, ds0, ds0, eps);
|
15646
|
+
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
15647
|
+
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
15648
|
+
|
15506
15649
|
|
15507
15650
|
#ifndef NDEBUG
|
15508
15651
|
for (int i = 0; i < nc; ++i) {
|
15509
|
-
assert(!isnan(sm[i]));
|
15510
|
-
assert(!isinf(sm[i]));
|
15511
15652
|
assert(!isnan(ds0[i]));
|
15512
15653
|
assert(!isinf(ds0[i]));
|
15513
15654
|
}
|
@@ -15731,7 +15872,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15731
15872
|
} break;
|
15732
15873
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15733
15874
|
{
|
15734
|
-
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor
|
15875
|
+
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
15735
15876
|
} break;
|
15736
15877
|
case GGML_OP_POOL_1D:
|
15737
15878
|
{
|
@@ -16062,9 +16203,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16062
16203
|
{
|
16063
16204
|
// necessary for llama
|
16064
16205
|
if (src0->grad) {
|
16206
|
+
float eps;
|
16207
|
+
memcpy(&eps, tensor->op_params, sizeof(float));
|
16208
|
+
|
16065
16209
|
src0->grad = ggml_add_impl(ctx,
|
16066
16210
|
src0->grad,
|
16067
|
-
ggml_rms_norm_back(ctx, src0, tensor->grad),
|
16211
|
+
ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
|
16068
16212
|
inplace);
|
16069
16213
|
}
|
16070
16214
|
} break;
|
@@ -16832,9 +16976,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
16832
16976
|
return result;
|
16833
16977
|
}
|
16834
16978
|
|
16835
|
-
|
16836
|
-
struct ggml_cgraph result = *gf;
|
16837
|
-
|
16979
|
+
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
16838
16980
|
GGML_ASSERT(gf->n_nodes > 0);
|
16839
16981
|
|
16840
16982
|
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
@@ -16858,15 +17000,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|
16858
17000
|
}
|
16859
17001
|
}
|
16860
17002
|
|
16861
|
-
for (int i =
|
17003
|
+
for (int i = 0; i < gf->n_nodes; i++) {
|
16862
17004
|
struct ggml_tensor * node = gf->nodes[i];
|
16863
17005
|
|
16864
17006
|
if (node->is_param) {
|
16865
17007
|
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
16866
|
-
ggml_build_forward_expand(
|
17008
|
+
ggml_build_forward_expand(gb, node->grad);
|
16867
17009
|
}
|
16868
17010
|
}
|
17011
|
+
}
|
16869
17012
|
|
17013
|
+
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
17014
|
+
struct ggml_cgraph result = *gf;
|
17015
|
+
ggml_build_backward_expand(ctx, gf, &result, keep);
|
16870
17016
|
return result;
|
16871
17017
|
}
|
16872
17018
|
|
@@ -17542,10 +17688,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
17542
17688
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
17543
17689
|
{
|
17544
17690
|
n_tasks = n_threads;
|
17545
|
-
|
17546
|
-
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
17547
|
-
|
17548
|
-
work_size = MAX(work_size, cur);
|
17549
17691
|
} break;
|
17550
17692
|
case GGML_OP_NONE:
|
17551
17693
|
{
|
@@ -18423,14 +18565,16 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18423
18565
|
struct ggml_opt_params params,
|
18424
18566
|
struct ggml_tensor * f,
|
18425
18567
|
struct ggml_cgraph * gf,
|
18426
|
-
struct ggml_cgraph * gb
|
18568
|
+
struct ggml_cgraph * gb,
|
18569
|
+
ggml_opt_callback callback,
|
18570
|
+
void * callback_data) {
|
18427
18571
|
GGML_ASSERT(ggml_is_scalar(f));
|
18428
18572
|
|
18429
18573
|
// these will store the parameters we want to optimize
|
18430
18574
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
18431
18575
|
|
18432
18576
|
int np = 0;
|
18433
|
-
|
18577
|
+
int64_t nx = 0;
|
18434
18578
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
18435
18579
|
if (gf->nodes[i]->is_param) {
|
18436
18580
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
@@ -18449,31 +18593,32 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18449
18593
|
}
|
18450
18594
|
|
18451
18595
|
// constants
|
18452
|
-
|
18453
|
-
const float
|
18454
|
-
const float
|
18596
|
+
float sched = params.adam.sched;
|
18597
|
+
const float alpha = params.adam.alpha;
|
18598
|
+
const float decay = params.adam.decay * alpha;
|
18455
18599
|
const float beta1 = params.adam.beta1;
|
18456
18600
|
const float beta2 = params.adam.beta2;
|
18457
18601
|
const float eps = params.adam.eps;
|
18602
|
+
const float gclip = params.adam.gclip;
|
18603
|
+
const int decay_min_ndim = params.adam.decay_min_ndim;
|
18458
18604
|
|
18459
|
-
float * x = opt->adam.x->data; // view of the parameters
|
18460
|
-
float * g1 = opt->adam.g1->data; // gradient
|
18461
|
-
float * g2 = opt->adam.g2->data; // gradient squared
|
18462
18605
|
float * m = opt->adam.m->data; // first moment
|
18463
18606
|
float * v = opt->adam.v->data; // second moment
|
18464
|
-
float * mh = opt->adam.mh->data; // first moment hat
|
18465
|
-
float * vh = opt->adam.vh->data; // second moment hat
|
18466
18607
|
|
18467
18608
|
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
18468
18609
|
|
18469
|
-
|
18470
|
-
|
18610
|
+
if (callback) {
|
18611
|
+
callback(callback_data, &sched);
|
18612
|
+
}
|
18471
18613
|
|
18472
18614
|
// compute the function value
|
18473
18615
|
ggml_graph_reset (gf);
|
18474
18616
|
ggml_set_f32 (f->grad, 1.0f);
|
18475
18617
|
|
18476
|
-
|
18618
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18619
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18620
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18621
|
+
ggml_graph_compute(gb, &cplan);
|
18477
18622
|
|
18478
18623
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
18479
18624
|
opt->adam.fx_best = opt->adam.fx_prev;
|
@@ -18481,6 +18626,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18481
18626
|
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
18482
18627
|
}
|
18483
18628
|
|
18629
|
+
opt->loss_before = opt->adam.fx_prev;
|
18630
|
+
opt->loss_after = opt->adam.fx_prev;
|
18631
|
+
|
18484
18632
|
// initialize
|
18485
18633
|
if (opt->just_initialized) {
|
18486
18634
|
opt->adam.n_no_improvement = 0;
|
@@ -18513,50 +18661,55 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
18513
18661
|
UNUSED(t_start_cpu);
|
18514
18662
|
|
18515
18663
|
{
|
18516
|
-
|
18517
|
-
|
18518
|
-
|
18519
|
-
|
18520
|
-
|
18521
|
-
|
18522
|
-
|
18523
|
-
|
18524
|
-
|
18525
|
-
|
18526
|
-
|
18527
|
-
|
18528
|
-
|
18529
|
-
|
18530
|
-
|
18531
|
-
|
18532
|
-
|
18533
|
-
|
18534
|
-
|
18535
|
-
|
18536
|
-
|
18537
|
-
|
18538
|
-
|
18539
|
-
|
18540
|
-
|
18541
|
-
|
18542
|
-
|
18543
|
-
|
18544
|
-
|
18545
|
-
|
18546
|
-
|
18547
|
-
|
18548
|
-
|
18664
|
+
float gnorm = 1.0f;
|
18665
|
+
if (gclip > 0.0f) {
|
18666
|
+
// gradient clipping
|
18667
|
+
ggml_float sum = 0.0;
|
18668
|
+
for (int p = 0; p < np; ++p) {
|
18669
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18670
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18671
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j);
|
18672
|
+
sum += (ggml_float)(g*g);
|
18673
|
+
}
|
18674
|
+
}
|
18675
|
+
ggml_float norm = sqrt(sum);
|
18676
|
+
if (norm > (ggml_float) gclip) {
|
18677
|
+
gnorm = (float) ((ggml_float) gclip / norm);
|
18678
|
+
}
|
18679
|
+
}
|
18680
|
+
const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
|
18681
|
+
const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
|
18682
|
+
int64_t i = 0;
|
18683
|
+
for (int p = 0; p < np; ++p) {
|
18684
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18685
|
+
const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
18686
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18687
|
+
float x = ggml_get_f32_1d(ps[p], j);
|
18688
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
|
18689
|
+
m[i] = m[i]*beta1 + g*(1.0f - beta1);
|
18690
|
+
v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
|
18691
|
+
float mh = m[i]*beta1h;
|
18692
|
+
float vh = v[i]*beta2h;
|
18693
|
+
vh = sqrtf(vh) + eps;
|
18694
|
+
x = x*(1.0f - p_decay) - mh/vh;
|
18695
|
+
ggml_set_f32_1d(ps[p], j, x);
|
18696
|
+
++i;
|
18697
|
+
}
|
18698
|
+
}
|
18699
|
+
}
|
18549
18700
|
|
18550
|
-
|
18551
|
-
|
18701
|
+
if (callback) {
|
18702
|
+
callback(callback_data, &sched);
|
18552
18703
|
}
|
18553
18704
|
|
18554
18705
|
ggml_graph_reset (gf);
|
18555
18706
|
ggml_set_f32 (f->grad, 1.0f);
|
18556
18707
|
|
18557
|
-
|
18708
|
+
ggml_graph_compute(gb, &cplan);
|
18558
18709
|
|
18559
18710
|
const float fx = ggml_get_f32_1d(f, 0);
|
18711
|
+
opt->loss_after = fx;
|
18712
|
+
|
18560
18713
|
|
18561
18714
|
// check convergence
|
18562
18715
|
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
@@ -18625,7 +18778,6 @@ struct ggml_lbfgs_iteration_data {
|
|
18625
18778
|
};
|
18626
18779
|
|
18627
18780
|
static enum ggml_opt_result linesearch_backtracking(
|
18628
|
-
struct ggml_context * ctx,
|
18629
18781
|
const struct ggml_opt_params * params,
|
18630
18782
|
int nx,
|
18631
18783
|
float * x,
|
@@ -18637,8 +18789,11 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18637
18789
|
struct ggml_tensor * f,
|
18638
18790
|
struct ggml_cgraph * gf,
|
18639
18791
|
struct ggml_cgraph * gb,
|
18792
|
+
struct ggml_cplan * cplan,
|
18640
18793
|
const int np,
|
18641
|
-
struct ggml_tensor * ps[]
|
18794
|
+
struct ggml_tensor * ps[],
|
18795
|
+
ggml_opt_callback callback,
|
18796
|
+
void * callback_data) {
|
18642
18797
|
int count = 0;
|
18643
18798
|
|
18644
18799
|
float width = 0.0f;
|
@@ -18667,6 +18822,12 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18667
18822
|
dgtest = params->lbfgs.ftol*dginit;
|
18668
18823
|
|
18669
18824
|
while (true) {
|
18825
|
+
if (callback) {
|
18826
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18827
|
+
float sched = 0;
|
18828
|
+
callback(callback_data, &sched);
|
18829
|
+
}
|
18830
|
+
|
18670
18831
|
ggml_vec_cpy_f32(nx, x, xp);
|
18671
18832
|
ggml_vec_mad_f32(nx, x, d, *step);
|
18672
18833
|
|
@@ -18677,7 +18838,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18677
18838
|
ggml_graph_reset (gf);
|
18678
18839
|
ggml_set_f32 (f->grad, 1.0f);
|
18679
18840
|
|
18680
|
-
|
18841
|
+
ggml_graph_compute(gb, cplan);
|
18681
18842
|
|
18682
18843
|
ggml_opt_get_grad(np, ps, g);
|
18683
18844
|
|
@@ -18737,7 +18898,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18737
18898
|
struct ggml_opt_params params,
|
18738
18899
|
struct ggml_tensor * f,
|
18739
18900
|
struct ggml_cgraph * gf,
|
18740
|
-
struct ggml_cgraph * gb
|
18901
|
+
struct ggml_cgraph * gb,
|
18902
|
+
ggml_opt_callback callback,
|
18903
|
+
void * callback_data) {
|
18741
18904
|
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
18742
18905
|
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
18743
18906
|
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
@@ -18769,6 +18932,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18769
18932
|
opt->iter = iter;
|
18770
18933
|
}
|
18771
18934
|
|
18935
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18936
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18937
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18938
|
+
|
18772
18939
|
float * x = opt->lbfgs.x->data; // current parameters
|
18773
18940
|
float * xp = opt->lbfgs.xp->data; // previous parameters
|
18774
18941
|
float * g = opt->lbfgs.g->data; // current gradient
|
@@ -18790,6 +18957,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18790
18957
|
float * lm_s = opt->lbfgs.lms->data;
|
18791
18958
|
float * lm_y = opt->lbfgs.lmy->data;
|
18792
18959
|
|
18960
|
+
if (callback) {
|
18961
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18962
|
+
float sched = 0;
|
18963
|
+
callback(callback_data, &sched);
|
18964
|
+
}
|
18965
|
+
|
18793
18966
|
// evaluate the function value and its gradient
|
18794
18967
|
{
|
18795
18968
|
ggml_opt_set_params(np, ps, x);
|
@@ -18797,11 +18970,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18797
18970
|
ggml_graph_reset (gf);
|
18798
18971
|
ggml_set_f32 (f->grad, 1.0f);
|
18799
18972
|
|
18800
|
-
|
18973
|
+
ggml_graph_compute(gb, &cplan);
|
18801
18974
|
|
18802
18975
|
ggml_opt_get_grad(np, ps, g);
|
18803
18976
|
|
18804
18977
|
fx = ggml_get_f32_1d(f, 0);
|
18978
|
+
|
18979
|
+
opt->loss_before = fx;
|
18980
|
+
opt->loss_after = fx;
|
18805
18981
|
}
|
18806
18982
|
|
18807
18983
|
// search direction = -gradient
|
@@ -18856,7 +19032,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18856
19032
|
ggml_vec_cpy_f32(nx, xp, x);
|
18857
19033
|
ggml_vec_cpy_f32(nx, gp, g);
|
18858
19034
|
|
18859
|
-
ls = linesearch_backtracking(
|
19035
|
+
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
|
18860
19036
|
|
18861
19037
|
if (ls < 0) {
|
18862
19038
|
// linesearch failed - go back to the previous point and return
|
@@ -18866,6 +19042,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18866
19042
|
return ls;
|
18867
19043
|
}
|
18868
19044
|
|
19045
|
+
opt->loss_after = fx;
|
19046
|
+
|
18869
19047
|
ggml_vec_norm_f32(nx, &xnorm, x);
|
18870
19048
|
ggml_vec_norm_f32(nx, &gnorm, g);
|
18871
19049
|
|
@@ -18923,7 +19101,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18923
19101
|
// ys = y^t \cdot s -> 1 / \rho.
|
18924
19102
|
// yy = y^t \cdot y.
|
18925
19103
|
//
|
18926
|
-
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]
|
19104
|
+
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
18927
19105
|
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
18928
19106
|
|
18929
19107
|
lm_ys[end[0]] = ys;
|
@@ -18986,13 +19164,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18986
19164
|
.adam = {
|
18987
19165
|
.n_iter = 10000,
|
18988
19166
|
.sched = 1.000f,
|
18989
|
-
.decay = 0.
|
19167
|
+
.decay = 0.0f,
|
19168
|
+
.decay_min_ndim = 2,
|
18990
19169
|
.alpha = 0.001f,
|
18991
19170
|
.beta1 = 0.9f,
|
18992
19171
|
.beta2 = 0.999f,
|
18993
19172
|
.eps = 1e-8f,
|
18994
19173
|
.eps_f = 1e-5f,
|
18995
19174
|
.eps_g = 1e-3f,
|
19175
|
+
.gclip = 0.0f,
|
18996
19176
|
},
|
18997
19177
|
};
|
18998
19178
|
} break;
|
@@ -19042,23 +19222,13 @@ GGML_API void ggml_opt_init(
|
|
19042
19222
|
switch (opt->params.type) {
|
19043
19223
|
case GGML_OPT_ADAM:
|
19044
19224
|
{
|
19045
|
-
opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19046
|
-
opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19047
|
-
opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19048
19225
|
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19049
19226
|
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19050
|
-
opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19051
|
-
opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
19052
19227
|
opt->adam.pf = params.past > 0
|
19053
19228
|
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
19054
19229
|
: NULL;
|
19055
|
-
ggml_set_zero(opt->adam.x);
|
19056
|
-
ggml_set_zero(opt->adam.g1);
|
19057
|
-
ggml_set_zero(opt->adam.g2);
|
19058
19230
|
ggml_set_zero(opt->adam.m);
|
19059
19231
|
ggml_set_zero(opt->adam.v);
|
19060
|
-
ggml_set_zero(opt->adam.mh);
|
19061
|
-
ggml_set_zero(opt->adam.vh);
|
19062
19232
|
if (opt->adam.pf) {
|
19063
19233
|
ggml_set_zero(opt->adam.pf);
|
19064
19234
|
}
|
@@ -19142,7 +19312,7 @@ enum ggml_opt_result ggml_opt_resume(
|
|
19142
19312
|
*gf = ggml_build_forward (f);
|
19143
19313
|
*gb = ggml_build_backward(ctx, gf, true);
|
19144
19314
|
|
19145
|
-
return ggml_opt_resume_g(ctx, opt, f, gf, gb);
|
19315
|
+
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
19146
19316
|
}
|
19147
19317
|
|
19148
19318
|
enum ggml_opt_result ggml_opt_resume_g(
|
@@ -19150,7 +19320,9 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
19150
19320
|
struct ggml_opt_context * opt,
|
19151
19321
|
struct ggml_tensor * f,
|
19152
19322
|
struct ggml_cgraph * gf,
|
19153
|
-
struct ggml_cgraph * gb
|
19323
|
+
struct ggml_cgraph * gb,
|
19324
|
+
ggml_opt_callback callback,
|
19325
|
+
void * callback_data) {
|
19154
19326
|
|
19155
19327
|
// build forward + backward compute graphs
|
19156
19328
|
enum ggml_opt_result result = GGML_OPT_OK;
|
@@ -19158,11 +19330,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
19158
19330
|
switch (opt->params.type) {
|
19159
19331
|
case GGML_OPT_ADAM:
|
19160
19332
|
{
|
19161
|
-
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
19333
|
+
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
19162
19334
|
} break;
|
19163
19335
|
case GGML_OPT_LBFGS:
|
19164
19336
|
{
|
19165
|
-
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
|
19337
|
+
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
19166
19338
|
} break;
|
19167
19339
|
}
|
19168
19340
|
|
@@ -19394,7 +19566,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
19394
19566
|
////////////////////////////////////////////////////////////////////////////////
|
19395
19567
|
|
19396
19568
|
struct gguf_str {
|
19397
|
-
|
19569
|
+
uint64_t n; // GGUFv2
|
19398
19570
|
char * data;
|
19399
19571
|
};
|
19400
19572
|
|
@@ -19408,9 +19580,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
|
19408
19580
|
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
19409
19581
|
[GGUF_TYPE_BOOL] = sizeof(bool),
|
19410
19582
|
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
19583
|
+
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
|
19584
|
+
[GGUF_TYPE_INT64] = sizeof(int64_t),
|
19585
|
+
[GGUF_TYPE_FLOAT64] = sizeof(double),
|
19411
19586
|
[GGUF_TYPE_ARRAY] = 0, // undefined
|
19412
19587
|
};
|
19413
|
-
static_assert(GGUF_TYPE_COUNT ==
|
19588
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19414
19589
|
|
19415
19590
|
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
19416
19591
|
[GGUF_TYPE_UINT8] = "u8",
|
@@ -19423,8 +19598,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
|
19423
19598
|
[GGUF_TYPE_BOOL] = "bool",
|
19424
19599
|
[GGUF_TYPE_STRING] = "str",
|
19425
19600
|
[GGUF_TYPE_ARRAY] = "arr",
|
19601
|
+
[GGUF_TYPE_UINT64] = "u64",
|
19602
|
+
[GGUF_TYPE_INT64] = "i64",
|
19603
|
+
[GGUF_TYPE_FLOAT64] = "f64",
|
19426
19604
|
};
|
19427
|
-
static_assert(GGUF_TYPE_COUNT ==
|
19605
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19428
19606
|
|
19429
19607
|
union gguf_value {
|
19430
19608
|
uint8_t uint8;
|
@@ -19434,6 +19612,9 @@ union gguf_value {
|
|
19434
19612
|
uint32_t uint32;
|
19435
19613
|
int32_t int32;
|
19436
19614
|
float float32;
|
19615
|
+
uint64_t uint64;
|
19616
|
+
int64_t int64;
|
19617
|
+
double float64;
|
19437
19618
|
bool bool_;
|
19438
19619
|
|
19439
19620
|
struct gguf_str str;
|
@@ -19441,7 +19622,7 @@ union gguf_value {
|
|
19441
19622
|
struct {
|
19442
19623
|
enum gguf_type type;
|
19443
19624
|
|
19444
|
-
|
19625
|
+
uint64_t n; // GGUFv2
|
19445
19626
|
void * data;
|
19446
19627
|
} arr;
|
19447
19628
|
};
|
@@ -19449,8 +19630,6 @@ union gguf_value {
|
|
19449
19630
|
struct gguf_kv {
|
19450
19631
|
struct gguf_str key;
|
19451
19632
|
|
19452
|
-
uint32_t n_bytes; // TODO: is this actually needed?
|
19453
|
-
|
19454
19633
|
enum gguf_type type;
|
19455
19634
|
union gguf_value value;
|
19456
19635
|
};
|
@@ -19458,15 +19637,15 @@ struct gguf_kv {
|
|
19458
19637
|
struct gguf_header {
|
19459
19638
|
uint32_t magic;
|
19460
19639
|
uint32_t version;
|
19461
|
-
|
19462
|
-
|
19640
|
+
uint64_t n_tensors; // GGUFv2
|
19641
|
+
uint64_t n_kv; // GGUFv2
|
19463
19642
|
};
|
19464
19643
|
|
19465
19644
|
struct gguf_tensor_info {
|
19466
19645
|
struct gguf_str name;
|
19467
19646
|
|
19468
19647
|
uint32_t n_dims;
|
19469
|
-
|
19648
|
+
uint64_t ne[GGML_MAX_DIMS];
|
19470
19649
|
|
19471
19650
|
enum ggml_type type;
|
19472
19651
|
|
@@ -19497,19 +19676,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
|
|
19497
19676
|
return n == size;
|
19498
19677
|
}
|
19499
19678
|
|
19500
|
-
|
19679
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19680
|
+
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
19501
19681
|
p->n = 0;
|
19502
19682
|
p->data = NULL;
|
19503
19683
|
|
19504
19684
|
bool ok = true;
|
19505
19685
|
|
19506
|
-
// TODO: how to avoid mallocs for strings?
|
19507
19686
|
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
19508
19687
|
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19509
19688
|
|
19510
19689
|
return ok;
|
19511
19690
|
}
|
19512
19691
|
|
19692
|
+
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
19693
|
+
p->n = 0;
|
19694
|
+
p->data = NULL;
|
19695
|
+
|
19696
|
+
bool ok = true;
|
19697
|
+
|
19698
|
+
uint32_t n = 0;
|
19699
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
19700
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19701
|
+
|
19702
|
+
return ok;
|
19703
|
+
}
|
19704
|
+
|
19513
19705
|
struct gguf_context * gguf_init_empty(void) {
|
19514
19706
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19515
19707
|
|
@@ -19565,8 +19757,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19565
19757
|
ctx->data = NULL;
|
19566
19758
|
|
19567
19759
|
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
19568
|
-
|
19569
|
-
|
19760
|
+
|
19761
|
+
if (ctx->header.version == 1) {
|
19762
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19763
|
+
uint32_t n_tensors = 0;
|
19764
|
+
uint32_t n_kv = 0;
|
19765
|
+
|
19766
|
+
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
19767
|
+
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
19768
|
+
|
19769
|
+
ctx->header.n_tensors = n_tensors;
|
19770
|
+
ctx->header.n_kv = n_kv;
|
19771
|
+
} else {
|
19772
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
19773
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
19774
|
+
}
|
19570
19775
|
|
19571
19776
|
if (!ok) {
|
19572
19777
|
fprintf(stderr, "%s: failed to read header\n", __func__);
|
@@ -19576,18 +19781,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19576
19781
|
}
|
19577
19782
|
}
|
19578
19783
|
|
19784
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19785
|
+
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
19786
|
+
if (ctx->header.version == 1) {
|
19787
|
+
gguf_fread_str = gguf_fread_str_v1;
|
19788
|
+
}
|
19789
|
+
|
19579
19790
|
// read the kv pairs
|
19580
19791
|
{
|
19581
|
-
ctx->kv =
|
19792
|
+
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19582
19793
|
|
19583
19794
|
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19584
19795
|
struct gguf_kv * kv = &ctx->kv[i];
|
19585
19796
|
|
19586
19797
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
19587
19798
|
|
19588
|
-
ok = ok && gguf_fread_str(file, &kv->key,
|
19589
|
-
|
19590
|
-
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19799
|
+
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
19800
|
+
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19591
19801
|
|
19592
19802
|
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
19593
19803
|
|
@@ -19599,12 +19809,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19599
19809
|
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
19600
19810
|
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
19601
19811
|
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
19812
|
+
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
|
19813
|
+
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
|
19814
|
+
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
|
19602
19815
|
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
19603
19816
|
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
19604
19817
|
case GGUF_TYPE_ARRAY:
|
19605
19818
|
{
|
19606
19819
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19607
|
-
|
19820
|
+
|
19821
|
+
if (ctx->header.version == 1) {
|
19822
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19823
|
+
uint32_t n = 0;
|
19824
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
19825
|
+
kv->value.arr.n = n;
|
19826
|
+
} else {
|
19827
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19828
|
+
}
|
19608
19829
|
|
19609
19830
|
switch (kv->value.arr.type) {
|
19610
19831
|
case GGUF_TYPE_UINT8:
|
@@ -19614,6 +19835,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19614
19835
|
case GGUF_TYPE_UINT32:
|
19615
19836
|
case GGUF_TYPE_INT32:
|
19616
19837
|
case GGUF_TYPE_FLOAT32:
|
19838
|
+
case GGUF_TYPE_UINT64:
|
19839
|
+
case GGUF_TYPE_INT64:
|
19840
|
+
case GGUF_TYPE_FLOAT64:
|
19617
19841
|
case GGUF_TYPE_BOOL:
|
19618
19842
|
{
|
19619
19843
|
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
@@ -19648,7 +19872,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19648
19872
|
|
19649
19873
|
// read the tensor infos
|
19650
19874
|
{
|
19651
|
-
ctx->infos =
|
19875
|
+
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19652
19876
|
|
19653
19877
|
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19654
19878
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -19660,7 +19884,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19660
19884
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19661
19885
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19662
19886
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19663
|
-
|
19887
|
+
if (ctx->header.version == 1) {
|
19888
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19889
|
+
uint32_t t = 0;
|
19890
|
+
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
19891
|
+
info->ne[j] = t;
|
19892
|
+
} else {
|
19893
|
+
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19894
|
+
}
|
19664
19895
|
}
|
19665
19896
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19666
19897
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
@@ -19744,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19744
19975
|
|
19745
19976
|
struct ggml_tensor * data = NULL;
|
19746
19977
|
|
19747
|
-
if (params.no_alloc
|
19978
|
+
if (!params.no_alloc) {
|
19748
19979
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
19749
19980
|
|
19750
19981
|
ok = ok && data != NULL;
|
@@ -19785,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19785
20016
|
}
|
19786
20017
|
|
19787
20018
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
19788
|
-
if (params.no_alloc
|
20019
|
+
if (!params.no_alloc) {
|
19789
20020
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
19790
20021
|
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
19791
20022
|
}
|
@@ -19842,7 +20073,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19842
20073
|
}
|
19843
20074
|
}
|
19844
20075
|
|
19845
|
-
|
20076
|
+
free(ctx->kv);
|
19846
20077
|
}
|
19847
20078
|
|
19848
20079
|
if (ctx->infos) {
|
@@ -19854,7 +20085,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19854
20085
|
}
|
19855
20086
|
}
|
19856
20087
|
|
19857
|
-
|
20088
|
+
free(ctx->infos);
|
19858
20089
|
}
|
19859
20090
|
|
19860
20091
|
GGML_ALIGNED_FREE(ctx);
|
@@ -19954,6 +20185,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
|
19954
20185
|
return ctx->kv[i].value.float32;
|
19955
20186
|
}
|
19956
20187
|
|
20188
|
+
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
20189
|
+
return ctx->kv[i].value.uint64;
|
20190
|
+
}
|
20191
|
+
|
20192
|
+
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
20193
|
+
return ctx->kv[i].value.int64;
|
20194
|
+
}
|
20195
|
+
|
20196
|
+
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
20197
|
+
return ctx->kv[i].value.float64;
|
20198
|
+
}
|
20199
|
+
|
19957
20200
|
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
19958
20201
|
return ctx->kv[i].value.bool_;
|
19959
20202
|
}
|
@@ -20000,7 +20243,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
|
20000
20243
|
const int n_kv = gguf_get_n_kv(ctx);
|
20001
20244
|
|
20002
20245
|
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
20003
|
-
ctx->kv[n_kv].key.n = strlen(key)
|
20246
|
+
ctx->kv[n_kv].key.n = strlen(key);
|
20004
20247
|
ctx->kv[n_kv].key.data = strdup(key);
|
20005
20248
|
ctx->header.n_kv++;
|
20006
20249
|
|
@@ -20056,6 +20299,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
|
20056
20299
|
ctx->kv[idx].value.float32 = val;
|
20057
20300
|
}
|
20058
20301
|
|
20302
|
+
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
|
20303
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20304
|
+
|
20305
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT64;
|
20306
|
+
ctx->kv[idx].value.uint64 = val;
|
20307
|
+
}
|
20308
|
+
|
20309
|
+
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
|
20310
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20311
|
+
|
20312
|
+
ctx->kv[idx].type = GGUF_TYPE_INT64;
|
20313
|
+
ctx->kv[idx].value.int64 = val;
|
20314
|
+
}
|
20315
|
+
|
20316
|
+
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
|
20317
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20318
|
+
|
20319
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
|
20320
|
+
ctx->kv[idx].value.float64 = val;
|
20321
|
+
}
|
20322
|
+
|
20059
20323
|
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
20060
20324
|
const int idx = gguf_get_or_add_key(ctx, key);
|
20061
20325
|
|
@@ -20067,7 +20331,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
|
|
20067
20331
|
const int idx = gguf_get_or_add_key(ctx, key);
|
20068
20332
|
|
20069
20333
|
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
20070
|
-
ctx->kv[idx].value.str.n = strlen(val)
|
20334
|
+
ctx->kv[idx].value.str.n = strlen(val);
|
20071
20335
|
ctx->kv[idx].value.str.data = strdup(val);
|
20072
20336
|
}
|
20073
20337
|
|
@@ -20090,7 +20354,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
20090
20354
|
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
20091
20355
|
for (int i = 0; i < n; i++) {
|
20092
20356
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
20093
|
-
str->n = strlen(data[i])
|
20357
|
+
str->n = strlen(data[i]);
|
20094
20358
|
str->data = strdup(data[i]);
|
20095
20359
|
}
|
20096
20360
|
}
|
@@ -20106,6 +20370,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
20106
20370
|
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
20107
20371
|
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
20108
20372
|
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
20373
|
+
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
|
20374
|
+
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
|
20375
|
+
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
|
20109
20376
|
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
20110
20377
|
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
20111
20378
|
case GGUF_TYPE_ARRAY:
|
@@ -20134,7 +20401,7 @@ void gguf_add_tensor(
|
|
20134
20401
|
const int idx = ctx->header.n_tensors;
|
20135
20402
|
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
20136
20403
|
|
20137
|
-
ctx->infos[idx].name.n = strlen(tensor->name)
|
20404
|
+
ctx->infos[idx].name.n = strlen(tensor->name);
|
20138
20405
|
ctx->infos[idx].name.data = strdup(tensor->name);
|
20139
20406
|
|
20140
20407
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
@@ -20267,6 +20534,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|
20267
20534
|
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
20268
20535
|
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
20269
20536
|
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
20537
|
+
case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
|
20538
|
+
case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
|
20539
|
+
case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
|
20270
20540
|
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
20271
20541
|
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
20272
20542
|
case GGUF_TYPE_ARRAY:
|
@@ -20282,6 +20552,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|
20282
20552
|
case GGUF_TYPE_UINT32:
|
20283
20553
|
case GGUF_TYPE_INT32:
|
20284
20554
|
case GGUF_TYPE_FLOAT32:
|
20555
|
+
case GGUF_TYPE_UINT64:
|
20556
|
+
case GGUF_TYPE_INT64:
|
20557
|
+
case GGUF_TYPE_FLOAT64:
|
20285
20558
|
case GGUF_TYPE_BOOL:
|
20286
20559
|
{
|
20287
20560
|
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
@@ -20516,6 +20789,14 @@ int ggml_cpu_has_sse3(void) {
|
|
20516
20789
|
#endif
|
20517
20790
|
}
|
20518
20791
|
|
20792
|
+
int ggml_cpu_has_ssse3(void) {
|
20793
|
+
#if defined(__SSSE3__)
|
20794
|
+
return 1;
|
20795
|
+
#else
|
20796
|
+
return 0;
|
20797
|
+
#endif
|
20798
|
+
}
|
20799
|
+
|
20519
20800
|
int ggml_cpu_has_vsx(void) {
|
20520
20801
|
#if defined(__POWER9_VECTOR__)
|
20521
20802
|
return 1;
|