llama_cpp 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -2
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +97 -3
- data/ext/llama_cpp/src/ggml.c +1254 -670
- data/ext/llama_cpp/src/ggml.h +110 -42
- data/ext/llama_cpp/src/llama.cpp +878 -757
- data/ext/llama_cpp/src/llama.h +42 -1
- data/ext/llama_cpp/src/llama_util.h +389 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -1
- data/sig/llama_cpp.rbs +55 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
// Defines CLOCK_MONOTONIC
|
1
|
+
// Defines CLOCK_MONOTONIC on Linux
|
2
2
|
#define _GNU_SOURCE
|
3
3
|
|
4
4
|
#include "ggml.h"
|
@@ -16,6 +16,7 @@
|
|
16
16
|
#include <stdlib.h>
|
17
17
|
#include <string.h>
|
18
18
|
#include <stdint.h>
|
19
|
+
#include <inttypes.h>
|
19
20
|
#include <stdio.h>
|
20
21
|
#include <float.h>
|
21
22
|
|
@@ -25,14 +26,9 @@
|
|
25
26
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
26
27
|
#endif
|
27
28
|
|
28
|
-
#if defined
|
29
|
+
#if defined(_WIN32)
|
29
30
|
|
30
|
-
#if !defined(__MINGW32__)
|
31
|
-
#include <Windows.h>
|
32
|
-
#else
|
33
|
-
// ref: https://github.com/ggerganov/whisper.cpp/issues/168
|
34
31
|
#include <windows.h>
|
35
|
-
#endif
|
36
32
|
|
37
33
|
typedef volatile LONG atomic_int;
|
38
34
|
typedef atomic_int atomic_bool;
|
@@ -54,6 +50,7 @@ typedef HANDLE pthread_t;
|
|
54
50
|
|
55
51
|
typedef DWORD thread_ret_t;
|
56
52
|
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
|
53
|
+
(void) unused;
|
57
54
|
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
|
58
55
|
if (handle == NULL)
|
59
56
|
{
|
@@ -65,6 +62,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
|
|
65
62
|
}
|
66
63
|
|
67
64
|
static int pthread_join(pthread_t thread, void* unused) {
|
65
|
+
(void) unused;
|
68
66
|
return (int) WaitForSingleObject(thread, INFINITE);
|
69
67
|
}
|
70
68
|
|
@@ -96,17 +94,6 @@ typedef void* thread_ret_t;
|
|
96
94
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
97
95
|
#endif
|
98
96
|
|
99
|
-
#define GGML_MLOCK_SUPPORT 0
|
100
|
-
|
101
|
-
#ifdef __has_include
|
102
|
-
#if __has_include(<sys/mman.h>)
|
103
|
-
#undef GGML_MLOCK_SUPPORT
|
104
|
-
#define GGML_MLOCK_SUPPORT 1
|
105
|
-
#include <sys/mman.h>
|
106
|
-
#endif
|
107
|
-
#endif
|
108
|
-
|
109
|
-
|
110
97
|
/*#define GGML_PERF*/
|
111
98
|
#define GGML_DEBUG 0
|
112
99
|
#define GGML_GELU_FP16
|
@@ -127,6 +114,14 @@ typedef void* thread_ret_t;
|
|
127
114
|
#define GGML_MEM_ALIGN 16
|
128
115
|
#endif
|
129
116
|
|
117
|
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
118
|
+
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
119
|
+
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
120
|
+
#else
|
121
|
+
#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size)
|
122
|
+
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
123
|
+
#endif
|
124
|
+
|
130
125
|
#define UNUSED(x) (void)(x)
|
131
126
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
132
127
|
|
@@ -241,12 +236,12 @@ static inline float fp32_from_bits(uint32_t w) {
|
|
241
236
|
}
|
242
237
|
|
243
238
|
static inline uint32_t fp32_to_bits(float f) {
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
239
|
+
union {
|
240
|
+
float as_value;
|
241
|
+
uint32_t as_bits;
|
242
|
+
} fp32;
|
243
|
+
fp32.as_value = f;
|
244
|
+
return fp32.as_bits;
|
250
245
|
}
|
251
246
|
|
252
247
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
@@ -496,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
|
496
491
|
}
|
497
492
|
#endif
|
498
493
|
|
494
|
+
#if __ARM_NEON
|
495
|
+
|
496
|
+
#if !defined(__aarch64__)
|
497
|
+
|
498
|
+
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
499
|
+
return
|
500
|
+
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
501
|
+
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
502
|
+
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
503
|
+
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
504
|
+
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
505
|
+
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
506
|
+
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
507
|
+
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
508
|
+
}
|
509
|
+
|
510
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
511
|
+
return
|
512
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
513
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
514
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
515
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
516
|
+
}
|
517
|
+
|
518
|
+
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
519
|
+
return
|
520
|
+
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
521
|
+
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
522
|
+
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
523
|
+
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
524
|
+
}
|
525
|
+
|
526
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
527
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
528
|
+
}
|
529
|
+
|
530
|
+
inline static float vaddvq_f32(float32x4_t v) {
|
531
|
+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
532
|
+
}
|
533
|
+
|
534
|
+
inline float vminvq_f32(float32x4_t v) {
|
535
|
+
return
|
536
|
+
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
537
|
+
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
538
|
+
}
|
539
|
+
|
540
|
+
inline float vmaxvq_f32(float32x4_t v) {
|
541
|
+
return
|
542
|
+
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
543
|
+
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
544
|
+
}
|
545
|
+
|
546
|
+
inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
|
547
|
+
return vget_low_s8(vcombine_s8(a, b));
|
548
|
+
}
|
549
|
+
|
550
|
+
inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
|
551
|
+
return vget_high_s8(vcombine_s8(a, b));
|
552
|
+
}
|
553
|
+
|
554
|
+
inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
555
|
+
return vget_low_u8(vcombine_u8(a, b));
|
556
|
+
}
|
557
|
+
|
558
|
+
inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
559
|
+
return vget_high_u8(vcombine_u8(a, b));
|
560
|
+
}
|
561
|
+
|
562
|
+
#endif
|
563
|
+
#endif
|
564
|
+
|
499
565
|
// method 5
|
500
566
|
// blocks of QK elements
|
501
567
|
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
@@ -609,10 +675,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
609
675
|
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
610
676
|
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
611
677
|
|
612
|
-
|
613
|
-
const float amax = MAX(
|
614
|
-
MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
|
615
|
-
MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
|
678
|
+
const float amax = vmaxvq_f32(amaxv[0]);
|
616
679
|
|
617
680
|
const float d = amax / ((1 << 3) - 1);
|
618
681
|
const float id = d ? 1.0f/d : 0.0f;
|
@@ -934,7 +997,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
|
|
934
997
|
float32x4_t minv[8];
|
935
998
|
float32x4_t maxv[8];
|
936
999
|
|
937
|
-
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*
|
1000
|
+
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l);
|
938
1001
|
|
939
1002
|
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
|
940
1003
|
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
|
@@ -957,7 +1020,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
|
|
957
1020
|
|
958
1021
|
for (int l = 0; l < 8; l++) {
|
959
1022
|
const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
|
960
|
-
const
|
1023
|
+
const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
|
1024
|
+
const int32x4_t vi = vcvtq_s32_f32(vf);
|
961
1025
|
|
962
1026
|
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
963
1027
|
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
@@ -1225,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
|
|
1225
1289
|
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
1226
1290
|
#define GGML_F32x4_ADD vaddq_f32
|
1227
1291
|
#define GGML_F32x4_MUL vmulq_f32
|
1228
|
-
#
|
1229
|
-
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1230
|
-
#else
|
1231
|
-
#define GGML_F32x4_REDUCE_ONE(x) \
|
1232
|
-
(vgetq_lane_f32(x, 0) + \
|
1233
|
-
vgetq_lane_f32(x, 1) + \
|
1234
|
-
vgetq_lane_f32(x, 2) + \
|
1235
|
-
vgetq_lane_f32(x, 3))
|
1236
|
-
#endif
|
1292
|
+
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1237
1293
|
#define GGML_F32x4_REDUCE(res, x) \
|
1238
1294
|
{ \
|
1239
1295
|
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
@@ -1856,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1856
1912
|
// 4-bit -> 8-bit
|
1857
1913
|
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
|
1858
1914
|
const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
|
1859
|
-
|
1860
1915
|
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
1861
1916
|
const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
|
1862
1917
|
|
1863
1918
|
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
|
1864
1919
|
const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
|
1865
|
-
|
1866
1920
|
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
1867
1921
|
const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
|
1868
1922
|
|
1869
1923
|
// sub 8
|
1870
1924
|
const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
|
1871
1925
|
const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
|
1872
|
-
|
1873
1926
|
const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
|
1874
1927
|
const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
|
1875
1928
|
|
1876
1929
|
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
|
1877
1930
|
const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
|
1878
|
-
|
1879
1931
|
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
|
1880
1932
|
const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
|
1881
1933
|
|
1882
1934
|
#if defined(__ARM_FEATURE_DOTPROD)
|
1883
|
-
// dot product into
|
1935
|
+
// dot product into int32x4_t
|
1884
1936
|
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
|
1885
1937
|
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
|
1886
1938
|
|
1887
1939
|
p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
|
1888
1940
|
p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
|
1889
1941
|
|
1890
|
-
|
1891
|
-
|
1892
|
-
sum0 += x0->d * y0->d * vaddvq_s32(p_0);
|
1893
|
-
sum1 += x1->d * y1->d * vaddvq_s32(p_1);
|
1894
|
-
#else
|
1895
|
-
sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
|
1896
|
-
sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
|
1897
|
-
#endif
|
1942
|
+
sum0 += x0->d*y0->d*vaddvq_s32(p_0);
|
1943
|
+
sum1 += x1->d*y1->d*vaddvq_s32(p_1);
|
1898
1944
|
#else
|
1899
|
-
|
1945
|
+
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
1900
1946
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
1901
|
-
|
1902
1947
|
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
1903
1948
|
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
|
1904
1949
|
|
1905
1950
|
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
|
1906
1951
|
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
|
1907
|
-
|
1908
1952
|
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
|
1909
1953
|
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
|
1910
1954
|
|
@@ -1917,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1917
1961
|
const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
|
1918
1962
|
const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
|
1919
1963
|
|
1920
|
-
|
1921
|
-
|
1922
|
-
sum0 += x0->d * y0->d * vaddvq_s16(p_0);
|
1923
|
-
sum1 += x1->d * y1->d * vaddvq_s16(p_1);
|
1924
|
-
#else
|
1925
|
-
sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
|
1926
|
-
sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
|
1927
|
-
#endif
|
1964
|
+
sum0 += x0->d*y0->d*vaddvq_s16(p_0);
|
1965
|
+
sum1 += x1->d*y1->d*vaddvq_s16(p_1);
|
1928
1966
|
#endif
|
1929
1967
|
}
|
1930
1968
|
|
@@ -1961,41 +1999,68 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1961
1999
|
// Initialize accumulator with zeros
|
1962
2000
|
__m256 acc = _mm256_setzero_ps();
|
1963
2001
|
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
#pragma GCC unroll 16
|
1968
|
-
#endif
|
1969
|
-
for (int i = 0; i < nb; ++i) {
|
1970
|
-
// Compute combined scale for the block
|
1971
|
-
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
1972
|
-
|
1973
|
-
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
1974
|
-
__m256i bx = bytesFromNibbles( x[i].qs );
|
1975
|
-
__m256i by = bytesFromNibbles( y[i].qs );
|
1976
|
-
|
1977
|
-
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
1978
|
-
const __m256i off = _mm256_set1_epi8( 8 );
|
1979
|
-
bx = _mm256_sub_epi8( bx, off );
|
1980
|
-
by = _mm256_sub_epi8( by, off );
|
1981
|
-
|
1982
|
-
// Get absolute values of x vectors
|
1983
|
-
const __m256i ax = _mm256_sign_epi8(bx, bx);
|
1984
|
-
|
1985
|
-
// Sign the values of the y vectors
|
1986
|
-
const __m256i sy = _mm256_sign_epi8(by, bx);
|
1987
|
-
|
1988
|
-
// Perform multiplication and create 16-bit values
|
1989
|
-
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
1990
|
-
|
1991
|
-
const __m256i ones = _mm256_set1_epi16(1);
|
1992
|
-
const __m256i i32 = _mm256_madd_epi16(ones, dot);
|
2002
|
+
/* Prepare the constants we will need during execution */
|
2003
|
+
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
2004
|
+
const __m256i offset_8 = _mm256_set1_epi16( 8 );
|
1993
2005
|
|
1994
|
-
|
1995
|
-
|
2006
|
+
#define UNROLL_COUNT 8
|
2007
|
+
// make sure we only unroll multiples of the block count
|
2008
|
+
assert(nb % UNROLL_COUNT == 0);
|
1996
2009
|
|
1997
|
-
|
1998
|
-
|
2010
|
+
// Main loop
|
2011
|
+
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
|
2012
|
+
// This loop will be unrolled by the compiler
|
2013
|
+
for (int u=0;u<UNROLL_COUNT;u++) {
|
2014
|
+
/* Compute combined scale for the block */
|
2015
|
+
const __m256 scale = _mm256_mul_ps(
|
2016
|
+
_mm256_broadcast_ss( &x[i+u].d ),
|
2017
|
+
_mm256_broadcast_ss( &y[i+u].d ) );
|
2018
|
+
|
2019
|
+
/* get input from x
|
2020
|
+
Input: 32 Nibbles (16 bytes) at *x[i+u]
|
2021
|
+
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
|
2022
|
+
|
2023
|
+
/* Load 16 bytes from memory */
|
2024
|
+
const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
|
2025
|
+
/* Expand bytes into uint16_t values */
|
2026
|
+
const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
|
2027
|
+
/* Unpack values into individual bytes */
|
2028
|
+
__m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
|
2029
|
+
const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
|
2030
|
+
__m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
|
2031
|
+
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
2032
|
+
x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
|
2033
|
+
x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
|
2034
|
+
|
2035
|
+
/* get input from y
|
2036
|
+
Input: 32 Nibbles (16 bytes) at *y[i+u]
|
2037
|
+
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
|
2038
|
+
|
2039
|
+
/* Load 16 bytes from memory */
|
2040
|
+
const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
|
2041
|
+
/* Expand bytes into uint16_t values */
|
2042
|
+
const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
|
2043
|
+
/* Unpack values into individual bytes */
|
2044
|
+
const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
|
2045
|
+
__m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
|
2046
|
+
__m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
|
2047
|
+
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
2048
|
+
y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
|
2049
|
+
y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
|
2050
|
+
|
2051
|
+
/* Compute products of int16_t integers, add pairwise, store as int32_t */
|
2052
|
+
__m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
|
2053
|
+
__m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
|
2054
|
+
|
2055
|
+
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
|
2056
|
+
__m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
|
2057
|
+
|
2058
|
+
/* Convert to vectore of 8 int32_t to 8 floats */
|
2059
|
+
__m256 q = _mm256_cvtepi32_ps( xy_q );
|
2060
|
+
|
2061
|
+
/* Multiply q with scale and accumulate */
|
2062
|
+
acc = _mm256_fmadd_ps( scale, q, acc );
|
2063
|
+
}
|
1999
2064
|
}
|
2000
2065
|
|
2001
2066
|
// Return horizontal sum of the acc vector
|
@@ -2025,7 +2090,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
2025
2090
|
bx = _mm_sub_epi8( bx, off );
|
2026
2091
|
by = _mm_sub_epi8( by, off );
|
2027
2092
|
|
2028
|
-
|
2093
|
+
// Get absolute values of x vectors
|
2029
2094
|
const __m128i ax = _mm_sign_epi8(bx, bx);
|
2030
2095
|
|
2031
2096
|
// Sign the values of the y vectors
|
@@ -2057,18 +2122,18 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
2057
2122
|
float sum1 = 0.0f;
|
2058
2123
|
|
2059
2124
|
for (int i = 0; i < nb; i += 2) {
|
2060
|
-
const block_q4_0 * restrict x0 = &
|
2061
|
-
const block_q4_0 * restrict y0 = &
|
2062
|
-
const block_q4_0 * restrict x1 = &
|
2063
|
-
const block_q4_0 * restrict y1 = &
|
2125
|
+
const block_q4_0 * restrict x0 = &x[i + 0];
|
2126
|
+
const block_q4_0 * restrict y0 = &y[i + 0];
|
2127
|
+
const block_q4_0 * restrict x1 = &x[i + 1];
|
2128
|
+
const block_q4_0 * restrict y1 = &y[i + 1];
|
2064
2129
|
|
2065
2130
|
const v128_t m4b = wasm_u8x16_splat(0xf);
|
2066
2131
|
const v128_t s8b = wasm_i8x16_splat(0x8);
|
2067
2132
|
|
2068
|
-
const v128_t v0_0 = wasm_v128_load(x0
|
2069
|
-
const v128_t v0_1 = wasm_v128_load(y0
|
2070
|
-
const v128_t v1_0 = wasm_v128_load(x1
|
2071
|
-
const v128_t v1_1 = wasm_v128_load(y1
|
2133
|
+
const v128_t v0_0 = wasm_v128_load(x0->qs);
|
2134
|
+
const v128_t v0_1 = wasm_v128_load(y0->qs);
|
2135
|
+
const v128_t v1_0 = wasm_v128_load(x1->qs);
|
2136
|
+
const v128_t v1_1 = wasm_v128_load(y1->qs);
|
2072
2137
|
|
2073
2138
|
// 4-bit -> 8-bit
|
2074
2139
|
const v128_t v0_0l = wasm_v128_and(v0_0, m4b);
|
@@ -2140,18 +2205,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
2140
2205
|
const uint8_t * restrict p0 = x[i].qs;
|
2141
2206
|
const uint8_t * restrict p1 = y[i].qs;
|
2142
2207
|
|
2208
|
+
int sumi = 0;
|
2143
2209
|
for (int j = 0; j < QK/2; j++) {
|
2144
2210
|
const uint8_t v0 = p0[j];
|
2145
2211
|
const uint8_t v1 = p1[j];
|
2146
2212
|
|
2147
|
-
const
|
2148
|
-
const
|
2213
|
+
const int8_t i0 = (int8_t) (v0 & 0xf) - 8;
|
2214
|
+
const int8_t i1 = (int8_t) (v0 >> 4) - 8;
|
2149
2215
|
|
2150
|
-
const
|
2151
|
-
const
|
2216
|
+
const int8_t i2 = (int8_t) (v1 & 0xf) - 8;
|
2217
|
+
const int8_t i3 = (int8_t) (v1 >> 4) - 8;
|
2152
2218
|
|
2153
|
-
|
2219
|
+
sumi += i0*i2 + i1*i3;
|
2154
2220
|
}
|
2221
|
+
sumf += d0 * d1 * sumi;
|
2155
2222
|
}
|
2156
2223
|
#endif
|
2157
2224
|
|
@@ -2243,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|
2243
2310
|
float sum10 = 0.0f;
|
2244
2311
|
float sum11 = 0.0f;
|
2245
2312
|
|
2246
|
-
for (int i = 0; i < nb;
|
2313
|
+
for (int i = 0; i < nb; i += 2) {
|
2247
2314
|
const block_q4_1 * restrict x0 = &x[i + 0];
|
2248
2315
|
const block_q4_1 * restrict y0 = &y[i + 0];
|
2316
|
+
const block_q4_1 * restrict x1 = &x[i + 1];
|
2317
|
+
const block_q4_1 * restrict y1 = &y[i + 1];
|
2249
2318
|
|
2250
2319
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
2251
2320
|
|
2252
2321
|
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2253
2322
|
const uint8x16_t v1_0 = vld1q_u8(y0->qs);
|
2323
|
+
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
2324
|
+
const uint8x16_t v1_1 = vld1q_u8(y1->qs);
|
2254
2325
|
|
2255
|
-
//
|
2326
|
+
// 4-bit -> 8-bit
|
2256
2327
|
const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
|
2257
2328
|
const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
|
2258
|
-
|
2259
2329
|
const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
|
2260
2330
|
const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
|
2261
2331
|
|
2262
|
-
|
2332
|
+
const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
|
2333
|
+
const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
|
2334
|
+
const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
|
2335
|
+
const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
|
2336
|
+
|
2337
|
+
sum00 += x0->m*y0->m;
|
2338
|
+
sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
|
2339
|
+
sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
|
2340
|
+
|
2341
|
+
sum00 += x1->m*y1->m;
|
2342
|
+
sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
|
2343
|
+
sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
|
2344
|
+
|
2345
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
2346
|
+
// dot product into int32x4_t
|
2347
|
+
uint32x4_t p_0 = vdotq_u32(vdupq_n_u32(0), v0_0l, v1_0l);
|
2348
|
+
uint32x4_t p_1 = vdotq_u32(vdupq_n_u32(0), v0_1l, v1_1l);
|
2349
|
+
|
2350
|
+
p_0 = vdotq_u32(p_0, v0_0h, v1_0h);
|
2351
|
+
p_1 = vdotq_u32(p_1, v0_1h, v1_1h);
|
2352
|
+
|
2353
|
+
sum11 += x0->d*y0->d*vaddvq_u32(p_0);
|
2354
|
+
sum11 += x1->d*y1->d*vaddvq_u32(p_1);
|
2355
|
+
#else
|
2263
2356
|
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
2264
2357
|
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
2265
|
-
|
2266
2358
|
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
|
2267
2359
|
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
|
2268
2360
|
|
2269
|
-
const uint16x8_t
|
2270
|
-
const uint16x8_t
|
2361
|
+
const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
|
2362
|
+
const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
|
2363
|
+
const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
|
2364
|
+
const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
|
2271
2365
|
|
2272
|
-
|
2273
|
-
|
2274
|
-
|
2275
|
-
|
2366
|
+
const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
|
2367
|
+
const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);
|
2368
|
+
|
2369
|
+
const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h);
|
2370
|
+
const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h);
|
2371
|
+
|
2372
|
+
const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0);
|
2373
|
+
const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1);
|
2374
|
+
|
2375
|
+
sum11 += x0->d*y0->d*vaddvq_u16(p_0);
|
2376
|
+
sum11 += x1->d*y1->d*vaddvq_u16(p_1);
|
2377
|
+
#endif
|
2276
2378
|
}
|
2277
2379
|
|
2278
2380
|
sumf = QK*sum00 + sum01 + sum10 + sum11;
|
@@ -2548,29 +2650,38 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
|
2548
2650
|
//
|
2549
2651
|
|
2550
2652
|
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
2551
|
-
|
2552
|
-
|
2553
|
-
|
2554
|
-
|
2555
|
-
1,
|
2556
|
-
1,
|
2557
|
-
1,
|
2653
|
+
[GGML_TYPE_F32] = 1,
|
2654
|
+
[GGML_TYPE_F16] = 1,
|
2655
|
+
[GGML_TYPE_Q4_0] = QK,
|
2656
|
+
[GGML_TYPE_Q4_1] = QK,
|
2657
|
+
[GGML_TYPE_I8] = 1,
|
2658
|
+
[GGML_TYPE_I16] = 1,
|
2659
|
+
[GGML_TYPE_I32] = 1,
|
2558
2660
|
};
|
2559
|
-
|
2560
|
-
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
|
2661
|
+
static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated");
|
2561
2662
|
|
2562
2663
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
2563
|
-
sizeof(
|
2564
|
-
sizeof(
|
2565
|
-
sizeof(
|
2566
|
-
sizeof(
|
2567
|
-
sizeof(
|
2568
|
-
sizeof(
|
2569
|
-
sizeof(
|
2664
|
+
[GGML_TYPE_F32] = sizeof(float),
|
2665
|
+
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
|
2666
|
+
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
2667
|
+
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
2668
|
+
[GGML_TYPE_I8] = sizeof(int8_t),
|
2669
|
+
[GGML_TYPE_I16] = sizeof(int16_t),
|
2670
|
+
[GGML_TYPE_I32] = sizeof(int32_t),
|
2570
2671
|
};
|
2672
|
+
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
|
2571
2673
|
|
2572
|
-
|
2573
|
-
|
2674
|
+
|
2675
|
+
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
2676
|
+
[GGML_TYPE_F32] = "f32",
|
2677
|
+
[GGML_TYPE_F16] = "f16",
|
2678
|
+
[GGML_TYPE_Q4_0] = "q4_0",
|
2679
|
+
[GGML_TYPE_Q4_1] = "q4_1",
|
2680
|
+
[GGML_TYPE_I8] = "i8",
|
2681
|
+
[GGML_TYPE_I16] = "i16",
|
2682
|
+
[GGML_TYPE_I32] = "i32",
|
2683
|
+
};
|
2684
|
+
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated");
|
2574
2685
|
|
2575
2686
|
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
2576
2687
|
"NONE",
|
@@ -2599,6 +2710,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
2599
2710
|
|
2600
2711
|
"SCALE",
|
2601
2712
|
"CPY",
|
2713
|
+
"CONT",
|
2602
2714
|
"RESHAPE",
|
2603
2715
|
"VIEW",
|
2604
2716
|
"PERMUTE",
|
@@ -2612,9 +2724,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
2612
2724
|
|
2613
2725
|
"FLASH_ATTN",
|
2614
2726
|
"FLASH_FF",
|
2727
|
+
|
2728
|
+
"MAP_UNARY",
|
2729
|
+
"MAP_BINARY",
|
2615
2730
|
};
|
2616
2731
|
|
2617
|
-
static_assert(GGML_OP_COUNT ==
|
2732
|
+
static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
|
2618
2733
|
|
2619
2734
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2620
2735
|
"none",
|
@@ -2643,6 +2758,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2643
2758
|
|
2644
2759
|
"x*v",
|
2645
2760
|
"x-\\>y",
|
2761
|
+
"cont(x)",
|
2646
2762
|
"reshape(x)",
|
2647
2763
|
"view(x)",
|
2648
2764
|
"permute(x)",
|
@@ -2656,24 +2772,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2656
2772
|
|
2657
2773
|
"flash_attn(x)",
|
2658
2774
|
"flash_ff(x)",
|
2659
|
-
};
|
2660
2775
|
|
2661
|
-
|
2662
|
-
|
2663
|
-
//
|
2664
|
-
// ggml object
|
2665
|
-
//
|
2666
|
-
|
2667
|
-
struct ggml_object {
|
2668
|
-
size_t offs;
|
2669
|
-
size_t size;
|
2670
|
-
|
2671
|
-
struct ggml_object * next;
|
2672
|
-
|
2673
|
-
char padding[8];
|
2776
|
+
"f(x)",
|
2777
|
+
"f(x,y)",
|
2674
2778
|
};
|
2675
2779
|
|
2676
|
-
|
2780
|
+
static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
|
2677
2781
|
|
2678
2782
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
2679
2783
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -2686,7 +2790,6 @@ struct ggml_context {
|
|
2686
2790
|
size_t mem_size;
|
2687
2791
|
void * mem_buffer;
|
2688
2792
|
bool mem_buffer_owned;
|
2689
|
-
bool mem_buffer_mlocked;
|
2690
2793
|
bool no_alloc;
|
2691
2794
|
|
2692
2795
|
int n_objects;
|
@@ -2774,7 +2877,7 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
2774
2877
|
GGML_PRINT("%s: --- end ---\n", __func__);
|
2775
2878
|
}
|
2776
2879
|
|
2777
|
-
|
2880
|
+
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
2778
2881
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2779
2882
|
|
2780
2883
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
@@ -2804,6 +2907,11 @@ float ggml_type_sizef(enum ggml_type type) {
|
|
2804
2907
|
return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
|
2805
2908
|
}
|
2806
2909
|
|
2910
|
+
const char * ggml_type_name(enum ggml_type type) {
|
2911
|
+
return GGML_TYPE_NAME[type];
|
2912
|
+
}
|
2913
|
+
|
2914
|
+
|
2807
2915
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2808
2916
|
return GGML_TYPE_SIZE[tensor->type];
|
2809
2917
|
}
|
@@ -2969,11 +3077,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2969
3077
|
return NULL;
|
2970
3078
|
}
|
2971
3079
|
|
3080
|
+
const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
|
3081
|
+
|
2972
3082
|
*ctx = (struct ggml_context) {
|
2973
|
-
/*.mem_size =*/
|
2974
|
-
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer :
|
3083
|
+
/*.mem_size =*/ mem_size,
|
3084
|
+
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
2975
3085
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
2976
|
-
/*.mem_buffer_mlocked =*/ false,
|
2977
3086
|
/*.no_alloc =*/ params.no_alloc,
|
2978
3087
|
/*.n_objects =*/ 0,
|
2979
3088
|
/*.objects_begin =*/ NULL,
|
@@ -2982,7 +3091,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2982
3091
|
/*.scratch_save =*/ { 0, 0, NULL, },
|
2983
3092
|
};
|
2984
3093
|
|
2985
|
-
GGML_ASSERT(ctx->mem_buffer != NULL);
|
3094
|
+
GGML_ASSERT(ctx->mem_buffer != NULL);
|
2986
3095
|
|
2987
3096
|
ggml_assert_aligned(ctx->mem_buffer);
|
2988
3097
|
|
@@ -3006,16 +3115,8 @@ void ggml_free(struct ggml_context * ctx) {
|
|
3006
3115
|
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
|
3007
3116
|
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
3008
3117
|
|
3009
|
-
#if GGML_MLOCK_SUPPORT
|
3010
|
-
if (ctx->mem_buffer_mlocked) {
|
3011
|
-
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
|
3012
|
-
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
|
3013
|
-
}
|
3014
|
-
}
|
3015
|
-
#endif
|
3016
|
-
|
3017
3118
|
if (ctx->mem_buffer_owned) {
|
3018
|
-
|
3119
|
+
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
3019
3120
|
}
|
3020
3121
|
|
3021
3122
|
found = true;
|
@@ -3042,55 +3143,13 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
3042
3143
|
return result;
|
3043
3144
|
}
|
3044
3145
|
|
3045
|
-
#ifdef __APPLE__
|
3046
|
-
#define MLOCK_SUGGESTION \
|
3047
|
-
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
3048
|
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
3049
|
-
#else
|
3050
|
-
#define MLOCK_SUGGESTION \
|
3051
|
-
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
3052
|
-
#endif
|
3053
|
-
|
3054
|
-
bool ggml_mlock_supported(void) {
|
3055
|
-
return GGML_MLOCK_SUPPORT;
|
3056
|
-
}
|
3057
|
-
|
3058
|
-
bool ggml_mlock(
|
3059
|
-
struct ggml_context * ctx,
|
3060
|
-
const void *opt_extra_addr,
|
3061
|
-
size_t opt_extra_len,
|
3062
|
-
char **err_p) {
|
3063
|
-
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
|
3064
|
-
#if GGML_MLOCK_SUPPORT
|
3065
|
-
if (ctx->mem_buffer_mlocked) {
|
3066
|
-
return true;
|
3067
|
-
}
|
3068
|
-
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
|
3069
|
-
(opt_extra_len &&
|
3070
|
-
mlock(opt_extra_addr, opt_extra_len))) {
|
3071
|
-
if ((*err_p = malloc(1024))) {
|
3072
|
-
snprintf(*err_p, 1024,
|
3073
|
-
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
|
3074
|
-
ctx->mem_size + opt_extra_len,
|
3075
|
-
strerror(errno));
|
3076
|
-
}
|
3077
|
-
return false;
|
3078
|
-
}
|
3079
|
-
ctx->mem_buffer_mlocked = true;
|
3080
|
-
return true;
|
3081
|
-
#else // GGML_MLOCK_SUPPORT
|
3082
|
-
*err_p = strdup("can't mlock because it's not supported on this system");
|
3083
|
-
return false;
|
3084
|
-
#endif // GGML_MLOCK_SUPPORT
|
3085
|
-
}
|
3086
|
-
|
3087
3146
|
////////////////////////////////////////////////////////////////////////////////
|
3088
3147
|
|
3089
3148
|
struct ggml_tensor * ggml_new_tensor_impl(
|
3090
3149
|
struct ggml_context * ctx,
|
3091
3150
|
enum ggml_type type,
|
3092
3151
|
int n_dims,
|
3093
|
-
const
|
3152
|
+
const int64_t* ne,
|
3094
3153
|
void* data) {
|
3095
3154
|
// always insert objects at the end of the context's memory pool
|
3096
3155
|
struct ggml_object * obj_cur = ctx->objects_end;
|
@@ -3189,7 +3248,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
3189
3248
|
/*.pad =*/ { 0 },
|
3190
3249
|
};
|
3191
3250
|
|
3192
|
-
|
3251
|
+
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
3252
|
+
//ggml_assert_aligned(result->data);
|
3193
3253
|
|
3194
3254
|
for (int i = 0; i < n_dims; i++) {
|
3195
3255
|
result->ne[i] = ne[i];
|
@@ -3210,44 +3270,44 @@ struct ggml_tensor * ggml_new_tensor(
|
|
3210
3270
|
struct ggml_context * ctx,
|
3211
3271
|
enum ggml_type type,
|
3212
3272
|
int n_dims,
|
3213
|
-
const
|
3273
|
+
const int64_t * ne) {
|
3214
3274
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
3215
3275
|
}
|
3216
3276
|
|
3217
3277
|
struct ggml_tensor * ggml_new_tensor_1d(
|
3218
3278
|
struct ggml_context * ctx,
|
3219
3279
|
enum ggml_type type,
|
3220
|
-
|
3280
|
+
int64_t ne0) {
|
3221
3281
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
3222
3282
|
}
|
3223
3283
|
|
3224
3284
|
struct ggml_tensor * ggml_new_tensor_2d(
|
3225
3285
|
struct ggml_context * ctx,
|
3226
3286
|
enum ggml_type type,
|
3227
|
-
|
3228
|
-
|
3229
|
-
const
|
3287
|
+
int64_t ne0,
|
3288
|
+
int64_t ne1) {
|
3289
|
+
const int64_t ne[2] = { ne0, ne1 };
|
3230
3290
|
return ggml_new_tensor(ctx, type, 2, ne);
|
3231
3291
|
}
|
3232
3292
|
|
3233
3293
|
struct ggml_tensor * ggml_new_tensor_3d(
|
3234
3294
|
struct ggml_context * ctx,
|
3235
3295
|
enum ggml_type type,
|
3236
|
-
|
3237
|
-
|
3238
|
-
|
3239
|
-
const
|
3296
|
+
int64_t ne0,
|
3297
|
+
int64_t ne1,
|
3298
|
+
int64_t ne2) {
|
3299
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
3240
3300
|
return ggml_new_tensor(ctx, type, 3, ne);
|
3241
3301
|
}
|
3242
3302
|
|
3243
3303
|
struct ggml_tensor * ggml_new_tensor_4d(
|
3244
3304
|
struct ggml_context * ctx,
|
3245
3305
|
enum ggml_type type,
|
3246
|
-
|
3247
|
-
|
3248
|
-
|
3249
|
-
|
3250
|
-
const
|
3306
|
+
int64_t ne0,
|
3307
|
+
int64_t ne1,
|
3308
|
+
int64_t ne2,
|
3309
|
+
int64_t ne3) {
|
3310
|
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
3251
3311
|
return ggml_new_tensor(ctx, type, 4, ne);
|
3252
3312
|
}
|
3253
3313
|
|
@@ -3590,7 +3650,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
3590
3650
|
struct ggml_tensor * ggml_view_tensor(
|
3591
3651
|
struct ggml_context * ctx,
|
3592
3652
|
const struct ggml_tensor * src) {
|
3593
|
-
|
3653
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
3654
|
+
|
3655
|
+
result->nb[0] = src->nb[0];
|
3656
|
+
result->nb[1] = src->nb[1];
|
3657
|
+
result->nb[2] = src->nb[2];
|
3658
|
+
result->nb[3] = src->nb[3];
|
3659
|
+
|
3660
|
+
return result;
|
3594
3661
|
}
|
3595
3662
|
|
3596
3663
|
////////////////////////////////////////////////////////////////////////////////
|
@@ -3894,7 +3961,7 @@ struct ggml_tensor * ggml_mean(
|
|
3894
3961
|
is_node = true;
|
3895
3962
|
}
|
3896
3963
|
|
3897
|
-
|
3964
|
+
int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
3898
3965
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
|
3899
3966
|
|
3900
3967
|
result->op = GGML_OP_MEAN;
|
@@ -4255,7 +4322,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4255
4322
|
is_node = true;
|
4256
4323
|
}
|
4257
4324
|
|
4258
|
-
const
|
4325
|
+
const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
|
4259
4326
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
|
4260
4327
|
|
4261
4328
|
result->op = GGML_OP_MUL_MAT;
|
@@ -4350,6 +4417,41 @@ struct ggml_tensor * ggml_cpy_inplace(
|
|
4350
4417
|
return ggml_cpy_impl(ctx, a, b, true);
|
4351
4418
|
}
|
4352
4419
|
|
4420
|
+
// ggml_cont
|
4421
|
+
|
4422
|
+
struct ggml_tensor * ggml_cont_impl(
|
4423
|
+
struct ggml_context * ctx,
|
4424
|
+
struct ggml_tensor * a,
|
4425
|
+
bool inplace) {
|
4426
|
+
bool is_node = false;
|
4427
|
+
|
4428
|
+
if (!inplace && a->grad) {
|
4429
|
+
GGML_ASSERT(false); // TODO: implement backward
|
4430
|
+
is_node = true;
|
4431
|
+
}
|
4432
|
+
|
4433
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4434
|
+
|
4435
|
+
result->op = GGML_OP_CONT;
|
4436
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4437
|
+
result->src0 = a;
|
4438
|
+
result->src1 = NULL;
|
4439
|
+
|
4440
|
+
return result;
|
4441
|
+
}
|
4442
|
+
|
4443
|
+
struct ggml_tensor * ggml_cont(
|
4444
|
+
struct ggml_context * ctx,
|
4445
|
+
struct ggml_tensor * a) {
|
4446
|
+
return ggml_cont_impl(ctx, a, false);
|
4447
|
+
}
|
4448
|
+
|
4449
|
+
struct ggml_tensor * ggml_cont_inplace(
|
4450
|
+
struct ggml_context * ctx,
|
4451
|
+
struct ggml_tensor * a) {
|
4452
|
+
return ggml_cont_impl(ctx, a, true);
|
4453
|
+
}
|
4454
|
+
|
4353
4455
|
// ggml_reshape
|
4354
4456
|
|
4355
4457
|
struct ggml_tensor * ggml_reshape(
|
@@ -4380,8 +4482,8 @@ struct ggml_tensor * ggml_reshape(
|
|
4380
4482
|
struct ggml_tensor * ggml_reshape_2d(
|
4381
4483
|
struct ggml_context * ctx,
|
4382
4484
|
struct ggml_tensor * a,
|
4383
|
-
|
4384
|
-
|
4485
|
+
int64_t ne0,
|
4486
|
+
int64_t ne1) {
|
4385
4487
|
GGML_ASSERT(ggml_is_contiguous(a));
|
4386
4488
|
GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
|
4387
4489
|
|
@@ -4392,7 +4494,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
4392
4494
|
is_node = true;
|
4393
4495
|
}
|
4394
4496
|
|
4395
|
-
const
|
4497
|
+
const int64_t ne[2] = { ne0, ne1 };
|
4396
4498
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
4397
4499
|
|
4398
4500
|
result->op = GGML_OP_RESHAPE;
|
@@ -4406,9 +4508,9 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
4406
4508
|
struct ggml_tensor * ggml_reshape_3d(
|
4407
4509
|
struct ggml_context * ctx,
|
4408
4510
|
struct ggml_tensor * a,
|
4409
|
-
|
4410
|
-
|
4411
|
-
|
4511
|
+
int64_t ne0,
|
4512
|
+
int64_t ne1,
|
4513
|
+
int64_t ne2) {
|
4412
4514
|
GGML_ASSERT(ggml_is_contiguous(a));
|
4413
4515
|
GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
|
4414
4516
|
|
@@ -4419,7 +4521,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
4419
4521
|
is_node = true;
|
4420
4522
|
}
|
4421
4523
|
|
4422
|
-
const
|
4524
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
4423
4525
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
4424
4526
|
|
4425
4527
|
result->op = GGML_OP_RESHAPE;
|
@@ -4435,7 +4537,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
4435
4537
|
struct ggml_tensor * ggml_view_1d(
|
4436
4538
|
struct ggml_context * ctx,
|
4437
4539
|
struct ggml_tensor * a,
|
4438
|
-
|
4540
|
+
int64_t ne0,
|
4439
4541
|
size_t offset) {
|
4440
4542
|
if (a->grad) {
|
4441
4543
|
GGML_ASSERT(false); // gradient propagation is not supported
|
@@ -4456,15 +4558,15 @@ struct ggml_tensor * ggml_view_1d(
|
|
4456
4558
|
struct ggml_tensor * ggml_view_2d(
|
4457
4559
|
struct ggml_context * ctx,
|
4458
4560
|
struct ggml_tensor * a,
|
4459
|
-
|
4460
|
-
|
4561
|
+
int64_t ne0,
|
4562
|
+
int64_t ne1,
|
4461
4563
|
size_t nb1,
|
4462
4564
|
size_t offset) {
|
4463
4565
|
if (a->grad) {
|
4464
4566
|
GGML_ASSERT(false); // gradient propagation is not supported
|
4465
4567
|
}
|
4466
4568
|
|
4467
|
-
const
|
4569
|
+
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
4468
4570
|
|
4469
4571
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
4470
4572
|
|
@@ -4480,6 +4582,37 @@ struct ggml_tensor * ggml_view_2d(
|
|
4480
4582
|
return result;
|
4481
4583
|
}
|
4482
4584
|
|
4585
|
+
// ggml_view_3d
|
4586
|
+
|
4587
|
+
struct ggml_tensor * ggml_view_3d(
|
4588
|
+
struct ggml_context * ctx,
|
4589
|
+
struct ggml_tensor * a,
|
4590
|
+
int64_t ne0,
|
4591
|
+
int64_t ne1,
|
4592
|
+
int64_t ne2,
|
4593
|
+
size_t nb1,
|
4594
|
+
size_t nb2,
|
4595
|
+
size_t offset) {
|
4596
|
+
if (a->grad) {
|
4597
|
+
GGML_ASSERT(false); // gradient propagation is not supported
|
4598
|
+
}
|
4599
|
+
|
4600
|
+
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
4601
|
+
|
4602
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
4603
|
+
|
4604
|
+
result->nb[1] = nb1;
|
4605
|
+
result->nb[2] = nb2;
|
4606
|
+
result->nb[3] = result->nb[2]*ne2;
|
4607
|
+
|
4608
|
+
result->op = GGML_OP_VIEW;
|
4609
|
+
result->grad = NULL;
|
4610
|
+
result->src0 = a;
|
4611
|
+
result->src1 = NULL; // TODO: maybe store the offset here?
|
4612
|
+
|
4613
|
+
return result;
|
4614
|
+
}
|
4615
|
+
|
4483
4616
|
// ggml_permute
|
4484
4617
|
|
4485
4618
|
struct ggml_tensor * ggml_permute(
|
@@ -4695,7 +4828,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
4695
4828
|
is_node = true;
|
4696
4829
|
}
|
4697
4830
|
|
4698
|
-
const
|
4831
|
+
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
4699
4832
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
4700
4833
|
|
4701
4834
|
result->op = GGML_OP_CONV_1D_1S;
|
@@ -4722,7 +4855,7 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
4722
4855
|
is_node = true;
|
4723
4856
|
}
|
4724
4857
|
|
4725
|
-
const
|
4858
|
+
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
4726
4859
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
4727
4860
|
|
4728
4861
|
result->op = GGML_OP_CONV_1D_2S;
|
@@ -4797,6 +4930,90 @@ struct ggml_tensor * ggml_flash_ff(
|
|
4797
4930
|
return result;
|
4798
4931
|
}
|
4799
4932
|
|
4933
|
+
// ggml_map_unary
|
4934
|
+
|
4935
|
+
struct ggml_tensor * ggml_map_unary_impl_f32(
|
4936
|
+
struct ggml_context * ctx,
|
4937
|
+
struct ggml_tensor * a,
|
4938
|
+
const ggml_unary_op_f32_t fun,
|
4939
|
+
bool inplace) {
|
4940
|
+
bool is_node = false;
|
4941
|
+
|
4942
|
+
if (!inplace && a->grad) {
|
4943
|
+
is_node = true;
|
4944
|
+
}
|
4945
|
+
|
4946
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
4947
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
4948
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4949
|
+
|
4950
|
+
result->op = GGML_OP_MAP_UNARY;
|
4951
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4952
|
+
result->src0 = a;
|
4953
|
+
result->opt[0] = addr_tensor;
|
4954
|
+
|
4955
|
+
return result;
|
4956
|
+
}
|
4957
|
+
|
4958
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
4959
|
+
struct ggml_context * ctx,
|
4960
|
+
struct ggml_tensor * a,
|
4961
|
+
const ggml_unary_op_f32_t fun) {
|
4962
|
+
return ggml_map_unary_impl_f32(ctx, a, fun, false);
|
4963
|
+
}
|
4964
|
+
|
4965
|
+
struct ggml_tensor * ggml_map_unary_inplace_f32(
|
4966
|
+
struct ggml_context * ctx,
|
4967
|
+
struct ggml_tensor * a,
|
4968
|
+
const ggml_unary_op_f32_t fun) {
|
4969
|
+
return ggml_map_unary_impl_f32(ctx, a, fun, true);
|
4970
|
+
}
|
4971
|
+
|
4972
|
+
// ggml_map_binary
|
4973
|
+
|
4974
|
+
struct ggml_tensor * ggml_map_binary_impl_f32(
|
4975
|
+
struct ggml_context * ctx,
|
4976
|
+
struct ggml_tensor * a,
|
4977
|
+
struct ggml_tensor * b,
|
4978
|
+
const ggml_binary_op_f32_t fun,
|
4979
|
+
bool inplace) {
|
4980
|
+
GGML_ASSERT(ggml_are_same_shape(a, b));
|
4981
|
+
|
4982
|
+
bool is_node = false;
|
4983
|
+
|
4984
|
+
if (!inplace && (a->grad || b->grad)) {
|
4985
|
+
is_node = true;
|
4986
|
+
}
|
4987
|
+
|
4988
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
4989
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
4990
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4991
|
+
|
4992
|
+
result->op = GGML_OP_MAP_BINARY;
|
4993
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4994
|
+
result->src0 = a;
|
4995
|
+
result->src1 = b;
|
4996
|
+
result->opt[0] = addr_tensor;
|
4997
|
+
|
4998
|
+
return result;
|
4999
|
+
}
|
5000
|
+
|
5001
|
+
struct ggml_tensor * ggml_map_binary_f32(
|
5002
|
+
struct ggml_context * ctx,
|
5003
|
+
struct ggml_tensor * a,
|
5004
|
+
struct ggml_tensor * b,
|
5005
|
+
const ggml_binary_op_f32_t fun) {
|
5006
|
+
return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
|
5007
|
+
}
|
5008
|
+
|
5009
|
+
struct ggml_tensor * ggml_map_binary_inplace_f32(
|
5010
|
+
struct ggml_context * ctx,
|
5011
|
+
struct ggml_tensor * a,
|
5012
|
+
struct ggml_tensor * b,
|
5013
|
+
const ggml_binary_op_f32_t fun) {
|
5014
|
+
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
5015
|
+
}
|
5016
|
+
|
4800
5017
|
////////////////////////////////////////////////////////////////////////////////
|
4801
5018
|
|
4802
5019
|
void ggml_set_param(
|
@@ -4815,102 +5032,191 @@ static void ggml_compute_forward_dup_f16(
|
|
4815
5032
|
const struct ggml_tensor * src0,
|
4816
5033
|
struct ggml_tensor * dst) {
|
4817
5034
|
GGML_ASSERT(params->ith == 0);
|
4818
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
4819
5035
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
4820
5036
|
|
4821
5037
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
4822
5038
|
return;
|
4823
5039
|
}
|
4824
5040
|
|
4825
|
-
const
|
4826
|
-
const
|
4827
|
-
const
|
4828
|
-
const
|
5041
|
+
const int64_t ne00 = src0->ne[0];
|
5042
|
+
const int64_t ne01 = src0->ne[1];
|
5043
|
+
const int64_t ne02 = src0->ne[2];
|
5044
|
+
const int64_t ne03 = src0->ne[3];
|
4829
5045
|
|
4830
5046
|
const size_t nb00 = src0->nb[0];
|
4831
5047
|
const size_t nb01 = src0->nb[1];
|
4832
5048
|
const size_t nb02 = src0->nb[2];
|
4833
5049
|
const size_t nb03 = src0->nb[3];
|
4834
5050
|
|
4835
|
-
|
5051
|
+
const size_t nb0 = dst->nb[0];
|
5052
|
+
const size_t nb1 = dst->nb[1];
|
5053
|
+
const size_t nb2 = dst->nb[2];
|
5054
|
+
const size_t nb3 = dst->nb[3];
|
5055
|
+
|
5056
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
4836
5057
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
4837
5058
|
return;
|
4838
5059
|
}
|
4839
5060
|
|
4840
|
-
if (src0->
|
4841
|
-
|
4842
|
-
|
4843
|
-
|
5061
|
+
if (src0->type == dst->type &&
|
5062
|
+
src0->ne[0] == dst->ne[0] &&
|
5063
|
+
src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
|
5064
|
+
// copy by rows
|
5065
|
+
const size_t rs = ne00*nb00;
|
5066
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5067
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5068
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5069
|
+
memcpy(
|
5070
|
+
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
5071
|
+
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
|
5072
|
+
rs);
|
5073
|
+
}
|
5074
|
+
}
|
5075
|
+
}
|
5076
|
+
return;
|
5077
|
+
}
|
4844
5078
|
|
4845
|
-
|
4846
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4847
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4848
|
-
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
4849
|
-
char * dst_ptr = (char *) dst->data + id*rs;
|
5079
|
+
// TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
|
4850
5080
|
|
4851
|
-
|
5081
|
+
if (ggml_is_contiguous(dst)) {
|
5082
|
+
if (src0->nb[0] == sizeof(ggml_fp16_t)) {
|
5083
|
+
if (dst->type == GGML_TYPE_F16) {
|
5084
|
+
size_t id = 0;
|
5085
|
+
const size_t rs = ne00*nb00;
|
4852
5086
|
|
4853
|
-
|
4854
|
-
|
4855
|
-
|
4856
|
-
|
4857
|
-
|
4858
|
-
size_t id = 0;
|
4859
|
-
float * dst_ptr = (float *) dst->data;
|
5087
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5088
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5089
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5090
|
+
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
5091
|
+
char * dst_ptr = (char *) dst->data + id*rs;
|
4860
5092
|
|
4861
|
-
|
4862
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4863
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4864
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4865
|
-
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5093
|
+
memcpy(dst_ptr, src0_ptr, rs);
|
4866
5094
|
|
4867
|
-
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
4868
5095
|
id++;
|
4869
5096
|
}
|
4870
5097
|
}
|
4871
5098
|
}
|
5099
|
+
} else if (dst->type == GGML_TYPE_F32) {
|
5100
|
+
size_t id = 0;
|
5101
|
+
float * dst_ptr = (float *) dst->data;
|
5102
|
+
|
5103
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5104
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5105
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5106
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
5107
|
+
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5108
|
+
|
5109
|
+
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
5110
|
+
id++;
|
5111
|
+
}
|
5112
|
+
}
|
5113
|
+
}
|
5114
|
+
}
|
5115
|
+
} else {
|
5116
|
+
GGML_ASSERT(false); // TODO: implement
|
4872
5117
|
}
|
4873
5118
|
} else {
|
4874
|
-
|
4875
|
-
}
|
4876
|
-
} else {
|
4877
|
-
//printf("%s: this is not optimal - fix me\n", __func__);
|
5119
|
+
//printf("%s: this is not optimal - fix me\n", __func__);
|
4878
5120
|
|
4879
|
-
|
4880
|
-
|
4881
|
-
|
5121
|
+
if (dst->type == GGML_TYPE_F32) {
|
5122
|
+
size_t id = 0;
|
5123
|
+
float * dst_ptr = (float *) dst->data;
|
4882
5124
|
|
4883
|
-
|
4884
|
-
|
4885
|
-
|
4886
|
-
|
4887
|
-
|
5125
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5126
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5127
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5128
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
5129
|
+
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4888
5130
|
|
4889
|
-
|
4890
|
-
|
5131
|
+
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
5132
|
+
id++;
|
5133
|
+
}
|
5134
|
+
}
|
5135
|
+
}
|
5136
|
+
}
|
5137
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
5138
|
+
size_t id = 0;
|
5139
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
5140
|
+
|
5141
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5142
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5143
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5144
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
5145
|
+
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5146
|
+
|
5147
|
+
dst_ptr[id] = *src0_ptr;
|
5148
|
+
id++;
|
5149
|
+
}
|
4891
5150
|
}
|
4892
5151
|
}
|
4893
5152
|
}
|
5153
|
+
} else {
|
5154
|
+
GGML_ASSERT(false); // TODO: implement
|
4894
5155
|
}
|
4895
|
-
}
|
4896
|
-
|
4897
|
-
|
4898
|
-
|
4899
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
4900
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4901
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4902
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4903
|
-
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5156
|
+
}
|
5157
|
+
return;
|
5158
|
+
}
|
4904
5159
|
|
4905
|
-
|
4906
|
-
|
5160
|
+
// dst counters
|
5161
|
+
int64_t i10 = 0;
|
5162
|
+
int64_t i11 = 0;
|
5163
|
+
int64_t i12 = 0;
|
5164
|
+
int64_t i13 = 0;
|
5165
|
+
|
5166
|
+
if (dst->type == GGML_TYPE_F16) {
|
5167
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5168
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5169
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5170
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5171
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5172
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5173
|
+
|
5174
|
+
memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
|
5175
|
+
|
5176
|
+
if (++i10 == ne00) {
|
5177
|
+
i10 = 0;
|
5178
|
+
if (++i11 == ne01) {
|
5179
|
+
i11 = 0;
|
5180
|
+
if (++i12 == ne02) {
|
5181
|
+
i12 = 0;
|
5182
|
+
if (++i13 == ne03) {
|
5183
|
+
i13 = 0;
|
5184
|
+
}
|
5185
|
+
}
|
5186
|
+
}
|
5187
|
+
}
|
5188
|
+
}
|
5189
|
+
}
|
5190
|
+
}
|
5191
|
+
}
|
5192
|
+
} else if (dst->type == GGML_TYPE_F32) {
|
5193
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5194
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5195
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5196
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5197
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5198
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5199
|
+
|
5200
|
+
*(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
|
5201
|
+
|
5202
|
+
if (++i10 == ne00) {
|
5203
|
+
i10 = 0;
|
5204
|
+
if (++i11 == ne01) {
|
5205
|
+
i11 = 0;
|
5206
|
+
if (++i12 == ne02) {
|
5207
|
+
i12 = 0;
|
5208
|
+
if (++i13 == ne03) {
|
5209
|
+
i13 = 0;
|
5210
|
+
}
|
5211
|
+
}
|
5212
|
+
}
|
4907
5213
|
}
|
4908
5214
|
}
|
4909
5215
|
}
|
4910
5216
|
}
|
4911
|
-
} else {
|
4912
|
-
GGML_ASSERT(false); // TODO: implement
|
4913
5217
|
}
|
5218
|
+
} else {
|
5219
|
+
GGML_ASSERT(false); // TODO: implement
|
4914
5220
|
}
|
4915
5221
|
}
|
4916
5222
|
|
@@ -4919,102 +5225,191 @@ static void ggml_compute_forward_dup_f32(
|
|
4919
5225
|
const struct ggml_tensor * src0,
|
4920
5226
|
struct ggml_tensor * dst) {
|
4921
5227
|
GGML_ASSERT(params->ith == 0);
|
4922
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
4923
5228
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
4924
5229
|
|
4925
5230
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
4926
5231
|
return;
|
4927
5232
|
}
|
4928
5233
|
|
4929
|
-
const
|
4930
|
-
const
|
4931
|
-
const
|
4932
|
-
const
|
5234
|
+
const int64_t ne00 = src0->ne[0];
|
5235
|
+
const int64_t ne01 = src0->ne[1];
|
5236
|
+
const int64_t ne02 = src0->ne[2];
|
5237
|
+
const int64_t ne03 = src0->ne[3];
|
4933
5238
|
|
4934
5239
|
const size_t nb00 = src0->nb[0];
|
4935
5240
|
const size_t nb01 = src0->nb[1];
|
4936
5241
|
const size_t nb02 = src0->nb[2];
|
4937
5242
|
const size_t nb03 = src0->nb[3];
|
4938
5243
|
|
4939
|
-
|
5244
|
+
const size_t nb0 = dst->nb[0];
|
5245
|
+
const size_t nb1 = dst->nb[1];
|
5246
|
+
const size_t nb2 = dst->nb[2];
|
5247
|
+
const size_t nb3 = dst->nb[3];
|
5248
|
+
|
5249
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
4940
5250
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
4941
5251
|
return;
|
4942
5252
|
}
|
4943
5253
|
|
4944
|
-
if (src0->
|
4945
|
-
|
4946
|
-
|
4947
|
-
|
4948
|
-
|
4949
|
-
|
4950
|
-
|
4951
|
-
|
4952
|
-
|
4953
|
-
|
4954
|
-
|
4955
|
-
|
4956
|
-
|
4957
|
-
id++;
|
4958
|
-
}
|
5254
|
+
if (src0->type == dst->type &&
|
5255
|
+
src0->ne[0] == dst->ne[0] &&
|
5256
|
+
src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
|
5257
|
+
// copy by rows
|
5258
|
+
const size_t rs = ne00*nb00;
|
5259
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5260
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5261
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5262
|
+
memcpy(
|
5263
|
+
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
5264
|
+
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
|
5265
|
+
rs);
|
4959
5266
|
}
|
4960
5267
|
}
|
4961
|
-
}
|
4962
|
-
|
4963
|
-
|
5268
|
+
}
|
5269
|
+
return;
|
5270
|
+
}
|
5271
|
+
|
5272
|
+
if (ggml_is_contiguous(dst)) {
|
5273
|
+
// TODO: simplify
|
5274
|
+
if (src0->nb[0] == sizeof(float)) {
|
5275
|
+
if (dst->type == GGML_TYPE_F32) {
|
5276
|
+
size_t id = 0;
|
5277
|
+
const size_t rs = ne00*nb00;
|
5278
|
+
|
5279
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5280
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5281
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5282
|
+
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
5283
|
+
char * dst_ptr = (char *) dst->data + id*rs;
|
4964
5284
|
|
4965
|
-
|
4966
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4967
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4968
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4969
|
-
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5285
|
+
memcpy(dst_ptr, src0_ptr, rs);
|
4970
5286
|
|
4971
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
4972
5287
|
id++;
|
4973
5288
|
}
|
4974
5289
|
}
|
4975
5290
|
}
|
5291
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
5292
|
+
size_t id = 0;
|
5293
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
5294
|
+
|
5295
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5296
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5297
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5298
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
5299
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5300
|
+
|
5301
|
+
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
5302
|
+
id++;
|
5303
|
+
}
|
5304
|
+
}
|
5305
|
+
}
|
5306
|
+
}
|
5307
|
+
} else {
|
5308
|
+
GGML_ASSERT(false); // TODO: implement
|
4976
5309
|
}
|
4977
5310
|
} else {
|
4978
|
-
|
4979
|
-
}
|
4980
|
-
} else {
|
4981
|
-
//printf("%s: this is not optimal - fix me\n", __func__);
|
5311
|
+
//printf("%s: this is not optimal - fix me\n", __func__);
|
4982
5312
|
|
4983
|
-
|
4984
|
-
|
4985
|
-
|
5313
|
+
if (dst->type == GGML_TYPE_F32) {
|
5314
|
+
size_t id = 0;
|
5315
|
+
float * dst_ptr = (float *) dst->data;
|
4986
5316
|
|
4987
|
-
|
4988
|
-
|
4989
|
-
|
4990
|
-
|
4991
|
-
|
5317
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5318
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5319
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5320
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
5321
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4992
5322
|
|
4993
|
-
|
4994
|
-
|
5323
|
+
dst_ptr[id] = *src0_ptr;
|
5324
|
+
id++;
|
5325
|
+
}
|
4995
5326
|
}
|
4996
5327
|
}
|
4997
5328
|
}
|
5329
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
5330
|
+
size_t id = 0;
|
5331
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
5332
|
+
|
5333
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
5334
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
5335
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
5336
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
5337
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5338
|
+
|
5339
|
+
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
5340
|
+
id++;
|
5341
|
+
}
|
5342
|
+
}
|
5343
|
+
}
|
5344
|
+
}
|
5345
|
+
} else {
|
5346
|
+
GGML_ASSERT(false); // TODO: implement
|
4998
5347
|
}
|
4999
|
-
}
|
5000
|
-
size_t id = 0;
|
5001
|
-
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
5348
|
+
}
|
5002
5349
|
|
5003
|
-
|
5004
|
-
|
5005
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
5006
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
5007
|
-
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5350
|
+
return;
|
5351
|
+
}
|
5008
5352
|
|
5009
|
-
|
5010
|
-
|
5353
|
+
// dst counters
|
5354
|
+
int64_t i10 = 0;
|
5355
|
+
int64_t i11 = 0;
|
5356
|
+
int64_t i12 = 0;
|
5357
|
+
int64_t i13 = 0;
|
5358
|
+
|
5359
|
+
if (dst->type == GGML_TYPE_F32) {
|
5360
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5361
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5362
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5363
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5364
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5365
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5366
|
+
|
5367
|
+
memcpy(dst_ptr, src0_ptr, sizeof(float));
|
5368
|
+
|
5369
|
+
if (++i10 == dst->ne[0]) {
|
5370
|
+
i10 = 0;
|
5371
|
+
if (++i11 == dst->ne[1]) {
|
5372
|
+
i11 = 0;
|
5373
|
+
if (++i12 == dst->ne[2]) {
|
5374
|
+
i12 = 0;
|
5375
|
+
if (++i13 == dst->ne[3]) {
|
5376
|
+
i13 = 0;
|
5377
|
+
}
|
5378
|
+
}
|
5379
|
+
}
|
5380
|
+
}
|
5381
|
+
}
|
5382
|
+
}
|
5383
|
+
}
|
5384
|
+
}
|
5385
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
5386
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5387
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5388
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5389
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5390
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5391
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5392
|
+
|
5393
|
+
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
|
5394
|
+
|
5395
|
+
if (++i10 == dst->ne[0]) {
|
5396
|
+
i10 = 0;
|
5397
|
+
if (++i11 == dst->ne[1]) {
|
5398
|
+
i11 = 0;
|
5399
|
+
if (++i12 == dst->ne[2]) {
|
5400
|
+
i12 = 0;
|
5401
|
+
if (++i13 == dst->ne[3]) {
|
5402
|
+
i13 = 0;
|
5403
|
+
}
|
5404
|
+
}
|
5405
|
+
}
|
5011
5406
|
}
|
5012
5407
|
}
|
5013
5408
|
}
|
5014
5409
|
}
|
5015
|
-
} else {
|
5016
|
-
GGML_ASSERT(false); // TODO: implement
|
5017
5410
|
}
|
5411
|
+
} else {
|
5412
|
+
GGML_ASSERT(false); // TODO: implement
|
5018
5413
|
}
|
5019
5414
|
}
|
5020
5415
|
|
@@ -5075,14 +5470,18 @@ static void ggml_compute_forward_add_f32(
|
|
5075
5470
|
GGML_ASSERT(nb00 == sizeof(float));
|
5076
5471
|
|
5077
5472
|
if (nb10 == sizeof(float)) {
|
5078
|
-
|
5079
|
-
|
5080
|
-
|
5081
|
-
|
5473
|
+
for (int j = ith; j < n; j += nth) {
|
5474
|
+
#ifdef GGML_USE_ACCELERATE
|
5475
|
+
vDSP_vadd(
|
5476
|
+
(float *) ((char *) src0->data + j*nb01), 1,
|
5477
|
+
(float *) ((char *) src1->data + j*nb11), 1,
|
5478
|
+
(float *) ((char *) dst->data + j*nb1), 1, nc);
|
5479
|
+
#else
|
5082
5480
|
ggml_vec_add_f32(nc,
|
5083
5481
|
(float *) ((char *) dst->data + j*nb1),
|
5084
5482
|
(float *) ((char *) src0->data + j*nb01),
|
5085
5483
|
(float *) ((char *) src1->data + j*nb11));
|
5484
|
+
#endif
|
5086
5485
|
}
|
5087
5486
|
} else {
|
5088
5487
|
// src1 is not contiguous
|
@@ -5389,18 +5788,18 @@ static void ggml_compute_forward_sum_f32(
|
|
5389
5788
|
assert(ggml_is_scalar(dst));
|
5390
5789
|
assert(src0->nb[0] == sizeof(float));
|
5391
5790
|
|
5392
|
-
const
|
5393
|
-
const
|
5394
|
-
const
|
5395
|
-
const
|
5791
|
+
const int64_t ne00 = src0->ne[0];
|
5792
|
+
const int64_t ne01 = src0->ne[1];
|
5793
|
+
const int64_t ne02 = src0->ne[2];
|
5794
|
+
const int64_t ne03 = src0->ne[3];
|
5396
5795
|
|
5397
5796
|
const size_t nb01 = src0->nb[1];
|
5398
5797
|
const size_t nb02 = src0->nb[2];
|
5399
5798
|
const size_t nb03 = src0->nb[3];
|
5400
5799
|
|
5401
|
-
for (
|
5402
|
-
for (
|
5403
|
-
for (
|
5800
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5801
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5802
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5404
5803
|
ggml_vec_sum_f32(ne00,
|
5405
5804
|
(float *) (dst->data),
|
5406
5805
|
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
@@ -5445,19 +5844,19 @@ static void ggml_compute_forward_mean_f32(
|
|
5445
5844
|
|
5446
5845
|
assert(src0->nb[0] == sizeof(float));
|
5447
5846
|
|
5448
|
-
const
|
5449
|
-
const
|
5450
|
-
const
|
5451
|
-
const
|
5847
|
+
const int64_t ne00 = src0->ne[0];
|
5848
|
+
const int64_t ne01 = src0->ne[1];
|
5849
|
+
const int64_t ne02 = src0->ne[2];
|
5850
|
+
const int64_t ne03 = src0->ne[3];
|
5452
5851
|
|
5453
5852
|
const size_t nb01 = src0->nb[1];
|
5454
5853
|
const size_t nb02 = src0->nb[2];
|
5455
5854
|
const size_t nb03 = src0->nb[3];
|
5456
5855
|
|
5457
|
-
const
|
5458
|
-
const
|
5459
|
-
const
|
5460
|
-
const
|
5856
|
+
const int64_t ne0 = dst->ne[0];
|
5857
|
+
const int64_t ne1 = dst->ne[1];
|
5858
|
+
const int64_t ne2 = dst->ne[2];
|
5859
|
+
const int64_t ne3 = dst->ne[3];
|
5461
5860
|
|
5462
5861
|
assert(ne0 == 1);
|
5463
5862
|
assert(ne1 == ne01);
|
@@ -5473,9 +5872,9 @@ static void ggml_compute_forward_mean_f32(
|
|
5473
5872
|
const size_t nb2 = dst->nb[2];
|
5474
5873
|
const size_t nb3 = dst->nb[3];
|
5475
5874
|
|
5476
|
-
for (
|
5477
|
-
for (
|
5478
|
-
for (
|
5875
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5876
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5877
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5479
5878
|
ggml_vec_sum_f32(ne00,
|
5480
5879
|
(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
5481
5880
|
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
@@ -5962,10 +6361,10 @@ static void ggml_compute_forward_norm_f32(
|
|
5962
6361
|
const int ith = params->ith;
|
5963
6362
|
const int nth = params->nth;
|
5964
6363
|
|
5965
|
-
const
|
5966
|
-
const
|
5967
|
-
const
|
5968
|
-
const
|
6364
|
+
const int64_t ne00 = src0->ne[0];
|
6365
|
+
const int64_t ne01 = src0->ne[1];
|
6366
|
+
const int64_t ne02 = src0->ne[2];
|
6367
|
+
const int64_t ne03 = src0->ne[3];
|
5969
6368
|
|
5970
6369
|
const size_t nb01 = src0->nb[1];
|
5971
6370
|
const size_t nb02 = src0->nb[2];
|
@@ -5978,13 +6377,13 @@ static void ggml_compute_forward_norm_f32(
|
|
5978
6377
|
const float eps = 1e-5f; // TODO: make this a parameter
|
5979
6378
|
|
5980
6379
|
// TODO: optimize
|
5981
|
-
for (
|
5982
|
-
for (
|
5983
|
-
for (
|
6380
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6381
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6382
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
5984
6383
|
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
5985
6384
|
|
5986
6385
|
ggml_float sum = 0.0;
|
5987
|
-
for (
|
6386
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5988
6387
|
sum += (ggml_float)x[i00];
|
5989
6388
|
}
|
5990
6389
|
|
@@ -5993,7 +6392,7 @@ static void ggml_compute_forward_norm_f32(
|
|
5993
6392
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
5994
6393
|
|
5995
6394
|
ggml_float sum2 = 0.0;
|
5996
|
-
for (
|
6395
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5997
6396
|
float v = x[i00] - mean;
|
5998
6397
|
y[i00] = v;
|
5999
6398
|
sum2 += (ggml_float)(v*v);
|
@@ -6045,10 +6444,10 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
6045
6444
|
const int ith = params->ith;
|
6046
6445
|
const int nth = params->nth;
|
6047
6446
|
|
6048
|
-
const
|
6049
|
-
const
|
6050
|
-
const
|
6051
|
-
const
|
6447
|
+
const int64_t ne00 = src0->ne[0];
|
6448
|
+
const int64_t ne01 = src0->ne[1];
|
6449
|
+
const int64_t ne02 = src0->ne[2];
|
6450
|
+
const int64_t ne03 = src0->ne[3];
|
6052
6451
|
|
6053
6452
|
const size_t nb01 = src0->nb[1];
|
6054
6453
|
const size_t nb02 = src0->nb[2];
|
@@ -6061,13 +6460,13 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
6061
6460
|
const float eps = 1e-6f; // TODO: make this a parameter
|
6062
6461
|
|
6063
6462
|
// TODO: optimize
|
6064
|
-
for (
|
6065
|
-
for (
|
6066
|
-
for (
|
6463
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6464
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6465
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
6067
6466
|
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
6068
6467
|
|
6069
6468
|
ggml_float sum = 0.0;
|
6070
|
-
for (
|
6469
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
6071
6470
|
sum += (ggml_float)(x[i00] * x[i00]);
|
6072
6471
|
}
|
6073
6472
|
|
@@ -6120,13 +6519,13 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
6120
6519
|
const struct ggml_tensor * src0,
|
6121
6520
|
const struct ggml_tensor * src1,
|
6122
6521
|
struct ggml_tensor * dst) {
|
6123
|
-
//const
|
6124
|
-
//const
|
6522
|
+
//const int64_t ne00 = src0->ne[0];
|
6523
|
+
//const int64_t ne01 = src0->ne[1];
|
6125
6524
|
|
6126
|
-
const
|
6525
|
+
const int64_t ne10 = src1->ne[0];
|
6127
6526
|
|
6128
|
-
const
|
6129
|
-
const
|
6527
|
+
const int64_t ne0 = dst->ne[0];
|
6528
|
+
const int64_t ne1 = dst->ne[1];
|
6130
6529
|
|
6131
6530
|
// TODO: find the optimal values for these
|
6132
6531
|
if (ggml_is_contiguous(src0) &&
|
@@ -6148,23 +6547,23 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
6148
6547
|
int64_t t0 = ggml_perf_time_us();
|
6149
6548
|
UNUSED(t0);
|
6150
6549
|
|
6151
|
-
const
|
6152
|
-
const
|
6153
|
-
const
|
6154
|
-
const
|
6550
|
+
const int64_t ne00 = src0->ne[0];
|
6551
|
+
const int64_t ne01 = src0->ne[1];
|
6552
|
+
const int64_t ne02 = src0->ne[2];
|
6553
|
+
const int64_t ne03 = src0->ne[3];
|
6155
6554
|
|
6156
6555
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
6157
|
-
const
|
6556
|
+
const int64_t ne10 = src1->ne[0];
|
6158
6557
|
#endif
|
6159
|
-
const
|
6558
|
+
const int64_t ne11 = src1->ne[1];
|
6160
6559
|
#ifndef NDEBUG
|
6161
|
-
const
|
6162
|
-
const
|
6560
|
+
const int64_t ne12 = src1->ne[2];
|
6561
|
+
const int64_t ne13 = src1->ne[3];
|
6163
6562
|
|
6164
|
-
const
|
6165
|
-
const
|
6166
|
-
const
|
6167
|
-
const
|
6563
|
+
const int64_t ne0 = dst->ne[0];
|
6564
|
+
const int64_t ne1 = dst->ne[1];
|
6565
|
+
const int64_t ne2 = dst->ne[2];
|
6566
|
+
const int64_t ne3 = dst->ne[3];
|
6168
6567
|
|
6169
6568
|
const int nb00 = src0->nb[0];
|
6170
6569
|
#endif
|
@@ -6224,8 +6623,8 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
6224
6623
|
return;
|
6225
6624
|
}
|
6226
6625
|
|
6227
|
-
for (
|
6228
|
-
for (
|
6626
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6627
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6229
6628
|
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
6230
6629
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
6231
6630
|
|
@@ -6235,7 +6634,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
6235
6634
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
6236
6635
|
ne11, ne01, ne10,
|
6237
6636
|
1.0f, y, ne10,
|
6238
|
-
x,
|
6637
|
+
x, ne00,
|
6239
6638
|
0.0f, d, ne01);
|
6240
6639
|
}
|
6241
6640
|
}
|
@@ -6272,7 +6671,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
6272
6671
|
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
6273
6672
|
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
6274
6673
|
|
6275
|
-
for (
|
6674
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6276
6675
|
// src1 indices
|
6277
6676
|
const int i13 = i03;
|
6278
6677
|
const int i12 = i02;
|
@@ -6313,21 +6712,21 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6313
6712
|
int64_t t0 = ggml_perf_time_us();
|
6314
6713
|
UNUSED(t0);
|
6315
6714
|
|
6316
|
-
const
|
6317
|
-
const
|
6318
|
-
const
|
6319
|
-
const
|
6715
|
+
const int64_t ne00 = src0->ne[0];
|
6716
|
+
const int64_t ne01 = src0->ne[1];
|
6717
|
+
const int64_t ne02 = src0->ne[2];
|
6718
|
+
const int64_t ne03 = src0->ne[3];
|
6320
6719
|
|
6321
|
-
const
|
6322
|
-
const
|
6323
|
-
const
|
6324
|
-
const
|
6720
|
+
const int64_t ne10 = src1->ne[0];
|
6721
|
+
const int64_t ne11 = src1->ne[1];
|
6722
|
+
const int64_t ne12 = src1->ne[2];
|
6723
|
+
const int64_t ne13 = src1->ne[3];
|
6325
6724
|
|
6326
|
-
const
|
6327
|
-
const
|
6328
|
-
const
|
6329
|
-
const
|
6330
|
-
//const
|
6725
|
+
const int64_t ne0 = dst->ne[0];
|
6726
|
+
const int64_t ne1 = dst->ne[1];
|
6727
|
+
const int64_t ne2 = dst->ne[2];
|
6728
|
+
const int64_t ne3 = dst->ne[3];
|
6729
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
6331
6730
|
|
6332
6731
|
const int nb00 = src0->nb[0];
|
6333
6732
|
const int nb01 = src0->nb[1];
|
@@ -6387,12 +6786,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6387
6786
|
|
6388
6787
|
float * const wdata = params->wdata;
|
6389
6788
|
|
6390
|
-
for (
|
6391
|
-
for (
|
6789
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6790
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6392
6791
|
{
|
6393
6792
|
size_t id = 0;
|
6394
|
-
for (
|
6395
|
-
for (
|
6793
|
+
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
6794
|
+
for (int64_t i00 = 0; i00 < ne00; ++i00) {
|
6396
6795
|
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
6397
6796
|
}
|
6398
6797
|
}
|
@@ -6407,7 +6806,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6407
6806
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
6408
6807
|
ne11, ne01, ne10,
|
6409
6808
|
1.0f, y, ne10,
|
6410
|
-
x,
|
6809
|
+
x, ne00,
|
6411
6810
|
0.0f, d, ne01);
|
6412
6811
|
}
|
6413
6812
|
}
|
@@ -6422,10 +6821,10 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6422
6821
|
ggml_fp16_t * const wdata = params->wdata;
|
6423
6822
|
|
6424
6823
|
size_t id = 0;
|
6425
|
-
for (
|
6426
|
-
for (
|
6427
|
-
for (
|
6428
|
-
for (
|
6824
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
6825
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
6826
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
6827
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
6429
6828
|
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
6430
6829
|
}
|
6431
6830
|
}
|
@@ -6477,7 +6876,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6477
6876
|
|
6478
6877
|
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
|
6479
6878
|
|
6480
|
-
for (
|
6879
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6481
6880
|
ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
|
6482
6881
|
}
|
6483
6882
|
}
|
@@ -6495,29 +6894,27 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6495
6894
|
//}
|
6496
6895
|
}
|
6497
6896
|
|
6498
|
-
typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
|
6499
|
-
typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
|
6500
|
-
typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
|
6501
|
-
|
6502
|
-
typedef struct {
|
6503
|
-
dequantize_row_q_t dequantize_row_q;
|
6504
|
-
quantize_row_q_t quantize_row_q;
|
6505
|
-
vec_dot_q_t vec_dot_q;
|
6506
|
-
} quantize_fns_t;
|
6507
|
-
|
6508
6897
|
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
6509
6898
|
[GGML_TYPE_Q4_0] = {
|
6510
|
-
.dequantize_row_q
|
6511
|
-
.quantize_row_q
|
6512
|
-
.
|
6899
|
+
.dequantize_row_q = dequantize_row_q4_0,
|
6900
|
+
.quantize_row_q = quantize_row_q4_0,
|
6901
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
|
6902
|
+
.vec_dot_q = ggml_vec_dot_q4_0,
|
6513
6903
|
},
|
6514
6904
|
[GGML_TYPE_Q4_1] = {
|
6515
|
-
.dequantize_row_q
|
6516
|
-
.quantize_row_q
|
6517
|
-
.
|
6905
|
+
.dequantize_row_q = dequantize_row_q4_1,
|
6906
|
+
.quantize_row_q = quantize_row_q4_1,
|
6907
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
6908
|
+
.vec_dot_q = ggml_vec_dot_q4_1,
|
6518
6909
|
},
|
6519
6910
|
};
|
6520
6911
|
|
6912
|
+
// For internal test use
|
6913
|
+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
6914
|
+
GGML_ASSERT(i < GGML_TYPE_COUNT);
|
6915
|
+
return quantize_fns[i];
|
6916
|
+
}
|
6917
|
+
|
6521
6918
|
static void ggml_compute_forward_mul_mat_q_f32(
|
6522
6919
|
const struct ggml_compute_params * params,
|
6523
6920
|
const struct ggml_tensor * src0,
|
@@ -6526,20 +6923,20 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6526
6923
|
int64_t t0 = ggml_perf_time_us();
|
6527
6924
|
UNUSED(t0);
|
6528
6925
|
|
6529
|
-
const
|
6530
|
-
const
|
6531
|
-
const
|
6532
|
-
const
|
6926
|
+
const int64_t ne00 = src0->ne[0];
|
6927
|
+
const int64_t ne01 = src0->ne[1];
|
6928
|
+
const int64_t ne02 = src0->ne[2];
|
6929
|
+
const int64_t ne03 = src0->ne[3];
|
6533
6930
|
|
6534
|
-
const
|
6535
|
-
const
|
6536
|
-
const
|
6537
|
-
const
|
6931
|
+
const int64_t ne10 = src1->ne[0];
|
6932
|
+
const int64_t ne11 = src1->ne[1];
|
6933
|
+
const int64_t ne12 = src1->ne[2];
|
6934
|
+
const int64_t ne13 = src1->ne[3];
|
6538
6935
|
|
6539
|
-
const
|
6540
|
-
const
|
6541
|
-
const
|
6542
|
-
const
|
6936
|
+
const int64_t ne0 = dst->ne[0];
|
6937
|
+
const int64_t ne1 = dst->ne[1];
|
6938
|
+
const int64_t ne2 = dst->ne[2];
|
6939
|
+
const int64_t ne3 = dst->ne[3];
|
6543
6940
|
|
6544
6941
|
const int nb00 = src0->nb[0];
|
6545
6942
|
const int nb01 = src0->nb[1];
|
@@ -6603,11 +7000,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6603
7000
|
float * const wdata = params->wdata;
|
6604
7001
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
6605
7002
|
|
6606
|
-
for (
|
6607
|
-
for (
|
7003
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
7004
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6608
7005
|
{
|
6609
7006
|
size_t id = 0;
|
6610
|
-
for (
|
7007
|
+
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
6611
7008
|
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
6612
7009
|
id += ne00;
|
6613
7010
|
}
|
@@ -6622,7 +7019,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6622
7019
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
6623
7020
|
ne11, ne01, ne10,
|
6624
7021
|
1.0f, y, ne10,
|
6625
|
-
x,
|
7022
|
+
x, ne00,
|
6626
7023
|
0.0f, d, ne01);
|
6627
7024
|
}
|
6628
7025
|
}
|
@@ -6637,9 +7034,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6637
7034
|
char * wdata = params->wdata;
|
6638
7035
|
const size_t row_size = ne10*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type];
|
6639
7036
|
|
6640
|
-
for (
|
6641
|
-
for (
|
6642
|
-
for (
|
7037
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
7038
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
7039
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
6643
7040
|
quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
6644
7041
|
wdata += row_size;
|
6645
7042
|
}
|
@@ -6688,7 +7085,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6688
7085
|
|
6689
7086
|
assert(ne00 % 32 == 0);
|
6690
7087
|
|
6691
|
-
for (
|
7088
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6692
7089
|
vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
6693
7090
|
}
|
6694
7091
|
}
|
@@ -6832,6 +7229,15 @@ static void ggml_compute_forward_cpy(
|
|
6832
7229
|
ggml_compute_forward_dup(params, src0, dst);
|
6833
7230
|
}
|
6834
7231
|
|
7232
|
+
// ggml_compute_forward_cont
|
7233
|
+
|
7234
|
+
static void ggml_compute_forward_cont(
|
7235
|
+
const struct ggml_compute_params * params,
|
7236
|
+
const struct ggml_tensor * src0,
|
7237
|
+
struct ggml_tensor * dst) {
|
7238
|
+
ggml_compute_forward_dup(params, src0, dst);
|
7239
|
+
}
|
7240
|
+
|
6835
7241
|
// ggml_compute_forward_reshape
|
6836
7242
|
|
6837
7243
|
static void ggml_compute_forward_reshape(
|
@@ -7169,7 +7575,6 @@ static void ggml_compute_forward_rope_f32(
|
|
7169
7575
|
const struct ggml_tensor * src0,
|
7170
7576
|
const struct ggml_tensor * src1,
|
7171
7577
|
struct ggml_tensor * dst) {
|
7172
|
-
assert(params->ith == 0);
|
7173
7578
|
assert(src1->type == GGML_TYPE_I32);
|
7174
7579
|
assert(ggml_nelements(src1) == 3);
|
7175
7580
|
|
@@ -7181,10 +7586,10 @@ static void ggml_compute_forward_rope_f32(
|
|
7181
7586
|
const int n_dims = ((int32_t *) src1->data)[1];
|
7182
7587
|
const int mode = ((int32_t *) src1->data)[2];
|
7183
7588
|
|
7184
|
-
//const
|
7185
|
-
const
|
7186
|
-
const
|
7187
|
-
const
|
7589
|
+
//const int64_t ne0 = src0->ne[0];
|
7590
|
+
const int64_t ne1 = src0->ne[1];
|
7591
|
+
const int64_t ne2 = src0->ne[2];
|
7592
|
+
const int64_t ne3 = src0->ne[3];
|
7188
7593
|
|
7189
7594
|
const int nb0 = src0->nb[0];
|
7190
7595
|
const int nb1 = src0->nb[1];
|
@@ -7196,16 +7601,37 @@ static void ggml_compute_forward_rope_f32(
|
|
7196
7601
|
|
7197
7602
|
assert(nb0 == sizeof(float));
|
7198
7603
|
|
7199
|
-
|
7200
|
-
|
7201
|
-
|
7604
|
+
const int ith = params->ith;
|
7605
|
+
const int nth = params->nth;
|
7606
|
+
|
7607
|
+
const int nr = ggml_nrows(src0);
|
7608
|
+
|
7609
|
+
// rows per thread
|
7610
|
+
const int dr = (nr + nth - 1)/nth;
|
7611
|
+
|
7612
|
+
// row range for this thread
|
7613
|
+
const int ir0 = dr*ith;
|
7614
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
7615
|
+
|
7616
|
+
// row index used to determine which thread to use
|
7617
|
+
int ir = 0;
|
7618
|
+
|
7619
|
+
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
7620
|
+
|
7621
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
7622
|
+
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
7202
7623
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
7203
|
-
for (
|
7624
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
7625
|
+
if (ir++ < ir0) continue;
|
7626
|
+
if (ir > ir1) break;
|
7627
|
+
|
7628
|
+
float theta = (float)p;
|
7629
|
+
|
7204
7630
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
7205
|
-
const float
|
7631
|
+
const float cos_theta = cosf(theta);
|
7632
|
+
const float sin_theta = sinf(theta);
|
7206
7633
|
|
7207
|
-
|
7208
|
-
const float sin_theta = sinf(p*theta);
|
7634
|
+
theta *= theta_scale;
|
7209
7635
|
|
7210
7636
|
const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
7211
7637
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -7226,7 +7652,6 @@ static void ggml_compute_forward_rope_f16(
|
|
7226
7652
|
const struct ggml_tensor * src0,
|
7227
7653
|
const struct ggml_tensor * src1,
|
7228
7654
|
struct ggml_tensor * dst) {
|
7229
|
-
assert(params->ith == 0);
|
7230
7655
|
assert(src1->type == GGML_TYPE_I32);
|
7231
7656
|
assert(ggml_nelements(src1) == 3);
|
7232
7657
|
|
@@ -7238,10 +7663,10 @@ static void ggml_compute_forward_rope_f16(
|
|
7238
7663
|
const int n_dims = ((int32_t *) src1->data)[1];
|
7239
7664
|
const int mode = ((int32_t *) src1->data)[2];
|
7240
7665
|
|
7241
|
-
//const
|
7242
|
-
const
|
7243
|
-
const
|
7244
|
-
const
|
7666
|
+
//const int64_t ne0 = src0->ne[0];
|
7667
|
+
const int64_t ne1 = src0->ne[1];
|
7668
|
+
const int64_t ne2 = src0->ne[2];
|
7669
|
+
const int64_t ne3 = src0->ne[3];
|
7245
7670
|
|
7246
7671
|
const int nb0 = src0->nb[0];
|
7247
7672
|
const int nb1 = src0->nb[1];
|
@@ -7253,15 +7678,37 @@ static void ggml_compute_forward_rope_f16(
|
|
7253
7678
|
|
7254
7679
|
assert(nb0 == sizeof(ggml_fp16_t));
|
7255
7680
|
|
7256
|
-
|
7257
|
-
|
7681
|
+
const int ith = params->ith;
|
7682
|
+
const int nth = params->nth;
|
7683
|
+
|
7684
|
+
const int nr = ggml_nrows(src0);
|
7685
|
+
|
7686
|
+
// rows per thread
|
7687
|
+
const int dr = (nr + nth - 1)/nth;
|
7688
|
+
|
7689
|
+
// row range for this thread
|
7690
|
+
const int ir0 = dr*ith;
|
7691
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
7692
|
+
|
7693
|
+
// row index used to determine which thread to use
|
7694
|
+
int ir = 0;
|
7695
|
+
|
7696
|
+
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
7697
|
+
|
7698
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
7699
|
+
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
7258
7700
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
7259
|
-
for (
|
7701
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
7702
|
+
if (ir++ < ir0) continue;
|
7703
|
+
if (ir > ir1) break;
|
7704
|
+
|
7705
|
+
float theta = (float)p;
|
7706
|
+
|
7260
7707
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
7261
|
-
const float
|
7708
|
+
const float cos_theta = cosf(theta);
|
7709
|
+
const float sin_theta = sinf(theta);
|
7262
7710
|
|
7263
|
-
|
7264
|
-
const float sin_theta = sinf(p*theta);
|
7711
|
+
theta *= theta_scale;
|
7265
7712
|
|
7266
7713
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
7267
7714
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -7317,21 +7764,21 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7317
7764
|
int64_t t0 = ggml_perf_time_us();
|
7318
7765
|
UNUSED(t0);
|
7319
7766
|
|
7320
|
-
const
|
7321
|
-
const
|
7322
|
-
const
|
7323
|
-
//const
|
7767
|
+
const int64_t ne00 = src0->ne[0];
|
7768
|
+
const int64_t ne01 = src0->ne[1];
|
7769
|
+
const int64_t ne02 = src0->ne[2];
|
7770
|
+
//const int64_t ne03 = src0->ne[3];
|
7324
7771
|
|
7325
|
-
const
|
7326
|
-
const
|
7327
|
-
//const
|
7328
|
-
//const
|
7772
|
+
const int64_t ne10 = src1->ne[0];
|
7773
|
+
const int64_t ne11 = src1->ne[1];
|
7774
|
+
//const int64_t ne12 = src1->ne[2];
|
7775
|
+
//const int64_t ne13 = src1->ne[3];
|
7329
7776
|
|
7330
|
-
//const
|
7331
|
-
//const
|
7332
|
-
//const
|
7333
|
-
//const
|
7334
|
-
//const
|
7777
|
+
//const int64_t ne0 = dst->ne[0];
|
7778
|
+
//const int64_t ne1 = dst->ne[1];
|
7779
|
+
//const int64_t ne2 = dst->ne[2];
|
7780
|
+
//const int64_t ne3 = dst->ne[3];
|
7781
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7335
7782
|
|
7336
7783
|
const int nb00 = src0->nb[0];
|
7337
7784
|
const int nb01 = src0->nb[1];
|
@@ -7368,11 +7815,11 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7368
7815
|
{
|
7369
7816
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
7370
7817
|
|
7371
|
-
for (
|
7372
|
-
for (
|
7818
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7819
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7373
7820
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7374
7821
|
ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
|
7375
|
-
for (
|
7822
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7376
7823
|
dst_data[i00*ew0 + i01] = src[i00];
|
7377
7824
|
}
|
7378
7825
|
}
|
@@ -7383,10 +7830,10 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7383
7830
|
{
|
7384
7831
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
|
7385
7832
|
|
7386
|
-
for (
|
7833
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7387
7834
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7388
7835
|
ggml_fp16_t * dst_data = wdata;
|
7389
|
-
for (
|
7836
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7390
7837
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
7391
7838
|
}
|
7392
7839
|
}
|
@@ -7411,7 +7858,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7411
7858
|
|
7412
7859
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7413
7860
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7414
|
-
for (
|
7861
|
+
for (int64_t i0 = 0; i0 < ne10; ++i0) {
|
7415
7862
|
dst_data[i0] = 0;
|
7416
7863
|
for (int k = -nh; k <= nh; k++) {
|
7417
7864
|
float v = 0.0f;
|
@@ -7437,21 +7884,21 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7437
7884
|
int64_t t0 = ggml_perf_time_us();
|
7438
7885
|
UNUSED(t0);
|
7439
7886
|
|
7440
|
-
const
|
7441
|
-
const
|
7442
|
-
const
|
7443
|
-
//const
|
7887
|
+
const int64_t ne00 = src0->ne[0];
|
7888
|
+
const int64_t ne01 = src0->ne[1];
|
7889
|
+
const int64_t ne02 = src0->ne[2];
|
7890
|
+
//const int64_t ne03 = src0->ne[3];
|
7444
7891
|
|
7445
|
-
const
|
7446
|
-
const
|
7447
|
-
//const
|
7448
|
-
//const
|
7892
|
+
const int64_t ne10 = src1->ne[0];
|
7893
|
+
const int64_t ne11 = src1->ne[1];
|
7894
|
+
//const int64_t ne12 = src1->ne[2];
|
7895
|
+
//const int64_t ne13 = src1->ne[3];
|
7449
7896
|
|
7450
|
-
//const
|
7451
|
-
//const
|
7452
|
-
//const
|
7453
|
-
//const
|
7454
|
-
//const
|
7897
|
+
//const int64_t ne0 = dst->ne[0];
|
7898
|
+
//const int64_t ne1 = dst->ne[1];
|
7899
|
+
//const int64_t ne2 = dst->ne[2];
|
7900
|
+
//const int64_t ne3 = dst->ne[3];
|
7901
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7455
7902
|
|
7456
7903
|
const int nb00 = src0->nb[0];
|
7457
7904
|
const int nb01 = src0->nb[1];
|
@@ -7488,11 +7935,11 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7488
7935
|
{
|
7489
7936
|
float * const wdata = (float *) params->wdata + 0;
|
7490
7937
|
|
7491
|
-
for (
|
7492
|
-
for (
|
7938
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7939
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7493
7940
|
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7494
7941
|
float * dst_data = wdata + i02*ew0*ne00;
|
7495
|
-
for (
|
7942
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7496
7943
|
dst_data[i00*ew0 + i01] = src[i00];
|
7497
7944
|
}
|
7498
7945
|
}
|
@@ -7503,10 +7950,10 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7503
7950
|
{
|
7504
7951
|
float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
|
7505
7952
|
|
7506
|
-
for (
|
7953
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7507
7954
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7508
7955
|
float * dst_data = wdata;
|
7509
|
-
for (
|
7956
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7510
7957
|
dst_data[(i10 + nh)*ew0 + i11] = src[i10];
|
7511
7958
|
}
|
7512
7959
|
}
|
@@ -7531,7 +7978,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7531
7978
|
|
7532
7979
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7533
7980
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7534
|
-
for (
|
7981
|
+
for (int64_t i0 = 0; i0 < ne10; ++i0) {
|
7535
7982
|
dst_data[i0] = 0;
|
7536
7983
|
for (int k = -nh; k <= nh; k++) {
|
7537
7984
|
float v = 0.0f;
|
@@ -7585,21 +8032,21 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7585
8032
|
int64_t t0 = ggml_perf_time_us();
|
7586
8033
|
UNUSED(t0);
|
7587
8034
|
|
7588
|
-
const
|
7589
|
-
const
|
7590
|
-
const
|
7591
|
-
//const
|
8035
|
+
const int64_t ne00 = src0->ne[0];
|
8036
|
+
const int64_t ne01 = src0->ne[1];
|
8037
|
+
const int64_t ne02 = src0->ne[2];
|
8038
|
+
//const int64_t ne03 = src0->ne[3];
|
7592
8039
|
|
7593
|
-
const
|
7594
|
-
const
|
7595
|
-
//const
|
7596
|
-
//const
|
8040
|
+
const int64_t ne10 = src1->ne[0];
|
8041
|
+
const int64_t ne11 = src1->ne[1];
|
8042
|
+
//const int64_t ne12 = src1->ne[2];
|
8043
|
+
//const int64_t ne13 = src1->ne[3];
|
7597
8044
|
|
7598
|
-
//const
|
7599
|
-
//const
|
7600
|
-
//const
|
7601
|
-
//const
|
7602
|
-
//const
|
8045
|
+
//const int64_t ne0 = dst->ne[0];
|
8046
|
+
//const int64_t ne1 = dst->ne[1];
|
8047
|
+
//const int64_t ne2 = dst->ne[2];
|
8048
|
+
//const int64_t ne3 = dst->ne[3];
|
8049
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7603
8050
|
|
7604
8051
|
const int nb00 = src0->nb[0];
|
7605
8052
|
const int nb01 = src0->nb[1];
|
@@ -7636,11 +8083,11 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7636
8083
|
{
|
7637
8084
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
7638
8085
|
|
7639
|
-
for (
|
7640
|
-
for (
|
8086
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8087
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7641
8088
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7642
8089
|
ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
|
7643
|
-
for (
|
8090
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7644
8091
|
dst_data[i00*ew0 + i01] = src[i00];
|
7645
8092
|
}
|
7646
8093
|
}
|
@@ -7651,10 +8098,10 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7651
8098
|
{
|
7652
8099
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
|
7653
8100
|
|
7654
|
-
for (
|
8101
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7655
8102
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7656
8103
|
ggml_fp16_t * dst_data = wdata;
|
7657
|
-
for (
|
8104
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7658
8105
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
7659
8106
|
}
|
7660
8107
|
}
|
@@ -7679,7 +8126,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7679
8126
|
|
7680
8127
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7681
8128
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7682
|
-
for (
|
8129
|
+
for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
|
7683
8130
|
dst_data[i0/2] = 0;
|
7684
8131
|
for (int k = -nh; k <= nh; k++) {
|
7685
8132
|
float v = 0.0f;
|
@@ -7705,21 +8152,21 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7705
8152
|
int64_t t0 = ggml_perf_time_us();
|
7706
8153
|
UNUSED(t0);
|
7707
8154
|
|
7708
|
-
const
|
7709
|
-
const
|
7710
|
-
const
|
7711
|
-
//const
|
8155
|
+
const int64_t ne00 = src0->ne[0];
|
8156
|
+
const int64_t ne01 = src0->ne[1];
|
8157
|
+
const int64_t ne02 = src0->ne[2];
|
8158
|
+
//const int64_t ne03 = src0->ne[3];
|
7712
8159
|
|
7713
|
-
const
|
7714
|
-
const
|
7715
|
-
//const
|
7716
|
-
//const
|
8160
|
+
const int64_t ne10 = src1->ne[0];
|
8161
|
+
const int64_t ne11 = src1->ne[1];
|
8162
|
+
//const int64_t ne12 = src1->ne[2];
|
8163
|
+
//const int64_t ne13 = src1->ne[3];
|
7717
8164
|
|
7718
|
-
//const
|
7719
|
-
//const
|
7720
|
-
//const
|
7721
|
-
//const
|
7722
|
-
//const
|
8165
|
+
//const int64_t ne0 = dst->ne[0];
|
8166
|
+
//const int64_t ne1 = dst->ne[1];
|
8167
|
+
//const int64_t ne2 = dst->ne[2];
|
8168
|
+
//const int64_t ne3 = dst->ne[3];
|
8169
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7723
8170
|
|
7724
8171
|
const int nb00 = src0->nb[0];
|
7725
8172
|
const int nb01 = src0->nb[1];
|
@@ -7756,11 +8203,11 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7756
8203
|
{
|
7757
8204
|
float * const wdata = (float *) params->wdata + 0;
|
7758
8205
|
|
7759
|
-
for (
|
7760
|
-
for (
|
8206
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
8207
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7761
8208
|
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7762
8209
|
float * dst_data = wdata + i02*ew0*ne00;
|
7763
|
-
for (
|
8210
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7764
8211
|
dst_data[i00*ew0 + i01] = src[i00];
|
7765
8212
|
}
|
7766
8213
|
}
|
@@ -7771,10 +8218,10 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7771
8218
|
{
|
7772
8219
|
float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
|
7773
8220
|
|
7774
|
-
for (
|
8221
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7775
8222
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7776
8223
|
float * dst_data = wdata;
|
7777
|
-
for (
|
8224
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7778
8225
|
dst_data[(i10 + nh)*ew0 + i11] = src[i10];
|
7779
8226
|
}
|
7780
8227
|
}
|
@@ -7799,7 +8246,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7799
8246
|
|
7800
8247
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7801
8248
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7802
|
-
for (
|
8249
|
+
for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
|
7803
8250
|
dst_data[i0/2] = 0;
|
7804
8251
|
for (int k = -nh; k <= nh; k++) {
|
7805
8252
|
float v = 0.0f;
|
@@ -7851,25 +8298,25 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7851
8298
|
int64_t t0 = ggml_perf_time_us();
|
7852
8299
|
UNUSED(t0);
|
7853
8300
|
|
7854
|
-
const
|
7855
|
-
const
|
7856
|
-
const
|
7857
|
-
const
|
8301
|
+
const int64_t neq0 = q->ne[0];
|
8302
|
+
const int64_t neq1 = q->ne[1];
|
8303
|
+
const int64_t neq2 = q->ne[2];
|
8304
|
+
const int64_t neq3 = q->ne[3];
|
7858
8305
|
|
7859
|
-
const
|
7860
|
-
const
|
7861
|
-
//const
|
7862
|
-
//const
|
8306
|
+
const int64_t nek0 = k->ne[0];
|
8307
|
+
const int64_t nek1 = k->ne[1];
|
8308
|
+
//const int64_t nek2 = k->ne[2];
|
8309
|
+
//const int64_t nek3 = k->ne[3];
|
7863
8310
|
|
7864
|
-
//const
|
7865
|
-
const
|
7866
|
-
//const
|
7867
|
-
//const
|
8311
|
+
//const int64_t nev0 = v->ne[0];
|
8312
|
+
const int64_t nev1 = v->ne[1];
|
8313
|
+
//const int64_t nev2 = v->ne[2];
|
8314
|
+
//const int64_t nev3 = v->ne[3];
|
7868
8315
|
|
7869
|
-
const
|
7870
|
-
const
|
7871
|
-
//const
|
7872
|
-
//const
|
8316
|
+
const int64_t ne0 = dst->ne[0];
|
8317
|
+
const int64_t ne1 = dst->ne[1];
|
8318
|
+
//const int64_t ne2 = dst->ne[2];
|
8319
|
+
//const int64_t ne3 = dst->ne[3];
|
7873
8320
|
|
7874
8321
|
const int nbk0 = k->nb[0];
|
7875
8322
|
const int nbk1 = k->nb[1];
|
@@ -7894,10 +8341,10 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7894
8341
|
const int ith = params->ith;
|
7895
8342
|
const int nth = params->nth;
|
7896
8343
|
|
7897
|
-
const
|
7898
|
-
const
|
7899
|
-
const
|
7900
|
-
const
|
8344
|
+
const int64_t D = neq0;
|
8345
|
+
const int64_t N = neq1;
|
8346
|
+
const int64_t P = nek1 - N;
|
8347
|
+
const int64_t M = P + N;
|
7901
8348
|
|
7902
8349
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
7903
8350
|
|
@@ -7959,7 +8406,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7959
8406
|
S[i] = -INFINITY;
|
7960
8407
|
}
|
7961
8408
|
|
7962
|
-
for (
|
8409
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
7963
8410
|
// k indices
|
7964
8411
|
const int ik3 = iq3;
|
7965
8412
|
const int ik2 = iq2;
|
@@ -7978,7 +8425,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7978
8425
|
ggml_vec_scale_f32(nek1, S, scale);
|
7979
8426
|
|
7980
8427
|
if (masked) {
|
7981
|
-
for (
|
8428
|
+
for (int64_t i = P; i < M; i++) {
|
7982
8429
|
if (i > P + iq1) {
|
7983
8430
|
S[i] = -INFINITY;
|
7984
8431
|
}
|
@@ -8036,7 +8483,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
8036
8483
|
#endif
|
8037
8484
|
}
|
8038
8485
|
|
8039
|
-
for (
|
8486
|
+
for (int64_t ic = 0; ic < nev1; ++ic) {
|
8040
8487
|
// dst indices
|
8041
8488
|
const int i1 = iq1;
|
8042
8489
|
const int i2 = iq2;
|
@@ -8060,25 +8507,25 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8060
8507
|
int64_t t0 = ggml_perf_time_us();
|
8061
8508
|
UNUSED(t0);
|
8062
8509
|
|
8063
|
-
const
|
8064
|
-
const
|
8065
|
-
const
|
8066
|
-
const
|
8510
|
+
const int64_t neq0 = q->ne[0];
|
8511
|
+
const int64_t neq1 = q->ne[1];
|
8512
|
+
const int64_t neq2 = q->ne[2];
|
8513
|
+
const int64_t neq3 = q->ne[3];
|
8067
8514
|
|
8068
|
-
const
|
8069
|
-
const
|
8070
|
-
//const
|
8071
|
-
//const
|
8515
|
+
const int64_t nek0 = k->ne[0];
|
8516
|
+
const int64_t nek1 = k->ne[1];
|
8517
|
+
//const int64_t nek2 = k->ne[2];
|
8518
|
+
//const int64_t nek3 = k->ne[3];
|
8072
8519
|
|
8073
|
-
//const
|
8074
|
-
const
|
8075
|
-
//const
|
8076
|
-
//const
|
8520
|
+
//const int64_t nev0 = v->ne[0];
|
8521
|
+
const int64_t nev1 = v->ne[1];
|
8522
|
+
//const int64_t nev2 = v->ne[2];
|
8523
|
+
//const int64_t nev3 = v->ne[3];
|
8077
8524
|
|
8078
|
-
const
|
8079
|
-
const
|
8080
|
-
//const
|
8081
|
-
//const
|
8525
|
+
const int64_t ne0 = dst->ne[0];
|
8526
|
+
const int64_t ne1 = dst->ne[1];
|
8527
|
+
//const int64_t ne2 = dst->ne[2];
|
8528
|
+
//const int64_t ne3 = dst->ne[3];
|
8082
8529
|
|
8083
8530
|
const int nbk0 = k->nb[0];
|
8084
8531
|
const int nbk1 = k->nb[1];
|
@@ -8103,10 +8550,10 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8103
8550
|
const int ith = params->ith;
|
8104
8551
|
const int nth = params->nth;
|
8105
8552
|
|
8106
|
-
const
|
8107
|
-
const
|
8108
|
-
const
|
8109
|
-
const
|
8553
|
+
const int64_t D = neq0;
|
8554
|
+
const int64_t N = neq1;
|
8555
|
+
const int64_t P = nek1 - N;
|
8556
|
+
const int64_t M = P + N;
|
8110
8557
|
|
8111
8558
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
8112
8559
|
|
@@ -8169,7 +8616,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8169
8616
|
}
|
8170
8617
|
|
8171
8618
|
if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
|
8172
|
-
for (
|
8619
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
8173
8620
|
// k indices
|
8174
8621
|
const int ik3 = iq3;
|
8175
8622
|
const int ik2 = iq2;
|
@@ -8184,7 +8631,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8184
8631
|
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
8185
8632
|
}
|
8186
8633
|
} else {
|
8187
|
-
for (
|
8634
|
+
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
8188
8635
|
// k indices
|
8189
8636
|
const int ik3 = iq3;
|
8190
8637
|
const int ik2 = iq2;
|
@@ -8204,7 +8651,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8204
8651
|
ggml_vec_scale_f32(nek1, S, scale);
|
8205
8652
|
|
8206
8653
|
if (masked) {
|
8207
|
-
for (
|
8654
|
+
for (int64_t i = P; i < M; i++) {
|
8208
8655
|
if (i > P + iq1) {
|
8209
8656
|
S[i] = -INFINITY;
|
8210
8657
|
}
|
@@ -8264,12 +8711,12 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8264
8711
|
|
8265
8712
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
|
8266
8713
|
|
8267
|
-
for (
|
8714
|
+
for (int64_t i = 0; i < M; i++) {
|
8268
8715
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
8269
8716
|
}
|
8270
8717
|
|
8271
8718
|
if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
|
8272
|
-
for (
|
8719
|
+
for (int64_t ic = 0; ic < nev1; ++ic) {
|
8273
8720
|
// dst indices
|
8274
8721
|
const int i1 = iq1;
|
8275
8722
|
const int i2 = iq2;
|
@@ -8281,7 +8728,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8281
8728
|
S16);
|
8282
8729
|
}
|
8283
8730
|
} else {
|
8284
|
-
for (
|
8731
|
+
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
8285
8732
|
// dst indices
|
8286
8733
|
const int i1 = iq1;
|
8287
8734
|
const int i2 = iq2;
|
@@ -8337,35 +8784,35 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8337
8784
|
int64_t t0 = ggml_perf_time_us();
|
8338
8785
|
UNUSED(t0);
|
8339
8786
|
|
8340
|
-
const
|
8341
|
-
const
|
8342
|
-
const
|
8343
|
-
const
|
8787
|
+
const int64_t nea0 = a->ne[0];
|
8788
|
+
const int64_t nea1 = a->ne[1];
|
8789
|
+
const int64_t nea2 = a->ne[2];
|
8790
|
+
const int64_t nea3 = a->ne[3];
|
8344
8791
|
|
8345
|
-
const
|
8346
|
-
const
|
8347
|
-
//const
|
8348
|
-
//const
|
8792
|
+
const int64_t neb00 = b0->ne[0];
|
8793
|
+
const int64_t neb01 = b0->ne[1];
|
8794
|
+
//const int64_t neb02 = b0->ne[2];
|
8795
|
+
//const int64_t neb03 = b0->ne[3];
|
8349
8796
|
|
8350
|
-
const
|
8351
|
-
const
|
8352
|
-
//const
|
8353
|
-
//const
|
8797
|
+
const int64_t neb10 = b1->ne[0];
|
8798
|
+
const int64_t neb11 = b1->ne[1];
|
8799
|
+
//const int64_t neb12 = b1->ne[2];
|
8800
|
+
//const int64_t neb13 = b1->ne[3];
|
8354
8801
|
|
8355
|
-
const
|
8356
|
-
const
|
8357
|
-
//const
|
8358
|
-
//const
|
8802
|
+
const int64_t nec00 = c0->ne[0];
|
8803
|
+
const int64_t nec01 = c0->ne[1];
|
8804
|
+
//const int64_t nec02 = c0->ne[2];
|
8805
|
+
//const int64_t nec03 = c0->ne[3];
|
8359
8806
|
|
8360
|
-
const
|
8361
|
-
const
|
8362
|
-
//const
|
8363
|
-
//const
|
8807
|
+
const int64_t nec10 = c1->ne[0];
|
8808
|
+
const int64_t nec11 = c1->ne[1];
|
8809
|
+
//const int64_t nec12 = c1->ne[2];
|
8810
|
+
//const int64_t nec13 = c1->ne[3];
|
8364
8811
|
|
8365
|
-
const
|
8366
|
-
const
|
8367
|
-
const
|
8368
|
-
//const
|
8812
|
+
const int64_t ne0 = dst->ne[0];
|
8813
|
+
const int64_t ne1 = dst->ne[1];
|
8814
|
+
const int64_t ne2 = dst->ne[2];
|
8815
|
+
//const int64_t ne3 = dst->ne[3];
|
8369
8816
|
|
8370
8817
|
const int nba0 = a->nb[0];
|
8371
8818
|
const int nba1 = a->nb[1];
|
@@ -8400,9 +8847,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8400
8847
|
const int ith = params->ith;
|
8401
8848
|
const int nth = params->nth;
|
8402
8849
|
|
8403
|
-
const
|
8404
|
-
//const
|
8405
|
-
const
|
8850
|
+
const int64_t D = nea0;
|
8851
|
+
//const int64_t N = nea1;
|
8852
|
+
const int64_t M = neb01;
|
8406
8853
|
|
8407
8854
|
GGML_ASSERT(ne0 == nea0);
|
8408
8855
|
GGML_ASSERT(ne1 == nea1);
|
@@ -8458,7 +8905,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8458
8905
|
|
8459
8906
|
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
8460
8907
|
|
8461
|
-
for (
|
8908
|
+
for (int64_t ic = 0; ic < neb01; ++ic) {
|
8462
8909
|
// b0 indices
|
8463
8910
|
const int ib03 = ia3;
|
8464
8911
|
const int ib02 = ia2;
|
@@ -8478,7 +8925,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8478
8925
|
|
8479
8926
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
8480
8927
|
|
8481
|
-
for (
|
8928
|
+
for (int64_t i = 0; i < M; i++) {
|
8482
8929
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
8483
8930
|
}
|
8484
8931
|
|
@@ -8490,7 +8937,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8490
8937
|
const int i2 = ia2;
|
8491
8938
|
const int i3 = ia3;
|
8492
8939
|
|
8493
|
-
for (
|
8940
|
+
for (int64_t ic = 0; ic < nec01; ++ic) {
|
8494
8941
|
|
8495
8942
|
ggml_vec_dot_f16(neb01,
|
8496
8943
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
@@ -8535,6 +8982,111 @@ static void ggml_compute_forward_flash_ff(
|
|
8535
8982
|
}
|
8536
8983
|
}
|
8537
8984
|
|
8985
|
+
// ggml_compute_forward_map_unary
|
8986
|
+
|
8987
|
+
static void ggml_compute_forward_map_unary_f32(
|
8988
|
+
const struct ggml_compute_params * params,
|
8989
|
+
const struct ggml_tensor * src0,
|
8990
|
+
struct ggml_tensor * dst,
|
8991
|
+
const ggml_unary_op_f32_t fun) {
|
8992
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
8993
|
+
|
8994
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
8995
|
+
return;
|
8996
|
+
}
|
8997
|
+
|
8998
|
+
const int n = ggml_nrows(src0);
|
8999
|
+
const int nc = src0->ne[0];
|
9000
|
+
|
9001
|
+
assert( dst->nb[0] == sizeof(float));
|
9002
|
+
assert(src0->nb[0] == sizeof(float));
|
9003
|
+
|
9004
|
+
for (int i = 0; i < n; i++) {
|
9005
|
+
fun(nc,
|
9006
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9007
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9008
|
+
}
|
9009
|
+
}
|
9010
|
+
|
9011
|
+
|
9012
|
+
static void ggml_compute_forward_map_unary(
|
9013
|
+
const struct ggml_compute_params * params,
|
9014
|
+
const struct ggml_tensor * src0,
|
9015
|
+
struct ggml_tensor * dst,
|
9016
|
+
const ggml_unary_op_f32_t fun) {
|
9017
|
+
switch (src0->type) {
|
9018
|
+
case GGML_TYPE_F32:
|
9019
|
+
{
|
9020
|
+
ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
|
9021
|
+
} break;
|
9022
|
+
case GGML_TYPE_Q4_0:
|
9023
|
+
case GGML_TYPE_Q4_1:
|
9024
|
+
case GGML_TYPE_I8:
|
9025
|
+
case GGML_TYPE_I16:
|
9026
|
+
case GGML_TYPE_I32:
|
9027
|
+
case GGML_TYPE_F16:
|
9028
|
+
case GGML_TYPE_COUNT:
|
9029
|
+
{
|
9030
|
+
GGML_ASSERT(false);
|
9031
|
+
} break;
|
9032
|
+
}
|
9033
|
+
}
|
9034
|
+
|
9035
|
+
// ggml_compute_forward_map_binary
|
9036
|
+
|
9037
|
+
static void ggml_compute_forward_map_binary_f32(
|
9038
|
+
const struct ggml_compute_params * params,
|
9039
|
+
const struct ggml_tensor * src0,
|
9040
|
+
const struct ggml_tensor * src1,
|
9041
|
+
struct ggml_tensor * dst,
|
9042
|
+
const ggml_binary_op_f32_t fun) {
|
9043
|
+
assert(params->ith == 0);
|
9044
|
+
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
9045
|
+
|
9046
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9047
|
+
return;
|
9048
|
+
}
|
9049
|
+
|
9050
|
+
const int n = ggml_nrows(src0);
|
9051
|
+
const int nc = src0->ne[0];
|
9052
|
+
|
9053
|
+
assert( dst->nb[0] == sizeof(float));
|
9054
|
+
assert(src0->nb[0] == sizeof(float));
|
9055
|
+
assert(src1->nb[0] == sizeof(float));
|
9056
|
+
|
9057
|
+
for (int i = 0; i < n; i++) {
|
9058
|
+
fun(nc,
|
9059
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9060
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])),
|
9061
|
+
(float *) ((char *) src1->data + i*(src1->nb[1])));
|
9062
|
+
}
|
9063
|
+
}
|
9064
|
+
|
9065
|
+
|
9066
|
+
static void ggml_compute_forward_map_binary(
|
9067
|
+
const struct ggml_compute_params * params,
|
9068
|
+
const struct ggml_tensor * src0,
|
9069
|
+
const struct ggml_tensor * src1,
|
9070
|
+
struct ggml_tensor * dst,
|
9071
|
+
const ggml_binary_op_f32_t fun) {
|
9072
|
+
switch (src0->type) {
|
9073
|
+
case GGML_TYPE_F32:
|
9074
|
+
{
|
9075
|
+
ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
|
9076
|
+
} break;
|
9077
|
+
case GGML_TYPE_Q4_0:
|
9078
|
+
case GGML_TYPE_Q4_1:
|
9079
|
+
case GGML_TYPE_I8:
|
9080
|
+
case GGML_TYPE_I16:
|
9081
|
+
case GGML_TYPE_I32:
|
9082
|
+
case GGML_TYPE_F16:
|
9083
|
+
case GGML_TYPE_COUNT:
|
9084
|
+
{
|
9085
|
+
GGML_ASSERT(false);
|
9086
|
+
} break;
|
9087
|
+
}
|
9088
|
+
}
|
9089
|
+
|
8538
9090
|
/////////////////////////////////
|
8539
9091
|
|
8540
9092
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
@@ -8629,6 +9181,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
8629
9181
|
{
|
8630
9182
|
ggml_compute_forward_cpy(params, tensor->src0, tensor);
|
8631
9183
|
} break;
|
9184
|
+
case GGML_OP_CONT:
|
9185
|
+
{
|
9186
|
+
ggml_compute_forward_cont(params, tensor->src0, tensor);
|
9187
|
+
} break;
|
8632
9188
|
case GGML_OP_RESHAPE:
|
8633
9189
|
{
|
8634
9190
|
ggml_compute_forward_reshape(params, tensor->src0, tensor);
|
@@ -8680,6 +9236,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
8680
9236
|
{
|
8681
9237
|
ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
|
8682
9238
|
} break;
|
9239
|
+
case GGML_OP_MAP_UNARY:
|
9240
|
+
{
|
9241
|
+
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
9242
|
+
ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
|
9243
|
+
}
|
9244
|
+
break;
|
9245
|
+
case GGML_OP_MAP_BINARY:
|
9246
|
+
{
|
9247
|
+
const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
|
9248
|
+
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
9249
|
+
}
|
9250
|
+
break;
|
8683
9251
|
case GGML_OP_NONE:
|
8684
9252
|
{
|
8685
9253
|
// nop
|
@@ -8873,8 +9441,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
8873
9441
|
src1->grad =
|
8874
9442
|
ggml_add_impl(ctx,
|
8875
9443
|
src1->grad,
|
8876
|
-
|
8877
|
-
|
9444
|
+
ggml_mul_mat(ctx,
|
9445
|
+
ggml_cont(ctx, ggml_transpose(ctx, src0)),
|
9446
|
+
tensor->grad),
|
8878
9447
|
inplace);
|
8879
9448
|
}
|
8880
9449
|
} break;
|
@@ -8886,6 +9455,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
8886
9455
|
{
|
8887
9456
|
GGML_ASSERT(false); // TODO: not implemented
|
8888
9457
|
} break;
|
9458
|
+
case GGML_OP_CONT:
|
9459
|
+
{
|
9460
|
+
GGML_ASSERT(false); // TODO: not implemented
|
9461
|
+
} break;
|
8889
9462
|
case GGML_OP_RESHAPE:
|
8890
9463
|
{
|
8891
9464
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -8934,6 +9507,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
8934
9507
|
{
|
8935
9508
|
GGML_ASSERT(false); // not supported
|
8936
9509
|
} break;
|
9510
|
+
case GGML_OP_MAP_UNARY:
|
9511
|
+
case GGML_OP_MAP_BINARY:
|
9512
|
+
{
|
9513
|
+
GGML_ASSERT(false); // not supported
|
9514
|
+
} break;
|
8937
9515
|
case GGML_OP_NONE:
|
8938
9516
|
{
|
8939
9517
|
// nop
|
@@ -9024,7 +9602,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
9024
9602
|
struct ggml_cgraph result = {
|
9025
9603
|
/*.n_nodes =*/ 0,
|
9026
9604
|
/*.n_leafs =*/ 0,
|
9027
|
-
/*.n_threads =*/
|
9605
|
+
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
9028
9606
|
/*.work_size =*/ 0,
|
9029
9607
|
/*.work =*/ NULL,
|
9030
9608
|
/*.nodes =*/ { NULL },
|
@@ -9340,6 +9918,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9340
9918
|
node->n_tasks = n_threads;
|
9341
9919
|
} break;
|
9342
9920
|
case GGML_OP_CPY:
|
9921
|
+
case GGML_OP_CONT:
|
9343
9922
|
case GGML_OP_RESHAPE:
|
9344
9923
|
case GGML_OP_VIEW:
|
9345
9924
|
case GGML_OP_PERMUTE:
|
@@ -9355,7 +9934,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9355
9934
|
} break;
|
9356
9935
|
case GGML_OP_ROPE:
|
9357
9936
|
{
|
9358
|
-
node->n_tasks =
|
9937
|
+
node->n_tasks = n_threads;
|
9359
9938
|
} break;
|
9360
9939
|
case GGML_OP_CONV_1D_1S:
|
9361
9940
|
case GGML_OP_CONV_1D_2S:
|
@@ -9393,7 +9972,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9393
9972
|
|
9394
9973
|
size_t cur = 0;
|
9395
9974
|
|
9396
|
-
const
|
9975
|
+
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
9397
9976
|
|
9398
9977
|
if (node->src1->type == GGML_TYPE_F32) {
|
9399
9978
|
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
@@ -9425,6 +10004,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9425
10004
|
|
9426
10005
|
work_size = MAX(work_size, cur);
|
9427
10006
|
} break;
|
10007
|
+
case GGML_OP_MAP_UNARY:
|
10008
|
+
case GGML_OP_MAP_BINARY:
|
10009
|
+
{
|
10010
|
+
node->n_tasks = 1;
|
10011
|
+
} break;
|
9428
10012
|
case GGML_OP_NONE:
|
9429
10013
|
{
|
9430
10014
|
node->n_tasks = 1;
|
@@ -9643,8 +10227,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
9643
10227
|
|
9644
10228
|
GGML_PRINT("=== GRAPH ===\n");
|
9645
10229
|
|
9646
|
-
GGML_PRINT_DEBUG("n_threads = %d\n",
|
9647
|
-
GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size);
|
10230
|
+
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
10231
|
+
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
9648
10232
|
|
9649
10233
|
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
9650
10234
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -9652,7 +10236,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
9652
10236
|
|
9653
10237
|
perf_total_per_op_us[node->op] += node->perf_time_us;
|
9654
10238
|
|
9655
|
-
GGML_PRINT(" - %3d: [ %
|
10239
|
+
GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
9656
10240
|
i,
|
9657
10241
|
node->ne[0], node->ne[1], node->ne[2],
|
9658
10242
|
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
@@ -9666,7 +10250,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
9666
10250
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
9667
10251
|
struct ggml_tensor * node = cgraph->leafs[i];
|
9668
10252
|
|
9669
|
-
GGML_PRINT(" - %3d: [ %
|
10253
|
+
GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n",
|
9670
10254
|
i,
|
9671
10255
|
node->ne[0], node->ne[1],
|
9672
10256
|
GGML_OP_LABEL[node->op]);
|
@@ -9737,7 +10321,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
9737
10321
|
|
9738
10322
|
fprintf(fp, " \"%p\" [ \
|
9739
10323
|
style = filled; fillcolor = %s; shape = record; \
|
9740
|
-
label=\"%d [%
|
10324
|
+
label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
9741
10325
|
(void *) node, color,
|
9742
10326
|
i, node->ne[0], node->ne[1],
|
9743
10327
|
GGML_OP_SYMBOL[node->op]);
|
@@ -9762,7 +10346,7 @@ label=\"<x>%.1e\"; ]\n",
|
|
9762
10346
|
} else {
|
9763
10347
|
fprintf(fp, " \"%p\" [ \
|
9764
10348
|
style = filled; fillcolor = %s; shape = record; \
|
9765
|
-
label=\"<x>CONST %d [%
|
10349
|
+
label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
|
9766
10350
|
(void *) node, color,
|
9767
10351
|
i, node->ne[0], node->ne[1]);
|
9768
10352
|
}
|
@@ -9826,9 +10410,9 @@ label=\"<x>CONST %d [%d, %d]\"; ]\n",
|
|
9826
10410
|
static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
|
9827
10411
|
int i = 0;
|
9828
10412
|
for (int p = 0; p < np; ++p) {
|
9829
|
-
const
|
10413
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9830
10414
|
// TODO: add function to set tensor from array
|
9831
|
-
for (
|
10415
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9832
10416
|
ggml_set_f32_1d(ps[p], j, x[i++]);
|
9833
10417
|
}
|
9834
10418
|
}
|
@@ -9837,9 +10421,9 @@ static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const f
|
|
9837
10421
|
static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
|
9838
10422
|
int i = 0;
|
9839
10423
|
for (int p = 0; p < np; ++p) {
|
9840
|
-
const
|
10424
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9841
10425
|
// TODO: add function to get all elements at once
|
9842
|
-
for (
|
10426
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9843
10427
|
x[i++] = ggml_get_f32_1d(ps[p], j);
|
9844
10428
|
}
|
9845
10429
|
}
|
@@ -9848,9 +10432,9 @@ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float *
|
|
9848
10432
|
static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
|
9849
10433
|
int i = 0;
|
9850
10434
|
for (int p = 0; p < np; ++p) {
|
9851
|
-
const
|
10435
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9852
10436
|
// TODO: add function to get all elements at once
|
9853
|
-
for (
|
10437
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9854
10438
|
g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
|
9855
10439
|
}
|
9856
10440
|
}
|