llama_cpp 0.9.1 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
|
|
100
100
|
#include <hbwmalloc.h>
|
101
101
|
#endif
|
102
102
|
|
103
|
+
#if defined(__APPLE__)
|
104
|
+
#include <TargetConditionals.h>
|
105
|
+
#endif
|
106
|
+
|
107
|
+
#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
|
108
|
+
(!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
|
109
|
+
|
110
|
+
#include <sys/wait.h>
|
111
|
+
|
112
|
+
void ggml_print_backtrace(void) {
|
113
|
+
/*
|
114
|
+
#include <execinfo.h>
|
115
|
+
#include <dlfcn.h>
|
116
|
+
|
117
|
+
void * trace[100];
|
118
|
+
|
119
|
+
int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
|
120
|
+
|
121
|
+
backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
|
122
|
+
*/
|
123
|
+
|
124
|
+
// backtrack_symbols does not show line numbers, use gdb instead
|
125
|
+
char attach[32];
|
126
|
+
snprintf(attach, sizeof(attach), "attach %d", getpid());
|
127
|
+
int pid = fork();
|
128
|
+
if (pid == 0) {
|
129
|
+
execlp("gdb", "gdb", "--batch",
|
130
|
+
"-ex", "set style enabled on",
|
131
|
+
"-ex", attach,
|
132
|
+
"-ex", "bt -frame-info source-and-location",
|
133
|
+
"-ex", "detach",
|
134
|
+
"-ex", "quit",
|
135
|
+
NULL);
|
136
|
+
} else {
|
137
|
+
waitpid(pid, NULL, 0);
|
138
|
+
}
|
139
|
+
}
|
140
|
+
#else
|
141
|
+
void ggml_print_backtrace(void) {
|
142
|
+
// platform not supported
|
143
|
+
}
|
144
|
+
#endif
|
145
|
+
|
103
146
|
/*#define GGML_PERF*/
|
104
147
|
#define GGML_DEBUG 0
|
105
148
|
#define GGML_GELU_FP16
|
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
228
271
|
// floating point type used to accumulate sums
|
229
272
|
typedef double ggml_float;
|
230
273
|
|
274
|
+
#undef MIN
|
275
|
+
#undef MAX
|
276
|
+
|
277
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
278
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
279
|
+
|
231
280
|
//
|
232
281
|
// global data
|
233
282
|
//
|
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
561
610
|
// simd mappings
|
562
611
|
//
|
563
612
|
|
613
|
+
#if defined(__ARM_NEON)
|
614
|
+
#if !defined(__aarch64__)
|
615
|
+
|
616
|
+
// 64-bit compatibility
|
617
|
+
|
618
|
+
inline static float vaddvq_f32(float32x4_t v) {
|
619
|
+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
620
|
+
}
|
621
|
+
|
622
|
+
#endif
|
623
|
+
#endif
|
624
|
+
|
564
625
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
565
626
|
// we then implement the fundamental computation operations below using only these macros
|
566
627
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
1352
1413
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
1353
1414
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1354
1415
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1416
|
+
inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
|
1355
1417
|
|
1356
1418
|
static const float GELU_COEF_A = 0.044715f;
|
1357
1419
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1572
1634
|
"ROPE_BACK",
|
1573
1635
|
"ALIBI",
|
1574
1636
|
"CLAMP",
|
1575
|
-
"CONV_1D",
|
1576
|
-
"CONV_1D_STAGE_0",
|
1577
|
-
"CONV_1D_STAGE_1",
|
1578
1637
|
"CONV_TRANSPOSE_1D",
|
1579
|
-
"
|
1580
|
-
"CONV_2D_STAGE_0",
|
1581
|
-
"CONV_2D_STAGE_1",
|
1638
|
+
"IM2COL",
|
1582
1639
|
"CONV_TRANSPOSE_2D",
|
1583
1640
|
"POOL_1D",
|
1584
1641
|
"POOL_2D",
|
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1609
1666
|
"CROSS_ENTROPY_LOSS_BACK",
|
1610
1667
|
};
|
1611
1668
|
|
1612
|
-
static_assert(GGML_OP_COUNT ==
|
1669
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
1613
1670
|
|
1614
1671
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1615
1672
|
"none",
|
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1659
1716
|
"rope_back(x)",
|
1660
1717
|
"alibi(x)",
|
1661
1718
|
"clamp(x)",
|
1662
|
-
"conv_1d(x)",
|
1663
|
-
"conv_1d_stage_0(x)",
|
1664
|
-
"conv_1d_stage_1(x)",
|
1665
1719
|
"conv_transpose_1d(x)",
|
1666
|
-
"
|
1667
|
-
"conv_2d_stage_0(x)",
|
1668
|
-
"conv_2d_stage_1(x)",
|
1720
|
+
"im2col(x)",
|
1669
1721
|
"conv_transpose_2d(x)",
|
1670
1722
|
"pool_1d(x)",
|
1671
1723
|
"pool_2d(x)",
|
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1696
1748
|
"cross_entropy_loss_back(x,y)",
|
1697
1749
|
};
|
1698
1750
|
|
1699
|
-
static_assert(GGML_OP_COUNT ==
|
1751
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
1700
1752
|
|
1701
1753
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1702
1754
|
|
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1724
1776
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
1725
1777
|
p[GGML_OP_DIAG_MASK_INF ] = true;
|
1726
1778
|
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
1727
|
-
p[GGML_OP_CONV_1D ] = true;
|
1728
|
-
p[GGML_OP_CONV_1D_STAGE_0 ] = true;
|
1729
|
-
p[GGML_OP_CONV_1D_STAGE_1 ] = true;
|
1730
1779
|
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
|
1731
|
-
p[GGML_OP_CONV_2D ] = true;
|
1732
|
-
p[GGML_OP_CONV_2D_STAGE_0 ] = true;
|
1733
|
-
p[GGML_OP_CONV_2D_STAGE_1 ] = true;
|
1734
1780
|
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
1735
1781
|
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
1736
1782
|
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
|
|
3769
3815
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
|
3770
3816
|
}
|
3771
3817
|
|
3818
|
+
// ggml_leaky
|
3819
|
+
|
3820
|
+
struct ggml_tensor * ggml_leaky(
|
3821
|
+
struct ggml_context * ctx,
|
3822
|
+
struct ggml_tensor * a) {
|
3823
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
|
3824
|
+
}
|
3825
|
+
|
3772
3826
|
// ggml_gelu
|
3773
3827
|
|
3774
3828
|
struct ggml_tensor * ggml_gelu(
|
@@ -4970,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
4970
5024
|
int n_dims,
|
4971
5025
|
int mode,
|
4972
5026
|
int n_ctx,
|
5027
|
+
int n_orig_ctx,
|
4973
5028
|
float freq_base,
|
4974
5029
|
float freq_scale,
|
5030
|
+
float ext_factor,
|
5031
|
+
float attn_factor,
|
5032
|
+
float beta_fast,
|
5033
|
+
float beta_slow,
|
4975
5034
|
float xpos_base,
|
4976
5035
|
bool xpos_down) {
|
4977
5036
|
GGML_ASSERT(ggml_is_vector(b));
|
@@ -4988,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
|
|
4988
5047
|
|
4989
5048
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
4990
5049
|
|
4991
|
-
int32_t params[
|
4992
|
-
memcpy(params +
|
4993
|
-
memcpy(params +
|
4994
|
-
memcpy(params +
|
4995
|
-
memcpy(params +
|
5050
|
+
int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
|
5051
|
+
memcpy(params + 5, &freq_base, sizeof(float));
|
5052
|
+
memcpy(params + 6, &freq_scale, sizeof(float));
|
5053
|
+
memcpy(params + 7, &ext_factor, sizeof(float));
|
5054
|
+
memcpy(params + 8, &attn_factor, sizeof(float));
|
5055
|
+
memcpy(params + 9, &beta_fast, sizeof(float));
|
5056
|
+
memcpy(params + 10, &beta_slow, sizeof(float));
|
5057
|
+
memcpy(params + 11, &xpos_base, sizeof(float));
|
5058
|
+
memcpy(params + 12, &xpos_down, sizeof(bool));
|
4996
5059
|
ggml_set_op_params(result, params, sizeof(params));
|
4997
5060
|
|
4998
5061
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -5067,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
|
|
5067
5130
|
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
5068
5131
|
}
|
5069
5132
|
|
5070
|
-
// im2col: [N, IC, IL] => [N, OL, IC*K]
|
5071
|
-
// a: [OC,IC, K]
|
5072
|
-
// b: [N, IC, IL]
|
5073
|
-
// result: [N, OL, IC*K]
|
5074
|
-
static struct ggml_tensor * ggml_conv_1d_stage_0(
|
5075
|
-
struct ggml_context * ctx,
|
5076
|
-
struct ggml_tensor * a,
|
5077
|
-
struct ggml_tensor * b,
|
5078
|
-
int s0,
|
5079
|
-
int p0,
|
5080
|
-
int d0) {
|
5081
|
-
GGML_ASSERT(a->ne[1] == b->ne[1]);
|
5082
|
-
bool is_node = false;
|
5083
|
-
|
5084
|
-
if (a->grad || b->grad) {
|
5085
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5086
|
-
is_node = true;
|
5087
|
-
}
|
5088
|
-
|
5089
|
-
const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
5090
|
-
|
5091
|
-
const int64_t ne[4] = {
|
5092
|
-
a->ne[1] * a->ne[0],
|
5093
|
-
OL,
|
5094
|
-
b->ne[2],
|
5095
|
-
1,
|
5096
|
-
};
|
5097
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
5098
|
-
|
5099
|
-
int32_t params[] = { s0, p0, d0 };
|
5100
|
-
ggml_set_op_params(result, params, sizeof(params));
|
5101
|
-
|
5102
|
-
result->op = GGML_OP_CONV_1D_STAGE_0;
|
5103
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5104
|
-
result->src[0] = a;
|
5105
|
-
result->src[1] = b;
|
5106
|
-
|
5107
|
-
return result;
|
5108
|
-
}
|
5109
|
-
|
5110
|
-
// ggml_conv_1d_stage_1
|
5111
|
-
|
5112
|
-
// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
|
5113
|
-
// a: [OC, IC, K]
|
5114
|
-
// b: [N, OL, IC * K]
|
5115
|
-
// result: [N, OC, OL]
|
5116
|
-
static struct ggml_tensor * ggml_conv_1d_stage_1(
|
5117
|
-
struct ggml_context * ctx,
|
5118
|
-
struct ggml_tensor * a,
|
5119
|
-
struct ggml_tensor * b) {
|
5120
|
-
|
5121
|
-
bool is_node = false;
|
5122
|
-
|
5123
|
-
if (a->grad || b->grad) {
|
5124
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5125
|
-
is_node = true;
|
5126
|
-
}
|
5127
|
-
|
5128
|
-
const int64_t ne[4] = {
|
5129
|
-
b->ne[1],
|
5130
|
-
a->ne[2],
|
5131
|
-
b->ne[2],
|
5132
|
-
1,
|
5133
|
-
};
|
5134
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5135
|
-
|
5136
|
-
result->op = GGML_OP_CONV_1D_STAGE_1;
|
5137
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5138
|
-
result->src[0] = a;
|
5139
|
-
result->src[1] = b;
|
5140
|
-
|
5141
|
-
return result;
|
5142
|
-
}
|
5143
|
-
|
5144
|
-
// ggml_conv_1d
|
5145
|
-
|
5146
5133
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
5147
5134
|
struct ggml_context * ctx,
|
5148
5135
|
struct ggml_tensor * a,
|
@@ -5150,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
5150
5137
|
int s0,
|
5151
5138
|
int p0,
|
5152
5139
|
int d0) {
|
5153
|
-
struct ggml_tensor *
|
5154
|
-
result = ggml_conv_1d_stage_1(ctx, a, result);
|
5155
|
-
return result;
|
5156
|
-
}
|
5140
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
5157
5141
|
|
5158
|
-
|
5159
|
-
|
5160
|
-
//
|
5161
|
-
//
|
5162
|
-
// int s0,
|
5163
|
-
// int p0,
|
5164
|
-
// int d0) {
|
5165
|
-
// GGML_ASSERT(ggml_is_matrix(b));
|
5166
|
-
// GGML_ASSERT(a->ne[1] == b->ne[1]);
|
5167
|
-
// bool is_node = false;
|
5142
|
+
struct ggml_tensor * result =
|
5143
|
+
ggml_mul_mat(ctx,
|
5144
|
+
ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
|
5145
|
+
ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
|
5168
5146
|
|
5169
|
-
|
5170
|
-
// GGML_ASSERT(false); // TODO: implement backward
|
5171
|
-
// is_node = true;
|
5172
|
-
// }
|
5147
|
+
result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
|
5173
5148
|
|
5174
|
-
|
5175
|
-
|
5176
|
-
// a->ne[2], 1, 1,
|
5177
|
-
// };
|
5178
|
-
// struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
5179
|
-
|
5180
|
-
// int32_t params[] = { s0, p0, d0 };
|
5181
|
-
// ggml_set_op_params(result, params, sizeof(params));
|
5182
|
-
|
5183
|
-
// result->op = GGML_OP_CONV_1D;
|
5184
|
-
// result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5185
|
-
// result->src[0] = a;
|
5186
|
-
// result->src[1] = b;
|
5187
|
-
|
5188
|
-
// return result;
|
5189
|
-
// }
|
5149
|
+
return result;
|
5150
|
+
}
|
5190
5151
|
|
5191
5152
|
// ggml_conv_1d_ph
|
5192
5153
|
|
@@ -5249,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|
5249
5210
|
// a: [OC,IC, KH, KW]
|
5250
5211
|
// b: [N, IC, IH, IW]
|
5251
5212
|
// result: [N, OH, OW, IC*KH*KW]
|
5252
|
-
|
5213
|
+
struct ggml_tensor * ggml_im2col(
|
5253
5214
|
struct ggml_context * ctx,
|
5254
5215
|
struct ggml_tensor * a,
|
5255
5216
|
struct ggml_tensor * b,
|
@@ -5258,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
|
|
5258
5219
|
int p0,
|
5259
5220
|
int p1,
|
5260
5221
|
int d0,
|
5261
|
-
int d1
|
5222
|
+
int d1,
|
5223
|
+
bool is_2D) {
|
5262
5224
|
|
5263
|
-
|
5225
|
+
if(is_2D) {
|
5226
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
5227
|
+
} else {
|
5228
|
+
GGML_ASSERT(a->ne[1] == b->ne[1]);
|
5229
|
+
}
|
5264
5230
|
bool is_node = false;
|
5265
5231
|
|
5266
5232
|
if (a->grad || b->grad) {
|
@@ -5268,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
|
|
5268
5234
|
is_node = true;
|
5269
5235
|
}
|
5270
5236
|
|
5271
|
-
const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
5272
|
-
const int64_t OW =
|
5237
|
+
const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
|
5238
|
+
const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
5273
5239
|
|
5274
5240
|
const int64_t ne[4] = {
|
5275
|
-
a->ne[2] * a->ne[1] * a->ne[0],
|
5241
|
+
is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
|
5276
5242
|
OW,
|
5277
|
-
OH,
|
5278
|
-
b->ne[3],
|
5243
|
+
is_2D ? OH : b->ne[2],
|
5244
|
+
is_2D ? b->ne[3] : 1,
|
5279
5245
|
};
|
5280
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
5281
5246
|
|
5282
|
-
|
5247
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
5248
|
+
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
5283
5249
|
ggml_set_op_params(result, params, sizeof(params));
|
5284
5250
|
|
5285
|
-
result->op =
|
5286
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5287
|
-
result->src[0] = a;
|
5288
|
-
result->src[1] = b;
|
5289
|
-
|
5290
|
-
return result;
|
5291
|
-
|
5292
|
-
}
|
5293
|
-
|
5294
|
-
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
5295
|
-
// a: [OC, IC, KH, KW]
|
5296
|
-
// b: [N, OH, OW, IC * KH * KW]
|
5297
|
-
// result: [N, OC, OH, OW]
|
5298
|
-
static struct ggml_tensor * ggml_conv_2d_stage_1(
|
5299
|
-
struct ggml_context * ctx,
|
5300
|
-
struct ggml_tensor * a,
|
5301
|
-
struct ggml_tensor * b) {
|
5302
|
-
|
5303
|
-
bool is_node = false;
|
5304
|
-
|
5305
|
-
if (a->grad || b->grad) {
|
5306
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5307
|
-
is_node = true;
|
5308
|
-
}
|
5309
|
-
|
5310
|
-
const int64_t ne[4] = {
|
5311
|
-
b->ne[1],
|
5312
|
-
b->ne[2],
|
5313
|
-
a->ne[3],
|
5314
|
-
b->ne[3],
|
5315
|
-
};
|
5316
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5317
|
-
|
5318
|
-
result->op = GGML_OP_CONV_2D_STAGE_1;
|
5251
|
+
result->op = GGML_OP_IM2COL;
|
5319
5252
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5320
5253
|
result->src[0] = a;
|
5321
5254
|
result->src[1] = b;
|
5322
5255
|
|
5323
5256
|
return result;
|
5324
|
-
|
5325
5257
|
}
|
5326
5258
|
|
5327
5259
|
// a: [OC,IC, KH, KW]
|
5328
5260
|
// b: [N, IC, IH, IW]
|
5329
5261
|
// result: [N, OC, OH, OW]
|
5330
5262
|
struct ggml_tensor * ggml_conv_2d(
|
5331
|
-
|
5332
|
-
|
5333
|
-
|
5334
|
-
|
5335
|
-
|
5336
|
-
|
5337
|
-
|
5338
|
-
|
5339
|
-
|
5263
|
+
struct ggml_context * ctx,
|
5264
|
+
struct ggml_tensor * a,
|
5265
|
+
struct ggml_tensor * b,
|
5266
|
+
int s0,
|
5267
|
+
int s1,
|
5268
|
+
int p0,
|
5269
|
+
int p1,
|
5270
|
+
int d0,
|
5271
|
+
int d1) {
|
5272
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
5340
5273
|
|
5341
|
-
struct ggml_tensor * result =
|
5342
|
-
|
5274
|
+
struct ggml_tensor * result =
|
5275
|
+
ggml_mul_mat(ctx,
|
5276
|
+
ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
|
5277
|
+
ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
|
5343
5278
|
|
5344
|
-
|
5279
|
+
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
|
5345
5280
|
|
5281
|
+
return result;
|
5346
5282
|
}
|
5347
5283
|
|
5348
5284
|
// ggml_conv_2d_sk_p0
|
@@ -5402,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
|
5402
5338
|
|
5403
5339
|
// ggml_pool_*
|
5404
5340
|
|
5405
|
-
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s,
|
5341
|
+
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
|
5406
5342
|
return (ins + 2 * p - ks) / s + 1;
|
5407
5343
|
}
|
5408
5344
|
|
@@ -5449,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5449
5385
|
int k1,
|
5450
5386
|
int s0,
|
5451
5387
|
int s1,
|
5452
|
-
|
5453
|
-
|
5388
|
+
float p0,
|
5389
|
+
float p1) {
|
5454
5390
|
|
5455
5391
|
bool is_node = false;
|
5456
5392
|
|
@@ -8912,6 +8848,48 @@ static void ggml_compute_forward_silu(
|
|
8912
8848
|
}
|
8913
8849
|
}
|
8914
8850
|
|
8851
|
+
// ggml_compute_forward_leaky
|
8852
|
+
|
8853
|
+
static void ggml_compute_forward_leaky_f32(
|
8854
|
+
const struct ggml_compute_params * params,
|
8855
|
+
const struct ggml_tensor * src0,
|
8856
|
+
struct ggml_tensor * dst) {
|
8857
|
+
assert(params->ith == 0);
|
8858
|
+
assert(ggml_are_same_shape(src0, dst));
|
8859
|
+
|
8860
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
8861
|
+
return;
|
8862
|
+
}
|
8863
|
+
|
8864
|
+
const int n = ggml_nrows(src0);
|
8865
|
+
const int nc = src0->ne[0];
|
8866
|
+
|
8867
|
+
assert(dst->nb[0] == sizeof(float));
|
8868
|
+
assert(src0->nb[0] == sizeof(float));
|
8869
|
+
|
8870
|
+
for (int i = 0; i < n; i++) {
|
8871
|
+
ggml_vec_leaky_f32(nc,
|
8872
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
8873
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
8874
|
+
}
|
8875
|
+
}
|
8876
|
+
|
8877
|
+
static void ggml_compute_forward_leaky(
|
8878
|
+
const struct ggml_compute_params * params,
|
8879
|
+
const struct ggml_tensor * src0,
|
8880
|
+
struct ggml_tensor * dst) {
|
8881
|
+
switch (src0->type) {
|
8882
|
+
case GGML_TYPE_F32:
|
8883
|
+
{
|
8884
|
+
ggml_compute_forward_leaky_f32(params, src0, dst);
|
8885
|
+
} break;
|
8886
|
+
default:
|
8887
|
+
{
|
8888
|
+
GGML_ASSERT(false);
|
8889
|
+
} break;
|
8890
|
+
}
|
8891
|
+
}
|
8892
|
+
|
8915
8893
|
// ggml_compute_forward_silu_back
|
8916
8894
|
|
8917
8895
|
static void ggml_compute_forward_silu_back_f32(
|
@@ -9395,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9395
9373
|
// TODO: find the optimal values for these
|
9396
9374
|
if (ggml_is_contiguous(src0) &&
|
9397
9375
|
ggml_is_contiguous(src1) &&
|
9376
|
+
src0->type == GGML_TYPE_F32 &&
|
9377
|
+
src1->type == GGML_TYPE_F32 &&
|
9398
9378
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9399
9379
|
|
9400
9380
|
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
@@ -9433,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9433
9413
|
|
9434
9414
|
// we don't support permuted src0 or src1
|
9435
9415
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
9436
|
-
GGML_ASSERT(nb10 ==
|
9416
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
9437
9417
|
|
9438
9418
|
// dst cannot be transposed or permuted
|
9439
9419
|
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -10974,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
|
|
10974
10954
|
const struct ggml_compute_params * params,
|
10975
10955
|
const struct ggml_tensor * src0,
|
10976
10956
|
const struct ggml_tensor * src1,
|
10977
|
-
struct ggml_tensor * dst
|
10957
|
+
struct ggml_tensor * dst,
|
10958
|
+
const bool forward) {
|
10978
10959
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10979
10960
|
return;
|
10980
10961
|
}
|
@@ -11033,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
|
|
11033
11014
|
const bool is_neox = mode & 2;
|
11034
11015
|
const bool is_glm = mode & 4;
|
11035
11016
|
|
11017
|
+
// backward process uses inverse rotation by cos and sin.
|
11018
|
+
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
11019
|
+
// this essentially just switches the sign of sin.
|
11020
|
+
const float sin_sign = forward ? 1.0f : -1.0f;
|
11021
|
+
|
11036
11022
|
const int32_t * pos = (const int32_t *) src1->data;
|
11037
11023
|
|
11038
11024
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
@@ -11049,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
|
|
11049
11035
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
11050
11036
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
11051
11037
|
const float cos_theta = cosf(theta_base);
|
11052
|
-
const float sin_theta = sinf(theta_base);
|
11038
|
+
const float sin_theta = sinf(theta_base) * sin_sign;
|
11053
11039
|
const float cos_block_theta = cosf(block_theta);
|
11054
|
-
const float sin_block_theta = sinf(block_theta);
|
11040
|
+
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
11055
11041
|
|
11056
11042
|
theta_base *= theta_scale;
|
11057
11043
|
block_theta *= theta_scale;
|
@@ -11075,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11075
11061
|
rope_yarn(
|
11076
11062
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11077
11063
|
);
|
11064
|
+
sin_theta *= sin_sign;
|
11078
11065
|
|
11079
11066
|
// zeta scaling for xPos only:
|
11080
11067
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
@@ -11105,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11105
11092
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
11106
11093
|
&cos_theta, &sin_theta
|
11107
11094
|
);
|
11095
|
+
sin_theta *= sin_sign;
|
11108
11096
|
|
11109
11097
|
theta_base *= theta_scale;
|
11110
11098
|
|
@@ -11130,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
|
|
11130
11118
|
const struct ggml_compute_params * params,
|
11131
11119
|
const struct ggml_tensor * src0,
|
11132
11120
|
const struct ggml_tensor * src1,
|
11133
|
-
struct ggml_tensor * dst
|
11121
|
+
struct ggml_tensor * dst,
|
11122
|
+
const bool forward) {
|
11134
11123
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11135
11124
|
return;
|
11136
11125
|
}
|
@@ -11182,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
|
|
11182
11171
|
const bool is_neox = mode & 2;
|
11183
11172
|
const bool is_glm = mode & 4;
|
11184
11173
|
|
11174
|
+
// backward process uses inverse rotation by cos and sin.
|
11175
|
+
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
11176
|
+
// this essentially just switches the sign of sin.
|
11177
|
+
const float sin_sign = forward ? 1.0f : -1.0f;
|
11178
|
+
|
11185
11179
|
const int32_t * pos = (const int32_t *) src1->data;
|
11186
11180
|
|
11187
11181
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
@@ -11198,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
|
|
11198
11192
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
11199
11193
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
11200
11194
|
const float cos_theta = cosf(theta_base);
|
11201
|
-
const float sin_theta = sinf(theta_base);
|
11195
|
+
const float sin_theta = sinf(theta_base) * sin_sign;
|
11202
11196
|
const float cos_block_theta = cosf(block_theta);
|
11203
|
-
const float sin_block_theta = sinf(block_theta);
|
11197
|
+
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
11204
11198
|
|
11205
11199
|
theta_base *= theta_scale;
|
11206
11200
|
block_theta *= theta_scale;
|
@@ -11224,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11224
11218
|
rope_yarn(
|
11225
11219
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11226
11220
|
);
|
11221
|
+
sin_theta *= sin_sign;
|
11227
11222
|
|
11228
11223
|
theta_base *= theta_scale;
|
11229
11224
|
|
@@ -11250,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11250
11245
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
11251
11246
|
&cos_theta, &sin_theta
|
11252
11247
|
);
|
11248
|
+
sin_theta *= sin_sign;
|
11253
11249
|
|
11254
11250
|
theta_base *= theta_scale;
|
11255
11251
|
|
@@ -11279,11 +11275,11 @@ static void ggml_compute_forward_rope(
|
|
11279
11275
|
switch (src0->type) {
|
11280
11276
|
case GGML_TYPE_F16:
|
11281
11277
|
{
|
11282
|
-
ggml_compute_forward_rope_f16(params, src0, src1, dst);
|
11278
|
+
ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
|
11283
11279
|
} break;
|
11284
11280
|
case GGML_TYPE_F32:
|
11285
11281
|
{
|
11286
|
-
ggml_compute_forward_rope_f32(params, src0, src1, dst);
|
11282
|
+
ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
|
11287
11283
|
} break;
|
11288
11284
|
default:
|
11289
11285
|
{
|
@@ -11294,693 +11290,73 @@ static void ggml_compute_forward_rope(
|
|
11294
11290
|
|
11295
11291
|
// ggml_compute_forward_rope_back
|
11296
11292
|
|
11297
|
-
static void
|
11293
|
+
static void ggml_compute_forward_rope_back(
|
11298
11294
|
const struct ggml_compute_params * params,
|
11299
11295
|
const struct ggml_tensor * src0,
|
11300
11296
|
const struct ggml_tensor * src1,
|
11301
11297
|
struct ggml_tensor * dst) {
|
11302
|
-
|
11303
|
-
|
11304
|
-
|
11298
|
+
switch (src0->type) {
|
11299
|
+
case GGML_TYPE_F16:
|
11300
|
+
{
|
11301
|
+
ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
|
11302
|
+
} break;
|
11303
|
+
case GGML_TYPE_F32:
|
11304
|
+
{
|
11305
|
+
ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
|
11306
|
+
} break;
|
11307
|
+
default:
|
11308
|
+
{
|
11309
|
+
GGML_ASSERT(false);
|
11310
|
+
} break;
|
11305
11311
|
}
|
11312
|
+
}
|
11306
11313
|
|
11307
|
-
|
11308
|
-
// dx = rope_back(dy, src1)
|
11309
|
-
// src0 is dy, src1 contains options
|
11310
|
-
|
11311
|
-
float freq_base;
|
11312
|
-
float freq_scale;
|
11313
|
-
|
11314
|
-
// these two only relevant for xPos RoPE:
|
11315
|
-
float xpos_base;
|
11316
|
-
bool xpos_down;
|
11317
|
-
|
11318
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
11319
|
-
const int n_dims = ((int32_t *) dst->op_params)[1];
|
11320
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
11321
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
11322
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
11323
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
11324
|
-
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
11325
|
-
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
11314
|
+
// ggml_compute_forward_conv_transpose_1d
|
11326
11315
|
|
11327
|
-
|
11316
|
+
static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
11317
|
+
const struct ggml_compute_params * params,
|
11318
|
+
const struct ggml_tensor * src0,
|
11319
|
+
const struct ggml_tensor * src1,
|
11320
|
+
struct ggml_tensor * dst) {
|
11321
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11322
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11323
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11328
11324
|
|
11329
|
-
|
11330
|
-
|
11325
|
+
int64_t t0 = ggml_perf_time_us();
|
11326
|
+
UNUSED(t0);
|
11331
11327
|
|
11332
|
-
|
11328
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
11333
11329
|
|
11334
11330
|
const int ith = params->ith;
|
11335
11331
|
const int nth = params->nth;
|
11336
11332
|
|
11337
|
-
const int
|
11333
|
+
const int nk = ne00*ne01*ne02;
|
11338
11334
|
|
11339
|
-
|
11340
|
-
|
11335
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11336
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
11341
11337
|
|
11342
|
-
|
11343
|
-
|
11344
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11338
|
+
if (params->type == GGML_TASK_INIT) {
|
11339
|
+
memset(params->wdata, 0, params->wsize);
|
11345
11340
|
|
11346
|
-
|
11347
|
-
|
11341
|
+
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
11342
|
+
{
|
11343
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11348
11344
|
|
11349
|
-
|
11345
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
11346
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11347
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
11348
|
+
ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
|
11349
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11350
|
+
dst_data[i00*ne02 + i02] = src[i00];
|
11351
|
+
}
|
11352
|
+
}
|
11353
|
+
}
|
11354
|
+
}
|
11350
11355
|
|
11351
|
-
|
11352
|
-
|
11353
|
-
|
11354
|
-
|
11355
|
-
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11356
|
-
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11357
|
-
const int64_t p = pos[i2];
|
11358
|
-
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11359
|
-
if (ir++ < ir0) continue;
|
11360
|
-
if (ir > ir1) break;
|
11361
|
-
|
11362
|
-
float theta_base = freq_scale * (float)p;
|
11363
|
-
|
11364
|
-
if (!is_neox) {
|
11365
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11366
|
-
const float cos_theta = cosf(theta_base);
|
11367
|
-
const float sin_theta = sinf(theta_base);
|
11368
|
-
|
11369
|
-
// zeta scaling for xPos only:
|
11370
|
-
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
11371
|
-
if (xpos_down) zeta = 1.0f / zeta;
|
11372
|
-
|
11373
|
-
theta_base *= theta_scale;
|
11374
|
-
|
11375
|
-
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11376
|
-
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11377
|
-
|
11378
|
-
const float dy0 = dy[0];
|
11379
|
-
const float dy1 = dy[1];
|
11380
|
-
|
11381
|
-
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
11382
|
-
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
11383
|
-
}
|
11384
|
-
} else {
|
11385
|
-
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
11386
|
-
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
11387
|
-
const float cos_theta = cosf(theta_base);
|
11388
|
-
const float sin_theta = sinf(theta_base);
|
11389
|
-
|
11390
|
-
theta_base *= theta_scale;
|
11391
|
-
|
11392
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
11393
|
-
|
11394
|
-
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11395
|
-
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11396
|
-
|
11397
|
-
const float dy0 = dy[0];
|
11398
|
-
const float dy1 = dy[n_dims/2];
|
11399
|
-
|
11400
|
-
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
11401
|
-
dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
|
11402
|
-
}
|
11403
|
-
}
|
11404
|
-
}
|
11405
|
-
}
|
11406
|
-
}
|
11407
|
-
}
|
11408
|
-
}
|
11409
|
-
|
11410
|
-
static void ggml_compute_forward_rope_back_f16(
|
11411
|
-
const struct ggml_compute_params * params,
|
11412
|
-
const struct ggml_tensor * src0,
|
11413
|
-
const struct ggml_tensor * src1,
|
11414
|
-
struct ggml_tensor * dst) {
|
11415
|
-
|
11416
|
-
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11417
|
-
return;
|
11418
|
-
}
|
11419
|
-
|
11420
|
-
// y = rope(x, src1)
|
11421
|
-
// dx = rope_back(dy, src1)
|
11422
|
-
// src0 is dy, src1 contains options
|
11423
|
-
|
11424
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
11425
|
-
const int n_dims = ((int32_t *) dst->op_params)[1];
|
11426
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
11427
|
-
|
11428
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
11429
|
-
|
11430
|
-
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
11431
|
-
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
11432
|
-
|
11433
|
-
assert(nb0 == sizeof(ggml_fp16_t));
|
11434
|
-
|
11435
|
-
const int ith = params->ith;
|
11436
|
-
const int nth = params->nth;
|
11437
|
-
|
11438
|
-
const int nr = ggml_nrows(dst);
|
11439
|
-
|
11440
|
-
// rows per thread
|
11441
|
-
const int dr = (nr + nth - 1)/nth;
|
11442
|
-
|
11443
|
-
// row range for this thread
|
11444
|
-
const int ir0 = dr*ith;
|
11445
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11446
|
-
|
11447
|
-
// row index used to determine which thread to use
|
11448
|
-
int ir = 0;
|
11449
|
-
|
11450
|
-
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
11451
|
-
|
11452
|
-
const bool is_neox = mode & 2;
|
11453
|
-
|
11454
|
-
const int32_t * pos = (const int32_t *) src1->data;
|
11455
|
-
|
11456
|
-
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11457
|
-
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11458
|
-
const int64_t p = pos[i2];
|
11459
|
-
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11460
|
-
if (ir++ < ir0) continue;
|
11461
|
-
if (ir > ir1) break;
|
11462
|
-
|
11463
|
-
float theta_base = (float)p;
|
11464
|
-
|
11465
|
-
if (!is_neox) {
|
11466
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11467
|
-
const float cos_theta = cosf(theta_base);
|
11468
|
-
const float sin_theta = sinf(theta_base);
|
11469
|
-
|
11470
|
-
theta_base *= theta_scale;
|
11471
|
-
|
11472
|
-
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11473
|
-
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11474
|
-
|
11475
|
-
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
11476
|
-
const float dy1 = GGML_FP16_TO_FP32(dy[1]);
|
11477
|
-
|
11478
|
-
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
11479
|
-
dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
11480
|
-
}
|
11481
|
-
} else {
|
11482
|
-
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
11483
|
-
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
11484
|
-
const float cos_theta = cosf(theta_base);
|
11485
|
-
const float sin_theta = sinf(theta_base);
|
11486
|
-
|
11487
|
-
theta_base *= theta_scale;
|
11488
|
-
|
11489
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
11490
|
-
|
11491
|
-
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11492
|
-
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11493
|
-
|
11494
|
-
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
11495
|
-
const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
|
11496
|
-
|
11497
|
-
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
11498
|
-
dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
11499
|
-
}
|
11500
|
-
}
|
11501
|
-
}
|
11502
|
-
}
|
11503
|
-
}
|
11504
|
-
}
|
11505
|
-
}
|
11506
|
-
|
11507
|
-
static void ggml_compute_forward_rope_back(
|
11508
|
-
const struct ggml_compute_params * params,
|
11509
|
-
const struct ggml_tensor * src0,
|
11510
|
-
const struct ggml_tensor * src1,
|
11511
|
-
struct ggml_tensor * dst) {
|
11512
|
-
switch (src0->type) {
|
11513
|
-
case GGML_TYPE_F16:
|
11514
|
-
{
|
11515
|
-
ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
|
11516
|
-
} break;
|
11517
|
-
case GGML_TYPE_F32:
|
11518
|
-
{
|
11519
|
-
ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
|
11520
|
-
} break;
|
11521
|
-
default:
|
11522
|
-
{
|
11523
|
-
GGML_ASSERT(false);
|
11524
|
-
} break;
|
11525
|
-
}
|
11526
|
-
}
|
11527
|
-
|
11528
|
-
// ggml_compute_forward_conv_1d
|
11529
|
-
|
11530
|
-
static void ggml_compute_forward_conv_1d_f16_f32(
|
11531
|
-
const struct ggml_compute_params * params,
|
11532
|
-
const struct ggml_tensor * src0,
|
11533
|
-
const struct ggml_tensor * src1,
|
11534
|
-
struct ggml_tensor * dst) {
|
11535
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11536
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11537
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11538
|
-
|
11539
|
-
int64_t t0 = ggml_perf_time_us();
|
11540
|
-
UNUSED(t0);
|
11541
|
-
|
11542
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
11543
|
-
|
11544
|
-
const int ith = params->ith;
|
11545
|
-
const int nth = params->nth;
|
11546
|
-
|
11547
|
-
const int nk = ne00;
|
11548
|
-
|
11549
|
-
// size of the convolution row - the kernel size unrolled across all input channels
|
11550
|
-
const int ew0 = nk*ne01;
|
11551
|
-
|
11552
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
11553
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
|
11554
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
|
11555
|
-
|
11556
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11557
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11558
|
-
|
11559
|
-
if (params->type == GGML_TASK_INIT) {
|
11560
|
-
memset(params->wdata, 0, params->wsize);
|
11561
|
-
|
11562
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11563
|
-
|
11564
|
-
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
11565
|
-
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
11566
|
-
ggml_fp16_t * dst_data = wdata;
|
11567
|
-
|
11568
|
-
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
11569
|
-
for (int64_t ik = 0; ik < nk; ik++) {
|
11570
|
-
const int idx0 = i0*s0 + ik*d0 - p0;
|
11571
|
-
|
11572
|
-
if(!(idx0 < 0 || idx0 >= ne10)) {
|
11573
|
-
dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
|
11574
|
-
}
|
11575
|
-
}
|
11576
|
-
}
|
11577
|
-
}
|
11578
|
-
|
11579
|
-
return;
|
11580
|
-
}
|
11581
|
-
|
11582
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11583
|
-
return;
|
11584
|
-
}
|
11585
|
-
|
11586
|
-
// total rows in dst
|
11587
|
-
const int nr = ne2;
|
11588
|
-
|
11589
|
-
// rows per thread
|
11590
|
-
const int dr = (nr + nth - 1)/nth;
|
11591
|
-
|
11592
|
-
// row range for this thread
|
11593
|
-
const int ir0 = dr*ith;
|
11594
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11595
|
-
|
11596
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11597
|
-
|
11598
|
-
for (int i2 = 0; i2 < ne2; i2++) {
|
11599
|
-
for (int i1 = ir0; i1 < ir1; i1++) {
|
11600
|
-
float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
|
11601
|
-
|
11602
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
11603
|
-
ggml_vec_dot_f16(ew0, dst_data + i0,
|
11604
|
-
(ggml_fp16_t *) ((char *) src0->data + i1*nb02),
|
11605
|
-
(ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
|
11606
|
-
}
|
11607
|
-
}
|
11608
|
-
}
|
11609
|
-
}
|
11610
|
-
|
11611
|
-
static void ggml_compute_forward_conv_1d_f32(
|
11612
|
-
const struct ggml_compute_params * params,
|
11613
|
-
const struct ggml_tensor * src0,
|
11614
|
-
const struct ggml_tensor * src1,
|
11615
|
-
struct ggml_tensor * dst) {
|
11616
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
11617
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11618
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11619
|
-
|
11620
|
-
int64_t t0 = ggml_perf_time_us();
|
11621
|
-
UNUSED(t0);
|
11622
|
-
|
11623
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
11624
|
-
|
11625
|
-
const int ith = params->ith;
|
11626
|
-
const int nth = params->nth;
|
11627
|
-
|
11628
|
-
const int nk = ne00;
|
11629
|
-
|
11630
|
-
const int ew0 = nk*ne01;
|
11631
|
-
|
11632
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
11633
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
|
11634
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
|
11635
|
-
|
11636
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
11637
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11638
|
-
|
11639
|
-
if (params->type == GGML_TASK_INIT) {
|
11640
|
-
memset(params->wdata, 0, params->wsize);
|
11641
|
-
|
11642
|
-
float * const wdata = (float *) params->wdata + 0;
|
11643
|
-
|
11644
|
-
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
11645
|
-
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
11646
|
-
float * dst_data = wdata;
|
11647
|
-
|
11648
|
-
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
11649
|
-
for (int64_t ik = 0; ik < nk; ik++) {
|
11650
|
-
const int idx0 = i0*s0 + ik*d0 - p0;
|
11651
|
-
|
11652
|
-
if(!(idx0 < 0 || idx0 >= ne10)) {
|
11653
|
-
dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
|
11654
|
-
}
|
11655
|
-
}
|
11656
|
-
}
|
11657
|
-
}
|
11658
|
-
|
11659
|
-
return;
|
11660
|
-
}
|
11661
|
-
|
11662
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11663
|
-
return;
|
11664
|
-
}
|
11665
|
-
|
11666
|
-
// total rows in dst
|
11667
|
-
const int nr = ne02;
|
11668
|
-
|
11669
|
-
// rows per thread
|
11670
|
-
const int dr = (nr + nth - 1)/nth;
|
11671
|
-
|
11672
|
-
// row range for this thread
|
11673
|
-
const int ir0 = dr*ith;
|
11674
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11675
|
-
|
11676
|
-
float * const wdata = (float *) params->wdata + 0;
|
11677
|
-
|
11678
|
-
for (int i2 = 0; i2 < ne2; i2++) {
|
11679
|
-
for (int i1 = ir0; i1 < ir1; i1++) {
|
11680
|
-
float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
|
11681
|
-
|
11682
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
11683
|
-
ggml_vec_dot_f32(ew0, dst_data + i0,
|
11684
|
-
(float *) ((char *) src0->data + i1*nb02),
|
11685
|
-
(float *) wdata + i2*nb2 + i0*ew0);
|
11686
|
-
}
|
11687
|
-
}
|
11688
|
-
}
|
11689
|
-
}
|
11690
|
-
|
11691
|
-
// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
|
11692
|
-
static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
|
11693
|
-
ggml_fp16_t * A,
|
11694
|
-
ggml_fp16_t * B,
|
11695
|
-
float * C,
|
11696
|
-
const int ith, const int nth) {
|
11697
|
-
// does not seem to make a difference
|
11698
|
-
int64_t m0, m1, n0, n1;
|
11699
|
-
// patches per thread
|
11700
|
-
if (m > n) {
|
11701
|
-
n0 = 0;
|
11702
|
-
n1 = n;
|
11703
|
-
|
11704
|
-
// total patches in dst
|
11705
|
-
const int np = m;
|
11706
|
-
|
11707
|
-
// patches per thread
|
11708
|
-
const int dp = (np + nth - 1)/nth;
|
11709
|
-
|
11710
|
-
// patch range for this thread
|
11711
|
-
m0 = dp*ith;
|
11712
|
-
m1 = MIN(m0 + dp, np);
|
11713
|
-
} else {
|
11714
|
-
m0 = 0;
|
11715
|
-
m1 = m;
|
11716
|
-
|
11717
|
-
// total patches in dst
|
11718
|
-
const int np = n;
|
11719
|
-
|
11720
|
-
// patches per thread
|
11721
|
-
const int dp = (np + nth - 1)/nth;
|
11722
|
-
|
11723
|
-
// patch range for this thread
|
11724
|
-
n0 = dp*ith;
|
11725
|
-
n1 = MIN(n0 + dp, np);
|
11726
|
-
}
|
11727
|
-
|
11728
|
-
// block-tiling attempt
|
11729
|
-
int64_t blck_n = 16;
|
11730
|
-
int64_t blck_m = 16;
|
11731
|
-
|
11732
|
-
// int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
|
11733
|
-
// int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
|
11734
|
-
// if (blck_size > 0) {
|
11735
|
-
// blck_0 = 4;
|
11736
|
-
// blck_1 = blck_size / blck_0;
|
11737
|
-
// if (blck_1 < 0) {
|
11738
|
-
// blck_1 = 1;
|
11739
|
-
// }
|
11740
|
-
// // blck_0 = (int64_t)sqrt(blck_size);
|
11741
|
-
// // blck_1 = blck_0;
|
11742
|
-
// }
|
11743
|
-
// // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
|
11744
|
-
|
11745
|
-
for (int j = n0; j < n1; j+=blck_n) {
|
11746
|
-
for (int i = m0; i < m1; i+=blck_m) {
|
11747
|
-
// printf("i j k => %d %d %d\n", i, j, K);
|
11748
|
-
for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
|
11749
|
-
for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
|
11750
|
-
ggml_vec_dot_f16(k,
|
11751
|
-
C + ii*n + jj,
|
11752
|
-
A + ii * k,
|
11753
|
-
B + jj * k);
|
11754
|
-
}
|
11755
|
-
}
|
11756
|
-
}
|
11757
|
-
}
|
11758
|
-
}
|
11759
|
-
|
11760
|
-
// src0: kernel [OC, IC, K]
|
11761
|
-
// src1: signal [N, IC, IL]
|
11762
|
-
// dst: result [N, OL, IC*K]
|
11763
|
-
static void ggml_compute_forward_conv_1d_stage_0_f32(
|
11764
|
-
const struct ggml_compute_params * params,
|
11765
|
-
const struct ggml_tensor * src0,
|
11766
|
-
const struct ggml_tensor * src1,
|
11767
|
-
struct ggml_tensor * dst) {
|
11768
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11769
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11770
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
11771
|
-
|
11772
|
-
int64_t t0 = ggml_perf_time_us();
|
11773
|
-
UNUSED(t0);
|
11774
|
-
|
11775
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
11776
|
-
|
11777
|
-
const int64_t N = ne12;
|
11778
|
-
const int64_t IC = ne11;
|
11779
|
-
const int64_t IL = ne10;
|
11780
|
-
|
11781
|
-
const int64_t K = ne00;
|
11782
|
-
|
11783
|
-
const int64_t OL = ne1;
|
11784
|
-
|
11785
|
-
const int ith = params->ith;
|
11786
|
-
const int nth = params->nth;
|
11787
|
-
|
11788
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
11789
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
|
11790
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
|
11791
|
-
|
11792
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11793
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11794
|
-
|
11795
|
-
if (params->type == GGML_TASK_INIT) {
|
11796
|
-
memset(dst->data, 0, ggml_nbytes(dst));
|
11797
|
-
return;
|
11798
|
-
}
|
11799
|
-
|
11800
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11801
|
-
return;
|
11802
|
-
}
|
11803
|
-
|
11804
|
-
// im2col: [N, IC, IL] => [N, OL, IC*K]
|
11805
|
-
{
|
11806
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
11807
|
-
|
11808
|
-
for (int64_t in = 0; in < N; in++) {
|
11809
|
-
for (int64_t iol = 0; iol < OL; iol++) {
|
11810
|
-
for (int64_t iic = ith; iic < IC; iic+=nth) {
|
11811
|
-
|
11812
|
-
// micro kernel
|
11813
|
-
ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
|
11814
|
-
const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
|
11815
|
-
|
11816
|
-
for (int64_t ik = 0; ik < K; ik++) {
|
11817
|
-
const int64_t iil = iol*s0 + ik*d0 - p0;
|
11818
|
-
|
11819
|
-
if (!(iil < 0 || iil >= IL)) {
|
11820
|
-
dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
|
11821
|
-
}
|
11822
|
-
}
|
11823
|
-
}
|
11824
|
-
}
|
11825
|
-
}
|
11826
|
-
}
|
11827
|
-
}
|
11828
|
-
|
11829
|
-
// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
|
11830
|
-
// src0: [OC, IC, K]
|
11831
|
-
// src1: [N, OL, IC * K]
|
11832
|
-
// result: [N, OC, OL]
|
11833
|
-
static void ggml_compute_forward_conv_1d_stage_1_f16(
|
11834
|
-
const struct ggml_compute_params * params,
|
11835
|
-
const struct ggml_tensor * src0,
|
11836
|
-
const struct ggml_tensor * src1,
|
11837
|
-
struct ggml_tensor * dst) {
|
11838
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11839
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
11840
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11841
|
-
|
11842
|
-
int64_t t0 = ggml_perf_time_us();
|
11843
|
-
UNUSED(t0);
|
11844
|
-
|
11845
|
-
if (params->type == GGML_TASK_INIT) {
|
11846
|
-
return;
|
11847
|
-
}
|
11848
|
-
|
11849
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11850
|
-
return;
|
11851
|
-
}
|
11852
|
-
|
11853
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
11854
|
-
|
11855
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11856
|
-
GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
|
11857
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
11858
|
-
|
11859
|
-
const int N = ne12;
|
11860
|
-
const int OL = ne11;
|
11861
|
-
|
11862
|
-
const int OC = ne02;
|
11863
|
-
const int IC = ne01;
|
11864
|
-
const int K = ne00;
|
11865
|
-
|
11866
|
-
const int ith = params->ith;
|
11867
|
-
const int nth = params->nth;
|
11868
|
-
|
11869
|
-
int64_t m = OC;
|
11870
|
-
int64_t n = OL;
|
11871
|
-
int64_t k = IC * K;
|
11872
|
-
|
11873
|
-
// [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
|
11874
|
-
for (int i = 0; i < N; i++) {
|
11875
|
-
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
11876
|
-
ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
|
11877
|
-
float * C = (float *)dst->data + i * m * n; // [m, n]
|
11878
|
-
|
11879
|
-
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
11880
|
-
}
|
11881
|
-
}
|
11882
|
-
|
11883
|
-
static void ggml_compute_forward_conv_1d(
|
11884
|
-
const struct ggml_compute_params * params,
|
11885
|
-
const struct ggml_tensor * src0,
|
11886
|
-
const struct ggml_tensor * src1,
|
11887
|
-
struct ggml_tensor * dst) {
|
11888
|
-
switch(src0->type) {
|
11889
|
-
case GGML_TYPE_F16:
|
11890
|
-
{
|
11891
|
-
ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
|
11892
|
-
} break;
|
11893
|
-
case GGML_TYPE_F32:
|
11894
|
-
{
|
11895
|
-
ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
|
11896
|
-
} break;
|
11897
|
-
default:
|
11898
|
-
{
|
11899
|
-
GGML_ASSERT(false);
|
11900
|
-
} break;
|
11901
|
-
}
|
11902
|
-
}
|
11903
|
-
|
11904
|
-
static void ggml_compute_forward_conv_1d_stage_0(
|
11905
|
-
const struct ggml_compute_params * params,
|
11906
|
-
const struct ggml_tensor * src0,
|
11907
|
-
const struct ggml_tensor * src1,
|
11908
|
-
struct ggml_tensor * dst) {
|
11909
|
-
switch(src0->type) {
|
11910
|
-
case GGML_TYPE_F16:
|
11911
|
-
{
|
11912
|
-
ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
|
11913
|
-
} break;
|
11914
|
-
default:
|
11915
|
-
{
|
11916
|
-
GGML_ASSERT(false);
|
11917
|
-
} break;
|
11918
|
-
}
|
11919
|
-
}
|
11920
|
-
|
11921
|
-
static void ggml_compute_forward_conv_1d_stage_1(
|
11922
|
-
const struct ggml_compute_params * params,
|
11923
|
-
const struct ggml_tensor * src0,
|
11924
|
-
const struct ggml_tensor * src1,
|
11925
|
-
struct ggml_tensor * dst) {
|
11926
|
-
switch(src0->type) {
|
11927
|
-
case GGML_TYPE_F16:
|
11928
|
-
{
|
11929
|
-
ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
|
11930
|
-
} break;
|
11931
|
-
default:
|
11932
|
-
{
|
11933
|
-
GGML_ASSERT(false);
|
11934
|
-
} break;
|
11935
|
-
}
|
11936
|
-
}
|
11937
|
-
|
11938
|
-
// ggml_compute_forward_conv_transpose_1d
|
11939
|
-
|
11940
|
-
static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
11941
|
-
const struct ggml_compute_params * params,
|
11942
|
-
const struct ggml_tensor * src0,
|
11943
|
-
const struct ggml_tensor * src1,
|
11944
|
-
struct ggml_tensor * dst) {
|
11945
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11946
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11947
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11948
|
-
|
11949
|
-
int64_t t0 = ggml_perf_time_us();
|
11950
|
-
UNUSED(t0);
|
11951
|
-
|
11952
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
11953
|
-
|
11954
|
-
const int ith = params->ith;
|
11955
|
-
const int nth = params->nth;
|
11956
|
-
|
11957
|
-
const int nk = ne00*ne01*ne02;
|
11958
|
-
|
11959
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11960
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11961
|
-
|
11962
|
-
if (params->type == GGML_TASK_INIT) {
|
11963
|
-
memset(params->wdata, 0, params->wsize);
|
11964
|
-
|
11965
|
-
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
11966
|
-
{
|
11967
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11968
|
-
|
11969
|
-
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
11970
|
-
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11971
|
-
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
11972
|
-
ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
|
11973
|
-
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11974
|
-
dst_data[i00*ne02 + i02] = src[i00];
|
11975
|
-
}
|
11976
|
-
}
|
11977
|
-
}
|
11978
|
-
}
|
11979
|
-
|
11980
|
-
// permute source data (src1) from (L x Cin) to (Cin x L)
|
11981
|
-
{
|
11982
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
11983
|
-
ggml_fp16_t * dst_data = wdata;
|
11356
|
+
// permute source data (src1) from (L x Cin) to (Cin x L)
|
11357
|
+
{
|
11358
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
11359
|
+
ggml_fp16_t * dst_data = wdata;
|
11984
11360
|
|
11985
11361
|
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
11986
11362
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
@@ -12146,12 +11522,10 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|
12146
11522
|
}
|
12147
11523
|
}
|
12148
11524
|
|
12149
|
-
// ggml_compute_forward_conv_2d
|
12150
|
-
|
12151
11525
|
// src0: kernel [OC, IC, KH, KW]
|
12152
11526
|
// src1: image [N, IC, IH, IW]
|
12153
11527
|
// dst: result [N, OH, OW, IC*KH*KW]
|
12154
|
-
static void
|
11528
|
+
static void ggml_compute_forward_im2col_f16(
|
12155
11529
|
const struct ggml_compute_params * params,
|
12156
11530
|
const struct ggml_tensor * src0,
|
12157
11531
|
const struct ggml_tensor * src1,
|
@@ -12165,34 +11539,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
|
|
12165
11539
|
|
12166
11540
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
12167
11541
|
|
12168
|
-
const
|
12169
|
-
const
|
12170
|
-
const
|
11542
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
11543
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
11544
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
11545
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
11546
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
11547
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
11548
|
+
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
11549
|
+
|
11550
|
+
const int ith = params->ith;
|
11551
|
+
const int nth = params->nth;
|
11552
|
+
|
11553
|
+
const int64_t N = is_2D ? ne13 : ne12;
|
11554
|
+
const int64_t IC = is_2D ? ne12 : ne11;
|
11555
|
+
const int64_t IH = is_2D ? ne11 : 1;
|
12171
11556
|
const int64_t IW = ne10;
|
12172
11557
|
|
12173
|
-
|
12174
|
-
// const int64_t IC = ne02;
|
12175
|
-
const int64_t KH = ne01;
|
11558
|
+
const int64_t KH = is_2D ? ne01 : 1;
|
12176
11559
|
const int64_t KW = ne00;
|
12177
11560
|
|
12178
|
-
const int64_t OH = ne2;
|
11561
|
+
const int64_t OH = is_2D ? ne2 : 1;
|
12179
11562
|
const int64_t OW = ne1;
|
12180
11563
|
|
12181
|
-
|
12182
|
-
|
12183
|
-
|
12184
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
12185
|
-
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
12186
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
12187
|
-
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
12188
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
12189
|
-
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
11564
|
+
int ofs0 = is_2D ? nb13 : nb12;
|
11565
|
+
int ofs1 = is_2D ? nb12 : nb11;
|
12190
11566
|
|
12191
11567
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12192
11568
|
GGML_ASSERT(nb10 == sizeof(float));
|
12193
11569
|
|
12194
11570
|
if (params->type == GGML_TASK_INIT) {
|
12195
|
-
memset(dst->data, 0, ggml_nbytes(dst));
|
12196
11571
|
return;
|
12197
11572
|
}
|
12198
11573
|
|
@@ -12205,20 +11580,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
|
|
12205
11580
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
12206
11581
|
|
12207
11582
|
for (int64_t in = 0; in < N; in++) {
|
12208
|
-
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
11583
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
12209
11584
|
for (int64_t iow = 0; iow < OW; iow++) {
|
12210
|
-
for (int64_t iic = ith; iic < IC; iic+=nth) {
|
11585
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
12211
11586
|
|
12212
11587
|
// micro kernel
|
12213
11588
|
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
12214
|
-
const float * const src_data = (float *)((char *) src1->data + in*
|
11589
|
+
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
12215
11590
|
|
12216
|
-
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
11591
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
12217
11592
|
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
12218
11593
|
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
12219
11594
|
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
12220
11595
|
|
12221
|
-
if (
|
11596
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
11597
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
11598
|
+
} else {
|
12222
11599
|
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
12223
11600
|
}
|
12224
11601
|
}
|
@@ -12230,223 +11607,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
|
|
12230
11607
|
}
|
12231
11608
|
}
|
12232
11609
|
|
12233
|
-
|
12234
|
-
// src0: [OC, IC, KH, KW]
|
12235
|
-
// src1: [N, OH, OW, IC * KH * KW]
|
12236
|
-
// result: [N, OC, OH, OW]
|
12237
|
-
static void ggml_compute_forward_conv_2d_stage_1_f16(
|
12238
|
-
const struct ggml_compute_params * params,
|
12239
|
-
const struct ggml_tensor * src0,
|
12240
|
-
const struct ggml_tensor * src1,
|
12241
|
-
struct ggml_tensor * dst) {
|
12242
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12243
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
12244
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12245
|
-
|
12246
|
-
int64_t t0 = ggml_perf_time_us();
|
12247
|
-
UNUSED(t0);
|
12248
|
-
|
12249
|
-
if (params->type == GGML_TASK_INIT) {
|
12250
|
-
return;
|
12251
|
-
}
|
12252
|
-
|
12253
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
12254
|
-
return;
|
12255
|
-
}
|
12256
|
-
|
12257
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
12258
|
-
|
12259
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12260
|
-
GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
|
12261
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
12262
|
-
|
12263
|
-
const int N = ne13;
|
12264
|
-
const int OH = ne12;
|
12265
|
-
const int OW = ne11;
|
12266
|
-
|
12267
|
-
const int OC = ne03;
|
12268
|
-
const int IC = ne02;
|
12269
|
-
const int KH = ne01;
|
12270
|
-
const int KW = ne00;
|
12271
|
-
|
12272
|
-
const int ith = params->ith;
|
12273
|
-
const int nth = params->nth;
|
12274
|
-
|
12275
|
-
int64_t m = OC;
|
12276
|
-
int64_t n = OH * OW;
|
12277
|
-
int64_t k = IC * KH * KW;
|
12278
|
-
|
12279
|
-
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
12280
|
-
for (int i = 0; i < N; i++) {
|
12281
|
-
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
12282
|
-
ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
|
12283
|
-
float * C = (float *)dst->data + i * m * n; // [m, n]
|
12284
|
-
|
12285
|
-
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
12286
|
-
}
|
12287
|
-
}
|
12288
|
-
|
12289
|
-
static void ggml_compute_forward_conv_2d_f16_f32(
|
12290
|
-
const struct ggml_compute_params * params,
|
12291
|
-
const struct ggml_tensor * src0,
|
12292
|
-
const struct ggml_tensor * src1,
|
12293
|
-
struct ggml_tensor * dst) {
|
12294
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12295
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12296
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12297
|
-
|
12298
|
-
int64_t t0 = ggml_perf_time_us();
|
12299
|
-
UNUSED(t0);
|
12300
|
-
|
12301
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
12302
|
-
|
12303
|
-
// src1: image [N, IC, IH, IW]
|
12304
|
-
// src0: kernel [OC, IC, KH, KW]
|
12305
|
-
// dst: result [N, OC, OH, OW]
|
12306
|
-
// ne12: IC
|
12307
|
-
// ne0: OW
|
12308
|
-
// ne1: OH
|
12309
|
-
// nk0: KW
|
12310
|
-
// nk1: KH
|
12311
|
-
// ne13: N
|
12312
|
-
|
12313
|
-
const int N = ne13;
|
12314
|
-
const int IC = ne12;
|
12315
|
-
const int IH = ne11;
|
12316
|
-
const int IW = ne10;
|
12317
|
-
|
12318
|
-
const int OC = ne03;
|
12319
|
-
// const int IC = ne02;
|
12320
|
-
const int KH = ne01;
|
12321
|
-
const int KW = ne00;
|
12322
|
-
|
12323
|
-
const int OH = ne1;
|
12324
|
-
const int OW = ne0;
|
12325
|
-
|
12326
|
-
const int ith = params->ith;
|
12327
|
-
const int nth = params->nth;
|
12328
|
-
|
12329
|
-
// const int nk0 = ne00;
|
12330
|
-
// const int nk1 = ne01;
|
12331
|
-
|
12332
|
-
// size of the convolution row - the kernel size unrolled across all channels
|
12333
|
-
// const int ew0 = nk0*nk1*ne02;
|
12334
|
-
// ew0: IC*KH*KW
|
12335
|
-
|
12336
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
12337
|
-
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
12338
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
12339
|
-
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
12340
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
12341
|
-
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
12342
|
-
|
12343
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12344
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
12345
|
-
|
12346
|
-
if (params->type == GGML_TASK_INIT) {
|
12347
|
-
memset(params->wdata, 0, params->wsize);
|
12348
|
-
|
12349
|
-
// prepare source data (src1)
|
12350
|
-
// im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
|
12351
|
-
|
12352
|
-
{
|
12353
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
12354
|
-
|
12355
|
-
for (int in = 0; in < N; in++) {
|
12356
|
-
for (int iic = 0; iic < IC; iic++) {
|
12357
|
-
for (int ioh = 0; ioh < OH; ioh++) {
|
12358
|
-
for (int iow = 0; iow < OW; iow++) {
|
12359
|
-
|
12360
|
-
// micro kernel
|
12361
|
-
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
12362
|
-
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
|
12363
|
-
|
12364
|
-
for (int ikh = 0; ikh < KH; ikh++) {
|
12365
|
-
for (int ikw = 0; ikw < KW; ikw++) {
|
12366
|
-
const int iiw = iow*s0 + ikw*d0 - p0;
|
12367
|
-
const int iih = ioh*s1 + ikh*d1 - p1;
|
12368
|
-
|
12369
|
-
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
|
12370
|
-
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
12371
|
-
}
|
12372
|
-
}
|
12373
|
-
}
|
12374
|
-
}
|
12375
|
-
}
|
12376
|
-
}
|
12377
|
-
}
|
12378
|
-
}
|
12379
|
-
|
12380
|
-
return;
|
12381
|
-
}
|
12382
|
-
|
12383
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
12384
|
-
return;
|
12385
|
-
}
|
12386
|
-
|
12387
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
12388
|
-
// wdata: [N*OH*OW, IC*KH*KW]
|
12389
|
-
// dst: result [N, OC, OH, OW]
|
12390
|
-
// src0: kernel [OC, IC, KH, KW]
|
12391
|
-
|
12392
|
-
int64_t m = OC;
|
12393
|
-
int64_t n = OH * OW;
|
12394
|
-
int64_t k = IC * KH * KW;
|
12395
|
-
|
12396
|
-
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
12397
|
-
for (int i = 0; i < N; i++) {
|
12398
|
-
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
12399
|
-
ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
|
12400
|
-
float * C = (float *)dst->data + i * m * n; // [m * k]
|
12401
|
-
|
12402
|
-
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
12403
|
-
}
|
12404
|
-
}
|
12405
|
-
|
12406
|
-
static void ggml_compute_forward_conv_2d(
|
12407
|
-
const struct ggml_compute_params * params,
|
12408
|
-
const struct ggml_tensor * src0,
|
12409
|
-
const struct ggml_tensor * src1,
|
12410
|
-
struct ggml_tensor * dst) {
|
12411
|
-
switch (src0->type) {
|
12412
|
-
case GGML_TYPE_F16:
|
12413
|
-
{
|
12414
|
-
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
|
12415
|
-
} break;
|
12416
|
-
case GGML_TYPE_F32:
|
12417
|
-
{
|
12418
|
-
//ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
|
12419
|
-
GGML_ASSERT(false);
|
12420
|
-
} break;
|
12421
|
-
default:
|
12422
|
-
{
|
12423
|
-
GGML_ASSERT(false);
|
12424
|
-
} break;
|
12425
|
-
}
|
12426
|
-
}
|
12427
|
-
|
12428
|
-
static void ggml_compute_forward_conv_2d_stage_0(
|
12429
|
-
const struct ggml_compute_params * params,
|
12430
|
-
const struct ggml_tensor * src0,
|
12431
|
-
const struct ggml_tensor * src1,
|
12432
|
-
struct ggml_tensor * dst) {
|
12433
|
-
switch (src0->type) {
|
12434
|
-
case GGML_TYPE_F16:
|
12435
|
-
{
|
12436
|
-
ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
|
12437
|
-
} break;
|
12438
|
-
case GGML_TYPE_F32:
|
12439
|
-
{
|
12440
|
-
GGML_ASSERT(false);
|
12441
|
-
} break;
|
12442
|
-
default:
|
12443
|
-
{
|
12444
|
-
GGML_ASSERT(false);
|
12445
|
-
} break;
|
12446
|
-
}
|
12447
|
-
}
|
12448
|
-
|
12449
|
-
static void ggml_compute_forward_conv_2d_stage_1(
|
11610
|
+
static void ggml_compute_forward_im2col(
|
12450
11611
|
const struct ggml_compute_params * params,
|
12451
11612
|
const struct ggml_tensor * src0,
|
12452
11613
|
const struct ggml_tensor * src1,
|
@@ -12454,7 +11615,7 @@ static void ggml_compute_forward_conv_2d_stage_1(
|
|
12454
11615
|
switch (src0->type) {
|
12455
11616
|
case GGML_TYPE_F16:
|
12456
11617
|
{
|
12457
|
-
|
11618
|
+
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
12458
11619
|
} break;
|
12459
11620
|
case GGML_TYPE_F32:
|
12460
11621
|
{
|
@@ -12639,14 +11800,11 @@ static void ggml_compute_forward_pool_1d(
|
|
12639
11800
|
ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
|
12640
11801
|
}
|
12641
11802
|
|
12642
|
-
//
|
11803
|
+
// ggml_compute_forward_pool_2d
|
12643
11804
|
|
12644
|
-
static void
|
11805
|
+
static void ggml_compute_forward_pool_2d(
|
12645
11806
|
const struct ggml_compute_params * params,
|
12646
|
-
const enum ggml_op_pool op,
|
12647
11807
|
const struct ggml_tensor * src,
|
12648
|
-
const int k0,
|
12649
|
-
const int k1,
|
12650
11808
|
struct ggml_tensor * dst) {
|
12651
11809
|
assert(src->type == GGML_TYPE_F32);
|
12652
11810
|
assert(params->ith == 0);
|
@@ -12655,6 +11813,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12655
11813
|
return;
|
12656
11814
|
}
|
12657
11815
|
|
11816
|
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
11817
|
+
enum ggml_op_pool op = opts[0];
|
11818
|
+
const int k0 = opts[1];
|
11819
|
+
const int k1 = opts[2];
|
11820
|
+
const int s0 = opts[3];
|
11821
|
+
const int s1 = opts[4];
|
11822
|
+
const int p0 = opts[5];
|
11823
|
+
const int p1 = opts[6];
|
12658
11824
|
const char * cdata = (const char*)src->data;
|
12659
11825
|
const char * const data_end = cdata + ggml_nbytes(src);
|
12660
11826
|
|
@@ -12665,6 +11831,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12665
11831
|
float * dplane = (float *)dst->data;
|
12666
11832
|
|
12667
11833
|
const int ka = k0 * k1;
|
11834
|
+
const int offset0 = -p0;
|
11835
|
+
const int offset1 = -p1;
|
12668
11836
|
|
12669
11837
|
while (cdata < data_end) {
|
12670
11838
|
for (int oy = 0; oy < py; ++oy) {
|
@@ -12677,13 +11845,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12677
11845
|
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
12678
11846
|
}
|
12679
11847
|
|
12680
|
-
const int ix = ox *
|
12681
|
-
const int iy = oy *
|
11848
|
+
const int ix = offset0 + ox * s0;
|
11849
|
+
const int iy = offset1 + oy * s1;
|
12682
11850
|
|
12683
11851
|
for (int ky = 0; ky < k1; ++ky) {
|
11852
|
+
if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
|
12684
11853
|
const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
|
12685
11854
|
for (int kx = 0; kx < k0; ++kx) {
|
12686
11855
|
int j = ix + kx;
|
11856
|
+
if (j < 0 || j >= src->ne[0]) continue;
|
12687
11857
|
switch (op) {
|
12688
11858
|
case GGML_OP_POOL_AVG: *out += srow[j]; break;
|
12689
11859
|
case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
|
@@ -12700,31 +11870,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12700
11870
|
}
|
12701
11871
|
|
12702
11872
|
cdata += src->nb[2];
|
12703
|
-
dplane += pa;
|
12704
|
-
}
|
12705
|
-
}
|
12706
|
-
|
12707
|
-
// ggml_compute_forward_pool_2d
|
12708
|
-
|
12709
|
-
static void ggml_compute_forward_pool_2d(
|
12710
|
-
const struct ggml_compute_params * params,
|
12711
|
-
const struct ggml_tensor * src0,
|
12712
|
-
struct ggml_tensor * dst) {
|
12713
|
-
|
12714
|
-
const int32_t * opts = (const int32_t *)dst->op_params;
|
12715
|
-
enum ggml_op_pool op = opts[0];
|
12716
|
-
const int k0 = opts[1];
|
12717
|
-
const int k1 = opts[2];
|
12718
|
-
const int s0 = opts[3];
|
12719
|
-
const int s1 = opts[4];
|
12720
|
-
const int p0 = opts[5];
|
12721
|
-
const int p1 = opts[6];
|
12722
|
-
GGML_ASSERT(p0 == 0);
|
12723
|
-
GGML_ASSERT(p1 == 0); // padding not supported
|
12724
|
-
GGML_ASSERT(k0 == s0);
|
12725
|
-
GGML_ASSERT(k1 == s1); // only s = k supported
|
12726
|
-
|
12727
|
-
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
11873
|
+
dplane += pa;
|
11874
|
+
}
|
12728
11875
|
}
|
12729
11876
|
|
12730
11877
|
// ggml_compute_forward_upscale
|
@@ -13928,6 +13075,10 @@ static void ggml_compute_forward_unary(
|
|
13928
13075
|
{
|
13929
13076
|
ggml_compute_forward_silu(params, src0, dst);
|
13930
13077
|
} break;
|
13078
|
+
case GGML_UNARY_OP_LEAKY:
|
13079
|
+
{
|
13080
|
+
ggml_compute_forward_leaky(params, src0, dst);
|
13081
|
+
} break;
|
13931
13082
|
default:
|
13932
13083
|
{
|
13933
13084
|
GGML_ASSERT(false);
|
@@ -14681,33 +13832,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14681
13832
|
{
|
14682
13833
|
ggml_compute_forward_clamp(params, tensor->src[0], tensor);
|
14683
13834
|
} break;
|
14684
|
-
case GGML_OP_CONV_1D:
|
14685
|
-
{
|
14686
|
-
ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
|
14687
|
-
} break;
|
14688
|
-
case GGML_OP_CONV_1D_STAGE_0:
|
14689
|
-
{
|
14690
|
-
ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
|
14691
|
-
} break;
|
14692
|
-
case GGML_OP_CONV_1D_STAGE_1:
|
14693
|
-
{
|
14694
|
-
ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
|
14695
|
-
} break;
|
14696
13835
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
14697
13836
|
{
|
14698
13837
|
ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
|
14699
13838
|
} break;
|
14700
|
-
case
|
14701
|
-
{
|
14702
|
-
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14703
|
-
} break;
|
14704
|
-
case GGML_OP_CONV_2D_STAGE_0:
|
13839
|
+
case GGML_OP_IM2COL:
|
14705
13840
|
{
|
14706
|
-
|
14707
|
-
} break;
|
14708
|
-
case GGML_OP_CONV_2D_STAGE_1:
|
14709
|
-
{
|
14710
|
-
ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
|
13841
|
+
ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
|
14711
13842
|
} break;
|
14712
13843
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
14713
13844
|
{
|
@@ -14836,62 +13967,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14836
13967
|
|
14837
13968
|
////////////////////////////////////////////////////////////////////////////////
|
14838
13969
|
|
14839
|
-
|
13970
|
+
static size_t ggml_hash_size(size_t min_sz) {
|
13971
|
+
// next primes after powers of two
|
13972
|
+
static const size_t primes[] = {
|
13973
|
+
2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
|
13974
|
+
2053, 4099, 8209, 16411, 32771, 65537, 131101,
|
13975
|
+
262147, 524309, 1048583, 2097169, 4194319, 8388617,
|
13976
|
+
16777259, 33554467, 67108879, 134217757, 268435459,
|
13977
|
+
536870923, 1073741827, 2147483659
|
13978
|
+
};
|
13979
|
+
static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
|
13980
|
+
|
13981
|
+
// find the smallest prime that is larger or equal to min_sz
|
13982
|
+
size_t l = 0;
|
13983
|
+
size_t r = n_primes;
|
13984
|
+
while (l < r) {
|
13985
|
+
size_t m = (l + r)/2;
|
13986
|
+
if (primes[m] < min_sz) {
|
13987
|
+
l = m + 1;
|
13988
|
+
} else {
|
13989
|
+
r = m;
|
13990
|
+
}
|
13991
|
+
}
|
13992
|
+
size_t sz = l < n_primes ? primes[l] : min_sz | 1;
|
13993
|
+
return sz;
|
13994
|
+
}
|
14840
13995
|
|
14841
|
-
static size_t
|
14842
|
-
return (size_t)p
|
13996
|
+
static size_t ggml_hash(const void * p) {
|
13997
|
+
return (size_t)p;
|
14843
13998
|
}
|
14844
13999
|
|
14845
|
-
|
14846
|
-
size_t h =
|
14000
|
+
size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14001
|
+
size_t h = ggml_hash(key) % hash_set.size;
|
14847
14002
|
|
14848
14003
|
// linear probing
|
14849
14004
|
size_t i = h;
|
14850
|
-
while (
|
14851
|
-
i = (i + 1) %
|
14005
|
+
while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
|
14006
|
+
i = (i + 1) % hash_set.size;
|
14852
14007
|
if (i == h) {
|
14853
14008
|
// visited all hash table entries -> not found
|
14854
|
-
return
|
14009
|
+
return GGML_HASHTABLE_FULL;
|
14855
14010
|
}
|
14856
14011
|
}
|
14857
14012
|
return i;
|
14858
14013
|
}
|
14859
14014
|
|
14860
|
-
|
14861
|
-
size_t i =
|
14015
|
+
bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14016
|
+
size_t i = ggml_hash_find(hash_set, key);
|
14017
|
+
return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
|
14018
|
+
}
|
14019
|
+
|
14020
|
+
size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14021
|
+
size_t i = ggml_hash_find(hash_set, key);
|
14862
14022
|
|
14863
|
-
GGML_ASSERT(i
|
14023
|
+
GGML_ASSERT(i != GGML_HASHTABLE_FULL);
|
14864
14024
|
|
14865
|
-
if (
|
14866
|
-
return
|
14025
|
+
if (hash_set.keys[i] == key) {
|
14026
|
+
return GGML_HASHTABLE_ALREADY_EXISTS;
|
14867
14027
|
}
|
14868
14028
|
|
14869
14029
|
// insert
|
14870
|
-
GGML_ASSERT(
|
14871
|
-
|
14872
|
-
return
|
14030
|
+
GGML_ASSERT(hash_set.keys[i] == NULL);
|
14031
|
+
hash_set.keys[i] = key;
|
14032
|
+
return i;
|
14033
|
+
}
|
14034
|
+
|
14035
|
+
size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14036
|
+
size_t i = ggml_hash_find(hash_set, key);
|
14037
|
+
|
14038
|
+
GGML_ASSERT(i != GGML_HASHTABLE_FULL);
|
14039
|
+
|
14040
|
+
hash_set.keys[i] = key;
|
14041
|
+
return i;
|
14042
|
+
}
|
14043
|
+
|
14044
|
+
static struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
14045
|
+
size = ggml_hash_size(size);
|
14046
|
+
struct ggml_hash_set result;
|
14047
|
+
result.size = size;
|
14048
|
+
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
|
14049
|
+
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
14050
|
+
return result;
|
14873
14051
|
}
|
14874
14052
|
|
14875
|
-
static
|
14876
|
-
|
14877
|
-
return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
|
14053
|
+
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
14054
|
+
free(hash_set.keys);
|
14878
14055
|
}
|
14879
14056
|
|
14880
14057
|
struct hash_map {
|
14881
|
-
|
14882
|
-
|
14058
|
+
struct ggml_hash_set set;
|
14059
|
+
struct ggml_tensor ** vals;
|
14883
14060
|
};
|
14884
14061
|
|
14885
|
-
static struct hash_map *
|
14062
|
+
static struct hash_map * ggml_new_hash_map(size_t size) {
|
14886
14063
|
struct hash_map * result = malloc(sizeof(struct hash_map));
|
14887
|
-
|
14888
|
-
|
14889
|
-
|
14890
|
-
}
|
14064
|
+
result->set = ggml_hash_set_new(size);
|
14065
|
+
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
|
14066
|
+
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
14891
14067
|
return result;
|
14892
14068
|
}
|
14893
14069
|
|
14894
|
-
static void
|
14070
|
+
static void ggml_hash_map_free(struct hash_map * map) {
|
14071
|
+
ggml_hash_set_free(map->set);
|
14072
|
+
free(map->vals);
|
14895
14073
|
free(map);
|
14896
14074
|
}
|
14897
14075
|
|
@@ -14911,7 +14089,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14911
14089
|
return node;
|
14912
14090
|
}
|
14913
14091
|
|
14914
|
-
if (!
|
14092
|
+
if (!ggml_hash_contains(graph->visited_hash_table, node)) {
|
14915
14093
|
return node;
|
14916
14094
|
}
|
14917
14095
|
|
@@ -14926,17 +14104,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14926
14104
|
return node;
|
14927
14105
|
}
|
14928
14106
|
|
14929
|
-
size_t i =
|
14930
|
-
GGML_ASSERT(i
|
14931
|
-
if (replacements->keys[i] == node) {
|
14932
|
-
return
|
14107
|
+
size_t i = ggml_hash_find(replacements->set, node);
|
14108
|
+
GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
|
14109
|
+
if (replacements->set.keys[i] == node) {
|
14110
|
+
return replacements->vals[i];
|
14933
14111
|
}
|
14934
14112
|
|
14935
14113
|
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
|
14936
14114
|
|
14937
14115
|
// insert clone into replacements
|
14938
|
-
GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
|
14939
|
-
replacements->keys[i] = node;
|
14116
|
+
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
14117
|
+
replacements->set.keys[i] = node;
|
14940
14118
|
replacements->vals[i] = clone;
|
14941
14119
|
|
14942
14120
|
clone->op = node->op;
|
@@ -14973,26 +14151,26 @@ void ggml_build_backward_gradient_checkpointing(
|
|
14973
14151
|
struct ggml_cgraph * gb_tmp,
|
14974
14152
|
struct ggml_tensor * * checkpoints,
|
14975
14153
|
int n_checkpoints) {
|
14976
|
-
|
14154
|
+
ggml_graph_cpy(gf, gb_tmp);
|
14977
14155
|
ggml_build_backward_expand(ctx, gf, gb_tmp, true);
|
14978
14156
|
|
14979
14157
|
if (n_checkpoints <= 0) {
|
14980
|
-
|
14158
|
+
ggml_graph_cpy(gb_tmp, gb);
|
14981
14159
|
return;
|
14982
14160
|
}
|
14983
14161
|
|
14984
|
-
struct hash_map * replacements =
|
14162
|
+
struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
|
14985
14163
|
|
14986
14164
|
// insert checkpoints in replacements
|
14987
14165
|
for (int i = 0; i < n_checkpoints; ++i) {
|
14988
|
-
size_t k =
|
14989
|
-
GGML_ASSERT(k
|
14990
|
-
GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
|
14991
|
-
replacements->keys[k] = checkpoints[i];
|
14992
|
-
replacements->vals[k]
|
14166
|
+
size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
|
14167
|
+
GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
|
14168
|
+
GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
|
14169
|
+
replacements->set.keys[k] = checkpoints[i];
|
14170
|
+
replacements->vals[k] = checkpoints[i];
|
14993
14171
|
}
|
14994
14172
|
|
14995
|
-
|
14173
|
+
ggml_graph_cpy(gf, gb);
|
14996
14174
|
// rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
|
14997
14175
|
// replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
|
14998
14176
|
// by recomputing them from checkpoints
|
@@ -15009,21 +14187,21 @@ void ggml_build_backward_gradient_checkpointing(
|
|
15009
14187
|
ggml_build_forward_expand(gb, node);
|
15010
14188
|
}
|
15011
14189
|
|
15012
|
-
|
14190
|
+
ggml_hash_map_free(replacements);
|
15013
14191
|
}
|
15014
14192
|
|
15015
14193
|
// functions to change gradients considering the case that input a might be initial gradient with zero value
|
15016
14194
|
|
15017
|
-
static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b,
|
15018
|
-
if (
|
14195
|
+
static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
|
14196
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15019
14197
|
return b;
|
15020
14198
|
} else {
|
15021
14199
|
return ggml_add_impl(ctx, a, b, false);
|
15022
14200
|
}
|
15023
14201
|
}
|
15024
14202
|
|
15025
|
-
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset,
|
15026
|
-
if (
|
14203
|
+
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
14204
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15027
14205
|
struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
|
15028
14206
|
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
15029
14207
|
} else {
|
@@ -15031,23 +14209,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
|
|
15031
14209
|
}
|
15032
14210
|
}
|
15033
14211
|
|
15034
|
-
static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b,
|
15035
|
-
if (
|
14212
|
+
static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
|
14213
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15036
14214
|
return ggml_repeat(ctx, b, a);
|
15037
14215
|
} else {
|
15038
14216
|
return ggml_add1_impl(ctx, a, b, false);
|
15039
14217
|
}
|
15040
14218
|
}
|
15041
14219
|
|
15042
|
-
static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b,
|
15043
|
-
if (
|
14220
|
+
static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
|
14221
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15044
14222
|
return ggml_neg(ctx, b);
|
15045
14223
|
} else {
|
15046
14224
|
return ggml_sub_impl(ctx, a, b, false);
|
15047
14225
|
}
|
15048
14226
|
}
|
15049
14227
|
|
15050
|
-
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor,
|
14228
|
+
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
|
15051
14229
|
struct ggml_tensor * src0 = tensor->src[0];
|
15052
14230
|
struct ggml_tensor * src1 = tensor->src[1];
|
15053
14231
|
|
@@ -15559,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15559
14737
|
// necessary for llama
|
15560
14738
|
if (src0->grad) {
|
15561
14739
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
15562
|
-
const int n_dims
|
15563
|
-
const int mode
|
15564
|
-
const int n_ctx
|
15565
|
-
|
15566
|
-
float freq_scale;
|
15567
|
-
|
15568
|
-
|
15569
|
-
memcpy(&
|
15570
|
-
memcpy(&
|
15571
|
-
memcpy(&
|
15572
|
-
memcpy(&
|
14740
|
+
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
14741
|
+
const int mode = ((int32_t *) tensor->op_params)[2];
|
14742
|
+
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
14743
|
+
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
14744
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
14745
|
+
|
14746
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
14747
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
14748
|
+
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
14749
|
+
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
14750
|
+
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
14751
|
+
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
14752
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
14753
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
15573
14754
|
|
15574
14755
|
src0->grad = ggml_add_or_set(ctx,
|
15575
14756
|
src0->grad,
|
@@ -15579,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15579
14760
|
n_dims,
|
15580
14761
|
mode,
|
15581
14762
|
n_ctx,
|
14763
|
+
n_orig_ctx,
|
15582
14764
|
freq_base,
|
15583
14765
|
freq_scale,
|
14766
|
+
ext_factor,
|
14767
|
+
attn_factor,
|
14768
|
+
beta_fast,
|
14769
|
+
beta_slow,
|
15584
14770
|
xpos_base,
|
15585
14771
|
xpos_down),
|
15586
14772
|
zero_table);
|
@@ -15590,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15590
14776
|
{
|
15591
14777
|
if (src0->grad) {
|
15592
14778
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
15593
|
-
const int n_dims
|
15594
|
-
const int mode
|
15595
|
-
const int n_ctx
|
15596
|
-
|
15597
|
-
float freq_scale;
|
15598
|
-
|
15599
|
-
|
15600
|
-
memcpy(&
|
15601
|
-
memcpy(&
|
15602
|
-
memcpy(&
|
15603
|
-
memcpy(&
|
14779
|
+
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
14780
|
+
const int mode = ((int32_t *) tensor->op_params)[2];
|
14781
|
+
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
14782
|
+
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
14783
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
14784
|
+
|
14785
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
14786
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
14787
|
+
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
14788
|
+
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
14789
|
+
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
14790
|
+
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
14791
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
14792
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
15604
14793
|
|
15605
14794
|
src0->grad = ggml_add_or_set(ctx,
|
15606
14795
|
src0->grad,
|
@@ -15609,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15609
14798
|
src1,
|
15610
14799
|
n_dims,
|
15611
14800
|
mode,
|
15612
|
-
0,
|
15613
14801
|
n_ctx,
|
14802
|
+
n_orig_ctx,
|
15614
14803
|
freq_base,
|
15615
14804
|
freq_scale,
|
15616
|
-
|
15617
|
-
|
15618
|
-
|
15619
|
-
|
14805
|
+
ext_factor,
|
14806
|
+
attn_factor,
|
14807
|
+
beta_fast,
|
14808
|
+
beta_slow,
|
15620
14809
|
xpos_base,
|
15621
14810
|
xpos_down,
|
15622
14811
|
false),
|
@@ -15631,31 +14820,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15631
14820
|
{
|
15632
14821
|
GGML_ASSERT(false); // TODO: not implemented
|
15633
14822
|
} break;
|
15634
|
-
case GGML_OP_CONV_1D:
|
15635
|
-
{
|
15636
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15637
|
-
} break;
|
15638
|
-
case GGML_OP_CONV_1D_STAGE_0:
|
15639
|
-
{
|
15640
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15641
|
-
} break;
|
15642
|
-
case GGML_OP_CONV_1D_STAGE_1:
|
15643
|
-
{
|
15644
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15645
|
-
} break;
|
15646
14823
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15647
14824
|
{
|
15648
14825
|
GGML_ASSERT(false); // TODO: not implemented
|
15649
14826
|
} break;
|
15650
|
-
case
|
15651
|
-
{
|
15652
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15653
|
-
} break;
|
15654
|
-
case GGML_OP_CONV_2D_STAGE_0:
|
15655
|
-
{
|
15656
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15657
|
-
} break;
|
15658
|
-
case GGML_OP_CONV_2D_STAGE_1:
|
14827
|
+
case GGML_OP_IM2COL:
|
15659
14828
|
{
|
15660
14829
|
GGML_ASSERT(false); // TODO: not implemented
|
15661
14830
|
} break;
|
@@ -15869,7 +15038,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15869
15038
|
}
|
15870
15039
|
|
15871
15040
|
// check if already visited
|
15872
|
-
if (
|
15041
|
+
if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
|
15873
15042
|
return;
|
15874
15043
|
}
|
15875
15044
|
|
@@ -15885,7 +15054,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15885
15054
|
|
15886
15055
|
if (node->op == GGML_OP_NONE && node->grad == NULL) {
|
15887
15056
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
15888
|
-
GGML_ASSERT(cgraph->n_leafs <
|
15057
|
+
GGML_ASSERT(cgraph->n_leafs < cgraph->size);
|
15889
15058
|
|
15890
15059
|
if (strlen(node->name) == 0) {
|
15891
15060
|
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
@@ -15894,22 +15063,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15894
15063
|
cgraph->leafs[cgraph->n_leafs] = node;
|
15895
15064
|
cgraph->n_leafs++;
|
15896
15065
|
} else {
|
15897
|
-
GGML_ASSERT(cgraph->n_nodes <
|
15066
|
+
GGML_ASSERT(cgraph->n_nodes < cgraph->size);
|
15898
15067
|
|
15899
15068
|
if (strlen(node->name) == 0) {
|
15900
15069
|
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
15901
15070
|
}
|
15902
15071
|
|
15903
15072
|
cgraph->nodes[cgraph->n_nodes] = node;
|
15904
|
-
cgraph->grads
|
15073
|
+
if (cgraph->grads) {
|
15074
|
+
cgraph->grads[cgraph->n_nodes] = node->grad;
|
15075
|
+
}
|
15905
15076
|
cgraph->n_nodes++;
|
15906
15077
|
}
|
15907
15078
|
}
|
15908
15079
|
|
15909
15080
|
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
|
15910
15081
|
if (!expand) {
|
15911
|
-
|
15912
|
-
cgraph
|
15082
|
+
// TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
|
15083
|
+
ggml_graph_clear(cgraph);
|
15913
15084
|
}
|
15914
15085
|
|
15915
15086
|
const int n0 = cgraph->n_nodes;
|
@@ -15930,25 +15101,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15930
15101
|
ggml_build_forward_impl(cgraph, tensor, true);
|
15931
15102
|
}
|
15932
15103
|
|
15933
|
-
struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
15934
|
-
struct ggml_cgraph result = {
|
15935
|
-
/*.n_nodes =*/ 0,
|
15936
|
-
/*.n_leafs =*/ 0,
|
15937
|
-
/*.nodes =*/ { NULL },
|
15938
|
-
/*.grads =*/ { NULL },
|
15939
|
-
/*.leafs =*/ { NULL },
|
15940
|
-
/*.hash_table =*/ { NULL },
|
15941
|
-
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
15942
|
-
/*.perf_runs =*/ 0,
|
15943
|
-
/*.perf_cycles =*/ 0,
|
15944
|
-
/*.perf_time_us =*/ 0,
|
15945
|
-
};
|
15946
|
-
|
15947
|
-
ggml_build_forward_impl(&result, tensor, false);
|
15948
|
-
|
15949
|
-
return result;
|
15950
|
-
}
|
15951
|
-
|
15952
15104
|
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
15953
15105
|
GGML_ASSERT(gf->n_nodes > 0);
|
15954
15106
|
|
@@ -15965,11 +15117,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|
15965
15117
|
}
|
15966
15118
|
|
15967
15119
|
// remember original gradients which start with zero values
|
15968
|
-
|
15969
|
-
memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
|
15120
|
+
struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
|
15970
15121
|
for (int i = 0; i < gf->n_nodes; i++) {
|
15971
15122
|
if (gf->grads[i]) {
|
15972
|
-
|
15123
|
+
ggml_hash_insert(zero_table, gf->grads[i]);
|
15973
15124
|
}
|
15974
15125
|
}
|
15975
15126
|
|
@@ -15992,26 +15143,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|
15992
15143
|
}
|
15993
15144
|
}
|
15994
15145
|
|
15995
|
-
|
15146
|
+
ggml_hash_set_free(zero_table);
|
15996
15147
|
}
|
15997
15148
|
|
15998
|
-
|
15999
|
-
|
16000
|
-
|
16001
|
-
|
15149
|
+
static size_t ggml_graph_nbytes(size_t size, bool grads) {
|
15150
|
+
size_t nbytes = sizeof(struct ggml_cgraph);
|
15151
|
+
nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
|
15152
|
+
if (grads) {
|
15153
|
+
nbytes += size * sizeof(struct ggml_tensor *); // grads
|
15154
|
+
}
|
15155
|
+
nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
|
15156
|
+
return nbytes;
|
16002
15157
|
}
|
16003
15158
|
|
16004
|
-
|
16005
|
-
|
15159
|
+
size_t ggml_graph_overhead_custom(size_t size, bool grads) {
|
15160
|
+
return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
|
15161
|
+
}
|
15162
|
+
|
15163
|
+
size_t ggml_graph_overhead(void) {
|
15164
|
+
return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
|
15165
|
+
}
|
15166
|
+
|
15167
|
+
struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
|
15168
|
+
const size_t obj_size = ggml_graph_nbytes(size, grads);
|
15169
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
16006
15170
|
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
16007
15171
|
|
15172
|
+
struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
|
15173
|
+
|
15174
|
+
size_t hash_size = ggml_hash_size(size * 2);
|
15175
|
+
struct ggml_tensor ** nodes_ptr = data_start;
|
15176
|
+
struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
|
15177
|
+
struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
|
15178
|
+
struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
|
15179
|
+
|
15180
|
+
// check that we allocated the correct amount of memory
|
15181
|
+
assert(obj_size == (size_t) (
|
15182
|
+
(grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
|
15183
|
+
|
15184
|
+
memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
|
15185
|
+
|
16008
15186
|
*cgraph = (struct ggml_cgraph) {
|
15187
|
+
/*.size =*/ size,
|
16009
15188
|
/*.n_nodes =*/ 0,
|
16010
15189
|
/*.n_leafs =*/ 0,
|
16011
|
-
/*.nodes =*/
|
16012
|
-
/*.grads =*/
|
16013
|
-
/*.leafs =*/
|
16014
|
-
/*.hash_table =*/ {
|
15190
|
+
/*.nodes =*/ nodes_ptr,
|
15191
|
+
/*.grads =*/ grads_ptr,
|
15192
|
+
/*.leafs =*/ leafs_ptr,
|
15193
|
+
/*.hash_table =*/ { hash_size, hash_keys_ptr },
|
16015
15194
|
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
16016
15195
|
/*.perf_runs =*/ 0,
|
16017
15196
|
/*.perf_cycles =*/ 0,
|
@@ -16021,14 +15200,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
16021
15200
|
return cgraph;
|
16022
15201
|
}
|
16023
15202
|
|
16024
|
-
struct ggml_cgraph *
|
16025
|
-
|
16026
|
-
|
15203
|
+
struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
15204
|
+
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15205
|
+
}
|
15206
|
+
|
15207
|
+
struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15208
|
+
const size_t obj_size = sizeof(struct ggml_cgraph);
|
15209
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15210
|
+
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15211
|
+
|
15212
|
+
*cgraph = (struct ggml_cgraph) {
|
15213
|
+
/*.size =*/ 0,
|
15214
|
+
/*.n_nodes =*/ i1 - i0,
|
15215
|
+
/*.n_leafs =*/ 0,
|
15216
|
+
/*.nodes =*/ cgraph0->nodes + i0,
|
15217
|
+
/*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
|
15218
|
+
/*.leafs =*/ NULL,
|
15219
|
+
/*.hash_table =*/ { 0, NULL },
|
15220
|
+
/*.order =*/ cgraph0->order,
|
15221
|
+
/*.perf_runs =*/ 0,
|
15222
|
+
/*.perf_cycles =*/ 0,
|
15223
|
+
/*.perf_time_us =*/ 0,
|
15224
|
+
};
|
15225
|
+
|
16027
15226
|
return cgraph;
|
16028
15227
|
}
|
16029
15228
|
|
16030
|
-
|
16031
|
-
|
15229
|
+
void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
15230
|
+
GGML_ASSERT(dst->size >= src->n_leafs);
|
15231
|
+
GGML_ASSERT(dst->size >= src->n_nodes);
|
15232
|
+
GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
|
15233
|
+
|
15234
|
+
dst->n_leafs = src->n_leafs;
|
15235
|
+
dst->n_nodes = src->n_nodes;
|
15236
|
+
dst->order = src->order;
|
15237
|
+
|
15238
|
+
for (int i = 0; i < src->n_leafs; ++i) {
|
15239
|
+
dst->leafs[i] = src->leafs[i];
|
15240
|
+
}
|
15241
|
+
|
15242
|
+
for (int i = 0; i < src->n_nodes; ++i) {
|
15243
|
+
dst->nodes[i] = src->nodes[i];
|
15244
|
+
}
|
15245
|
+
|
15246
|
+
if (src->grads) {
|
15247
|
+
GGML_ASSERT(dst->grads != NULL);
|
15248
|
+
for (int i = 0; i < src->n_nodes; ++i) {
|
15249
|
+
dst->grads[i] = src->grads[i];
|
15250
|
+
}
|
15251
|
+
}
|
15252
|
+
|
15253
|
+
for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
|
15254
|
+
if (src->visited_hash_table.keys[i]) {
|
15255
|
+
ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
|
15256
|
+
}
|
15257
|
+
}
|
15258
|
+
}
|
15259
|
+
|
15260
|
+
struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
15261
|
+
struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
|
15262
|
+
ggml_graph_cpy(cgraph, result);
|
15263
|
+
return result;
|
15264
|
+
}
|
15265
|
+
|
15266
|
+
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
15267
|
+
GGML_ASSERT(cgraph->grads != NULL);
|
15268
|
+
|
15269
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15270
|
+
struct ggml_tensor * grad = cgraph->grads[i];
|
15271
|
+
|
15272
|
+
if (grad) {
|
15273
|
+
ggml_set_zero(grad);
|
15274
|
+
}
|
15275
|
+
}
|
15276
|
+
}
|
15277
|
+
|
15278
|
+
void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
15279
|
+
cgraph->n_leafs = 0;
|
15280
|
+
cgraph->n_nodes = 0;
|
15281
|
+
memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
|
16032
15282
|
}
|
16033
15283
|
|
16034
15284
|
//
|
@@ -16140,45 +15390,266 @@ static void clear_numa_thread_affinity(void) {
|
|
16140
15390
|
strerror(rv));
|
16141
15391
|
}
|
16142
15392
|
|
16143
|
-
CPU_FREE(cpus);
|
16144
|
-
}
|
16145
|
-
#else
|
16146
|
-
// TODO: Windows etc.
|
16147
|
-
// (the linux implementation may also work on BSD, someone should test)
|
16148
|
-
static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
16149
|
-
static void clear_numa_thread_affinity(void) {}
|
16150
|
-
#endif
|
16151
|
-
|
16152
|
-
struct ggml_compute_state_shared {
|
16153
|
-
const struct ggml_cgraph * cgraph;
|
16154
|
-
const struct ggml_cplan * cplan;
|
16155
|
-
|
16156
|
-
int64_t perf_node_start_cycles;
|
16157
|
-
int64_t perf_node_start_time_us;
|
16158
|
-
|
16159
|
-
const int n_threads;
|
16160
|
-
|
16161
|
-
// synchronization primitives
|
16162
|
-
atomic_int n_active; // num active threads
|
16163
|
-
atomic_int node_n; // active graph node
|
16164
|
-
|
16165
|
-
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
16166
|
-
void * abort_callback_data;
|
16167
|
-
};
|
16168
|
-
|
16169
|
-
struct ggml_compute_state {
|
16170
|
-
ggml_thread_t thrd;
|
16171
|
-
int ith;
|
16172
|
-
struct ggml_compute_state_shared * shared;
|
16173
|
-
};
|
16174
|
-
|
16175
|
-
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
16176
|
-
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
16177
|
-
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
15393
|
+
CPU_FREE(cpus);
|
15394
|
+
}
|
15395
|
+
#else
|
15396
|
+
// TODO: Windows etc.
|
15397
|
+
// (the linux implementation may also work on BSD, someone should test)
|
15398
|
+
static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
15399
|
+
static void clear_numa_thread_affinity(void) {}
|
15400
|
+
#endif
|
15401
|
+
|
15402
|
+
struct ggml_compute_state_shared {
|
15403
|
+
const struct ggml_cgraph * cgraph;
|
15404
|
+
const struct ggml_cplan * cplan;
|
15405
|
+
|
15406
|
+
int64_t perf_node_start_cycles;
|
15407
|
+
int64_t perf_node_start_time_us;
|
15408
|
+
|
15409
|
+
const int n_threads;
|
15410
|
+
|
15411
|
+
// synchronization primitives
|
15412
|
+
atomic_int n_active; // num active threads
|
15413
|
+
atomic_int node_n; // active graph node
|
15414
|
+
|
15415
|
+
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
15416
|
+
void * abort_callback_data;
|
15417
|
+
};
|
15418
|
+
|
15419
|
+
struct ggml_compute_state {
|
15420
|
+
ggml_thread_t thrd;
|
15421
|
+
int ith;
|
15422
|
+
struct ggml_compute_state_shared * shared;
|
15423
|
+
};
|
15424
|
+
|
15425
|
+
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
15426
|
+
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
15427
|
+
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
15428
|
+
|
15429
|
+
node->perf_runs++;
|
15430
|
+
node->perf_cycles += cycles_cur;
|
15431
|
+
node->perf_time_us += time_us_cur;
|
15432
|
+
}
|
15433
|
+
|
15434
|
+
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
15435
|
+
int n_tasks = 0;
|
15436
|
+
|
15437
|
+
switch (node->op) {
|
15438
|
+
case GGML_OP_CPY:
|
15439
|
+
case GGML_OP_DUP:
|
15440
|
+
case GGML_OP_ADD:
|
15441
|
+
case GGML_OP_ADD1:
|
15442
|
+
case GGML_OP_ACC:
|
15443
|
+
{
|
15444
|
+
n_tasks = n_threads;
|
15445
|
+
} break;
|
15446
|
+
case GGML_OP_SUB:
|
15447
|
+
case GGML_OP_DIV:
|
15448
|
+
case GGML_OP_SQR:
|
15449
|
+
case GGML_OP_SQRT:
|
15450
|
+
case GGML_OP_LOG:
|
15451
|
+
case GGML_OP_SUM:
|
15452
|
+
case GGML_OP_SUM_ROWS:
|
15453
|
+
case GGML_OP_MEAN:
|
15454
|
+
case GGML_OP_ARGMAX:
|
15455
|
+
case GGML_OP_REPEAT:
|
15456
|
+
case GGML_OP_REPEAT_BACK:
|
15457
|
+
{
|
15458
|
+
n_tasks = 1;
|
15459
|
+
} break;
|
15460
|
+
case GGML_OP_UNARY:
|
15461
|
+
switch (ggml_get_unary_op(node)) {
|
15462
|
+
case GGML_UNARY_OP_ABS:
|
15463
|
+
case GGML_UNARY_OP_SGN:
|
15464
|
+
case GGML_UNARY_OP_NEG:
|
15465
|
+
case GGML_UNARY_OP_STEP:
|
15466
|
+
case GGML_UNARY_OP_TANH:
|
15467
|
+
case GGML_UNARY_OP_ELU:
|
15468
|
+
case GGML_UNARY_OP_RELU:
|
15469
|
+
case GGML_UNARY_OP_LEAKY:
|
15470
|
+
{
|
15471
|
+
n_tasks = 1;
|
15472
|
+
} break;
|
15473
|
+
|
15474
|
+
case GGML_UNARY_OP_GELU:
|
15475
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
15476
|
+
case GGML_UNARY_OP_SILU:
|
15477
|
+
{
|
15478
|
+
n_tasks = n_threads;
|
15479
|
+
} break;
|
15480
|
+
}
|
15481
|
+
break;
|
15482
|
+
case GGML_OP_SILU_BACK:
|
15483
|
+
case GGML_OP_MUL:
|
15484
|
+
case GGML_OP_NORM:
|
15485
|
+
case GGML_OP_RMS_NORM:
|
15486
|
+
case GGML_OP_RMS_NORM_BACK:
|
15487
|
+
case GGML_OP_GROUP_NORM:
|
15488
|
+
case GGML_OP_CONCAT:
|
15489
|
+
{
|
15490
|
+
n_tasks = n_threads;
|
15491
|
+
} break;
|
15492
|
+
case GGML_OP_MUL_MAT:
|
15493
|
+
{
|
15494
|
+
n_tasks = n_threads;
|
15495
|
+
|
15496
|
+
// TODO: use different scheduling for different matrix sizes
|
15497
|
+
//const int nr0 = ggml_nrows(node->src[0]);
|
15498
|
+
//const int nr1 = ggml_nrows(node->src[1]);
|
15499
|
+
|
15500
|
+
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
15501
|
+
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
15502
|
+
|
15503
|
+
#if defined(GGML_USE_CUBLAS)
|
15504
|
+
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
15505
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
15506
|
+
// the threads are still spinning
|
15507
|
+
}
|
15508
|
+
#elif defined(GGML_USE_CLBLAST)
|
15509
|
+
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
15510
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
15511
|
+
// the threads are still spinning
|
15512
|
+
}
|
15513
|
+
#endif
|
15514
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
15515
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
15516
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
15517
|
+
// the threads are still spinning
|
15518
|
+
}
|
15519
|
+
#endif
|
15520
|
+
} break;
|
15521
|
+
case GGML_OP_OUT_PROD:
|
15522
|
+
{
|
15523
|
+
n_tasks = n_threads;
|
15524
|
+
} break;
|
15525
|
+
case GGML_OP_SCALE:
|
15526
|
+
case GGML_OP_SET:
|
15527
|
+
case GGML_OP_CONT:
|
15528
|
+
case GGML_OP_RESHAPE:
|
15529
|
+
case GGML_OP_VIEW:
|
15530
|
+
case GGML_OP_PERMUTE:
|
15531
|
+
case GGML_OP_TRANSPOSE:
|
15532
|
+
case GGML_OP_GET_ROWS:
|
15533
|
+
case GGML_OP_GET_ROWS_BACK:
|
15534
|
+
case GGML_OP_DIAG:
|
15535
|
+
{
|
15536
|
+
n_tasks = 1;
|
15537
|
+
} break;
|
15538
|
+
case GGML_OP_DIAG_MASK_ZERO:
|
15539
|
+
case GGML_OP_DIAG_MASK_INF:
|
15540
|
+
case GGML_OP_SOFT_MAX:
|
15541
|
+
case GGML_OP_SOFT_MAX_BACK:
|
15542
|
+
case GGML_OP_ROPE:
|
15543
|
+
case GGML_OP_ROPE_BACK:
|
15544
|
+
case GGML_OP_ADD_REL_POS:
|
15545
|
+
{
|
15546
|
+
n_tasks = n_threads;
|
15547
|
+
} break;
|
15548
|
+
case GGML_OP_ALIBI:
|
15549
|
+
{
|
15550
|
+
n_tasks = 1; //TODO
|
15551
|
+
} break;
|
15552
|
+
case GGML_OP_CLAMP:
|
15553
|
+
{
|
15554
|
+
n_tasks = 1; //TODO
|
15555
|
+
} break;
|
15556
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
15557
|
+
{
|
15558
|
+
n_tasks = n_threads;
|
15559
|
+
} break;
|
15560
|
+
case GGML_OP_IM2COL:
|
15561
|
+
{
|
15562
|
+
n_tasks = n_threads;
|
15563
|
+
} break;
|
15564
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
15565
|
+
{
|
15566
|
+
n_tasks = n_threads;
|
15567
|
+
} break;
|
15568
|
+
case GGML_OP_POOL_1D:
|
15569
|
+
case GGML_OP_POOL_2D:
|
15570
|
+
{
|
15571
|
+
n_tasks = 1;
|
15572
|
+
} break;
|
15573
|
+
case GGML_OP_UPSCALE:
|
15574
|
+
{
|
15575
|
+
n_tasks = n_threads;
|
15576
|
+
} break;
|
15577
|
+
case GGML_OP_FLASH_ATTN:
|
15578
|
+
{
|
15579
|
+
n_tasks = n_threads;
|
15580
|
+
} break;
|
15581
|
+
case GGML_OP_FLASH_FF:
|
15582
|
+
{
|
15583
|
+
n_tasks = n_threads;
|
15584
|
+
} break;
|
15585
|
+
case GGML_OP_FLASH_ATTN_BACK:
|
15586
|
+
{
|
15587
|
+
n_tasks = n_threads;
|
15588
|
+
} break;
|
15589
|
+
case GGML_OP_WIN_PART:
|
15590
|
+
case GGML_OP_WIN_UNPART:
|
15591
|
+
case GGML_OP_GET_REL_POS:
|
15592
|
+
case GGML_OP_MAP_UNARY:
|
15593
|
+
case GGML_OP_MAP_BINARY:
|
15594
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
15595
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
15596
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
15597
|
+
{
|
15598
|
+
n_tasks = 1;
|
15599
|
+
} break;
|
15600
|
+
case GGML_OP_MAP_CUSTOM1:
|
15601
|
+
{
|
15602
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
15603
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
15604
|
+
n_tasks = n_threads;
|
15605
|
+
} else {
|
15606
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
15607
|
+
}
|
15608
|
+
} break;
|
15609
|
+
case GGML_OP_MAP_CUSTOM2:
|
15610
|
+
{
|
15611
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
15612
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
15613
|
+
n_tasks = n_threads;
|
15614
|
+
} else {
|
15615
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
15616
|
+
}
|
15617
|
+
} break;
|
15618
|
+
case GGML_OP_MAP_CUSTOM3:
|
15619
|
+
{
|
15620
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
15621
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
15622
|
+
n_tasks = n_threads;
|
15623
|
+
} else {
|
15624
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
15625
|
+
}
|
15626
|
+
} break;
|
15627
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
15628
|
+
{
|
15629
|
+
n_tasks = n_threads;
|
15630
|
+
} break;
|
15631
|
+
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
15632
|
+
{
|
15633
|
+
n_tasks = n_threads;
|
15634
|
+
} break;
|
15635
|
+
case GGML_OP_NONE:
|
15636
|
+
{
|
15637
|
+
n_tasks = 1;
|
15638
|
+
} break;
|
15639
|
+
case GGML_OP_COUNT:
|
15640
|
+
{
|
15641
|
+
GGML_ASSERT(false);
|
15642
|
+
} break;
|
15643
|
+
default:
|
15644
|
+
{
|
15645
|
+
printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
|
15646
|
+
GGML_ASSERT(false);
|
15647
|
+
} break;
|
15648
|
+
}
|
15649
|
+
|
15650
|
+
assert(n_tasks > 0);
|
16178
15651
|
|
16179
|
-
|
16180
|
-
node->perf_cycles += cycles_cur;
|
16181
|
-
node->perf_time_us += time_us_cur;
|
15652
|
+
return n_tasks;
|
16182
15653
|
}
|
16183
15654
|
|
16184
15655
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
@@ -16187,7 +15658,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16187
15658
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
16188
15659
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
16189
15660
|
|
16190
|
-
const int * n_tasks_arr = cplan->n_tasks;
|
16191
15661
|
const int n_threads = state->shared->n_threads;
|
16192
15662
|
|
16193
15663
|
set_numa_thread_affinity(state->ith, n_threads);
|
@@ -16212,9 +15682,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16212
15682
|
|
16213
15683
|
if (node_n != -1) {
|
16214
15684
|
/* FINALIZE */
|
16215
|
-
struct ggml_tensor * node =
|
15685
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16216
15686
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16217
|
-
params.nth =
|
15687
|
+
params.nth = ggml_get_n_tasks(node, n_threads);
|
16218
15688
|
ggml_compute_forward(¶ms, node);
|
16219
15689
|
}
|
16220
15690
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -16225,7 +15695,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16225
15695
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16226
15696
|
|
16227
15697
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16228
|
-
const int n_tasks =
|
15698
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16229
15699
|
|
16230
15700
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16231
15701
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
@@ -16283,7 +15753,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16283
15753
|
|
16284
15754
|
/* COMPUTE */
|
16285
15755
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16286
|
-
const int n_tasks =
|
15756
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16287
15757
|
|
16288
15758
|
struct ggml_compute_params params = {
|
16289
15759
|
/*.type =*/ GGML_TASK_COMPUTE,
|
@@ -16317,121 +15787,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16317
15787
|
|
16318
15788
|
struct ggml_tensor * node = cgraph->nodes[i];
|
16319
15789
|
|
15790
|
+
size_t cur = 0;
|
15791
|
+
|
16320
15792
|
switch (node->op) {
|
16321
15793
|
case GGML_OP_CPY:
|
16322
15794
|
case GGML_OP_DUP:
|
16323
15795
|
{
|
16324
15796
|
n_tasks = n_threads;
|
16325
15797
|
|
16326
|
-
size_t cur = 0;
|
16327
15798
|
if (ggml_is_quantized(node->type)) {
|
16328
15799
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16329
15800
|
}
|
16330
|
-
|
16331
|
-
work_size = MAX(work_size, cur);
|
16332
15801
|
} break;
|
16333
15802
|
case GGML_OP_ADD:
|
16334
15803
|
case GGML_OP_ADD1:
|
16335
15804
|
{
|
16336
15805
|
n_tasks = n_threads;
|
16337
15806
|
|
16338
|
-
size_t cur = 0;
|
16339
|
-
|
16340
15807
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16341
15808
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16342
15809
|
}
|
16343
|
-
|
16344
|
-
work_size = MAX(work_size, cur);
|
16345
15810
|
} break;
|
16346
15811
|
case GGML_OP_ACC:
|
16347
15812
|
{
|
16348
15813
|
n_tasks = n_threads;
|
16349
15814
|
|
16350
|
-
size_t cur = 0;
|
16351
|
-
|
16352
15815
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16353
15816
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
16354
15817
|
}
|
16355
|
-
|
16356
|
-
work_size = MAX(work_size, cur);
|
16357
|
-
} break;
|
16358
|
-
case GGML_OP_SUB:
|
16359
|
-
case GGML_OP_DIV:
|
16360
|
-
case GGML_OP_SQR:
|
16361
|
-
case GGML_OP_SQRT:
|
16362
|
-
case GGML_OP_LOG:
|
16363
|
-
case GGML_OP_SUM:
|
16364
|
-
case GGML_OP_SUM_ROWS:
|
16365
|
-
case GGML_OP_MEAN:
|
16366
|
-
case GGML_OP_ARGMAX:
|
16367
|
-
case GGML_OP_REPEAT:
|
16368
|
-
case GGML_OP_REPEAT_BACK:
|
16369
|
-
{
|
16370
|
-
n_tasks = 1;
|
16371
|
-
} break;
|
16372
|
-
|
16373
|
-
case GGML_OP_UNARY:
|
16374
|
-
{
|
16375
|
-
switch (ggml_get_unary_op(node)) {
|
16376
|
-
case GGML_UNARY_OP_ABS:
|
16377
|
-
case GGML_UNARY_OP_SGN:
|
16378
|
-
case GGML_UNARY_OP_NEG:
|
16379
|
-
case GGML_UNARY_OP_STEP:
|
16380
|
-
case GGML_UNARY_OP_TANH:
|
16381
|
-
case GGML_UNARY_OP_ELU:
|
16382
|
-
case GGML_UNARY_OP_RELU:
|
16383
|
-
{
|
16384
|
-
n_tasks = 1;
|
16385
|
-
} break;
|
16386
|
-
|
16387
|
-
case GGML_UNARY_OP_GELU:
|
16388
|
-
case GGML_UNARY_OP_GELU_QUICK:
|
16389
|
-
case GGML_UNARY_OP_SILU:
|
16390
|
-
{
|
16391
|
-
n_tasks = n_threads;
|
16392
|
-
} break;
|
16393
|
-
}
|
16394
15818
|
} break;
|
16395
|
-
case GGML_OP_SILU_BACK:
|
16396
|
-
case GGML_OP_MUL:
|
16397
|
-
case GGML_OP_NORM:
|
16398
|
-
case GGML_OP_RMS_NORM:
|
16399
|
-
case GGML_OP_RMS_NORM_BACK:
|
16400
|
-
case GGML_OP_GROUP_NORM:
|
16401
|
-
{
|
16402
|
-
n_tasks = n_threads;
|
16403
|
-
} break;
|
16404
|
-
case GGML_OP_CONCAT:
|
16405
15819
|
case GGML_OP_MUL_MAT:
|
16406
15820
|
{
|
16407
|
-
n_tasks = n_threads;
|
16408
|
-
|
16409
|
-
// TODO: use different scheduling for different matrix sizes
|
16410
|
-
//const int nr0 = ggml_nrows(node->src[0]);
|
16411
|
-
//const int nr1 = ggml_nrows(node->src[1]);
|
16412
|
-
|
16413
|
-
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
16414
|
-
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
16415
|
-
|
16416
|
-
size_t cur = 0;
|
16417
15821
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
16418
15822
|
|
16419
|
-
#if defined(
|
16420
|
-
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
16421
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16422
|
-
// the threads are still spinning
|
16423
|
-
} else
|
16424
|
-
#elif defined(GGML_USE_CLBLAST)
|
15823
|
+
#if defined(GGML_USE_CLBLAST)
|
16425
15824
|
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
16426
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16427
|
-
// the threads are still spinning
|
16428
15825
|
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
16429
15826
|
} else
|
16430
15827
|
#endif
|
16431
15828
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16432
15829
|
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
16433
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16434
|
-
// the threads are still spinning
|
16435
15830
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16436
15831
|
// here we need memory just for single 2D matrix from src0
|
16437
15832
|
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
@@ -16440,108 +15835,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16440
15835
|
#endif
|
16441
15836
|
if (node->src[1]->type != vec_dot_type) {
|
16442
15837
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
16443
|
-
} else {
|
16444
|
-
cur = 0;
|
16445
15838
|
}
|
16446
|
-
|
16447
|
-
work_size = MAX(work_size, cur);
|
16448
15839
|
} break;
|
16449
15840
|
case GGML_OP_OUT_PROD:
|
16450
15841
|
{
|
16451
15842
|
n_tasks = n_threads;
|
16452
15843
|
|
16453
|
-
size_t cur = 0;
|
16454
|
-
|
16455
15844
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16456
15845
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16457
15846
|
}
|
16458
|
-
|
16459
|
-
work_size = MAX(work_size, cur);
|
16460
|
-
} break;
|
16461
|
-
case GGML_OP_SCALE:
|
16462
|
-
{
|
16463
|
-
n_tasks = 1;
|
16464
|
-
} break;
|
16465
|
-
case GGML_OP_SET:
|
16466
|
-
case GGML_OP_CONT:
|
16467
|
-
case GGML_OP_RESHAPE:
|
16468
|
-
case GGML_OP_VIEW:
|
16469
|
-
case GGML_OP_PERMUTE:
|
16470
|
-
case GGML_OP_TRANSPOSE:
|
16471
|
-
case GGML_OP_GET_ROWS:
|
16472
|
-
case GGML_OP_GET_ROWS_BACK:
|
16473
|
-
case GGML_OP_DIAG:
|
16474
|
-
{
|
16475
|
-
n_tasks = 1;
|
16476
|
-
} break;
|
16477
|
-
case GGML_OP_DIAG_MASK_ZERO:
|
16478
|
-
case GGML_OP_DIAG_MASK_INF:
|
16479
|
-
case GGML_OP_SOFT_MAX:
|
16480
|
-
case GGML_OP_SOFT_MAX_BACK:
|
16481
|
-
case GGML_OP_ROPE:
|
16482
|
-
case GGML_OP_ROPE_BACK:
|
16483
|
-
case GGML_OP_ADD_REL_POS:
|
16484
|
-
{
|
16485
|
-
n_tasks = n_threads;
|
16486
|
-
} break;
|
16487
|
-
case GGML_OP_ALIBI:
|
16488
|
-
{
|
16489
|
-
n_tasks = 1; //TODO
|
16490
|
-
} break;
|
16491
|
-
case GGML_OP_CLAMP:
|
16492
|
-
{
|
16493
|
-
n_tasks = 1; //TODO
|
16494
|
-
} break;
|
16495
|
-
case GGML_OP_CONV_1D:
|
16496
|
-
{
|
16497
|
-
n_tasks = n_threads;
|
16498
|
-
|
16499
|
-
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
16500
|
-
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
16501
|
-
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
16502
|
-
|
16503
|
-
const int64_t ne00 = node->src[0]->ne[0];
|
16504
|
-
const int64_t ne01 = node->src[0]->ne[1];
|
16505
|
-
const int64_t ne02 = node->src[0]->ne[2];
|
16506
|
-
|
16507
|
-
const int64_t ne10 = node->src[1]->ne[0];
|
16508
|
-
const int64_t ne11 = node->src[1]->ne[1];
|
16509
|
-
|
16510
|
-
const int64_t ne0 = node->ne[0];
|
16511
|
-
const int64_t ne1 = node->ne[1];
|
16512
|
-
const int64_t nk = ne00;
|
16513
|
-
const int64_t ew0 = nk * ne01;
|
16514
|
-
|
16515
|
-
UNUSED(ne02);
|
16516
|
-
UNUSED(ne10);
|
16517
|
-
UNUSED(ne11);
|
16518
|
-
|
16519
|
-
size_t cur = 0;
|
16520
|
-
|
16521
|
-
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16522
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16523
|
-
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
16524
|
-
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16525
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16526
|
-
cur = sizeof(float)*(ne0*ne1*ew0);
|
16527
|
-
} else {
|
16528
|
-
GGML_ASSERT(false);
|
16529
|
-
}
|
16530
|
-
|
16531
|
-
work_size = MAX(work_size, cur);
|
16532
|
-
} break;
|
16533
|
-
case GGML_OP_CONV_1D_STAGE_0:
|
16534
|
-
{
|
16535
|
-
n_tasks = n_threads;
|
16536
|
-
} break;
|
16537
|
-
case GGML_OP_CONV_1D_STAGE_1:
|
16538
|
-
{
|
16539
|
-
n_tasks = n_threads;
|
16540
15847
|
} break;
|
16541
15848
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
16542
15849
|
{
|
16543
|
-
n_tasks = n_threads;
|
16544
|
-
|
16545
15850
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
16546
15851
|
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
16547
15852
|
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
@@ -16553,7 +15858,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16553
15858
|
const int64_t ne10 = node->src[1]->ne[0]; // L
|
16554
15859
|
const int64_t ne11 = node->src[1]->ne[1]; // Cin
|
16555
15860
|
|
16556
|
-
size_t cur = 0;
|
16557
15861
|
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16558
15862
|
node->src[1]->type == GGML_TYPE_F32) {
|
16559
15863
|
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
|
@@ -16565,59 +15869,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16565
15869
|
} else {
|
16566
15870
|
GGML_ASSERT(false);
|
16567
15871
|
}
|
16568
|
-
|
16569
|
-
work_size = MAX(work_size, cur);
|
16570
|
-
} break;
|
16571
|
-
case GGML_OP_CONV_2D:
|
16572
|
-
{
|
16573
|
-
n_tasks = n_threads;
|
16574
|
-
|
16575
|
-
const int64_t ne00 = node->src[0]->ne[0]; // W
|
16576
|
-
const int64_t ne01 = node->src[0]->ne[1]; // H
|
16577
|
-
const int64_t ne02 = node->src[0]->ne[2]; // C
|
16578
|
-
const int64_t ne03 = node->src[0]->ne[3]; // N
|
16579
|
-
|
16580
|
-
const int64_t ne10 = node->src[1]->ne[0]; // W
|
16581
|
-
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16582
|
-
const int64_t ne12 = node->src[1]->ne[2]; // C
|
16583
|
-
|
16584
|
-
const int64_t ne0 = node->ne[0];
|
16585
|
-
const int64_t ne1 = node->ne[1];
|
16586
|
-
const int64_t ne2 = node->ne[2];
|
16587
|
-
const int64_t ne3 = node->ne[3];
|
16588
|
-
const int64_t nk = ne00*ne01;
|
16589
|
-
const int64_t ew0 = nk * ne02;
|
16590
|
-
|
16591
|
-
UNUSED(ne03);
|
16592
|
-
UNUSED(ne2);
|
16593
|
-
|
16594
|
-
size_t cur = 0;
|
16595
|
-
|
16596
|
-
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16597
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16598
|
-
// im2col: [N*OH*OW, IC*KH*KW]
|
16599
|
-
cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
|
16600
|
-
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16601
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16602
|
-
cur = sizeof(float)* (ne10*ne11*ne12);
|
16603
|
-
} else {
|
16604
|
-
GGML_ASSERT(false);
|
16605
|
-
}
|
16606
|
-
|
16607
|
-
work_size = MAX(work_size, cur);
|
16608
|
-
} break;
|
16609
|
-
case GGML_OP_CONV_2D_STAGE_0:
|
16610
|
-
{
|
16611
|
-
n_tasks = n_threads;
|
16612
15872
|
} break;
|
16613
|
-
case
|
15873
|
+
case GGML_OP_IM2COL:
|
16614
15874
|
{
|
16615
15875
|
n_tasks = n_threads;
|
16616
15876
|
} break;
|
16617
15877
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
16618
15878
|
{
|
16619
|
-
n_tasks = n_threads;
|
16620
|
-
|
16621
15879
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
16622
15880
|
const int64_t ne01 = node->src[0]->ne[1]; // H
|
16623
15881
|
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
@@ -16627,141 +15885,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16627
15885
|
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16628
15886
|
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
16629
15887
|
|
16630
|
-
size_t cur = 0;
|
16631
15888
|
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
16632
15889
|
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
16633
|
-
|
16634
|
-
work_size = MAX(work_size, cur);
|
16635
|
-
} break;
|
16636
|
-
case GGML_OP_POOL_1D:
|
16637
|
-
case GGML_OP_POOL_2D:
|
16638
|
-
{
|
16639
|
-
n_tasks = 1;
|
16640
|
-
} break;
|
16641
|
-
case GGML_OP_UPSCALE:
|
16642
|
-
{
|
16643
|
-
n_tasks = n_threads;
|
16644
15890
|
} break;
|
16645
15891
|
case GGML_OP_FLASH_ATTN:
|
16646
15892
|
{
|
16647
15893
|
n_tasks = n_threads;
|
16648
15894
|
|
16649
|
-
size_t cur = 0;
|
16650
|
-
|
16651
15895
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16652
15896
|
|
16653
15897
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16654
15898
|
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16655
15899
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16656
|
-
}
|
16657
|
-
|
16658
|
-
if (node->src[1]->type == GGML_TYPE_F16) {
|
15900
|
+
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
16659
15901
|
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16660
15902
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16661
15903
|
}
|
16662
|
-
|
16663
|
-
work_size = MAX(work_size, cur);
|
16664
15904
|
} break;
|
16665
15905
|
case GGML_OP_FLASH_FF:
|
16666
15906
|
{
|
16667
15907
|
n_tasks = n_threads;
|
16668
15908
|
|
16669
|
-
size_t cur = 0;
|
16670
|
-
|
16671
15909
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16672
15910
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16673
15911
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16674
|
-
}
|
16675
|
-
|
16676
|
-
if (node->src[1]->type == GGML_TYPE_F16) {
|
15912
|
+
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
16677
15913
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16678
15914
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16679
15915
|
}
|
16680
|
-
|
16681
|
-
work_size = MAX(work_size, cur);
|
16682
15916
|
} break;
|
16683
15917
|
case GGML_OP_FLASH_ATTN_BACK:
|
16684
15918
|
{
|
16685
15919
|
n_tasks = n_threads;
|
16686
15920
|
|
16687
|
-
size_t cur = 0;
|
16688
|
-
|
16689
15921
|
const int64_t D = node->src[0]->ne[0];
|
16690
15922
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16691
15923
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
16692
15924
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16693
15925
|
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16694
15926
|
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16695
|
-
}
|
16696
|
-
|
16697
|
-
if (node->src[1]->type == GGML_TYPE_F16) {
|
15927
|
+
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
16698
15928
|
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16699
15929
|
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16700
15930
|
}
|
16701
|
-
|
16702
|
-
work_size = MAX(work_size, cur);
|
16703
|
-
} break;
|
16704
|
-
case GGML_OP_WIN_PART:
|
16705
|
-
case GGML_OP_WIN_UNPART:
|
16706
|
-
case GGML_OP_GET_REL_POS:
|
16707
|
-
case GGML_OP_MAP_UNARY:
|
16708
|
-
case GGML_OP_MAP_BINARY:
|
16709
|
-
case GGML_OP_MAP_CUSTOM1_F32:
|
16710
|
-
case GGML_OP_MAP_CUSTOM2_F32:
|
16711
|
-
case GGML_OP_MAP_CUSTOM3_F32:
|
16712
|
-
{
|
16713
|
-
n_tasks = 1;
|
16714
|
-
} break;
|
16715
|
-
case GGML_OP_MAP_CUSTOM1:
|
16716
|
-
{
|
16717
|
-
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
16718
|
-
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16719
|
-
n_tasks = n_threads;
|
16720
|
-
} else {
|
16721
|
-
n_tasks = MIN(p->n_tasks, n_threads);
|
16722
|
-
}
|
16723
|
-
} break;
|
16724
|
-
case GGML_OP_MAP_CUSTOM2:
|
16725
|
-
{
|
16726
|
-
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
16727
|
-
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16728
|
-
n_tasks = n_threads;
|
16729
|
-
} else {
|
16730
|
-
n_tasks = MIN(p->n_tasks, n_threads);
|
16731
|
-
}
|
16732
|
-
} break;
|
16733
|
-
case GGML_OP_MAP_CUSTOM3:
|
16734
|
-
{
|
16735
|
-
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
16736
|
-
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16737
|
-
n_tasks = n_threads;
|
16738
|
-
} else {
|
16739
|
-
n_tasks = MIN(p->n_tasks, n_threads);
|
16740
|
-
}
|
16741
15931
|
} break;
|
15932
|
+
|
16742
15933
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16743
15934
|
{
|
16744
15935
|
n_tasks = n_threads;
|
16745
15936
|
|
16746
|
-
|
16747
|
-
|
16748
|
-
work_size = MAX(work_size, cur);
|
16749
|
-
} break;
|
16750
|
-
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16751
|
-
{
|
16752
|
-
n_tasks = n_threads;
|
16753
|
-
} break;
|
16754
|
-
case GGML_OP_NONE:
|
16755
|
-
{
|
16756
|
-
n_tasks = 1;
|
15937
|
+
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16757
15938
|
} break;
|
16758
15939
|
case GGML_OP_COUNT:
|
16759
15940
|
{
|
16760
15941
|
GGML_ASSERT(false);
|
16761
15942
|
} break;
|
15943
|
+
default:
|
15944
|
+
break;
|
16762
15945
|
}
|
16763
15946
|
|
16764
|
-
|
15947
|
+
work_size = MAX(work_size, cur);
|
16765
15948
|
}
|
16766
15949
|
|
16767
15950
|
if (work_size > 0) {
|
@@ -16783,12 +15966,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16783
15966
|
if (cplan->work_size > 0) {
|
16784
15967
|
GGML_ASSERT(cplan->work_data);
|
16785
15968
|
}
|
16786
|
-
|
16787
|
-
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16788
|
-
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
16789
|
-
GGML_ASSERT(cplan->n_tasks[i] > 0);
|
16790
|
-
}
|
16791
|
-
}
|
16792
15969
|
}
|
16793
15970
|
|
16794
15971
|
const int n_threads = cplan->n_threads;
|
@@ -16861,16 +16038,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16861
16038
|
return compute_status;
|
16862
16039
|
}
|
16863
16040
|
|
16864
|
-
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
16865
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16866
|
-
struct ggml_tensor * grad = cgraph->grads[i];
|
16867
|
-
|
16868
|
-
if (grad) {
|
16869
|
-
ggml_set_zero(grad);
|
16870
|
-
}
|
16871
|
-
}
|
16872
|
-
}
|
16873
|
-
|
16874
16041
|
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
16875
16042
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
16876
16043
|
|
@@ -16997,12 +16164,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16997
16164
|
const uint32_t magic = GGML_FILE_MAGIC;
|
16998
16165
|
const uint32_t version = GGML_FILE_VERSION;
|
16999
16166
|
const uint32_t n_leafs = cgraph->n_leafs;
|
17000
|
-
const uint32_t
|
16167
|
+
const uint32_t n_nodes = cgraph->n_nodes;
|
17001
16168
|
|
17002
16169
|
fwrite(&magic, sizeof(uint32_t), 1, fout);
|
17003
16170
|
fwrite(&version, sizeof(uint32_t), 1, fout);
|
17004
16171
|
fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
|
17005
|
-
fwrite(&
|
16172
|
+
fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
|
17006
16173
|
fwrite(&size_eval, sizeof(uint64_t), 1, fout);
|
17007
16174
|
}
|
17008
16175
|
|
@@ -17090,7 +16257,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17090
16257
|
if (idx == -1) {
|
17091
16258
|
for (int k = 0; k < cgraph->n_nodes; ++k) {
|
17092
16259
|
if (args[j] == cgraph->nodes[k]) {
|
17093
|
-
idx =
|
16260
|
+
idx = cgraph->n_leafs + k;
|
17094
16261
|
break;
|
17095
16262
|
}
|
17096
16263
|
}
|
@@ -17117,11 +16284,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17117
16284
|
}
|
17118
16285
|
}
|
17119
16286
|
|
17120
|
-
struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
16287
|
+
struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
17121
16288
|
assert(*ctx_data == NULL);
|
17122
16289
|
assert(*ctx_eval == NULL);
|
17123
16290
|
|
17124
|
-
struct ggml_cgraph result =
|
16291
|
+
struct ggml_cgraph * result = NULL;
|
17125
16292
|
|
17126
16293
|
struct ggml_tensor * data = NULL;
|
17127
16294
|
|
@@ -17193,13 +16360,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17193
16360
|
const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
|
17194
16361
|
const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
|
17195
16362
|
const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
|
17196
|
-
|
17197
|
-
result.n_leafs = n_leafs;
|
17198
|
-
result.n_nodes = n_nodes;
|
16363
|
+
const int graph_size = MAX(n_leafs, n_nodes);
|
17199
16364
|
|
17200
16365
|
// create the data context
|
17201
16366
|
{
|
17202
|
-
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
|
16367
|
+
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
|
17203
16368
|
|
17204
16369
|
struct ggml_init_params params = {
|
17205
16370
|
.mem_size = size_eval + overhead,
|
@@ -17215,6 +16380,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17215
16380
|
}
|
17216
16381
|
}
|
17217
16382
|
|
16383
|
+
result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
|
16384
|
+
|
16385
|
+
result->n_leafs = n_leafs;
|
16386
|
+
result->n_nodes = n_nodes;
|
16387
|
+
|
16388
|
+
|
17218
16389
|
// leafs
|
17219
16390
|
{
|
17220
16391
|
uint32_t type;
|
@@ -17253,7 +16424,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17253
16424
|
tensor->nb[j] = nb[j];
|
17254
16425
|
}
|
17255
16426
|
|
17256
|
-
result
|
16427
|
+
result->leafs[i] = tensor;
|
17257
16428
|
|
17258
16429
|
ptr += ggml_nbytes(tensor);
|
17259
16430
|
|
@@ -17305,10 +16476,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17305
16476
|
continue;
|
17306
16477
|
}
|
17307
16478
|
|
17308
|
-
if (arg_idx <
|
17309
|
-
args[j] = result
|
16479
|
+
if (arg_idx < result->n_leafs) {
|
16480
|
+
args[j] = result->leafs[arg_idx];
|
17310
16481
|
} else {
|
17311
|
-
args[j] = result
|
16482
|
+
args[j] = result->nodes[arg_idx - result->n_leafs];
|
17312
16483
|
}
|
17313
16484
|
}
|
17314
16485
|
|
@@ -17360,7 +16531,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17360
16531
|
tensor->src[j] = args[j];
|
17361
16532
|
}
|
17362
16533
|
|
17363
|
-
result
|
16534
|
+
result->nodes[i] = tensor;
|
17364
16535
|
|
17365
16536
|
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
17366
16537
|
}
|
@@ -18265,10 +17436,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18265
17436
|
case GGML_OPT_ADAM:
|
18266
17437
|
{
|
18267
17438
|
result = (struct ggml_opt_params) {
|
18268
|
-
.type
|
18269
|
-
.
|
18270
|
-
.
|
18271
|
-
.
|
17439
|
+
.type = GGML_OPT_ADAM,
|
17440
|
+
.graph_size = GGML_DEFAULT_GRAPH_SIZE,
|
17441
|
+
.n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
|
17442
|
+
.past = 0,
|
17443
|
+
.delta = 1e-5f,
|
18272
17444
|
|
18273
17445
|
.max_no_improvement = 100,
|
18274
17446
|
|
@@ -18295,10 +17467,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18295
17467
|
case GGML_OPT_LBFGS:
|
18296
17468
|
{
|
18297
17469
|
result = (struct ggml_opt_params) {
|
18298
|
-
.type
|
18299
|
-
.
|
18300
|
-
.
|
18301
|
-
.
|
17470
|
+
.type = GGML_OPT_LBFGS,
|
17471
|
+
.graph_size = GGML_DEFAULT_GRAPH_SIZE,
|
17472
|
+
.n_threads = 1,
|
17473
|
+
.past = 0,
|
17474
|
+
.delta = 1e-5f,
|
18302
17475
|
|
18303
17476
|
.max_no_improvement = 0,
|
18304
17477
|
|
@@ -18440,14 +17613,11 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18440
17613
|
struct ggml_tensor * f) {
|
18441
17614
|
|
18442
17615
|
// build forward + backward compute graphs
|
18443
|
-
struct
|
18444
|
-
|
18445
|
-
|
18446
|
-
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
18447
|
-
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
17616
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
|
17617
|
+
ggml_build_forward_expand(gf, f);
|
18448
17618
|
|
18449
|
-
*
|
18450
|
-
|
17619
|
+
struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
|
17620
|
+
ggml_build_backward_expand(ctx, gf, gb, true);
|
18451
17621
|
|
18452
17622
|
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
18453
17623
|
}
|
@@ -18903,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18903
18073
|
{
|
18904
18074
|
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
18905
18075
|
|
18906
|
-
for (
|
18076
|
+
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
18907
18077
|
struct gguf_kv * kv = &ctx->kv[i];
|
18908
18078
|
|
18909
18079
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
@@ -18950,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18950
18120
|
case GGUF_TYPE_STRING:
|
18951
18121
|
{
|
18952
18122
|
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
18953
|
-
for (
|
18123
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
18954
18124
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
18955
18125
|
}
|
18956
18126
|
} break;
|
@@ -18978,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18978
18148
|
{
|
18979
18149
|
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
18980
18150
|
|
18981
|
-
for (
|
18151
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
18982
18152
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
18983
18153
|
|
18984
18154
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
@@ -19025,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19025
18195
|
// compute the total size of the data section, taking into account the alignment
|
19026
18196
|
{
|
19027
18197
|
ctx->size = 0;
|
19028
|
-
for (
|
18198
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19029
18199
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
19030
18200
|
|
19031
18201
|
const int64_t ne =
|
@@ -19094,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19094
18264
|
ggml_set_no_alloc(ctx_data, true);
|
19095
18265
|
|
19096
18266
|
// create the tensors
|
19097
|
-
for (
|
18267
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19098
18268
|
const int64_t ne[GGML_MAX_DIMS] = {
|
19099
18269
|
ctx->infos[i].ne[0],
|
19100
18270
|
ctx->infos[i].ne[1],
|