llama_cpp 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
|
|
100
100
|
#include <hbwmalloc.h>
|
101
101
|
#endif
|
102
102
|
|
103
|
+
#if defined(__APPLE__)
|
104
|
+
#include <TargetConditionals.h>
|
105
|
+
#endif
|
106
|
+
|
107
|
+
#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
|
108
|
+
(!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
|
109
|
+
|
110
|
+
#include <sys/wait.h>
|
111
|
+
|
112
|
+
void ggml_print_backtrace(void) {
|
113
|
+
/*
|
114
|
+
#include <execinfo.h>
|
115
|
+
#include <dlfcn.h>
|
116
|
+
|
117
|
+
void * trace[100];
|
118
|
+
|
119
|
+
int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
|
120
|
+
|
121
|
+
backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
|
122
|
+
*/
|
123
|
+
|
124
|
+
// backtrack_symbols does not show line numbers, use gdb instead
|
125
|
+
char attach[32];
|
126
|
+
snprintf(attach, sizeof(attach), "attach %d", getpid());
|
127
|
+
int pid = fork();
|
128
|
+
if (pid == 0) {
|
129
|
+
execlp("gdb", "gdb", "--batch",
|
130
|
+
"-ex", "set style enabled on",
|
131
|
+
"-ex", attach,
|
132
|
+
"-ex", "bt -frame-info source-and-location",
|
133
|
+
"-ex", "detach",
|
134
|
+
"-ex", "quit",
|
135
|
+
NULL);
|
136
|
+
} else {
|
137
|
+
waitpid(pid, NULL, 0);
|
138
|
+
}
|
139
|
+
}
|
140
|
+
#else
|
141
|
+
void ggml_print_backtrace(void) {
|
142
|
+
// platform not supported
|
143
|
+
}
|
144
|
+
#endif
|
145
|
+
|
103
146
|
/*#define GGML_PERF*/
|
104
147
|
#define GGML_DEBUG 0
|
105
148
|
#define GGML_GELU_FP16
|
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
228
271
|
// floating point type used to accumulate sums
|
229
272
|
typedef double ggml_float;
|
230
273
|
|
274
|
+
#undef MIN
|
275
|
+
#undef MAX
|
276
|
+
|
277
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
278
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
279
|
+
|
231
280
|
//
|
232
281
|
// global data
|
233
282
|
//
|
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
561
610
|
// simd mappings
|
562
611
|
//
|
563
612
|
|
613
|
+
#if defined(__ARM_NEON)
|
614
|
+
#if !defined(__aarch64__)
|
615
|
+
|
616
|
+
// 64-bit compatibility
|
617
|
+
|
618
|
+
inline static float vaddvq_f32(float32x4_t v) {
|
619
|
+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
620
|
+
}
|
621
|
+
|
622
|
+
#endif
|
623
|
+
#endif
|
624
|
+
|
564
625
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
565
626
|
// we then implement the fundamental computation operations below using only these macros
|
566
627
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
1352
1413
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
1353
1414
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1354
1415
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1416
|
+
inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
|
1355
1417
|
|
1356
1418
|
static const float GELU_COEF_A = 0.044715f;
|
1357
1419
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1572
1634
|
"ROPE_BACK",
|
1573
1635
|
"ALIBI",
|
1574
1636
|
"CLAMP",
|
1575
|
-
"CONV_1D",
|
1576
|
-
"CONV_1D_STAGE_0",
|
1577
|
-
"CONV_1D_STAGE_1",
|
1578
1637
|
"CONV_TRANSPOSE_1D",
|
1579
|
-
"
|
1580
|
-
"CONV_2D_STAGE_0",
|
1581
|
-
"CONV_2D_STAGE_1",
|
1638
|
+
"IM2COL",
|
1582
1639
|
"CONV_TRANSPOSE_2D",
|
1583
1640
|
"POOL_1D",
|
1584
1641
|
"POOL_2D",
|
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1609
1666
|
"CROSS_ENTROPY_LOSS_BACK",
|
1610
1667
|
};
|
1611
1668
|
|
1612
|
-
static_assert(GGML_OP_COUNT ==
|
1669
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
1613
1670
|
|
1614
1671
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1615
1672
|
"none",
|
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1659
1716
|
"rope_back(x)",
|
1660
1717
|
"alibi(x)",
|
1661
1718
|
"clamp(x)",
|
1662
|
-
"conv_1d(x)",
|
1663
|
-
"conv_1d_stage_0(x)",
|
1664
|
-
"conv_1d_stage_1(x)",
|
1665
1719
|
"conv_transpose_1d(x)",
|
1666
|
-
"
|
1667
|
-
"conv_2d_stage_0(x)",
|
1668
|
-
"conv_2d_stage_1(x)",
|
1720
|
+
"im2col(x)",
|
1669
1721
|
"conv_transpose_2d(x)",
|
1670
1722
|
"pool_1d(x)",
|
1671
1723
|
"pool_2d(x)",
|
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1696
1748
|
"cross_entropy_loss_back(x,y)",
|
1697
1749
|
};
|
1698
1750
|
|
1699
|
-
static_assert(GGML_OP_COUNT ==
|
1751
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
1700
1752
|
|
1701
1753
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1702
1754
|
|
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1724
1776
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
1725
1777
|
p[GGML_OP_DIAG_MASK_INF ] = true;
|
1726
1778
|
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
1727
|
-
p[GGML_OP_CONV_1D ] = true;
|
1728
|
-
p[GGML_OP_CONV_1D_STAGE_0 ] = true;
|
1729
|
-
p[GGML_OP_CONV_1D_STAGE_1 ] = true;
|
1730
1779
|
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
|
1731
|
-
p[GGML_OP_CONV_2D ] = true;
|
1732
|
-
p[GGML_OP_CONV_2D_STAGE_0 ] = true;
|
1733
|
-
p[GGML_OP_CONV_2D_STAGE_1 ] = true;
|
1734
1780
|
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
1735
1781
|
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
1736
1782
|
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
|
|
3769
3815
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
|
3770
3816
|
}
|
3771
3817
|
|
3818
|
+
// ggml_leaky
|
3819
|
+
|
3820
|
+
struct ggml_tensor * ggml_leaky(
|
3821
|
+
struct ggml_context * ctx,
|
3822
|
+
struct ggml_tensor * a) {
|
3823
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
|
3824
|
+
}
|
3825
|
+
|
3772
3826
|
// ggml_gelu
|
3773
3827
|
|
3774
3828
|
struct ggml_tensor * ggml_gelu(
|
@@ -4970,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
4970
5024
|
int n_dims,
|
4971
5025
|
int mode,
|
4972
5026
|
int n_ctx,
|
5027
|
+
int n_orig_ctx,
|
4973
5028
|
float freq_base,
|
4974
5029
|
float freq_scale,
|
5030
|
+
float ext_factor,
|
5031
|
+
float attn_factor,
|
5032
|
+
float beta_fast,
|
5033
|
+
float beta_slow,
|
4975
5034
|
float xpos_base,
|
4976
5035
|
bool xpos_down) {
|
4977
5036
|
GGML_ASSERT(ggml_is_vector(b));
|
@@ -4988,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
|
|
4988
5047
|
|
4989
5048
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
4990
5049
|
|
4991
|
-
int32_t params[
|
4992
|
-
memcpy(params +
|
4993
|
-
memcpy(params +
|
4994
|
-
memcpy(params +
|
4995
|
-
memcpy(params +
|
5050
|
+
int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
|
5051
|
+
memcpy(params + 5, &freq_base, sizeof(float));
|
5052
|
+
memcpy(params + 6, &freq_scale, sizeof(float));
|
5053
|
+
memcpy(params + 7, &ext_factor, sizeof(float));
|
5054
|
+
memcpy(params + 8, &attn_factor, sizeof(float));
|
5055
|
+
memcpy(params + 9, &beta_fast, sizeof(float));
|
5056
|
+
memcpy(params + 10, &beta_slow, sizeof(float));
|
5057
|
+
memcpy(params + 11, &xpos_base, sizeof(float));
|
5058
|
+
memcpy(params + 12, &xpos_down, sizeof(bool));
|
4996
5059
|
ggml_set_op_params(result, params, sizeof(params));
|
4997
5060
|
|
4998
5061
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -5067,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
|
|
5067
5130
|
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
5068
5131
|
}
|
5069
5132
|
|
5070
|
-
// im2col: [N, IC, IL] => [N, OL, IC*K]
|
5071
|
-
// a: [OC,IC, K]
|
5072
|
-
// b: [N, IC, IL]
|
5073
|
-
// result: [N, OL, IC*K]
|
5074
|
-
static struct ggml_tensor * ggml_conv_1d_stage_0(
|
5075
|
-
struct ggml_context * ctx,
|
5076
|
-
struct ggml_tensor * a,
|
5077
|
-
struct ggml_tensor * b,
|
5078
|
-
int s0,
|
5079
|
-
int p0,
|
5080
|
-
int d0) {
|
5081
|
-
GGML_ASSERT(a->ne[1] == b->ne[1]);
|
5082
|
-
bool is_node = false;
|
5083
|
-
|
5084
|
-
if (a->grad || b->grad) {
|
5085
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5086
|
-
is_node = true;
|
5087
|
-
}
|
5088
|
-
|
5089
|
-
const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
5090
|
-
|
5091
|
-
const int64_t ne[4] = {
|
5092
|
-
a->ne[1] * a->ne[0],
|
5093
|
-
OL,
|
5094
|
-
b->ne[2],
|
5095
|
-
1,
|
5096
|
-
};
|
5097
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
5098
|
-
|
5099
|
-
int32_t params[] = { s0, p0, d0 };
|
5100
|
-
ggml_set_op_params(result, params, sizeof(params));
|
5101
|
-
|
5102
|
-
result->op = GGML_OP_CONV_1D_STAGE_0;
|
5103
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5104
|
-
result->src[0] = a;
|
5105
|
-
result->src[1] = b;
|
5106
|
-
|
5107
|
-
return result;
|
5108
|
-
}
|
5109
|
-
|
5110
|
-
// ggml_conv_1d_stage_1
|
5111
|
-
|
5112
|
-
// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
|
5113
|
-
// a: [OC, IC, K]
|
5114
|
-
// b: [N, OL, IC * K]
|
5115
|
-
// result: [N, OC, OL]
|
5116
|
-
static struct ggml_tensor * ggml_conv_1d_stage_1(
|
5117
|
-
struct ggml_context * ctx,
|
5118
|
-
struct ggml_tensor * a,
|
5119
|
-
struct ggml_tensor * b) {
|
5120
|
-
|
5121
|
-
bool is_node = false;
|
5122
|
-
|
5123
|
-
if (a->grad || b->grad) {
|
5124
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5125
|
-
is_node = true;
|
5126
|
-
}
|
5127
|
-
|
5128
|
-
const int64_t ne[4] = {
|
5129
|
-
b->ne[1],
|
5130
|
-
a->ne[2],
|
5131
|
-
b->ne[2],
|
5132
|
-
1,
|
5133
|
-
};
|
5134
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5135
|
-
|
5136
|
-
result->op = GGML_OP_CONV_1D_STAGE_1;
|
5137
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5138
|
-
result->src[0] = a;
|
5139
|
-
result->src[1] = b;
|
5140
|
-
|
5141
|
-
return result;
|
5142
|
-
}
|
5143
|
-
|
5144
|
-
// ggml_conv_1d
|
5145
|
-
|
5146
5133
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
5147
5134
|
struct ggml_context * ctx,
|
5148
5135
|
struct ggml_tensor * a,
|
@@ -5150,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
5150
5137
|
int s0,
|
5151
5138
|
int p0,
|
5152
5139
|
int d0) {
|
5153
|
-
struct ggml_tensor *
|
5154
|
-
result = ggml_conv_1d_stage_1(ctx, a, result);
|
5155
|
-
return result;
|
5156
|
-
}
|
5140
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
5157
5141
|
|
5158
|
-
|
5159
|
-
|
5160
|
-
//
|
5161
|
-
//
|
5162
|
-
// int s0,
|
5163
|
-
// int p0,
|
5164
|
-
// int d0) {
|
5165
|
-
// GGML_ASSERT(ggml_is_matrix(b));
|
5166
|
-
// GGML_ASSERT(a->ne[1] == b->ne[1]);
|
5167
|
-
// bool is_node = false;
|
5142
|
+
struct ggml_tensor * result =
|
5143
|
+
ggml_mul_mat(ctx,
|
5144
|
+
ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
|
5145
|
+
ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
|
5168
5146
|
|
5169
|
-
|
5170
|
-
// GGML_ASSERT(false); // TODO: implement backward
|
5171
|
-
// is_node = true;
|
5172
|
-
// }
|
5147
|
+
result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
|
5173
5148
|
|
5174
|
-
|
5175
|
-
|
5176
|
-
// a->ne[2], 1, 1,
|
5177
|
-
// };
|
5178
|
-
// struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
5179
|
-
|
5180
|
-
// int32_t params[] = { s0, p0, d0 };
|
5181
|
-
// ggml_set_op_params(result, params, sizeof(params));
|
5182
|
-
|
5183
|
-
// result->op = GGML_OP_CONV_1D;
|
5184
|
-
// result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5185
|
-
// result->src[0] = a;
|
5186
|
-
// result->src[1] = b;
|
5187
|
-
|
5188
|
-
// return result;
|
5189
|
-
// }
|
5149
|
+
return result;
|
5150
|
+
}
|
5190
5151
|
|
5191
5152
|
// ggml_conv_1d_ph
|
5192
5153
|
|
@@ -5249,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|
5249
5210
|
// a: [OC,IC, KH, KW]
|
5250
5211
|
// b: [N, IC, IH, IW]
|
5251
5212
|
// result: [N, OH, OW, IC*KH*KW]
|
5252
|
-
|
5213
|
+
struct ggml_tensor * ggml_im2col(
|
5253
5214
|
struct ggml_context * ctx,
|
5254
5215
|
struct ggml_tensor * a,
|
5255
5216
|
struct ggml_tensor * b,
|
@@ -5258,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
|
|
5258
5219
|
int p0,
|
5259
5220
|
int p1,
|
5260
5221
|
int d0,
|
5261
|
-
int d1
|
5222
|
+
int d1,
|
5223
|
+
bool is_2D) {
|
5262
5224
|
|
5263
|
-
|
5225
|
+
if(is_2D) {
|
5226
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
5227
|
+
} else {
|
5228
|
+
GGML_ASSERT(a->ne[1] == b->ne[1]);
|
5229
|
+
}
|
5264
5230
|
bool is_node = false;
|
5265
5231
|
|
5266
5232
|
if (a->grad || b->grad) {
|
@@ -5268,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
|
|
5268
5234
|
is_node = true;
|
5269
5235
|
}
|
5270
5236
|
|
5271
|
-
const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
5272
|
-
const int64_t OW =
|
5237
|
+
const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
|
5238
|
+
const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
5273
5239
|
|
5274
5240
|
const int64_t ne[4] = {
|
5275
|
-
a->ne[2] * a->ne[1] * a->ne[0],
|
5241
|
+
is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
|
5276
5242
|
OW,
|
5277
|
-
OH,
|
5278
|
-
b->ne[3],
|
5243
|
+
is_2D ? OH : b->ne[2],
|
5244
|
+
is_2D ? b->ne[3] : 1,
|
5279
5245
|
};
|
5280
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
5281
5246
|
|
5282
|
-
|
5247
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
5248
|
+
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
5283
5249
|
ggml_set_op_params(result, params, sizeof(params));
|
5284
5250
|
|
5285
|
-
result->op =
|
5286
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5287
|
-
result->src[0] = a;
|
5288
|
-
result->src[1] = b;
|
5289
|
-
|
5290
|
-
return result;
|
5291
|
-
|
5292
|
-
}
|
5293
|
-
|
5294
|
-
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
5295
|
-
// a: [OC, IC, KH, KW]
|
5296
|
-
// b: [N, OH, OW, IC * KH * KW]
|
5297
|
-
// result: [N, OC, OH, OW]
|
5298
|
-
static struct ggml_tensor * ggml_conv_2d_stage_1(
|
5299
|
-
struct ggml_context * ctx,
|
5300
|
-
struct ggml_tensor * a,
|
5301
|
-
struct ggml_tensor * b) {
|
5302
|
-
|
5303
|
-
bool is_node = false;
|
5304
|
-
|
5305
|
-
if (a->grad || b->grad) {
|
5306
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5307
|
-
is_node = true;
|
5308
|
-
}
|
5309
|
-
|
5310
|
-
const int64_t ne[4] = {
|
5311
|
-
b->ne[1],
|
5312
|
-
b->ne[2],
|
5313
|
-
a->ne[3],
|
5314
|
-
b->ne[3],
|
5315
|
-
};
|
5316
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5317
|
-
|
5318
|
-
result->op = GGML_OP_CONV_2D_STAGE_1;
|
5251
|
+
result->op = GGML_OP_IM2COL;
|
5319
5252
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5320
5253
|
result->src[0] = a;
|
5321
5254
|
result->src[1] = b;
|
5322
5255
|
|
5323
5256
|
return result;
|
5324
|
-
|
5325
5257
|
}
|
5326
5258
|
|
5327
5259
|
// a: [OC,IC, KH, KW]
|
5328
5260
|
// b: [N, IC, IH, IW]
|
5329
5261
|
// result: [N, OC, OH, OW]
|
5330
5262
|
struct ggml_tensor * ggml_conv_2d(
|
5331
|
-
|
5332
|
-
|
5333
|
-
|
5334
|
-
|
5335
|
-
|
5336
|
-
|
5337
|
-
|
5338
|
-
|
5339
|
-
|
5263
|
+
struct ggml_context * ctx,
|
5264
|
+
struct ggml_tensor * a,
|
5265
|
+
struct ggml_tensor * b,
|
5266
|
+
int s0,
|
5267
|
+
int s1,
|
5268
|
+
int p0,
|
5269
|
+
int p1,
|
5270
|
+
int d0,
|
5271
|
+
int d1) {
|
5272
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
5340
5273
|
|
5341
|
-
struct ggml_tensor * result =
|
5342
|
-
|
5274
|
+
struct ggml_tensor * result =
|
5275
|
+
ggml_mul_mat(ctx,
|
5276
|
+
ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
|
5277
|
+
ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
|
5343
5278
|
|
5344
|
-
|
5279
|
+
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
|
5345
5280
|
|
5281
|
+
return result;
|
5346
5282
|
}
|
5347
5283
|
|
5348
5284
|
// ggml_conv_2d_sk_p0
|
@@ -5402,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
|
5402
5338
|
|
5403
5339
|
// ggml_pool_*
|
5404
5340
|
|
5405
|
-
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s,
|
5341
|
+
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
|
5406
5342
|
return (ins + 2 * p - ks) / s + 1;
|
5407
5343
|
}
|
5408
5344
|
|
@@ -5449,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5449
5385
|
int k1,
|
5450
5386
|
int s0,
|
5451
5387
|
int s1,
|
5452
|
-
|
5453
|
-
|
5388
|
+
float p0,
|
5389
|
+
float p1) {
|
5454
5390
|
|
5455
5391
|
bool is_node = false;
|
5456
5392
|
|
@@ -8912,6 +8848,48 @@ static void ggml_compute_forward_silu(
|
|
8912
8848
|
}
|
8913
8849
|
}
|
8914
8850
|
|
8851
|
+
// ggml_compute_forward_leaky
|
8852
|
+
|
8853
|
+
static void ggml_compute_forward_leaky_f32(
|
8854
|
+
const struct ggml_compute_params * params,
|
8855
|
+
const struct ggml_tensor * src0,
|
8856
|
+
struct ggml_tensor * dst) {
|
8857
|
+
assert(params->ith == 0);
|
8858
|
+
assert(ggml_are_same_shape(src0, dst));
|
8859
|
+
|
8860
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
8861
|
+
return;
|
8862
|
+
}
|
8863
|
+
|
8864
|
+
const int n = ggml_nrows(src0);
|
8865
|
+
const int nc = src0->ne[0];
|
8866
|
+
|
8867
|
+
assert(dst->nb[0] == sizeof(float));
|
8868
|
+
assert(src0->nb[0] == sizeof(float));
|
8869
|
+
|
8870
|
+
for (int i = 0; i < n; i++) {
|
8871
|
+
ggml_vec_leaky_f32(nc,
|
8872
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
8873
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
8874
|
+
}
|
8875
|
+
}
|
8876
|
+
|
8877
|
+
static void ggml_compute_forward_leaky(
|
8878
|
+
const struct ggml_compute_params * params,
|
8879
|
+
const struct ggml_tensor * src0,
|
8880
|
+
struct ggml_tensor * dst) {
|
8881
|
+
switch (src0->type) {
|
8882
|
+
case GGML_TYPE_F32:
|
8883
|
+
{
|
8884
|
+
ggml_compute_forward_leaky_f32(params, src0, dst);
|
8885
|
+
} break;
|
8886
|
+
default:
|
8887
|
+
{
|
8888
|
+
GGML_ASSERT(false);
|
8889
|
+
} break;
|
8890
|
+
}
|
8891
|
+
}
|
8892
|
+
|
8915
8893
|
// ggml_compute_forward_silu_back
|
8916
8894
|
|
8917
8895
|
static void ggml_compute_forward_silu_back_f32(
|
@@ -9395,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9395
9373
|
// TODO: find the optimal values for these
|
9396
9374
|
if (ggml_is_contiguous(src0) &&
|
9397
9375
|
ggml_is_contiguous(src1) &&
|
9376
|
+
src0->type == GGML_TYPE_F32 &&
|
9377
|
+
src1->type == GGML_TYPE_F32 &&
|
9398
9378
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9399
9379
|
|
9400
9380
|
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
@@ -9433,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9433
9413
|
|
9434
9414
|
// we don't support permuted src0 or src1
|
9435
9415
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
9436
|
-
GGML_ASSERT(nb10 ==
|
9416
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
9437
9417
|
|
9438
9418
|
// dst cannot be transposed or permuted
|
9439
9419
|
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -10974,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
|
|
10974
10954
|
const struct ggml_compute_params * params,
|
10975
10955
|
const struct ggml_tensor * src0,
|
10976
10956
|
const struct ggml_tensor * src1,
|
10977
|
-
struct ggml_tensor * dst
|
10957
|
+
struct ggml_tensor * dst,
|
10958
|
+
const bool forward) {
|
10978
10959
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10979
10960
|
return;
|
10980
10961
|
}
|
@@ -11033,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
|
|
11033
11014
|
const bool is_neox = mode & 2;
|
11034
11015
|
const bool is_glm = mode & 4;
|
11035
11016
|
|
11017
|
+
// backward process uses inverse rotation by cos and sin.
|
11018
|
+
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
11019
|
+
// this essentially just switches the sign of sin.
|
11020
|
+
const float sin_sign = forward ? 1.0f : -1.0f;
|
11021
|
+
|
11036
11022
|
const int32_t * pos = (const int32_t *) src1->data;
|
11037
11023
|
|
11038
11024
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
@@ -11049,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
|
|
11049
11035
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
11050
11036
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
11051
11037
|
const float cos_theta = cosf(theta_base);
|
11052
|
-
const float sin_theta = sinf(theta_base);
|
11038
|
+
const float sin_theta = sinf(theta_base) * sin_sign;
|
11053
11039
|
const float cos_block_theta = cosf(block_theta);
|
11054
|
-
const float sin_block_theta = sinf(block_theta);
|
11040
|
+
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
11055
11041
|
|
11056
11042
|
theta_base *= theta_scale;
|
11057
11043
|
block_theta *= theta_scale;
|
@@ -11075,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11075
11061
|
rope_yarn(
|
11076
11062
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11077
11063
|
);
|
11064
|
+
sin_theta *= sin_sign;
|
11078
11065
|
|
11079
11066
|
// zeta scaling for xPos only:
|
11080
11067
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
@@ -11105,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11105
11092
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
11106
11093
|
&cos_theta, &sin_theta
|
11107
11094
|
);
|
11095
|
+
sin_theta *= sin_sign;
|
11108
11096
|
|
11109
11097
|
theta_base *= theta_scale;
|
11110
11098
|
|
@@ -11130,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
|
|
11130
11118
|
const struct ggml_compute_params * params,
|
11131
11119
|
const struct ggml_tensor * src0,
|
11132
11120
|
const struct ggml_tensor * src1,
|
11133
|
-
struct ggml_tensor * dst
|
11121
|
+
struct ggml_tensor * dst,
|
11122
|
+
const bool forward) {
|
11134
11123
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11135
11124
|
return;
|
11136
11125
|
}
|
@@ -11182,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
|
|
11182
11171
|
const bool is_neox = mode & 2;
|
11183
11172
|
const bool is_glm = mode & 4;
|
11184
11173
|
|
11174
|
+
// backward process uses inverse rotation by cos and sin.
|
11175
|
+
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
11176
|
+
// this essentially just switches the sign of sin.
|
11177
|
+
const float sin_sign = forward ? 1.0f : -1.0f;
|
11178
|
+
|
11185
11179
|
const int32_t * pos = (const int32_t *) src1->data;
|
11186
11180
|
|
11187
11181
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
@@ -11198,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
|
|
11198
11192
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
11199
11193
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
11200
11194
|
const float cos_theta = cosf(theta_base);
|
11201
|
-
const float sin_theta = sinf(theta_base);
|
11195
|
+
const float sin_theta = sinf(theta_base) * sin_sign;
|
11202
11196
|
const float cos_block_theta = cosf(block_theta);
|
11203
|
-
const float sin_block_theta = sinf(block_theta);
|
11197
|
+
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
11204
11198
|
|
11205
11199
|
theta_base *= theta_scale;
|
11206
11200
|
block_theta *= theta_scale;
|
@@ -11224,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11224
11218
|
rope_yarn(
|
11225
11219
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11226
11220
|
);
|
11221
|
+
sin_theta *= sin_sign;
|
11227
11222
|
|
11228
11223
|
theta_base *= theta_scale;
|
11229
11224
|
|
@@ -11250,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11250
11245
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
11251
11246
|
&cos_theta, &sin_theta
|
11252
11247
|
);
|
11248
|
+
sin_theta *= sin_sign;
|
11253
11249
|
|
11254
11250
|
theta_base *= theta_scale;
|
11255
11251
|
|
@@ -11279,11 +11275,11 @@ static void ggml_compute_forward_rope(
|
|
11279
11275
|
switch (src0->type) {
|
11280
11276
|
case GGML_TYPE_F16:
|
11281
11277
|
{
|
11282
|
-
ggml_compute_forward_rope_f16(params, src0, src1, dst);
|
11278
|
+
ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
|
11283
11279
|
} break;
|
11284
11280
|
case GGML_TYPE_F32:
|
11285
11281
|
{
|
11286
|
-
ggml_compute_forward_rope_f32(params, src0, src1, dst);
|
11282
|
+
ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
|
11287
11283
|
} break;
|
11288
11284
|
default:
|
11289
11285
|
{
|
@@ -11294,693 +11290,73 @@ static void ggml_compute_forward_rope(
|
|
11294
11290
|
|
11295
11291
|
// ggml_compute_forward_rope_back
|
11296
11292
|
|
11297
|
-
static void
|
11293
|
+
static void ggml_compute_forward_rope_back(
|
11298
11294
|
const struct ggml_compute_params * params,
|
11299
11295
|
const struct ggml_tensor * src0,
|
11300
11296
|
const struct ggml_tensor * src1,
|
11301
11297
|
struct ggml_tensor * dst) {
|
11302
|
-
|
11303
|
-
|
11304
|
-
|
11298
|
+
switch (src0->type) {
|
11299
|
+
case GGML_TYPE_F16:
|
11300
|
+
{
|
11301
|
+
ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
|
11302
|
+
} break;
|
11303
|
+
case GGML_TYPE_F32:
|
11304
|
+
{
|
11305
|
+
ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
|
11306
|
+
} break;
|
11307
|
+
default:
|
11308
|
+
{
|
11309
|
+
GGML_ASSERT(false);
|
11310
|
+
} break;
|
11305
11311
|
}
|
11312
|
+
}
|
11306
11313
|
|
11307
|
-
|
11308
|
-
// dx = rope_back(dy, src1)
|
11309
|
-
// src0 is dy, src1 contains options
|
11310
|
-
|
11311
|
-
float freq_base;
|
11312
|
-
float freq_scale;
|
11313
|
-
|
11314
|
-
// these two only relevant for xPos RoPE:
|
11315
|
-
float xpos_base;
|
11316
|
-
bool xpos_down;
|
11317
|
-
|
11318
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
11319
|
-
const int n_dims = ((int32_t *) dst->op_params)[1];
|
11320
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
11321
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
11322
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
11323
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
11324
|
-
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
11325
|
-
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
11314
|
+
// ggml_compute_forward_conv_transpose_1d
|
11326
11315
|
|
11327
|
-
|
11316
|
+
static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
11317
|
+
const struct ggml_compute_params * params,
|
11318
|
+
const struct ggml_tensor * src0,
|
11319
|
+
const struct ggml_tensor * src1,
|
11320
|
+
struct ggml_tensor * dst) {
|
11321
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11322
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11323
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11328
11324
|
|
11329
|
-
|
11330
|
-
|
11325
|
+
int64_t t0 = ggml_perf_time_us();
|
11326
|
+
UNUSED(t0);
|
11331
11327
|
|
11332
|
-
|
11328
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
11333
11329
|
|
11334
11330
|
const int ith = params->ith;
|
11335
11331
|
const int nth = params->nth;
|
11336
11332
|
|
11337
|
-
const int
|
11333
|
+
const int nk = ne00*ne01*ne02;
|
11338
11334
|
|
11339
|
-
|
11340
|
-
|
11335
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11336
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
11341
11337
|
|
11342
|
-
|
11343
|
-
|
11344
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11338
|
+
if (params->type == GGML_TASK_INIT) {
|
11339
|
+
memset(params->wdata, 0, params->wsize);
|
11345
11340
|
|
11346
|
-
|
11347
|
-
|
11341
|
+
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
11342
|
+
{
|
11343
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11348
11344
|
|
11349
|
-
|
11345
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
11346
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11347
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
11348
|
+
ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
|
11349
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11350
|
+
dst_data[i00*ne02 + i02] = src[i00];
|
11351
|
+
}
|
11352
|
+
}
|
11353
|
+
}
|
11354
|
+
}
|
11350
11355
|
|
11351
|
-
|
11352
|
-
|
11353
|
-
|
11354
|
-
|
11355
|
-
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11356
|
-
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11357
|
-
const int64_t p = pos[i2];
|
11358
|
-
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11359
|
-
if (ir++ < ir0) continue;
|
11360
|
-
if (ir > ir1) break;
|
11361
|
-
|
11362
|
-
float theta_base = freq_scale * (float)p;
|
11363
|
-
|
11364
|
-
if (!is_neox) {
|
11365
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11366
|
-
const float cos_theta = cosf(theta_base);
|
11367
|
-
const float sin_theta = sinf(theta_base);
|
11368
|
-
|
11369
|
-
// zeta scaling for xPos only:
|
11370
|
-
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
11371
|
-
if (xpos_down) zeta = 1.0f / zeta;
|
11372
|
-
|
11373
|
-
theta_base *= theta_scale;
|
11374
|
-
|
11375
|
-
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11376
|
-
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11377
|
-
|
11378
|
-
const float dy0 = dy[0];
|
11379
|
-
const float dy1 = dy[1];
|
11380
|
-
|
11381
|
-
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
11382
|
-
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
11383
|
-
}
|
11384
|
-
} else {
|
11385
|
-
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
11386
|
-
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
11387
|
-
const float cos_theta = cosf(theta_base);
|
11388
|
-
const float sin_theta = sinf(theta_base);
|
11389
|
-
|
11390
|
-
theta_base *= theta_scale;
|
11391
|
-
|
11392
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
11393
|
-
|
11394
|
-
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11395
|
-
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11396
|
-
|
11397
|
-
const float dy0 = dy[0];
|
11398
|
-
const float dy1 = dy[n_dims/2];
|
11399
|
-
|
11400
|
-
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
11401
|
-
dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
|
11402
|
-
}
|
11403
|
-
}
|
11404
|
-
}
|
11405
|
-
}
|
11406
|
-
}
|
11407
|
-
}
|
11408
|
-
}
|
11409
|
-
|
11410
|
-
static void ggml_compute_forward_rope_back_f16(
|
11411
|
-
const struct ggml_compute_params * params,
|
11412
|
-
const struct ggml_tensor * src0,
|
11413
|
-
const struct ggml_tensor * src1,
|
11414
|
-
struct ggml_tensor * dst) {
|
11415
|
-
|
11416
|
-
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11417
|
-
return;
|
11418
|
-
}
|
11419
|
-
|
11420
|
-
// y = rope(x, src1)
|
11421
|
-
// dx = rope_back(dy, src1)
|
11422
|
-
// src0 is dy, src1 contains options
|
11423
|
-
|
11424
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
11425
|
-
const int n_dims = ((int32_t *) dst->op_params)[1];
|
11426
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
11427
|
-
|
11428
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
11429
|
-
|
11430
|
-
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
11431
|
-
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
11432
|
-
|
11433
|
-
assert(nb0 == sizeof(ggml_fp16_t));
|
11434
|
-
|
11435
|
-
const int ith = params->ith;
|
11436
|
-
const int nth = params->nth;
|
11437
|
-
|
11438
|
-
const int nr = ggml_nrows(dst);
|
11439
|
-
|
11440
|
-
// rows per thread
|
11441
|
-
const int dr = (nr + nth - 1)/nth;
|
11442
|
-
|
11443
|
-
// row range for this thread
|
11444
|
-
const int ir0 = dr*ith;
|
11445
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11446
|
-
|
11447
|
-
// row index used to determine which thread to use
|
11448
|
-
int ir = 0;
|
11449
|
-
|
11450
|
-
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
11451
|
-
|
11452
|
-
const bool is_neox = mode & 2;
|
11453
|
-
|
11454
|
-
const int32_t * pos = (const int32_t *) src1->data;
|
11455
|
-
|
11456
|
-
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11457
|
-
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11458
|
-
const int64_t p = pos[i2];
|
11459
|
-
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11460
|
-
if (ir++ < ir0) continue;
|
11461
|
-
if (ir > ir1) break;
|
11462
|
-
|
11463
|
-
float theta_base = (float)p;
|
11464
|
-
|
11465
|
-
if (!is_neox) {
|
11466
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11467
|
-
const float cos_theta = cosf(theta_base);
|
11468
|
-
const float sin_theta = sinf(theta_base);
|
11469
|
-
|
11470
|
-
theta_base *= theta_scale;
|
11471
|
-
|
11472
|
-
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11473
|
-
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11474
|
-
|
11475
|
-
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
11476
|
-
const float dy1 = GGML_FP16_TO_FP32(dy[1]);
|
11477
|
-
|
11478
|
-
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
11479
|
-
dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
11480
|
-
}
|
11481
|
-
} else {
|
11482
|
-
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
11483
|
-
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
11484
|
-
const float cos_theta = cosf(theta_base);
|
11485
|
-
const float sin_theta = sinf(theta_base);
|
11486
|
-
|
11487
|
-
theta_base *= theta_scale;
|
11488
|
-
|
11489
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
11490
|
-
|
11491
|
-
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11492
|
-
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11493
|
-
|
11494
|
-
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
11495
|
-
const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
|
11496
|
-
|
11497
|
-
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
11498
|
-
dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
11499
|
-
}
|
11500
|
-
}
|
11501
|
-
}
|
11502
|
-
}
|
11503
|
-
}
|
11504
|
-
}
|
11505
|
-
}
|
11506
|
-
|
11507
|
-
static void ggml_compute_forward_rope_back(
|
11508
|
-
const struct ggml_compute_params * params,
|
11509
|
-
const struct ggml_tensor * src0,
|
11510
|
-
const struct ggml_tensor * src1,
|
11511
|
-
struct ggml_tensor * dst) {
|
11512
|
-
switch (src0->type) {
|
11513
|
-
case GGML_TYPE_F16:
|
11514
|
-
{
|
11515
|
-
ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
|
11516
|
-
} break;
|
11517
|
-
case GGML_TYPE_F32:
|
11518
|
-
{
|
11519
|
-
ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
|
11520
|
-
} break;
|
11521
|
-
default:
|
11522
|
-
{
|
11523
|
-
GGML_ASSERT(false);
|
11524
|
-
} break;
|
11525
|
-
}
|
11526
|
-
}
|
11527
|
-
|
11528
|
-
// ggml_compute_forward_conv_1d
|
11529
|
-
|
11530
|
-
static void ggml_compute_forward_conv_1d_f16_f32(
|
11531
|
-
const struct ggml_compute_params * params,
|
11532
|
-
const struct ggml_tensor * src0,
|
11533
|
-
const struct ggml_tensor * src1,
|
11534
|
-
struct ggml_tensor * dst) {
|
11535
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11536
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11537
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11538
|
-
|
11539
|
-
int64_t t0 = ggml_perf_time_us();
|
11540
|
-
UNUSED(t0);
|
11541
|
-
|
11542
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
11543
|
-
|
11544
|
-
const int ith = params->ith;
|
11545
|
-
const int nth = params->nth;
|
11546
|
-
|
11547
|
-
const int nk = ne00;
|
11548
|
-
|
11549
|
-
// size of the convolution row - the kernel size unrolled across all input channels
|
11550
|
-
const int ew0 = nk*ne01;
|
11551
|
-
|
11552
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
11553
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
|
11554
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
|
11555
|
-
|
11556
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11557
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11558
|
-
|
11559
|
-
if (params->type == GGML_TASK_INIT) {
|
11560
|
-
memset(params->wdata, 0, params->wsize);
|
11561
|
-
|
11562
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11563
|
-
|
11564
|
-
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
11565
|
-
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
11566
|
-
ggml_fp16_t * dst_data = wdata;
|
11567
|
-
|
11568
|
-
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
11569
|
-
for (int64_t ik = 0; ik < nk; ik++) {
|
11570
|
-
const int idx0 = i0*s0 + ik*d0 - p0;
|
11571
|
-
|
11572
|
-
if(!(idx0 < 0 || idx0 >= ne10)) {
|
11573
|
-
dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
|
11574
|
-
}
|
11575
|
-
}
|
11576
|
-
}
|
11577
|
-
}
|
11578
|
-
|
11579
|
-
return;
|
11580
|
-
}
|
11581
|
-
|
11582
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11583
|
-
return;
|
11584
|
-
}
|
11585
|
-
|
11586
|
-
// total rows in dst
|
11587
|
-
const int nr = ne2;
|
11588
|
-
|
11589
|
-
// rows per thread
|
11590
|
-
const int dr = (nr + nth - 1)/nth;
|
11591
|
-
|
11592
|
-
// row range for this thread
|
11593
|
-
const int ir0 = dr*ith;
|
11594
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11595
|
-
|
11596
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11597
|
-
|
11598
|
-
for (int i2 = 0; i2 < ne2; i2++) {
|
11599
|
-
for (int i1 = ir0; i1 < ir1; i1++) {
|
11600
|
-
float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
|
11601
|
-
|
11602
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
11603
|
-
ggml_vec_dot_f16(ew0, dst_data + i0,
|
11604
|
-
(ggml_fp16_t *) ((char *) src0->data + i1*nb02),
|
11605
|
-
(ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
|
11606
|
-
}
|
11607
|
-
}
|
11608
|
-
}
|
11609
|
-
}
|
11610
|
-
|
11611
|
-
static void ggml_compute_forward_conv_1d_f32(
|
11612
|
-
const struct ggml_compute_params * params,
|
11613
|
-
const struct ggml_tensor * src0,
|
11614
|
-
const struct ggml_tensor * src1,
|
11615
|
-
struct ggml_tensor * dst) {
|
11616
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
11617
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11618
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11619
|
-
|
11620
|
-
int64_t t0 = ggml_perf_time_us();
|
11621
|
-
UNUSED(t0);
|
11622
|
-
|
11623
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
11624
|
-
|
11625
|
-
const int ith = params->ith;
|
11626
|
-
const int nth = params->nth;
|
11627
|
-
|
11628
|
-
const int nk = ne00;
|
11629
|
-
|
11630
|
-
const int ew0 = nk*ne01;
|
11631
|
-
|
11632
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
11633
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
|
11634
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
|
11635
|
-
|
11636
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
11637
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11638
|
-
|
11639
|
-
if (params->type == GGML_TASK_INIT) {
|
11640
|
-
memset(params->wdata, 0, params->wsize);
|
11641
|
-
|
11642
|
-
float * const wdata = (float *) params->wdata + 0;
|
11643
|
-
|
11644
|
-
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
11645
|
-
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
11646
|
-
float * dst_data = wdata;
|
11647
|
-
|
11648
|
-
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
11649
|
-
for (int64_t ik = 0; ik < nk; ik++) {
|
11650
|
-
const int idx0 = i0*s0 + ik*d0 - p0;
|
11651
|
-
|
11652
|
-
if(!(idx0 < 0 || idx0 >= ne10)) {
|
11653
|
-
dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
|
11654
|
-
}
|
11655
|
-
}
|
11656
|
-
}
|
11657
|
-
}
|
11658
|
-
|
11659
|
-
return;
|
11660
|
-
}
|
11661
|
-
|
11662
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11663
|
-
return;
|
11664
|
-
}
|
11665
|
-
|
11666
|
-
// total rows in dst
|
11667
|
-
const int nr = ne02;
|
11668
|
-
|
11669
|
-
// rows per thread
|
11670
|
-
const int dr = (nr + nth - 1)/nth;
|
11671
|
-
|
11672
|
-
// row range for this thread
|
11673
|
-
const int ir0 = dr*ith;
|
11674
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
11675
|
-
|
11676
|
-
float * const wdata = (float *) params->wdata + 0;
|
11677
|
-
|
11678
|
-
for (int i2 = 0; i2 < ne2; i2++) {
|
11679
|
-
for (int i1 = ir0; i1 < ir1; i1++) {
|
11680
|
-
float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
|
11681
|
-
|
11682
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
11683
|
-
ggml_vec_dot_f32(ew0, dst_data + i0,
|
11684
|
-
(float *) ((char *) src0->data + i1*nb02),
|
11685
|
-
(float *) wdata + i2*nb2 + i0*ew0);
|
11686
|
-
}
|
11687
|
-
}
|
11688
|
-
}
|
11689
|
-
}
|
11690
|
-
|
11691
|
-
// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
|
11692
|
-
static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
|
11693
|
-
ggml_fp16_t * A,
|
11694
|
-
ggml_fp16_t * B,
|
11695
|
-
float * C,
|
11696
|
-
const int ith, const int nth) {
|
11697
|
-
// does not seem to make a difference
|
11698
|
-
int64_t m0, m1, n0, n1;
|
11699
|
-
// patches per thread
|
11700
|
-
if (m > n) {
|
11701
|
-
n0 = 0;
|
11702
|
-
n1 = n;
|
11703
|
-
|
11704
|
-
// total patches in dst
|
11705
|
-
const int np = m;
|
11706
|
-
|
11707
|
-
// patches per thread
|
11708
|
-
const int dp = (np + nth - 1)/nth;
|
11709
|
-
|
11710
|
-
// patch range for this thread
|
11711
|
-
m0 = dp*ith;
|
11712
|
-
m1 = MIN(m0 + dp, np);
|
11713
|
-
} else {
|
11714
|
-
m0 = 0;
|
11715
|
-
m1 = m;
|
11716
|
-
|
11717
|
-
// total patches in dst
|
11718
|
-
const int np = n;
|
11719
|
-
|
11720
|
-
// patches per thread
|
11721
|
-
const int dp = (np + nth - 1)/nth;
|
11722
|
-
|
11723
|
-
// patch range for this thread
|
11724
|
-
n0 = dp*ith;
|
11725
|
-
n1 = MIN(n0 + dp, np);
|
11726
|
-
}
|
11727
|
-
|
11728
|
-
// block-tiling attempt
|
11729
|
-
int64_t blck_n = 16;
|
11730
|
-
int64_t blck_m = 16;
|
11731
|
-
|
11732
|
-
// int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
|
11733
|
-
// int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
|
11734
|
-
// if (blck_size > 0) {
|
11735
|
-
// blck_0 = 4;
|
11736
|
-
// blck_1 = blck_size / blck_0;
|
11737
|
-
// if (blck_1 < 0) {
|
11738
|
-
// blck_1 = 1;
|
11739
|
-
// }
|
11740
|
-
// // blck_0 = (int64_t)sqrt(blck_size);
|
11741
|
-
// // blck_1 = blck_0;
|
11742
|
-
// }
|
11743
|
-
// // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
|
11744
|
-
|
11745
|
-
for (int j = n0; j < n1; j+=blck_n) {
|
11746
|
-
for (int i = m0; i < m1; i+=blck_m) {
|
11747
|
-
// printf("i j k => %d %d %d\n", i, j, K);
|
11748
|
-
for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
|
11749
|
-
for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
|
11750
|
-
ggml_vec_dot_f16(k,
|
11751
|
-
C + ii*n + jj,
|
11752
|
-
A + ii * k,
|
11753
|
-
B + jj * k);
|
11754
|
-
}
|
11755
|
-
}
|
11756
|
-
}
|
11757
|
-
}
|
11758
|
-
}
|
11759
|
-
|
11760
|
-
// src0: kernel [OC, IC, K]
|
11761
|
-
// src1: signal [N, IC, IL]
|
11762
|
-
// dst: result [N, OL, IC*K]
|
11763
|
-
static void ggml_compute_forward_conv_1d_stage_0_f32(
|
11764
|
-
const struct ggml_compute_params * params,
|
11765
|
-
const struct ggml_tensor * src0,
|
11766
|
-
const struct ggml_tensor * src1,
|
11767
|
-
struct ggml_tensor * dst) {
|
11768
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11769
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11770
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
11771
|
-
|
11772
|
-
int64_t t0 = ggml_perf_time_us();
|
11773
|
-
UNUSED(t0);
|
11774
|
-
|
11775
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
11776
|
-
|
11777
|
-
const int64_t N = ne12;
|
11778
|
-
const int64_t IC = ne11;
|
11779
|
-
const int64_t IL = ne10;
|
11780
|
-
|
11781
|
-
const int64_t K = ne00;
|
11782
|
-
|
11783
|
-
const int64_t OL = ne1;
|
11784
|
-
|
11785
|
-
const int ith = params->ith;
|
11786
|
-
const int nth = params->nth;
|
11787
|
-
|
11788
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
11789
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
|
11790
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
|
11791
|
-
|
11792
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11793
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11794
|
-
|
11795
|
-
if (params->type == GGML_TASK_INIT) {
|
11796
|
-
memset(dst->data, 0, ggml_nbytes(dst));
|
11797
|
-
return;
|
11798
|
-
}
|
11799
|
-
|
11800
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11801
|
-
return;
|
11802
|
-
}
|
11803
|
-
|
11804
|
-
// im2col: [N, IC, IL] => [N, OL, IC*K]
|
11805
|
-
{
|
11806
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
11807
|
-
|
11808
|
-
for (int64_t in = 0; in < N; in++) {
|
11809
|
-
for (int64_t iol = 0; iol < OL; iol++) {
|
11810
|
-
for (int64_t iic = ith; iic < IC; iic+=nth) {
|
11811
|
-
|
11812
|
-
// micro kernel
|
11813
|
-
ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
|
11814
|
-
const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
|
11815
|
-
|
11816
|
-
for (int64_t ik = 0; ik < K; ik++) {
|
11817
|
-
const int64_t iil = iol*s0 + ik*d0 - p0;
|
11818
|
-
|
11819
|
-
if (!(iil < 0 || iil >= IL)) {
|
11820
|
-
dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
|
11821
|
-
}
|
11822
|
-
}
|
11823
|
-
}
|
11824
|
-
}
|
11825
|
-
}
|
11826
|
-
}
|
11827
|
-
}
|
11828
|
-
|
11829
|
-
// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
|
11830
|
-
// src0: [OC, IC, K]
|
11831
|
-
// src1: [N, OL, IC * K]
|
11832
|
-
// result: [N, OC, OL]
|
11833
|
-
static void ggml_compute_forward_conv_1d_stage_1_f16(
|
11834
|
-
const struct ggml_compute_params * params,
|
11835
|
-
const struct ggml_tensor * src0,
|
11836
|
-
const struct ggml_tensor * src1,
|
11837
|
-
struct ggml_tensor * dst) {
|
11838
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11839
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
11840
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11841
|
-
|
11842
|
-
int64_t t0 = ggml_perf_time_us();
|
11843
|
-
UNUSED(t0);
|
11844
|
-
|
11845
|
-
if (params->type == GGML_TASK_INIT) {
|
11846
|
-
return;
|
11847
|
-
}
|
11848
|
-
|
11849
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
11850
|
-
return;
|
11851
|
-
}
|
11852
|
-
|
11853
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
11854
|
-
|
11855
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11856
|
-
GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
|
11857
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
11858
|
-
|
11859
|
-
const int N = ne12;
|
11860
|
-
const int OL = ne11;
|
11861
|
-
|
11862
|
-
const int OC = ne02;
|
11863
|
-
const int IC = ne01;
|
11864
|
-
const int K = ne00;
|
11865
|
-
|
11866
|
-
const int ith = params->ith;
|
11867
|
-
const int nth = params->nth;
|
11868
|
-
|
11869
|
-
int64_t m = OC;
|
11870
|
-
int64_t n = OL;
|
11871
|
-
int64_t k = IC * K;
|
11872
|
-
|
11873
|
-
// [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
|
11874
|
-
for (int i = 0; i < N; i++) {
|
11875
|
-
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
11876
|
-
ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
|
11877
|
-
float * C = (float *)dst->data + i * m * n; // [m, n]
|
11878
|
-
|
11879
|
-
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
11880
|
-
}
|
11881
|
-
}
|
11882
|
-
|
11883
|
-
static void ggml_compute_forward_conv_1d(
|
11884
|
-
const struct ggml_compute_params * params,
|
11885
|
-
const struct ggml_tensor * src0,
|
11886
|
-
const struct ggml_tensor * src1,
|
11887
|
-
struct ggml_tensor * dst) {
|
11888
|
-
switch(src0->type) {
|
11889
|
-
case GGML_TYPE_F16:
|
11890
|
-
{
|
11891
|
-
ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
|
11892
|
-
} break;
|
11893
|
-
case GGML_TYPE_F32:
|
11894
|
-
{
|
11895
|
-
ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
|
11896
|
-
} break;
|
11897
|
-
default:
|
11898
|
-
{
|
11899
|
-
GGML_ASSERT(false);
|
11900
|
-
} break;
|
11901
|
-
}
|
11902
|
-
}
|
11903
|
-
|
11904
|
-
static void ggml_compute_forward_conv_1d_stage_0(
|
11905
|
-
const struct ggml_compute_params * params,
|
11906
|
-
const struct ggml_tensor * src0,
|
11907
|
-
const struct ggml_tensor * src1,
|
11908
|
-
struct ggml_tensor * dst) {
|
11909
|
-
switch(src0->type) {
|
11910
|
-
case GGML_TYPE_F16:
|
11911
|
-
{
|
11912
|
-
ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
|
11913
|
-
} break;
|
11914
|
-
default:
|
11915
|
-
{
|
11916
|
-
GGML_ASSERT(false);
|
11917
|
-
} break;
|
11918
|
-
}
|
11919
|
-
}
|
11920
|
-
|
11921
|
-
static void ggml_compute_forward_conv_1d_stage_1(
|
11922
|
-
const struct ggml_compute_params * params,
|
11923
|
-
const struct ggml_tensor * src0,
|
11924
|
-
const struct ggml_tensor * src1,
|
11925
|
-
struct ggml_tensor * dst) {
|
11926
|
-
switch(src0->type) {
|
11927
|
-
case GGML_TYPE_F16:
|
11928
|
-
{
|
11929
|
-
ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
|
11930
|
-
} break;
|
11931
|
-
default:
|
11932
|
-
{
|
11933
|
-
GGML_ASSERT(false);
|
11934
|
-
} break;
|
11935
|
-
}
|
11936
|
-
}
|
11937
|
-
|
11938
|
-
// ggml_compute_forward_conv_transpose_1d
|
11939
|
-
|
11940
|
-
static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
11941
|
-
const struct ggml_compute_params * params,
|
11942
|
-
const struct ggml_tensor * src0,
|
11943
|
-
const struct ggml_tensor * src1,
|
11944
|
-
struct ggml_tensor * dst) {
|
11945
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
11946
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
11947
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
11948
|
-
|
11949
|
-
int64_t t0 = ggml_perf_time_us();
|
11950
|
-
UNUSED(t0);
|
11951
|
-
|
11952
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
11953
|
-
|
11954
|
-
const int ith = params->ith;
|
11955
|
-
const int nth = params->nth;
|
11956
|
-
|
11957
|
-
const int nk = ne00*ne01*ne02;
|
11958
|
-
|
11959
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
11960
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
11961
|
-
|
11962
|
-
if (params->type == GGML_TASK_INIT) {
|
11963
|
-
memset(params->wdata, 0, params->wsize);
|
11964
|
-
|
11965
|
-
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
11966
|
-
{
|
11967
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
11968
|
-
|
11969
|
-
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
11970
|
-
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11971
|
-
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
11972
|
-
ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
|
11973
|
-
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11974
|
-
dst_data[i00*ne02 + i02] = src[i00];
|
11975
|
-
}
|
11976
|
-
}
|
11977
|
-
}
|
11978
|
-
}
|
11979
|
-
|
11980
|
-
// permute source data (src1) from (L x Cin) to (Cin x L)
|
11981
|
-
{
|
11982
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
11983
|
-
ggml_fp16_t * dst_data = wdata;
|
11356
|
+
// permute source data (src1) from (L x Cin) to (Cin x L)
|
11357
|
+
{
|
11358
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
11359
|
+
ggml_fp16_t * dst_data = wdata;
|
11984
11360
|
|
11985
11361
|
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
11986
11362
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
@@ -12146,12 +11522,10 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|
12146
11522
|
}
|
12147
11523
|
}
|
12148
11524
|
|
12149
|
-
// ggml_compute_forward_conv_2d
|
12150
|
-
|
12151
11525
|
// src0: kernel [OC, IC, KH, KW]
|
12152
11526
|
// src1: image [N, IC, IH, IW]
|
12153
11527
|
// dst: result [N, OH, OW, IC*KH*KW]
|
12154
|
-
static void
|
11528
|
+
static void ggml_compute_forward_im2col_f16(
|
12155
11529
|
const struct ggml_compute_params * params,
|
12156
11530
|
const struct ggml_tensor * src0,
|
12157
11531
|
const struct ggml_tensor * src1,
|
@@ -12165,34 +11539,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
|
|
12165
11539
|
|
12166
11540
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
12167
11541
|
|
12168
|
-
const
|
12169
|
-
const
|
12170
|
-
const
|
11542
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
11543
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
11544
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
11545
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
11546
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
11547
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
11548
|
+
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
11549
|
+
|
11550
|
+
const int ith = params->ith;
|
11551
|
+
const int nth = params->nth;
|
11552
|
+
|
11553
|
+
const int64_t N = is_2D ? ne13 : ne12;
|
11554
|
+
const int64_t IC = is_2D ? ne12 : ne11;
|
11555
|
+
const int64_t IH = is_2D ? ne11 : 1;
|
12171
11556
|
const int64_t IW = ne10;
|
12172
11557
|
|
12173
|
-
|
12174
|
-
// const int64_t IC = ne02;
|
12175
|
-
const int64_t KH = ne01;
|
11558
|
+
const int64_t KH = is_2D ? ne01 : 1;
|
12176
11559
|
const int64_t KW = ne00;
|
12177
11560
|
|
12178
|
-
const int64_t OH = ne2;
|
11561
|
+
const int64_t OH = is_2D ? ne2 : 1;
|
12179
11562
|
const int64_t OW = ne1;
|
12180
11563
|
|
12181
|
-
|
12182
|
-
|
12183
|
-
|
12184
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
12185
|
-
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
12186
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
12187
|
-
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
12188
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
12189
|
-
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
11564
|
+
int ofs0 = is_2D ? nb13 : nb12;
|
11565
|
+
int ofs1 = is_2D ? nb12 : nb11;
|
12190
11566
|
|
12191
11567
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12192
11568
|
GGML_ASSERT(nb10 == sizeof(float));
|
12193
11569
|
|
12194
11570
|
if (params->type == GGML_TASK_INIT) {
|
12195
|
-
memset(dst->data, 0, ggml_nbytes(dst));
|
12196
11571
|
return;
|
12197
11572
|
}
|
12198
11573
|
|
@@ -12205,20 +11580,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
|
|
12205
11580
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
12206
11581
|
|
12207
11582
|
for (int64_t in = 0; in < N; in++) {
|
12208
|
-
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
11583
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
12209
11584
|
for (int64_t iow = 0; iow < OW; iow++) {
|
12210
|
-
for (int64_t iic = ith; iic < IC; iic+=nth) {
|
11585
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
12211
11586
|
|
12212
11587
|
// micro kernel
|
12213
11588
|
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
12214
|
-
const float * const src_data = (float *)((char *) src1->data + in*
|
11589
|
+
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
12215
11590
|
|
12216
|
-
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
11591
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
12217
11592
|
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
12218
11593
|
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
12219
11594
|
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
12220
11595
|
|
12221
|
-
if (
|
11596
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
11597
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
11598
|
+
} else {
|
12222
11599
|
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
12223
11600
|
}
|
12224
11601
|
}
|
@@ -12230,223 +11607,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
|
|
12230
11607
|
}
|
12231
11608
|
}
|
12232
11609
|
|
12233
|
-
|
12234
|
-
// src0: [OC, IC, KH, KW]
|
12235
|
-
// src1: [N, OH, OW, IC * KH * KW]
|
12236
|
-
// result: [N, OC, OH, OW]
|
12237
|
-
static void ggml_compute_forward_conv_2d_stage_1_f16(
|
12238
|
-
const struct ggml_compute_params * params,
|
12239
|
-
const struct ggml_tensor * src0,
|
12240
|
-
const struct ggml_tensor * src1,
|
12241
|
-
struct ggml_tensor * dst) {
|
12242
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12243
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
12244
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12245
|
-
|
12246
|
-
int64_t t0 = ggml_perf_time_us();
|
12247
|
-
UNUSED(t0);
|
12248
|
-
|
12249
|
-
if (params->type == GGML_TASK_INIT) {
|
12250
|
-
return;
|
12251
|
-
}
|
12252
|
-
|
12253
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
12254
|
-
return;
|
12255
|
-
}
|
12256
|
-
|
12257
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
12258
|
-
|
12259
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12260
|
-
GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
|
12261
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
12262
|
-
|
12263
|
-
const int N = ne13;
|
12264
|
-
const int OH = ne12;
|
12265
|
-
const int OW = ne11;
|
12266
|
-
|
12267
|
-
const int OC = ne03;
|
12268
|
-
const int IC = ne02;
|
12269
|
-
const int KH = ne01;
|
12270
|
-
const int KW = ne00;
|
12271
|
-
|
12272
|
-
const int ith = params->ith;
|
12273
|
-
const int nth = params->nth;
|
12274
|
-
|
12275
|
-
int64_t m = OC;
|
12276
|
-
int64_t n = OH * OW;
|
12277
|
-
int64_t k = IC * KH * KW;
|
12278
|
-
|
12279
|
-
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
12280
|
-
for (int i = 0; i < N; i++) {
|
12281
|
-
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
12282
|
-
ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
|
12283
|
-
float * C = (float *)dst->data + i * m * n; // [m, n]
|
12284
|
-
|
12285
|
-
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
12286
|
-
}
|
12287
|
-
}
|
12288
|
-
|
12289
|
-
static void ggml_compute_forward_conv_2d_f16_f32(
|
12290
|
-
const struct ggml_compute_params * params,
|
12291
|
-
const struct ggml_tensor * src0,
|
12292
|
-
const struct ggml_tensor * src1,
|
12293
|
-
struct ggml_tensor * dst) {
|
12294
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12295
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12296
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12297
|
-
|
12298
|
-
int64_t t0 = ggml_perf_time_us();
|
12299
|
-
UNUSED(t0);
|
12300
|
-
|
12301
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
12302
|
-
|
12303
|
-
// src1: image [N, IC, IH, IW]
|
12304
|
-
// src0: kernel [OC, IC, KH, KW]
|
12305
|
-
// dst: result [N, OC, OH, OW]
|
12306
|
-
// ne12: IC
|
12307
|
-
// ne0: OW
|
12308
|
-
// ne1: OH
|
12309
|
-
// nk0: KW
|
12310
|
-
// nk1: KH
|
12311
|
-
// ne13: N
|
12312
|
-
|
12313
|
-
const int N = ne13;
|
12314
|
-
const int IC = ne12;
|
12315
|
-
const int IH = ne11;
|
12316
|
-
const int IW = ne10;
|
12317
|
-
|
12318
|
-
const int OC = ne03;
|
12319
|
-
// const int IC = ne02;
|
12320
|
-
const int KH = ne01;
|
12321
|
-
const int KW = ne00;
|
12322
|
-
|
12323
|
-
const int OH = ne1;
|
12324
|
-
const int OW = ne0;
|
12325
|
-
|
12326
|
-
const int ith = params->ith;
|
12327
|
-
const int nth = params->nth;
|
12328
|
-
|
12329
|
-
// const int nk0 = ne00;
|
12330
|
-
// const int nk1 = ne01;
|
12331
|
-
|
12332
|
-
// size of the convolution row - the kernel size unrolled across all channels
|
12333
|
-
// const int ew0 = nk0*nk1*ne02;
|
12334
|
-
// ew0: IC*KH*KW
|
12335
|
-
|
12336
|
-
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
12337
|
-
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
12338
|
-
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
12339
|
-
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
12340
|
-
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
12341
|
-
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
12342
|
-
|
12343
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12344
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
12345
|
-
|
12346
|
-
if (params->type == GGML_TASK_INIT) {
|
12347
|
-
memset(params->wdata, 0, params->wsize);
|
12348
|
-
|
12349
|
-
// prepare source data (src1)
|
12350
|
-
// im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
|
12351
|
-
|
12352
|
-
{
|
12353
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
12354
|
-
|
12355
|
-
for (int in = 0; in < N; in++) {
|
12356
|
-
for (int iic = 0; iic < IC; iic++) {
|
12357
|
-
for (int ioh = 0; ioh < OH; ioh++) {
|
12358
|
-
for (int iow = 0; iow < OW; iow++) {
|
12359
|
-
|
12360
|
-
// micro kernel
|
12361
|
-
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
12362
|
-
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
|
12363
|
-
|
12364
|
-
for (int ikh = 0; ikh < KH; ikh++) {
|
12365
|
-
for (int ikw = 0; ikw < KW; ikw++) {
|
12366
|
-
const int iiw = iow*s0 + ikw*d0 - p0;
|
12367
|
-
const int iih = ioh*s1 + ikh*d1 - p1;
|
12368
|
-
|
12369
|
-
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
|
12370
|
-
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
12371
|
-
}
|
12372
|
-
}
|
12373
|
-
}
|
12374
|
-
}
|
12375
|
-
}
|
12376
|
-
}
|
12377
|
-
}
|
12378
|
-
}
|
12379
|
-
|
12380
|
-
return;
|
12381
|
-
}
|
12382
|
-
|
12383
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
12384
|
-
return;
|
12385
|
-
}
|
12386
|
-
|
12387
|
-
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
12388
|
-
// wdata: [N*OH*OW, IC*KH*KW]
|
12389
|
-
// dst: result [N, OC, OH, OW]
|
12390
|
-
// src0: kernel [OC, IC, KH, KW]
|
12391
|
-
|
12392
|
-
int64_t m = OC;
|
12393
|
-
int64_t n = OH * OW;
|
12394
|
-
int64_t k = IC * KH * KW;
|
12395
|
-
|
12396
|
-
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
12397
|
-
for (int i = 0; i < N; i++) {
|
12398
|
-
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
12399
|
-
ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
|
12400
|
-
float * C = (float *)dst->data + i * m * n; // [m * k]
|
12401
|
-
|
12402
|
-
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
12403
|
-
}
|
12404
|
-
}
|
12405
|
-
|
12406
|
-
static void ggml_compute_forward_conv_2d(
|
12407
|
-
const struct ggml_compute_params * params,
|
12408
|
-
const struct ggml_tensor * src0,
|
12409
|
-
const struct ggml_tensor * src1,
|
12410
|
-
struct ggml_tensor * dst) {
|
12411
|
-
switch (src0->type) {
|
12412
|
-
case GGML_TYPE_F16:
|
12413
|
-
{
|
12414
|
-
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
|
12415
|
-
} break;
|
12416
|
-
case GGML_TYPE_F32:
|
12417
|
-
{
|
12418
|
-
//ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
|
12419
|
-
GGML_ASSERT(false);
|
12420
|
-
} break;
|
12421
|
-
default:
|
12422
|
-
{
|
12423
|
-
GGML_ASSERT(false);
|
12424
|
-
} break;
|
12425
|
-
}
|
12426
|
-
}
|
12427
|
-
|
12428
|
-
static void ggml_compute_forward_conv_2d_stage_0(
|
12429
|
-
const struct ggml_compute_params * params,
|
12430
|
-
const struct ggml_tensor * src0,
|
12431
|
-
const struct ggml_tensor * src1,
|
12432
|
-
struct ggml_tensor * dst) {
|
12433
|
-
switch (src0->type) {
|
12434
|
-
case GGML_TYPE_F16:
|
12435
|
-
{
|
12436
|
-
ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
|
12437
|
-
} break;
|
12438
|
-
case GGML_TYPE_F32:
|
12439
|
-
{
|
12440
|
-
GGML_ASSERT(false);
|
12441
|
-
} break;
|
12442
|
-
default:
|
12443
|
-
{
|
12444
|
-
GGML_ASSERT(false);
|
12445
|
-
} break;
|
12446
|
-
}
|
12447
|
-
}
|
12448
|
-
|
12449
|
-
static void ggml_compute_forward_conv_2d_stage_1(
|
11610
|
+
static void ggml_compute_forward_im2col(
|
12450
11611
|
const struct ggml_compute_params * params,
|
12451
11612
|
const struct ggml_tensor * src0,
|
12452
11613
|
const struct ggml_tensor * src1,
|
@@ -12454,7 +11615,7 @@ static void ggml_compute_forward_conv_2d_stage_1(
|
|
12454
11615
|
switch (src0->type) {
|
12455
11616
|
case GGML_TYPE_F16:
|
12456
11617
|
{
|
12457
|
-
|
11618
|
+
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
12458
11619
|
} break;
|
12459
11620
|
case GGML_TYPE_F32:
|
12460
11621
|
{
|
@@ -12639,14 +11800,11 @@ static void ggml_compute_forward_pool_1d(
|
|
12639
11800
|
ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
|
12640
11801
|
}
|
12641
11802
|
|
12642
|
-
//
|
11803
|
+
// ggml_compute_forward_pool_2d
|
12643
11804
|
|
12644
|
-
static void
|
11805
|
+
static void ggml_compute_forward_pool_2d(
|
12645
11806
|
const struct ggml_compute_params * params,
|
12646
|
-
const enum ggml_op_pool op,
|
12647
11807
|
const struct ggml_tensor * src,
|
12648
|
-
const int k0,
|
12649
|
-
const int k1,
|
12650
11808
|
struct ggml_tensor * dst) {
|
12651
11809
|
assert(src->type == GGML_TYPE_F32);
|
12652
11810
|
assert(params->ith == 0);
|
@@ -12655,6 +11813,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12655
11813
|
return;
|
12656
11814
|
}
|
12657
11815
|
|
11816
|
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
11817
|
+
enum ggml_op_pool op = opts[0];
|
11818
|
+
const int k0 = opts[1];
|
11819
|
+
const int k1 = opts[2];
|
11820
|
+
const int s0 = opts[3];
|
11821
|
+
const int s1 = opts[4];
|
11822
|
+
const int p0 = opts[5];
|
11823
|
+
const int p1 = opts[6];
|
12658
11824
|
const char * cdata = (const char*)src->data;
|
12659
11825
|
const char * const data_end = cdata + ggml_nbytes(src);
|
12660
11826
|
|
@@ -12665,6 +11831,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12665
11831
|
float * dplane = (float *)dst->data;
|
12666
11832
|
|
12667
11833
|
const int ka = k0 * k1;
|
11834
|
+
const int offset0 = -p0;
|
11835
|
+
const int offset1 = -p1;
|
12668
11836
|
|
12669
11837
|
while (cdata < data_end) {
|
12670
11838
|
for (int oy = 0; oy < py; ++oy) {
|
@@ -12677,13 +11845,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12677
11845
|
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
12678
11846
|
}
|
12679
11847
|
|
12680
|
-
const int ix = ox *
|
12681
|
-
const int iy = oy *
|
11848
|
+
const int ix = offset0 + ox * s0;
|
11849
|
+
const int iy = offset1 + oy * s1;
|
12682
11850
|
|
12683
11851
|
for (int ky = 0; ky < k1; ++ky) {
|
11852
|
+
if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
|
12684
11853
|
const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
|
12685
11854
|
for (int kx = 0; kx < k0; ++kx) {
|
12686
11855
|
int j = ix + kx;
|
11856
|
+
if (j < 0 || j >= src->ne[0]) continue;
|
12687
11857
|
switch (op) {
|
12688
11858
|
case GGML_OP_POOL_AVG: *out += srow[j]; break;
|
12689
11859
|
case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
|
@@ -12700,31 +11870,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
|
|
12700
11870
|
}
|
12701
11871
|
|
12702
11872
|
cdata += src->nb[2];
|
12703
|
-
dplane += pa;
|
12704
|
-
}
|
12705
|
-
}
|
12706
|
-
|
12707
|
-
// ggml_compute_forward_pool_2d
|
12708
|
-
|
12709
|
-
static void ggml_compute_forward_pool_2d(
|
12710
|
-
const struct ggml_compute_params * params,
|
12711
|
-
const struct ggml_tensor * src0,
|
12712
|
-
struct ggml_tensor * dst) {
|
12713
|
-
|
12714
|
-
const int32_t * opts = (const int32_t *)dst->op_params;
|
12715
|
-
enum ggml_op_pool op = opts[0];
|
12716
|
-
const int k0 = opts[1];
|
12717
|
-
const int k1 = opts[2];
|
12718
|
-
const int s0 = opts[3];
|
12719
|
-
const int s1 = opts[4];
|
12720
|
-
const int p0 = opts[5];
|
12721
|
-
const int p1 = opts[6];
|
12722
|
-
GGML_ASSERT(p0 == 0);
|
12723
|
-
GGML_ASSERT(p1 == 0); // padding not supported
|
12724
|
-
GGML_ASSERT(k0 == s0);
|
12725
|
-
GGML_ASSERT(k1 == s1); // only s = k supported
|
12726
|
-
|
12727
|
-
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
11873
|
+
dplane += pa;
|
11874
|
+
}
|
12728
11875
|
}
|
12729
11876
|
|
12730
11877
|
// ggml_compute_forward_upscale
|
@@ -13928,6 +13075,10 @@ static void ggml_compute_forward_unary(
|
|
13928
13075
|
{
|
13929
13076
|
ggml_compute_forward_silu(params, src0, dst);
|
13930
13077
|
} break;
|
13078
|
+
case GGML_UNARY_OP_LEAKY:
|
13079
|
+
{
|
13080
|
+
ggml_compute_forward_leaky(params, src0, dst);
|
13081
|
+
} break;
|
13931
13082
|
default:
|
13932
13083
|
{
|
13933
13084
|
GGML_ASSERT(false);
|
@@ -14681,33 +13832,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14681
13832
|
{
|
14682
13833
|
ggml_compute_forward_clamp(params, tensor->src[0], tensor);
|
14683
13834
|
} break;
|
14684
|
-
case GGML_OP_CONV_1D:
|
14685
|
-
{
|
14686
|
-
ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
|
14687
|
-
} break;
|
14688
|
-
case GGML_OP_CONV_1D_STAGE_0:
|
14689
|
-
{
|
14690
|
-
ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
|
14691
|
-
} break;
|
14692
|
-
case GGML_OP_CONV_1D_STAGE_1:
|
14693
|
-
{
|
14694
|
-
ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
|
14695
|
-
} break;
|
14696
13835
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
14697
13836
|
{
|
14698
13837
|
ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
|
14699
13838
|
} break;
|
14700
|
-
case
|
14701
|
-
{
|
14702
|
-
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14703
|
-
} break;
|
14704
|
-
case GGML_OP_CONV_2D_STAGE_0:
|
13839
|
+
case GGML_OP_IM2COL:
|
14705
13840
|
{
|
14706
|
-
|
14707
|
-
} break;
|
14708
|
-
case GGML_OP_CONV_2D_STAGE_1:
|
14709
|
-
{
|
14710
|
-
ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
|
13841
|
+
ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
|
14711
13842
|
} break;
|
14712
13843
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
14713
13844
|
{
|
@@ -14836,62 +13967,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14836
13967
|
|
14837
13968
|
////////////////////////////////////////////////////////////////////////////////
|
14838
13969
|
|
14839
|
-
|
13970
|
+
static size_t ggml_hash_size(size_t min_sz) {
|
13971
|
+
// next primes after powers of two
|
13972
|
+
static const size_t primes[] = {
|
13973
|
+
2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
|
13974
|
+
2053, 4099, 8209, 16411, 32771, 65537, 131101,
|
13975
|
+
262147, 524309, 1048583, 2097169, 4194319, 8388617,
|
13976
|
+
16777259, 33554467, 67108879, 134217757, 268435459,
|
13977
|
+
536870923, 1073741827, 2147483659
|
13978
|
+
};
|
13979
|
+
static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
|
13980
|
+
|
13981
|
+
// find the smallest prime that is larger or equal to min_sz
|
13982
|
+
size_t l = 0;
|
13983
|
+
size_t r = n_primes;
|
13984
|
+
while (l < r) {
|
13985
|
+
size_t m = (l + r)/2;
|
13986
|
+
if (primes[m] < min_sz) {
|
13987
|
+
l = m + 1;
|
13988
|
+
} else {
|
13989
|
+
r = m;
|
13990
|
+
}
|
13991
|
+
}
|
13992
|
+
size_t sz = l < n_primes ? primes[l] : min_sz | 1;
|
13993
|
+
return sz;
|
13994
|
+
}
|
14840
13995
|
|
14841
|
-
static size_t
|
14842
|
-
return (size_t)p
|
13996
|
+
static size_t ggml_hash(const void * p) {
|
13997
|
+
return (size_t)p;
|
14843
13998
|
}
|
14844
13999
|
|
14845
|
-
|
14846
|
-
size_t h =
|
14000
|
+
size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14001
|
+
size_t h = ggml_hash(key) % hash_set.size;
|
14847
14002
|
|
14848
14003
|
// linear probing
|
14849
14004
|
size_t i = h;
|
14850
|
-
while (
|
14851
|
-
i = (i + 1) %
|
14005
|
+
while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
|
14006
|
+
i = (i + 1) % hash_set.size;
|
14852
14007
|
if (i == h) {
|
14853
14008
|
// visited all hash table entries -> not found
|
14854
|
-
return
|
14009
|
+
return GGML_HASHTABLE_FULL;
|
14855
14010
|
}
|
14856
14011
|
}
|
14857
14012
|
return i;
|
14858
14013
|
}
|
14859
14014
|
|
14860
|
-
|
14861
|
-
size_t i =
|
14015
|
+
bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14016
|
+
size_t i = ggml_hash_find(hash_set, key);
|
14017
|
+
return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
|
14018
|
+
}
|
14019
|
+
|
14020
|
+
size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14021
|
+
size_t i = ggml_hash_find(hash_set, key);
|
14862
14022
|
|
14863
|
-
GGML_ASSERT(i
|
14023
|
+
GGML_ASSERT(i != GGML_HASHTABLE_FULL);
|
14864
14024
|
|
14865
|
-
if (
|
14866
|
-
return
|
14025
|
+
if (hash_set.keys[i] == key) {
|
14026
|
+
return GGML_HASHTABLE_ALREADY_EXISTS;
|
14867
14027
|
}
|
14868
14028
|
|
14869
14029
|
// insert
|
14870
|
-
GGML_ASSERT(
|
14871
|
-
|
14872
|
-
return
|
14030
|
+
GGML_ASSERT(hash_set.keys[i] == NULL);
|
14031
|
+
hash_set.keys[i] = key;
|
14032
|
+
return i;
|
14033
|
+
}
|
14034
|
+
|
14035
|
+
size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
|
14036
|
+
size_t i = ggml_hash_find(hash_set, key);
|
14037
|
+
|
14038
|
+
GGML_ASSERT(i != GGML_HASHTABLE_FULL);
|
14039
|
+
|
14040
|
+
hash_set.keys[i] = key;
|
14041
|
+
return i;
|
14042
|
+
}
|
14043
|
+
|
14044
|
+
static struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
14045
|
+
size = ggml_hash_size(size);
|
14046
|
+
struct ggml_hash_set result;
|
14047
|
+
result.size = size;
|
14048
|
+
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
|
14049
|
+
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
14050
|
+
return result;
|
14873
14051
|
}
|
14874
14052
|
|
14875
|
-
static
|
14876
|
-
|
14877
|
-
return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
|
14053
|
+
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
14054
|
+
free(hash_set.keys);
|
14878
14055
|
}
|
14879
14056
|
|
14880
14057
|
struct hash_map {
|
14881
|
-
|
14882
|
-
|
14058
|
+
struct ggml_hash_set set;
|
14059
|
+
struct ggml_tensor ** vals;
|
14883
14060
|
};
|
14884
14061
|
|
14885
|
-
static struct hash_map *
|
14062
|
+
static struct hash_map * ggml_new_hash_map(size_t size) {
|
14886
14063
|
struct hash_map * result = malloc(sizeof(struct hash_map));
|
14887
|
-
|
14888
|
-
|
14889
|
-
|
14890
|
-
}
|
14064
|
+
result->set = ggml_hash_set_new(size);
|
14065
|
+
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
|
14066
|
+
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
14891
14067
|
return result;
|
14892
14068
|
}
|
14893
14069
|
|
14894
|
-
static void
|
14070
|
+
static void ggml_hash_map_free(struct hash_map * map) {
|
14071
|
+
ggml_hash_set_free(map->set);
|
14072
|
+
free(map->vals);
|
14895
14073
|
free(map);
|
14896
14074
|
}
|
14897
14075
|
|
@@ -14911,7 +14089,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14911
14089
|
return node;
|
14912
14090
|
}
|
14913
14091
|
|
14914
|
-
if (!
|
14092
|
+
if (!ggml_hash_contains(graph->visited_hash_table, node)) {
|
14915
14093
|
return node;
|
14916
14094
|
}
|
14917
14095
|
|
@@ -14926,17 +14104,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14926
14104
|
return node;
|
14927
14105
|
}
|
14928
14106
|
|
14929
|
-
size_t i =
|
14930
|
-
GGML_ASSERT(i
|
14931
|
-
if (replacements->keys[i] == node) {
|
14932
|
-
return
|
14107
|
+
size_t i = ggml_hash_find(replacements->set, node);
|
14108
|
+
GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
|
14109
|
+
if (replacements->set.keys[i] == node) {
|
14110
|
+
return replacements->vals[i];
|
14933
14111
|
}
|
14934
14112
|
|
14935
14113
|
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
|
14936
14114
|
|
14937
14115
|
// insert clone into replacements
|
14938
|
-
GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
|
14939
|
-
replacements->keys[i] = node;
|
14116
|
+
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
14117
|
+
replacements->set.keys[i] = node;
|
14940
14118
|
replacements->vals[i] = clone;
|
14941
14119
|
|
14942
14120
|
clone->op = node->op;
|
@@ -14973,26 +14151,26 @@ void ggml_build_backward_gradient_checkpointing(
|
|
14973
14151
|
struct ggml_cgraph * gb_tmp,
|
14974
14152
|
struct ggml_tensor * * checkpoints,
|
14975
14153
|
int n_checkpoints) {
|
14976
|
-
|
14154
|
+
ggml_graph_cpy(gf, gb_tmp);
|
14977
14155
|
ggml_build_backward_expand(ctx, gf, gb_tmp, true);
|
14978
14156
|
|
14979
14157
|
if (n_checkpoints <= 0) {
|
14980
|
-
|
14158
|
+
ggml_graph_cpy(gb_tmp, gb);
|
14981
14159
|
return;
|
14982
14160
|
}
|
14983
14161
|
|
14984
|
-
struct hash_map * replacements =
|
14162
|
+
struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
|
14985
14163
|
|
14986
14164
|
// insert checkpoints in replacements
|
14987
14165
|
for (int i = 0; i < n_checkpoints; ++i) {
|
14988
|
-
size_t k =
|
14989
|
-
GGML_ASSERT(k
|
14990
|
-
GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
|
14991
|
-
replacements->keys[k] = checkpoints[i];
|
14992
|
-
replacements->vals[k]
|
14166
|
+
size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
|
14167
|
+
GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
|
14168
|
+
GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
|
14169
|
+
replacements->set.keys[k] = checkpoints[i];
|
14170
|
+
replacements->vals[k] = checkpoints[i];
|
14993
14171
|
}
|
14994
14172
|
|
14995
|
-
|
14173
|
+
ggml_graph_cpy(gf, gb);
|
14996
14174
|
// rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
|
14997
14175
|
// replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
|
14998
14176
|
// by recomputing them from checkpoints
|
@@ -15009,21 +14187,21 @@ void ggml_build_backward_gradient_checkpointing(
|
|
15009
14187
|
ggml_build_forward_expand(gb, node);
|
15010
14188
|
}
|
15011
14189
|
|
15012
|
-
|
14190
|
+
ggml_hash_map_free(replacements);
|
15013
14191
|
}
|
15014
14192
|
|
15015
14193
|
// functions to change gradients considering the case that input a might be initial gradient with zero value
|
15016
14194
|
|
15017
|
-
static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b,
|
15018
|
-
if (
|
14195
|
+
static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
|
14196
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15019
14197
|
return b;
|
15020
14198
|
} else {
|
15021
14199
|
return ggml_add_impl(ctx, a, b, false);
|
15022
14200
|
}
|
15023
14201
|
}
|
15024
14202
|
|
15025
|
-
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset,
|
15026
|
-
if (
|
14203
|
+
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
14204
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15027
14205
|
struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
|
15028
14206
|
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
15029
14207
|
} else {
|
@@ -15031,23 +14209,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
|
|
15031
14209
|
}
|
15032
14210
|
}
|
15033
14211
|
|
15034
|
-
static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b,
|
15035
|
-
if (
|
14212
|
+
static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
|
14213
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15036
14214
|
return ggml_repeat(ctx, b, a);
|
15037
14215
|
} else {
|
15038
14216
|
return ggml_add1_impl(ctx, a, b, false);
|
15039
14217
|
}
|
15040
14218
|
}
|
15041
14219
|
|
15042
|
-
static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b,
|
15043
|
-
if (
|
14220
|
+
static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
|
14221
|
+
if (ggml_hash_contains(zero_table, a)) {
|
15044
14222
|
return ggml_neg(ctx, b);
|
15045
14223
|
} else {
|
15046
14224
|
return ggml_sub_impl(ctx, a, b, false);
|
15047
14225
|
}
|
15048
14226
|
}
|
15049
14227
|
|
15050
|
-
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor,
|
14228
|
+
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
|
15051
14229
|
struct ggml_tensor * src0 = tensor->src[0];
|
15052
14230
|
struct ggml_tensor * src1 = tensor->src[1];
|
15053
14231
|
|
@@ -15559,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15559
14737
|
// necessary for llama
|
15560
14738
|
if (src0->grad) {
|
15561
14739
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
15562
|
-
const int n_dims
|
15563
|
-
const int mode
|
15564
|
-
const int n_ctx
|
15565
|
-
|
15566
|
-
float freq_scale;
|
15567
|
-
|
15568
|
-
|
15569
|
-
memcpy(&
|
15570
|
-
memcpy(&
|
15571
|
-
memcpy(&
|
15572
|
-
memcpy(&
|
14740
|
+
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
14741
|
+
const int mode = ((int32_t *) tensor->op_params)[2];
|
14742
|
+
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
14743
|
+
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
14744
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
14745
|
+
|
14746
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
14747
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
14748
|
+
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
14749
|
+
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
14750
|
+
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
14751
|
+
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
14752
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
14753
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
15573
14754
|
|
15574
14755
|
src0->grad = ggml_add_or_set(ctx,
|
15575
14756
|
src0->grad,
|
@@ -15579,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15579
14760
|
n_dims,
|
15580
14761
|
mode,
|
15581
14762
|
n_ctx,
|
14763
|
+
n_orig_ctx,
|
15582
14764
|
freq_base,
|
15583
14765
|
freq_scale,
|
14766
|
+
ext_factor,
|
14767
|
+
attn_factor,
|
14768
|
+
beta_fast,
|
14769
|
+
beta_slow,
|
15584
14770
|
xpos_base,
|
15585
14771
|
xpos_down),
|
15586
14772
|
zero_table);
|
@@ -15590,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15590
14776
|
{
|
15591
14777
|
if (src0->grad) {
|
15592
14778
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
15593
|
-
const int n_dims
|
15594
|
-
const int mode
|
15595
|
-
const int n_ctx
|
15596
|
-
|
15597
|
-
float freq_scale;
|
15598
|
-
|
15599
|
-
|
15600
|
-
memcpy(&
|
15601
|
-
memcpy(&
|
15602
|
-
memcpy(&
|
15603
|
-
memcpy(&
|
14779
|
+
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
14780
|
+
const int mode = ((int32_t *) tensor->op_params)[2];
|
14781
|
+
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
14782
|
+
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
14783
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
14784
|
+
|
14785
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
14786
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
14787
|
+
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
14788
|
+
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
14789
|
+
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
14790
|
+
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
14791
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
14792
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
15604
14793
|
|
15605
14794
|
src0->grad = ggml_add_or_set(ctx,
|
15606
14795
|
src0->grad,
|
@@ -15609,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15609
14798
|
src1,
|
15610
14799
|
n_dims,
|
15611
14800
|
mode,
|
15612
|
-
0,
|
15613
14801
|
n_ctx,
|
14802
|
+
n_orig_ctx,
|
15614
14803
|
freq_base,
|
15615
14804
|
freq_scale,
|
15616
|
-
|
15617
|
-
|
15618
|
-
|
15619
|
-
|
14805
|
+
ext_factor,
|
14806
|
+
attn_factor,
|
14807
|
+
beta_fast,
|
14808
|
+
beta_slow,
|
15620
14809
|
xpos_base,
|
15621
14810
|
xpos_down,
|
15622
14811
|
false),
|
@@ -15631,31 +14820,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15631
14820
|
{
|
15632
14821
|
GGML_ASSERT(false); // TODO: not implemented
|
15633
14822
|
} break;
|
15634
|
-
case GGML_OP_CONV_1D:
|
15635
|
-
{
|
15636
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15637
|
-
} break;
|
15638
|
-
case GGML_OP_CONV_1D_STAGE_0:
|
15639
|
-
{
|
15640
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15641
|
-
} break;
|
15642
|
-
case GGML_OP_CONV_1D_STAGE_1:
|
15643
|
-
{
|
15644
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15645
|
-
} break;
|
15646
14823
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15647
14824
|
{
|
15648
14825
|
GGML_ASSERT(false); // TODO: not implemented
|
15649
14826
|
} break;
|
15650
|
-
case
|
15651
|
-
{
|
15652
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15653
|
-
} break;
|
15654
|
-
case GGML_OP_CONV_2D_STAGE_0:
|
15655
|
-
{
|
15656
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15657
|
-
} break;
|
15658
|
-
case GGML_OP_CONV_2D_STAGE_1:
|
14827
|
+
case GGML_OP_IM2COL:
|
15659
14828
|
{
|
15660
14829
|
GGML_ASSERT(false); // TODO: not implemented
|
15661
14830
|
} break;
|
@@ -15869,7 +15038,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15869
15038
|
}
|
15870
15039
|
|
15871
15040
|
// check if already visited
|
15872
|
-
if (
|
15041
|
+
if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
|
15873
15042
|
return;
|
15874
15043
|
}
|
15875
15044
|
|
@@ -15885,7 +15054,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15885
15054
|
|
15886
15055
|
if (node->op == GGML_OP_NONE && node->grad == NULL) {
|
15887
15056
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
15888
|
-
GGML_ASSERT(cgraph->n_leafs <
|
15057
|
+
GGML_ASSERT(cgraph->n_leafs < cgraph->size);
|
15889
15058
|
|
15890
15059
|
if (strlen(node->name) == 0) {
|
15891
15060
|
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
@@ -15894,22 +15063,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15894
15063
|
cgraph->leafs[cgraph->n_leafs] = node;
|
15895
15064
|
cgraph->n_leafs++;
|
15896
15065
|
} else {
|
15897
|
-
GGML_ASSERT(cgraph->n_nodes <
|
15066
|
+
GGML_ASSERT(cgraph->n_nodes < cgraph->size);
|
15898
15067
|
|
15899
15068
|
if (strlen(node->name) == 0) {
|
15900
15069
|
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
15901
15070
|
}
|
15902
15071
|
|
15903
15072
|
cgraph->nodes[cgraph->n_nodes] = node;
|
15904
|
-
cgraph->grads
|
15073
|
+
if (cgraph->grads) {
|
15074
|
+
cgraph->grads[cgraph->n_nodes] = node->grad;
|
15075
|
+
}
|
15905
15076
|
cgraph->n_nodes++;
|
15906
15077
|
}
|
15907
15078
|
}
|
15908
15079
|
|
15909
15080
|
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
|
15910
15081
|
if (!expand) {
|
15911
|
-
|
15912
|
-
cgraph
|
15082
|
+
// TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
|
15083
|
+
ggml_graph_clear(cgraph);
|
15913
15084
|
}
|
15914
15085
|
|
15915
15086
|
const int n0 = cgraph->n_nodes;
|
@@ -15930,25 +15101,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15930
15101
|
ggml_build_forward_impl(cgraph, tensor, true);
|
15931
15102
|
}
|
15932
15103
|
|
15933
|
-
struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
15934
|
-
struct ggml_cgraph result = {
|
15935
|
-
/*.n_nodes =*/ 0,
|
15936
|
-
/*.n_leafs =*/ 0,
|
15937
|
-
/*.nodes =*/ { NULL },
|
15938
|
-
/*.grads =*/ { NULL },
|
15939
|
-
/*.leafs =*/ { NULL },
|
15940
|
-
/*.hash_table =*/ { NULL },
|
15941
|
-
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
15942
|
-
/*.perf_runs =*/ 0,
|
15943
|
-
/*.perf_cycles =*/ 0,
|
15944
|
-
/*.perf_time_us =*/ 0,
|
15945
|
-
};
|
15946
|
-
|
15947
|
-
ggml_build_forward_impl(&result, tensor, false);
|
15948
|
-
|
15949
|
-
return result;
|
15950
|
-
}
|
15951
|
-
|
15952
15104
|
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
15953
15105
|
GGML_ASSERT(gf->n_nodes > 0);
|
15954
15106
|
|
@@ -15965,11 +15117,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|
15965
15117
|
}
|
15966
15118
|
|
15967
15119
|
// remember original gradients which start with zero values
|
15968
|
-
|
15969
|
-
memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
|
15120
|
+
struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
|
15970
15121
|
for (int i = 0; i < gf->n_nodes; i++) {
|
15971
15122
|
if (gf->grads[i]) {
|
15972
|
-
|
15123
|
+
ggml_hash_insert(zero_table, gf->grads[i]);
|
15973
15124
|
}
|
15974
15125
|
}
|
15975
15126
|
|
@@ -15992,26 +15143,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|
15992
15143
|
}
|
15993
15144
|
}
|
15994
15145
|
|
15995
|
-
|
15146
|
+
ggml_hash_set_free(zero_table);
|
15996
15147
|
}
|
15997
15148
|
|
15998
|
-
|
15999
|
-
|
16000
|
-
|
16001
|
-
|
15149
|
+
static size_t ggml_graph_nbytes(size_t size, bool grads) {
|
15150
|
+
size_t nbytes = sizeof(struct ggml_cgraph);
|
15151
|
+
nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
|
15152
|
+
if (grads) {
|
15153
|
+
nbytes += size * sizeof(struct ggml_tensor *); // grads
|
15154
|
+
}
|
15155
|
+
nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
|
15156
|
+
return nbytes;
|
16002
15157
|
}
|
16003
15158
|
|
16004
|
-
|
16005
|
-
|
15159
|
+
size_t ggml_graph_overhead_custom(size_t size, bool grads) {
|
15160
|
+
return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
|
15161
|
+
}
|
15162
|
+
|
15163
|
+
size_t ggml_graph_overhead(void) {
|
15164
|
+
return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
|
15165
|
+
}
|
15166
|
+
|
15167
|
+
struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
|
15168
|
+
const size_t obj_size = ggml_graph_nbytes(size, grads);
|
15169
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
16006
15170
|
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
16007
15171
|
|
15172
|
+
struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
|
15173
|
+
|
15174
|
+
size_t hash_size = ggml_hash_size(size * 2);
|
15175
|
+
struct ggml_tensor ** nodes_ptr = data_start;
|
15176
|
+
struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
|
15177
|
+
struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
|
15178
|
+
struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
|
15179
|
+
|
15180
|
+
// check that we allocated the correct amount of memory
|
15181
|
+
assert(obj_size == (size_t) (
|
15182
|
+
(grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
|
15183
|
+
|
15184
|
+
memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
|
15185
|
+
|
16008
15186
|
*cgraph = (struct ggml_cgraph) {
|
15187
|
+
/*.size =*/ size,
|
16009
15188
|
/*.n_nodes =*/ 0,
|
16010
15189
|
/*.n_leafs =*/ 0,
|
16011
|
-
/*.nodes =*/
|
16012
|
-
/*.grads =*/
|
16013
|
-
/*.leafs =*/
|
16014
|
-
/*.hash_table =*/ {
|
15190
|
+
/*.nodes =*/ nodes_ptr,
|
15191
|
+
/*.grads =*/ grads_ptr,
|
15192
|
+
/*.leafs =*/ leafs_ptr,
|
15193
|
+
/*.hash_table =*/ { hash_size, hash_keys_ptr },
|
16015
15194
|
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
16016
15195
|
/*.perf_runs =*/ 0,
|
16017
15196
|
/*.perf_cycles =*/ 0,
|
@@ -16021,14 +15200,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
16021
15200
|
return cgraph;
|
16022
15201
|
}
|
16023
15202
|
|
16024
|
-
struct ggml_cgraph *
|
16025
|
-
|
16026
|
-
|
15203
|
+
struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
15204
|
+
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15205
|
+
}
|
15206
|
+
|
15207
|
+
struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15208
|
+
const size_t obj_size = sizeof(struct ggml_cgraph);
|
15209
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15210
|
+
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15211
|
+
|
15212
|
+
*cgraph = (struct ggml_cgraph) {
|
15213
|
+
/*.size =*/ 0,
|
15214
|
+
/*.n_nodes =*/ i1 - i0,
|
15215
|
+
/*.n_leafs =*/ 0,
|
15216
|
+
/*.nodes =*/ cgraph0->nodes + i0,
|
15217
|
+
/*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
|
15218
|
+
/*.leafs =*/ NULL,
|
15219
|
+
/*.hash_table =*/ { 0, NULL },
|
15220
|
+
/*.order =*/ cgraph0->order,
|
15221
|
+
/*.perf_runs =*/ 0,
|
15222
|
+
/*.perf_cycles =*/ 0,
|
15223
|
+
/*.perf_time_us =*/ 0,
|
15224
|
+
};
|
15225
|
+
|
16027
15226
|
return cgraph;
|
16028
15227
|
}
|
16029
15228
|
|
16030
|
-
|
16031
|
-
|
15229
|
+
void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
15230
|
+
GGML_ASSERT(dst->size >= src->n_leafs);
|
15231
|
+
GGML_ASSERT(dst->size >= src->n_nodes);
|
15232
|
+
GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
|
15233
|
+
|
15234
|
+
dst->n_leafs = src->n_leafs;
|
15235
|
+
dst->n_nodes = src->n_nodes;
|
15236
|
+
dst->order = src->order;
|
15237
|
+
|
15238
|
+
for (int i = 0; i < src->n_leafs; ++i) {
|
15239
|
+
dst->leafs[i] = src->leafs[i];
|
15240
|
+
}
|
15241
|
+
|
15242
|
+
for (int i = 0; i < src->n_nodes; ++i) {
|
15243
|
+
dst->nodes[i] = src->nodes[i];
|
15244
|
+
}
|
15245
|
+
|
15246
|
+
if (src->grads) {
|
15247
|
+
GGML_ASSERT(dst->grads != NULL);
|
15248
|
+
for (int i = 0; i < src->n_nodes; ++i) {
|
15249
|
+
dst->grads[i] = src->grads[i];
|
15250
|
+
}
|
15251
|
+
}
|
15252
|
+
|
15253
|
+
for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
|
15254
|
+
if (src->visited_hash_table.keys[i]) {
|
15255
|
+
ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
|
15256
|
+
}
|
15257
|
+
}
|
15258
|
+
}
|
15259
|
+
|
15260
|
+
struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
15261
|
+
struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
|
15262
|
+
ggml_graph_cpy(cgraph, result);
|
15263
|
+
return result;
|
15264
|
+
}
|
15265
|
+
|
15266
|
+
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
15267
|
+
GGML_ASSERT(cgraph->grads != NULL);
|
15268
|
+
|
15269
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15270
|
+
struct ggml_tensor * grad = cgraph->grads[i];
|
15271
|
+
|
15272
|
+
if (grad) {
|
15273
|
+
ggml_set_zero(grad);
|
15274
|
+
}
|
15275
|
+
}
|
15276
|
+
}
|
15277
|
+
|
15278
|
+
void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
15279
|
+
cgraph->n_leafs = 0;
|
15280
|
+
cgraph->n_nodes = 0;
|
15281
|
+
memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
|
16032
15282
|
}
|
16033
15283
|
|
16034
15284
|
//
|
@@ -16140,45 +15390,266 @@ static void clear_numa_thread_affinity(void) {
|
|
16140
15390
|
strerror(rv));
|
16141
15391
|
}
|
16142
15392
|
|
16143
|
-
CPU_FREE(cpus);
|
16144
|
-
}
|
16145
|
-
#else
|
16146
|
-
// TODO: Windows etc.
|
16147
|
-
// (the linux implementation may also work on BSD, someone should test)
|
16148
|
-
static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
16149
|
-
static void clear_numa_thread_affinity(void) {}
|
16150
|
-
#endif
|
16151
|
-
|
16152
|
-
struct ggml_compute_state_shared {
|
16153
|
-
const struct ggml_cgraph * cgraph;
|
16154
|
-
const struct ggml_cplan * cplan;
|
16155
|
-
|
16156
|
-
int64_t perf_node_start_cycles;
|
16157
|
-
int64_t perf_node_start_time_us;
|
16158
|
-
|
16159
|
-
const int n_threads;
|
16160
|
-
|
16161
|
-
// synchronization primitives
|
16162
|
-
atomic_int n_active; // num active threads
|
16163
|
-
atomic_int node_n; // active graph node
|
16164
|
-
|
16165
|
-
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
16166
|
-
void * abort_callback_data;
|
16167
|
-
};
|
16168
|
-
|
16169
|
-
struct ggml_compute_state {
|
16170
|
-
ggml_thread_t thrd;
|
16171
|
-
int ith;
|
16172
|
-
struct ggml_compute_state_shared * shared;
|
16173
|
-
};
|
16174
|
-
|
16175
|
-
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
16176
|
-
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
16177
|
-
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
15393
|
+
CPU_FREE(cpus);
|
15394
|
+
}
|
15395
|
+
#else
|
15396
|
+
// TODO: Windows etc.
|
15397
|
+
// (the linux implementation may also work on BSD, someone should test)
|
15398
|
+
static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
15399
|
+
static void clear_numa_thread_affinity(void) {}
|
15400
|
+
#endif
|
15401
|
+
|
15402
|
+
struct ggml_compute_state_shared {
|
15403
|
+
const struct ggml_cgraph * cgraph;
|
15404
|
+
const struct ggml_cplan * cplan;
|
15405
|
+
|
15406
|
+
int64_t perf_node_start_cycles;
|
15407
|
+
int64_t perf_node_start_time_us;
|
15408
|
+
|
15409
|
+
const int n_threads;
|
15410
|
+
|
15411
|
+
// synchronization primitives
|
15412
|
+
atomic_int n_active; // num active threads
|
15413
|
+
atomic_int node_n; // active graph node
|
15414
|
+
|
15415
|
+
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
15416
|
+
void * abort_callback_data;
|
15417
|
+
};
|
15418
|
+
|
15419
|
+
struct ggml_compute_state {
|
15420
|
+
ggml_thread_t thrd;
|
15421
|
+
int ith;
|
15422
|
+
struct ggml_compute_state_shared * shared;
|
15423
|
+
};
|
15424
|
+
|
15425
|
+
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
15426
|
+
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
15427
|
+
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
15428
|
+
|
15429
|
+
node->perf_runs++;
|
15430
|
+
node->perf_cycles += cycles_cur;
|
15431
|
+
node->perf_time_us += time_us_cur;
|
15432
|
+
}
|
15433
|
+
|
15434
|
+
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
15435
|
+
int n_tasks = 0;
|
15436
|
+
|
15437
|
+
switch (node->op) {
|
15438
|
+
case GGML_OP_CPY:
|
15439
|
+
case GGML_OP_DUP:
|
15440
|
+
case GGML_OP_ADD:
|
15441
|
+
case GGML_OP_ADD1:
|
15442
|
+
case GGML_OP_ACC:
|
15443
|
+
{
|
15444
|
+
n_tasks = n_threads;
|
15445
|
+
} break;
|
15446
|
+
case GGML_OP_SUB:
|
15447
|
+
case GGML_OP_DIV:
|
15448
|
+
case GGML_OP_SQR:
|
15449
|
+
case GGML_OP_SQRT:
|
15450
|
+
case GGML_OP_LOG:
|
15451
|
+
case GGML_OP_SUM:
|
15452
|
+
case GGML_OP_SUM_ROWS:
|
15453
|
+
case GGML_OP_MEAN:
|
15454
|
+
case GGML_OP_ARGMAX:
|
15455
|
+
case GGML_OP_REPEAT:
|
15456
|
+
case GGML_OP_REPEAT_BACK:
|
15457
|
+
{
|
15458
|
+
n_tasks = 1;
|
15459
|
+
} break;
|
15460
|
+
case GGML_OP_UNARY:
|
15461
|
+
switch (ggml_get_unary_op(node)) {
|
15462
|
+
case GGML_UNARY_OP_ABS:
|
15463
|
+
case GGML_UNARY_OP_SGN:
|
15464
|
+
case GGML_UNARY_OP_NEG:
|
15465
|
+
case GGML_UNARY_OP_STEP:
|
15466
|
+
case GGML_UNARY_OP_TANH:
|
15467
|
+
case GGML_UNARY_OP_ELU:
|
15468
|
+
case GGML_UNARY_OP_RELU:
|
15469
|
+
case GGML_UNARY_OP_LEAKY:
|
15470
|
+
{
|
15471
|
+
n_tasks = 1;
|
15472
|
+
} break;
|
15473
|
+
|
15474
|
+
case GGML_UNARY_OP_GELU:
|
15475
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
15476
|
+
case GGML_UNARY_OP_SILU:
|
15477
|
+
{
|
15478
|
+
n_tasks = n_threads;
|
15479
|
+
} break;
|
15480
|
+
}
|
15481
|
+
break;
|
15482
|
+
case GGML_OP_SILU_BACK:
|
15483
|
+
case GGML_OP_MUL:
|
15484
|
+
case GGML_OP_NORM:
|
15485
|
+
case GGML_OP_RMS_NORM:
|
15486
|
+
case GGML_OP_RMS_NORM_BACK:
|
15487
|
+
case GGML_OP_GROUP_NORM:
|
15488
|
+
case GGML_OP_CONCAT:
|
15489
|
+
{
|
15490
|
+
n_tasks = n_threads;
|
15491
|
+
} break;
|
15492
|
+
case GGML_OP_MUL_MAT:
|
15493
|
+
{
|
15494
|
+
n_tasks = n_threads;
|
15495
|
+
|
15496
|
+
// TODO: use different scheduling for different matrix sizes
|
15497
|
+
//const int nr0 = ggml_nrows(node->src[0]);
|
15498
|
+
//const int nr1 = ggml_nrows(node->src[1]);
|
15499
|
+
|
15500
|
+
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
15501
|
+
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
15502
|
+
|
15503
|
+
#if defined(GGML_USE_CUBLAS)
|
15504
|
+
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
15505
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
15506
|
+
// the threads are still spinning
|
15507
|
+
}
|
15508
|
+
#elif defined(GGML_USE_CLBLAST)
|
15509
|
+
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
15510
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
15511
|
+
// the threads are still spinning
|
15512
|
+
}
|
15513
|
+
#endif
|
15514
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
15515
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
15516
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
15517
|
+
// the threads are still spinning
|
15518
|
+
}
|
15519
|
+
#endif
|
15520
|
+
} break;
|
15521
|
+
case GGML_OP_OUT_PROD:
|
15522
|
+
{
|
15523
|
+
n_tasks = n_threads;
|
15524
|
+
} break;
|
15525
|
+
case GGML_OP_SCALE:
|
15526
|
+
case GGML_OP_SET:
|
15527
|
+
case GGML_OP_CONT:
|
15528
|
+
case GGML_OP_RESHAPE:
|
15529
|
+
case GGML_OP_VIEW:
|
15530
|
+
case GGML_OP_PERMUTE:
|
15531
|
+
case GGML_OP_TRANSPOSE:
|
15532
|
+
case GGML_OP_GET_ROWS:
|
15533
|
+
case GGML_OP_GET_ROWS_BACK:
|
15534
|
+
case GGML_OP_DIAG:
|
15535
|
+
{
|
15536
|
+
n_tasks = 1;
|
15537
|
+
} break;
|
15538
|
+
case GGML_OP_DIAG_MASK_ZERO:
|
15539
|
+
case GGML_OP_DIAG_MASK_INF:
|
15540
|
+
case GGML_OP_SOFT_MAX:
|
15541
|
+
case GGML_OP_SOFT_MAX_BACK:
|
15542
|
+
case GGML_OP_ROPE:
|
15543
|
+
case GGML_OP_ROPE_BACK:
|
15544
|
+
case GGML_OP_ADD_REL_POS:
|
15545
|
+
{
|
15546
|
+
n_tasks = n_threads;
|
15547
|
+
} break;
|
15548
|
+
case GGML_OP_ALIBI:
|
15549
|
+
{
|
15550
|
+
n_tasks = 1; //TODO
|
15551
|
+
} break;
|
15552
|
+
case GGML_OP_CLAMP:
|
15553
|
+
{
|
15554
|
+
n_tasks = 1; //TODO
|
15555
|
+
} break;
|
15556
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
15557
|
+
{
|
15558
|
+
n_tasks = n_threads;
|
15559
|
+
} break;
|
15560
|
+
case GGML_OP_IM2COL:
|
15561
|
+
{
|
15562
|
+
n_tasks = n_threads;
|
15563
|
+
} break;
|
15564
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
15565
|
+
{
|
15566
|
+
n_tasks = n_threads;
|
15567
|
+
} break;
|
15568
|
+
case GGML_OP_POOL_1D:
|
15569
|
+
case GGML_OP_POOL_2D:
|
15570
|
+
{
|
15571
|
+
n_tasks = 1;
|
15572
|
+
} break;
|
15573
|
+
case GGML_OP_UPSCALE:
|
15574
|
+
{
|
15575
|
+
n_tasks = n_threads;
|
15576
|
+
} break;
|
15577
|
+
case GGML_OP_FLASH_ATTN:
|
15578
|
+
{
|
15579
|
+
n_tasks = n_threads;
|
15580
|
+
} break;
|
15581
|
+
case GGML_OP_FLASH_FF:
|
15582
|
+
{
|
15583
|
+
n_tasks = n_threads;
|
15584
|
+
} break;
|
15585
|
+
case GGML_OP_FLASH_ATTN_BACK:
|
15586
|
+
{
|
15587
|
+
n_tasks = n_threads;
|
15588
|
+
} break;
|
15589
|
+
case GGML_OP_WIN_PART:
|
15590
|
+
case GGML_OP_WIN_UNPART:
|
15591
|
+
case GGML_OP_GET_REL_POS:
|
15592
|
+
case GGML_OP_MAP_UNARY:
|
15593
|
+
case GGML_OP_MAP_BINARY:
|
15594
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
15595
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
15596
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
15597
|
+
{
|
15598
|
+
n_tasks = 1;
|
15599
|
+
} break;
|
15600
|
+
case GGML_OP_MAP_CUSTOM1:
|
15601
|
+
{
|
15602
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
15603
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
15604
|
+
n_tasks = n_threads;
|
15605
|
+
} else {
|
15606
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
15607
|
+
}
|
15608
|
+
} break;
|
15609
|
+
case GGML_OP_MAP_CUSTOM2:
|
15610
|
+
{
|
15611
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
15612
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
15613
|
+
n_tasks = n_threads;
|
15614
|
+
} else {
|
15615
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
15616
|
+
}
|
15617
|
+
} break;
|
15618
|
+
case GGML_OP_MAP_CUSTOM3:
|
15619
|
+
{
|
15620
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
15621
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
15622
|
+
n_tasks = n_threads;
|
15623
|
+
} else {
|
15624
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
15625
|
+
}
|
15626
|
+
} break;
|
15627
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
15628
|
+
{
|
15629
|
+
n_tasks = n_threads;
|
15630
|
+
} break;
|
15631
|
+
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
15632
|
+
{
|
15633
|
+
n_tasks = n_threads;
|
15634
|
+
} break;
|
15635
|
+
case GGML_OP_NONE:
|
15636
|
+
{
|
15637
|
+
n_tasks = 1;
|
15638
|
+
} break;
|
15639
|
+
case GGML_OP_COUNT:
|
15640
|
+
{
|
15641
|
+
GGML_ASSERT(false);
|
15642
|
+
} break;
|
15643
|
+
default:
|
15644
|
+
{
|
15645
|
+
printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
|
15646
|
+
GGML_ASSERT(false);
|
15647
|
+
} break;
|
15648
|
+
}
|
15649
|
+
|
15650
|
+
assert(n_tasks > 0);
|
16178
15651
|
|
16179
|
-
|
16180
|
-
node->perf_cycles += cycles_cur;
|
16181
|
-
node->perf_time_us += time_us_cur;
|
15652
|
+
return n_tasks;
|
16182
15653
|
}
|
16183
15654
|
|
16184
15655
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
@@ -16187,7 +15658,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16187
15658
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
16188
15659
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
16189
15660
|
|
16190
|
-
const int * n_tasks_arr = cplan->n_tasks;
|
16191
15661
|
const int n_threads = state->shared->n_threads;
|
16192
15662
|
|
16193
15663
|
set_numa_thread_affinity(state->ith, n_threads);
|
@@ -16212,9 +15682,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16212
15682
|
|
16213
15683
|
if (node_n != -1) {
|
16214
15684
|
/* FINALIZE */
|
16215
|
-
struct ggml_tensor * node =
|
15685
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16216
15686
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16217
|
-
params.nth =
|
15687
|
+
params.nth = ggml_get_n_tasks(node, n_threads);
|
16218
15688
|
ggml_compute_forward(¶ms, node);
|
16219
15689
|
}
|
16220
15690
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -16225,7 +15695,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16225
15695
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16226
15696
|
|
16227
15697
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16228
|
-
const int n_tasks =
|
15698
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16229
15699
|
|
16230
15700
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16231
15701
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
@@ -16283,7 +15753,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16283
15753
|
|
16284
15754
|
/* COMPUTE */
|
16285
15755
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16286
|
-
const int n_tasks =
|
15756
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16287
15757
|
|
16288
15758
|
struct ggml_compute_params params = {
|
16289
15759
|
/*.type =*/ GGML_TASK_COMPUTE,
|
@@ -16317,121 +15787,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16317
15787
|
|
16318
15788
|
struct ggml_tensor * node = cgraph->nodes[i];
|
16319
15789
|
|
15790
|
+
size_t cur = 0;
|
15791
|
+
|
16320
15792
|
switch (node->op) {
|
16321
15793
|
case GGML_OP_CPY:
|
16322
15794
|
case GGML_OP_DUP:
|
16323
15795
|
{
|
16324
15796
|
n_tasks = n_threads;
|
16325
15797
|
|
16326
|
-
size_t cur = 0;
|
16327
15798
|
if (ggml_is_quantized(node->type)) {
|
16328
15799
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16329
15800
|
}
|
16330
|
-
|
16331
|
-
work_size = MAX(work_size, cur);
|
16332
15801
|
} break;
|
16333
15802
|
case GGML_OP_ADD:
|
16334
15803
|
case GGML_OP_ADD1:
|
16335
15804
|
{
|
16336
15805
|
n_tasks = n_threads;
|
16337
15806
|
|
16338
|
-
size_t cur = 0;
|
16339
|
-
|
16340
15807
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16341
15808
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16342
15809
|
}
|
16343
|
-
|
16344
|
-
work_size = MAX(work_size, cur);
|
16345
15810
|
} break;
|
16346
15811
|
case GGML_OP_ACC:
|
16347
15812
|
{
|
16348
15813
|
n_tasks = n_threads;
|
16349
15814
|
|
16350
|
-
size_t cur = 0;
|
16351
|
-
|
16352
15815
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16353
15816
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
16354
15817
|
}
|
16355
|
-
|
16356
|
-
work_size = MAX(work_size, cur);
|
16357
|
-
} break;
|
16358
|
-
case GGML_OP_SUB:
|
16359
|
-
case GGML_OP_DIV:
|
16360
|
-
case GGML_OP_SQR:
|
16361
|
-
case GGML_OP_SQRT:
|
16362
|
-
case GGML_OP_LOG:
|
16363
|
-
case GGML_OP_SUM:
|
16364
|
-
case GGML_OP_SUM_ROWS:
|
16365
|
-
case GGML_OP_MEAN:
|
16366
|
-
case GGML_OP_ARGMAX:
|
16367
|
-
case GGML_OP_REPEAT:
|
16368
|
-
case GGML_OP_REPEAT_BACK:
|
16369
|
-
{
|
16370
|
-
n_tasks = 1;
|
16371
|
-
} break;
|
16372
|
-
|
16373
|
-
case GGML_OP_UNARY:
|
16374
|
-
{
|
16375
|
-
switch (ggml_get_unary_op(node)) {
|
16376
|
-
case GGML_UNARY_OP_ABS:
|
16377
|
-
case GGML_UNARY_OP_SGN:
|
16378
|
-
case GGML_UNARY_OP_NEG:
|
16379
|
-
case GGML_UNARY_OP_STEP:
|
16380
|
-
case GGML_UNARY_OP_TANH:
|
16381
|
-
case GGML_UNARY_OP_ELU:
|
16382
|
-
case GGML_UNARY_OP_RELU:
|
16383
|
-
{
|
16384
|
-
n_tasks = 1;
|
16385
|
-
} break;
|
16386
|
-
|
16387
|
-
case GGML_UNARY_OP_GELU:
|
16388
|
-
case GGML_UNARY_OP_GELU_QUICK:
|
16389
|
-
case GGML_UNARY_OP_SILU:
|
16390
|
-
{
|
16391
|
-
n_tasks = n_threads;
|
16392
|
-
} break;
|
16393
|
-
}
|
16394
15818
|
} break;
|
16395
|
-
case GGML_OP_SILU_BACK:
|
16396
|
-
case GGML_OP_MUL:
|
16397
|
-
case GGML_OP_NORM:
|
16398
|
-
case GGML_OP_RMS_NORM:
|
16399
|
-
case GGML_OP_RMS_NORM_BACK:
|
16400
|
-
case GGML_OP_GROUP_NORM:
|
16401
|
-
{
|
16402
|
-
n_tasks = n_threads;
|
16403
|
-
} break;
|
16404
|
-
case GGML_OP_CONCAT:
|
16405
15819
|
case GGML_OP_MUL_MAT:
|
16406
15820
|
{
|
16407
|
-
n_tasks = n_threads;
|
16408
|
-
|
16409
|
-
// TODO: use different scheduling for different matrix sizes
|
16410
|
-
//const int nr0 = ggml_nrows(node->src[0]);
|
16411
|
-
//const int nr1 = ggml_nrows(node->src[1]);
|
16412
|
-
|
16413
|
-
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
16414
|
-
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
16415
|
-
|
16416
|
-
size_t cur = 0;
|
16417
15821
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
16418
15822
|
|
16419
|
-
#if defined(
|
16420
|
-
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
16421
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16422
|
-
// the threads are still spinning
|
16423
|
-
} else
|
16424
|
-
#elif defined(GGML_USE_CLBLAST)
|
15823
|
+
#if defined(GGML_USE_CLBLAST)
|
16425
15824
|
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
16426
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16427
|
-
// the threads are still spinning
|
16428
15825
|
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
16429
15826
|
} else
|
16430
15827
|
#endif
|
16431
15828
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16432
15829
|
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
16433
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16434
|
-
// the threads are still spinning
|
16435
15830
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16436
15831
|
// here we need memory just for single 2D matrix from src0
|
16437
15832
|
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
@@ -16440,108 +15835,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16440
15835
|
#endif
|
16441
15836
|
if (node->src[1]->type != vec_dot_type) {
|
16442
15837
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
16443
|
-
} else {
|
16444
|
-
cur = 0;
|
16445
15838
|
}
|
16446
|
-
|
16447
|
-
work_size = MAX(work_size, cur);
|
16448
15839
|
} break;
|
16449
15840
|
case GGML_OP_OUT_PROD:
|
16450
15841
|
{
|
16451
15842
|
n_tasks = n_threads;
|
16452
15843
|
|
16453
|
-
size_t cur = 0;
|
16454
|
-
|
16455
15844
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16456
15845
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16457
15846
|
}
|
16458
|
-
|
16459
|
-
work_size = MAX(work_size, cur);
|
16460
|
-
} break;
|
16461
|
-
case GGML_OP_SCALE:
|
16462
|
-
{
|
16463
|
-
n_tasks = 1;
|
16464
|
-
} break;
|
16465
|
-
case GGML_OP_SET:
|
16466
|
-
case GGML_OP_CONT:
|
16467
|
-
case GGML_OP_RESHAPE:
|
16468
|
-
case GGML_OP_VIEW:
|
16469
|
-
case GGML_OP_PERMUTE:
|
16470
|
-
case GGML_OP_TRANSPOSE:
|
16471
|
-
case GGML_OP_GET_ROWS:
|
16472
|
-
case GGML_OP_GET_ROWS_BACK:
|
16473
|
-
case GGML_OP_DIAG:
|
16474
|
-
{
|
16475
|
-
n_tasks = 1;
|
16476
|
-
} break;
|
16477
|
-
case GGML_OP_DIAG_MASK_ZERO:
|
16478
|
-
case GGML_OP_DIAG_MASK_INF:
|
16479
|
-
case GGML_OP_SOFT_MAX:
|
16480
|
-
case GGML_OP_SOFT_MAX_BACK:
|
16481
|
-
case GGML_OP_ROPE:
|
16482
|
-
case GGML_OP_ROPE_BACK:
|
16483
|
-
case GGML_OP_ADD_REL_POS:
|
16484
|
-
{
|
16485
|
-
n_tasks = n_threads;
|
16486
|
-
} break;
|
16487
|
-
case GGML_OP_ALIBI:
|
16488
|
-
{
|
16489
|
-
n_tasks = 1; //TODO
|
16490
|
-
} break;
|
16491
|
-
case GGML_OP_CLAMP:
|
16492
|
-
{
|
16493
|
-
n_tasks = 1; //TODO
|
16494
|
-
} break;
|
16495
|
-
case GGML_OP_CONV_1D:
|
16496
|
-
{
|
16497
|
-
n_tasks = n_threads;
|
16498
|
-
|
16499
|
-
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
16500
|
-
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
16501
|
-
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
16502
|
-
|
16503
|
-
const int64_t ne00 = node->src[0]->ne[0];
|
16504
|
-
const int64_t ne01 = node->src[0]->ne[1];
|
16505
|
-
const int64_t ne02 = node->src[0]->ne[2];
|
16506
|
-
|
16507
|
-
const int64_t ne10 = node->src[1]->ne[0];
|
16508
|
-
const int64_t ne11 = node->src[1]->ne[1];
|
16509
|
-
|
16510
|
-
const int64_t ne0 = node->ne[0];
|
16511
|
-
const int64_t ne1 = node->ne[1];
|
16512
|
-
const int64_t nk = ne00;
|
16513
|
-
const int64_t ew0 = nk * ne01;
|
16514
|
-
|
16515
|
-
UNUSED(ne02);
|
16516
|
-
UNUSED(ne10);
|
16517
|
-
UNUSED(ne11);
|
16518
|
-
|
16519
|
-
size_t cur = 0;
|
16520
|
-
|
16521
|
-
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16522
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16523
|
-
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
16524
|
-
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16525
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16526
|
-
cur = sizeof(float)*(ne0*ne1*ew0);
|
16527
|
-
} else {
|
16528
|
-
GGML_ASSERT(false);
|
16529
|
-
}
|
16530
|
-
|
16531
|
-
work_size = MAX(work_size, cur);
|
16532
|
-
} break;
|
16533
|
-
case GGML_OP_CONV_1D_STAGE_0:
|
16534
|
-
{
|
16535
|
-
n_tasks = n_threads;
|
16536
|
-
} break;
|
16537
|
-
case GGML_OP_CONV_1D_STAGE_1:
|
16538
|
-
{
|
16539
|
-
n_tasks = n_threads;
|
16540
15847
|
} break;
|
16541
15848
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
16542
15849
|
{
|
16543
|
-
n_tasks = n_threads;
|
16544
|
-
|
16545
15850
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
16546
15851
|
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
16547
15852
|
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
@@ -16553,7 +15858,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16553
15858
|
const int64_t ne10 = node->src[1]->ne[0]; // L
|
16554
15859
|
const int64_t ne11 = node->src[1]->ne[1]; // Cin
|
16555
15860
|
|
16556
|
-
size_t cur = 0;
|
16557
15861
|
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16558
15862
|
node->src[1]->type == GGML_TYPE_F32) {
|
16559
15863
|
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
|
@@ -16565,59 +15869,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16565
15869
|
} else {
|
16566
15870
|
GGML_ASSERT(false);
|
16567
15871
|
}
|
16568
|
-
|
16569
|
-
work_size = MAX(work_size, cur);
|
16570
|
-
} break;
|
16571
|
-
case GGML_OP_CONV_2D:
|
16572
|
-
{
|
16573
|
-
n_tasks = n_threads;
|
16574
|
-
|
16575
|
-
const int64_t ne00 = node->src[0]->ne[0]; // W
|
16576
|
-
const int64_t ne01 = node->src[0]->ne[1]; // H
|
16577
|
-
const int64_t ne02 = node->src[0]->ne[2]; // C
|
16578
|
-
const int64_t ne03 = node->src[0]->ne[3]; // N
|
16579
|
-
|
16580
|
-
const int64_t ne10 = node->src[1]->ne[0]; // W
|
16581
|
-
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16582
|
-
const int64_t ne12 = node->src[1]->ne[2]; // C
|
16583
|
-
|
16584
|
-
const int64_t ne0 = node->ne[0];
|
16585
|
-
const int64_t ne1 = node->ne[1];
|
16586
|
-
const int64_t ne2 = node->ne[2];
|
16587
|
-
const int64_t ne3 = node->ne[3];
|
16588
|
-
const int64_t nk = ne00*ne01;
|
16589
|
-
const int64_t ew0 = nk * ne02;
|
16590
|
-
|
16591
|
-
UNUSED(ne03);
|
16592
|
-
UNUSED(ne2);
|
16593
|
-
|
16594
|
-
size_t cur = 0;
|
16595
|
-
|
16596
|
-
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16597
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16598
|
-
// im2col: [N*OH*OW, IC*KH*KW]
|
16599
|
-
cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
|
16600
|
-
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16601
|
-
node->src[1]->type == GGML_TYPE_F32) {
|
16602
|
-
cur = sizeof(float)* (ne10*ne11*ne12);
|
16603
|
-
} else {
|
16604
|
-
GGML_ASSERT(false);
|
16605
|
-
}
|
16606
|
-
|
16607
|
-
work_size = MAX(work_size, cur);
|
16608
|
-
} break;
|
16609
|
-
case GGML_OP_CONV_2D_STAGE_0:
|
16610
|
-
{
|
16611
|
-
n_tasks = n_threads;
|
16612
15872
|
} break;
|
16613
|
-
case
|
15873
|
+
case GGML_OP_IM2COL:
|
16614
15874
|
{
|
16615
15875
|
n_tasks = n_threads;
|
16616
15876
|
} break;
|
16617
15877
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
16618
15878
|
{
|
16619
|
-
n_tasks = n_threads;
|
16620
|
-
|
16621
15879
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
16622
15880
|
const int64_t ne01 = node->src[0]->ne[1]; // H
|
16623
15881
|
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
@@ -16627,141 +15885,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16627
15885
|
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16628
15886
|
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
16629
15887
|
|
16630
|
-
size_t cur = 0;
|
16631
15888
|
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
16632
15889
|
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
16633
|
-
|
16634
|
-
work_size = MAX(work_size, cur);
|
16635
|
-
} break;
|
16636
|
-
case GGML_OP_POOL_1D:
|
16637
|
-
case GGML_OP_POOL_2D:
|
16638
|
-
{
|
16639
|
-
n_tasks = 1;
|
16640
|
-
} break;
|
16641
|
-
case GGML_OP_UPSCALE:
|
16642
|
-
{
|
16643
|
-
n_tasks = n_threads;
|
16644
15890
|
} break;
|
16645
15891
|
case GGML_OP_FLASH_ATTN:
|
16646
15892
|
{
|
16647
15893
|
n_tasks = n_threads;
|
16648
15894
|
|
16649
|
-
size_t cur = 0;
|
16650
|
-
|
16651
15895
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16652
15896
|
|
16653
15897
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16654
15898
|
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16655
15899
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16656
|
-
}
|
16657
|
-
|
16658
|
-
if (node->src[1]->type == GGML_TYPE_F16) {
|
15900
|
+
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
16659
15901
|
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16660
15902
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16661
15903
|
}
|
16662
|
-
|
16663
|
-
work_size = MAX(work_size, cur);
|
16664
15904
|
} break;
|
16665
15905
|
case GGML_OP_FLASH_FF:
|
16666
15906
|
{
|
16667
15907
|
n_tasks = n_threads;
|
16668
15908
|
|
16669
|
-
size_t cur = 0;
|
16670
|
-
|
16671
15909
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16672
15910
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16673
15911
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16674
|
-
}
|
16675
|
-
|
16676
|
-
if (node->src[1]->type == GGML_TYPE_F16) {
|
15912
|
+
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
16677
15913
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16678
15914
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16679
15915
|
}
|
16680
|
-
|
16681
|
-
work_size = MAX(work_size, cur);
|
16682
15916
|
} break;
|
16683
15917
|
case GGML_OP_FLASH_ATTN_BACK:
|
16684
15918
|
{
|
16685
15919
|
n_tasks = n_threads;
|
16686
15920
|
|
16687
|
-
size_t cur = 0;
|
16688
|
-
|
16689
15921
|
const int64_t D = node->src[0]->ne[0];
|
16690
15922
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16691
15923
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
16692
15924
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16693
15925
|
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16694
15926
|
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16695
|
-
}
|
16696
|
-
|
16697
|
-
if (node->src[1]->type == GGML_TYPE_F16) {
|
15927
|
+
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
16698
15928
|
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16699
15929
|
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16700
15930
|
}
|
16701
|
-
|
16702
|
-
work_size = MAX(work_size, cur);
|
16703
|
-
} break;
|
16704
|
-
case GGML_OP_WIN_PART:
|
16705
|
-
case GGML_OP_WIN_UNPART:
|
16706
|
-
case GGML_OP_GET_REL_POS:
|
16707
|
-
case GGML_OP_MAP_UNARY:
|
16708
|
-
case GGML_OP_MAP_BINARY:
|
16709
|
-
case GGML_OP_MAP_CUSTOM1_F32:
|
16710
|
-
case GGML_OP_MAP_CUSTOM2_F32:
|
16711
|
-
case GGML_OP_MAP_CUSTOM3_F32:
|
16712
|
-
{
|
16713
|
-
n_tasks = 1;
|
16714
|
-
} break;
|
16715
|
-
case GGML_OP_MAP_CUSTOM1:
|
16716
|
-
{
|
16717
|
-
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
16718
|
-
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16719
|
-
n_tasks = n_threads;
|
16720
|
-
} else {
|
16721
|
-
n_tasks = MIN(p->n_tasks, n_threads);
|
16722
|
-
}
|
16723
|
-
} break;
|
16724
|
-
case GGML_OP_MAP_CUSTOM2:
|
16725
|
-
{
|
16726
|
-
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
16727
|
-
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16728
|
-
n_tasks = n_threads;
|
16729
|
-
} else {
|
16730
|
-
n_tasks = MIN(p->n_tasks, n_threads);
|
16731
|
-
}
|
16732
|
-
} break;
|
16733
|
-
case GGML_OP_MAP_CUSTOM3:
|
16734
|
-
{
|
16735
|
-
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
16736
|
-
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16737
|
-
n_tasks = n_threads;
|
16738
|
-
} else {
|
16739
|
-
n_tasks = MIN(p->n_tasks, n_threads);
|
16740
|
-
}
|
16741
15931
|
} break;
|
15932
|
+
|
16742
15933
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16743
15934
|
{
|
16744
15935
|
n_tasks = n_threads;
|
16745
15936
|
|
16746
|
-
|
16747
|
-
|
16748
|
-
work_size = MAX(work_size, cur);
|
16749
|
-
} break;
|
16750
|
-
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16751
|
-
{
|
16752
|
-
n_tasks = n_threads;
|
16753
|
-
} break;
|
16754
|
-
case GGML_OP_NONE:
|
16755
|
-
{
|
16756
|
-
n_tasks = 1;
|
15937
|
+
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16757
15938
|
} break;
|
16758
15939
|
case GGML_OP_COUNT:
|
16759
15940
|
{
|
16760
15941
|
GGML_ASSERT(false);
|
16761
15942
|
} break;
|
15943
|
+
default:
|
15944
|
+
break;
|
16762
15945
|
}
|
16763
15946
|
|
16764
|
-
|
15947
|
+
work_size = MAX(work_size, cur);
|
16765
15948
|
}
|
16766
15949
|
|
16767
15950
|
if (work_size > 0) {
|
@@ -16783,12 +15966,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16783
15966
|
if (cplan->work_size > 0) {
|
16784
15967
|
GGML_ASSERT(cplan->work_data);
|
16785
15968
|
}
|
16786
|
-
|
16787
|
-
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16788
|
-
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
16789
|
-
GGML_ASSERT(cplan->n_tasks[i] > 0);
|
16790
|
-
}
|
16791
|
-
}
|
16792
15969
|
}
|
16793
15970
|
|
16794
15971
|
const int n_threads = cplan->n_threads;
|
@@ -16861,16 +16038,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16861
16038
|
return compute_status;
|
16862
16039
|
}
|
16863
16040
|
|
16864
|
-
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
16865
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16866
|
-
struct ggml_tensor * grad = cgraph->grads[i];
|
16867
|
-
|
16868
|
-
if (grad) {
|
16869
|
-
ggml_set_zero(grad);
|
16870
|
-
}
|
16871
|
-
}
|
16872
|
-
}
|
16873
|
-
|
16874
16041
|
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
16875
16042
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
16876
16043
|
|
@@ -16997,12 +16164,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16997
16164
|
const uint32_t magic = GGML_FILE_MAGIC;
|
16998
16165
|
const uint32_t version = GGML_FILE_VERSION;
|
16999
16166
|
const uint32_t n_leafs = cgraph->n_leafs;
|
17000
|
-
const uint32_t
|
16167
|
+
const uint32_t n_nodes = cgraph->n_nodes;
|
17001
16168
|
|
17002
16169
|
fwrite(&magic, sizeof(uint32_t), 1, fout);
|
17003
16170
|
fwrite(&version, sizeof(uint32_t), 1, fout);
|
17004
16171
|
fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
|
17005
|
-
fwrite(&
|
16172
|
+
fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
|
17006
16173
|
fwrite(&size_eval, sizeof(uint64_t), 1, fout);
|
17007
16174
|
}
|
17008
16175
|
|
@@ -17090,7 +16257,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17090
16257
|
if (idx == -1) {
|
17091
16258
|
for (int k = 0; k < cgraph->n_nodes; ++k) {
|
17092
16259
|
if (args[j] == cgraph->nodes[k]) {
|
17093
|
-
idx =
|
16260
|
+
idx = cgraph->n_leafs + k;
|
17094
16261
|
break;
|
17095
16262
|
}
|
17096
16263
|
}
|
@@ -17117,11 +16284,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17117
16284
|
}
|
17118
16285
|
}
|
17119
16286
|
|
17120
|
-
struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
16287
|
+
struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
17121
16288
|
assert(*ctx_data == NULL);
|
17122
16289
|
assert(*ctx_eval == NULL);
|
17123
16290
|
|
17124
|
-
struct ggml_cgraph result =
|
16291
|
+
struct ggml_cgraph * result = NULL;
|
17125
16292
|
|
17126
16293
|
struct ggml_tensor * data = NULL;
|
17127
16294
|
|
@@ -17193,13 +16360,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17193
16360
|
const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
|
17194
16361
|
const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
|
17195
16362
|
const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
|
17196
|
-
|
17197
|
-
result.n_leafs = n_leafs;
|
17198
|
-
result.n_nodes = n_nodes;
|
16363
|
+
const int graph_size = MAX(n_leafs, n_nodes);
|
17199
16364
|
|
17200
16365
|
// create the data context
|
17201
16366
|
{
|
17202
|
-
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
|
16367
|
+
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
|
17203
16368
|
|
17204
16369
|
struct ggml_init_params params = {
|
17205
16370
|
.mem_size = size_eval + overhead,
|
@@ -17215,6 +16380,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17215
16380
|
}
|
17216
16381
|
}
|
17217
16382
|
|
16383
|
+
result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
|
16384
|
+
|
16385
|
+
result->n_leafs = n_leafs;
|
16386
|
+
result->n_nodes = n_nodes;
|
16387
|
+
|
16388
|
+
|
17218
16389
|
// leafs
|
17219
16390
|
{
|
17220
16391
|
uint32_t type;
|
@@ -17253,7 +16424,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17253
16424
|
tensor->nb[j] = nb[j];
|
17254
16425
|
}
|
17255
16426
|
|
17256
|
-
result
|
16427
|
+
result->leafs[i] = tensor;
|
17257
16428
|
|
17258
16429
|
ptr += ggml_nbytes(tensor);
|
17259
16430
|
|
@@ -17305,10 +16476,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17305
16476
|
continue;
|
17306
16477
|
}
|
17307
16478
|
|
17308
|
-
if (arg_idx <
|
17309
|
-
args[j] = result
|
16479
|
+
if (arg_idx < result->n_leafs) {
|
16480
|
+
args[j] = result->leafs[arg_idx];
|
17310
16481
|
} else {
|
17311
|
-
args[j] = result
|
16482
|
+
args[j] = result->nodes[arg_idx - result->n_leafs];
|
17312
16483
|
}
|
17313
16484
|
}
|
17314
16485
|
|
@@ -17360,7 +16531,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17360
16531
|
tensor->src[j] = args[j];
|
17361
16532
|
}
|
17362
16533
|
|
17363
|
-
result
|
16534
|
+
result->nodes[i] = tensor;
|
17364
16535
|
|
17365
16536
|
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
17366
16537
|
}
|
@@ -18265,10 +17436,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18265
17436
|
case GGML_OPT_ADAM:
|
18266
17437
|
{
|
18267
17438
|
result = (struct ggml_opt_params) {
|
18268
|
-
.type
|
18269
|
-
.
|
18270
|
-
.
|
18271
|
-
.
|
17439
|
+
.type = GGML_OPT_ADAM,
|
17440
|
+
.graph_size = GGML_DEFAULT_GRAPH_SIZE,
|
17441
|
+
.n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
|
17442
|
+
.past = 0,
|
17443
|
+
.delta = 1e-5f,
|
18272
17444
|
|
18273
17445
|
.max_no_improvement = 100,
|
18274
17446
|
|
@@ -18295,10 +17467,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18295
17467
|
case GGML_OPT_LBFGS:
|
18296
17468
|
{
|
18297
17469
|
result = (struct ggml_opt_params) {
|
18298
|
-
.type
|
18299
|
-
.
|
18300
|
-
.
|
18301
|
-
.
|
17470
|
+
.type = GGML_OPT_LBFGS,
|
17471
|
+
.graph_size = GGML_DEFAULT_GRAPH_SIZE,
|
17472
|
+
.n_threads = 1,
|
17473
|
+
.past = 0,
|
17474
|
+
.delta = 1e-5f,
|
18302
17475
|
|
18303
17476
|
.max_no_improvement = 0,
|
18304
17477
|
|
@@ -18440,14 +17613,11 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18440
17613
|
struct ggml_tensor * f) {
|
18441
17614
|
|
18442
17615
|
// build forward + backward compute graphs
|
18443
|
-
struct
|
18444
|
-
|
18445
|
-
|
18446
|
-
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
18447
|
-
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
17616
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
|
17617
|
+
ggml_build_forward_expand(gf, f);
|
18448
17618
|
|
18449
|
-
*
|
18450
|
-
|
17619
|
+
struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
|
17620
|
+
ggml_build_backward_expand(ctx, gf, gb, true);
|
18451
17621
|
|
18452
17622
|
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
18453
17623
|
}
|
@@ -18903,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18903
18073
|
{
|
18904
18074
|
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
18905
18075
|
|
18906
|
-
for (
|
18076
|
+
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
18907
18077
|
struct gguf_kv * kv = &ctx->kv[i];
|
18908
18078
|
|
18909
18079
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
@@ -18950,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18950
18120
|
case GGUF_TYPE_STRING:
|
18951
18121
|
{
|
18952
18122
|
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
18953
|
-
for (
|
18123
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
18954
18124
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
18955
18125
|
}
|
18956
18126
|
} break;
|
@@ -18978,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18978
18148
|
{
|
18979
18149
|
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
18980
18150
|
|
18981
|
-
for (
|
18151
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
18982
18152
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
18983
18153
|
|
18984
18154
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
@@ -19025,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19025
18195
|
// compute the total size of the data section, taking into account the alignment
|
19026
18196
|
{
|
19027
18197
|
ctx->size = 0;
|
19028
|
-
for (
|
18198
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19029
18199
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
19030
18200
|
|
19031
18201
|
const int64_t ne =
|
@@ -19094,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19094
18264
|
ggml_set_no_alloc(ctx_data, true);
|
19095
18265
|
|
19096
18266
|
// create the tensors
|
19097
|
-
for (
|
18267
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19098
18268
|
const int64_t ne[GGML_MAX_DIMS] = {
|
19099
18269
|
ctx->infos[i].ne[0],
|
19100
18270
|
ctx->infos[i].ne[1],
|