llama_cpp 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +305 -133
- data/ext/llama_cpp/src/ggml-cuda.cu +367 -69
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +262 -291
- data/ext/llama_cpp/src/llama.h +49 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +14 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
// Defines CLOCK_MONOTONIC on Linux
|
2
|
-
#define
|
1
|
+
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <stdio.h>
|
25
25
|
#include <float.h>
|
26
26
|
#include <limits.h>
|
27
|
+
#include <stdarg.h>
|
27
28
|
|
28
29
|
#ifdef GGML_USE_METAL
|
29
30
|
#include <unistd.h>
|
@@ -90,6 +91,11 @@ static int sched_yield (void) {
|
|
90
91
|
#include <stdatomic.h>
|
91
92
|
|
92
93
|
typedef void* thread_ret_t;
|
94
|
+
|
95
|
+
#include <sys/types.h>
|
96
|
+
#include <sys/stat.h>
|
97
|
+
#include <unistd.h>
|
98
|
+
|
93
99
|
#endif
|
94
100
|
|
95
101
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -118,6 +124,30 @@ typedef void* thread_ret_t;
|
|
118
124
|
#define GGML_SOFT_MAX_UNROLL 4
|
119
125
|
#define GGML_VEC_DOT_UNROLL 2
|
120
126
|
|
127
|
+
//
|
128
|
+
// logging
|
129
|
+
//
|
130
|
+
|
131
|
+
#if (GGML_DEBUG >= 1)
|
132
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
133
|
+
#else
|
134
|
+
#define GGML_PRINT_DEBUG(...)
|
135
|
+
#endif
|
136
|
+
|
137
|
+
#if (GGML_DEBUG >= 5)
|
138
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
139
|
+
#else
|
140
|
+
#define GGML_PRINT_DEBUG_5(...)
|
141
|
+
#endif
|
142
|
+
|
143
|
+
#if (GGML_DEBUG >= 10)
|
144
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
145
|
+
#else
|
146
|
+
#define GGML_PRINT_DEBUG_10(...)
|
147
|
+
#endif
|
148
|
+
|
149
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
150
|
+
|
121
151
|
#ifdef GGML_USE_ACCELERATE
|
122
152
|
// uncomment to use vDSP for soft max computation
|
123
153
|
// note: not sure if it is actually faster
|
@@ -130,6 +160,34 @@ typedef void* thread_ret_t;
|
|
130
160
|
#define GGML_MEM_ALIGN 16
|
131
161
|
#endif
|
132
162
|
|
163
|
+
//
|
164
|
+
// logging
|
165
|
+
//
|
166
|
+
|
167
|
+
#if (GGML_DEBUG >= 1)
|
168
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
169
|
+
#else
|
170
|
+
#define GGML_PRINT_DEBUG(...)
|
171
|
+
#endif
|
172
|
+
|
173
|
+
#if (GGML_DEBUG >= 5)
|
174
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
175
|
+
#else
|
176
|
+
#define GGML_PRINT_DEBUG_5(...)
|
177
|
+
#endif
|
178
|
+
|
179
|
+
#if (GGML_DEBUG >= 10)
|
180
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
181
|
+
#else
|
182
|
+
#define GGML_PRINT_DEBUG_10(...)
|
183
|
+
#endif
|
184
|
+
|
185
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
186
|
+
|
187
|
+
//
|
188
|
+
// end of logging block
|
189
|
+
//
|
190
|
+
|
133
191
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
134
192
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
135
193
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -143,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
143
201
|
#endif
|
144
202
|
if (result != 0) {
|
145
203
|
// Handle allocation failure
|
204
|
+
const char *error_desc = "unknown allocation error";
|
205
|
+
switch (result) {
|
206
|
+
case EINVAL:
|
207
|
+
error_desc = "invalid alignment value";
|
208
|
+
break;
|
209
|
+
case ENOMEM:
|
210
|
+
error_desc = "insufficient memory";
|
211
|
+
break;
|
212
|
+
}
|
213
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
214
|
+
__func__, error_desc, size/(1024.0*1024.0));
|
146
215
|
return NULL;
|
147
216
|
}
|
148
217
|
return aligned_memory;
|
@@ -419,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
419
488
|
}
|
420
489
|
}
|
421
490
|
|
422
|
-
|
423
491
|
//
|
424
492
|
// timing
|
425
493
|
//
|
@@ -482,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
|
|
482
550
|
#define ggml_perf_cycles_per_ms() 0
|
483
551
|
#endif
|
484
552
|
|
553
|
+
|
485
554
|
//
|
486
555
|
// cache line
|
487
556
|
//
|
@@ -3529,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
|
3529
3598
|
*s = 1.f/(*s);
|
3530
3599
|
}
|
3531
3600
|
|
3532
|
-
//
|
3533
|
-
// logging
|
3534
|
-
//
|
3535
|
-
|
3536
|
-
#if (GGML_DEBUG >= 1)
|
3537
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
3538
|
-
#else
|
3539
|
-
#define GGML_PRINT_DEBUG(...)
|
3540
|
-
#endif
|
3541
|
-
|
3542
|
-
#if (GGML_DEBUG >= 5)
|
3543
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
3544
|
-
#else
|
3545
|
-
#define GGML_PRINT_DEBUG_5(...)
|
3546
|
-
#endif
|
3547
|
-
|
3548
|
-
#if (GGML_DEBUG >= 10)
|
3549
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
3550
|
-
#else
|
3551
|
-
#define GGML_PRINT_DEBUG_10(...)
|
3552
|
-
#endif
|
3553
|
-
|
3554
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
3555
|
-
|
3556
3601
|
//
|
3557
3602
|
// data types
|
3558
3603
|
//
|
@@ -3712,11 +3757,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3712
3757
|
"MAP_UNARY",
|
3713
3758
|
"MAP_BINARY",
|
3714
3759
|
|
3760
|
+
"MAP_CUSTOM1",
|
3761
|
+
"MAP_CUSTOM2",
|
3762
|
+
"MAP_CUSTOM3",
|
3763
|
+
|
3715
3764
|
"CROSS_ENTROPY_LOSS",
|
3716
3765
|
"CROSS_ENTROPY_LOSS_BACK",
|
3717
3766
|
};
|
3718
3767
|
|
3719
|
-
static_assert(GGML_OP_COUNT ==
|
3768
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3720
3769
|
|
3721
3770
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3722
3771
|
"none",
|
@@ -3784,11 +3833,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3784
3833
|
"f(x)",
|
3785
3834
|
"f(x,y)",
|
3786
3835
|
|
3836
|
+
"custom(x)",
|
3837
|
+
"custom(x,y)",
|
3838
|
+
"custom(x,y,z)",
|
3839
|
+
|
3787
3840
|
"cross_entropy_loss(x,y)",
|
3788
3841
|
"cross_entropy_loss_back(x,y)",
|
3789
3842
|
};
|
3790
3843
|
|
3791
|
-
static_assert(GGML_OP_COUNT ==
|
3844
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3792
3845
|
|
3793
3846
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3794
3847
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -3819,12 +3872,31 @@ struct ggml_context_container {
|
|
3819
3872
|
struct ggml_context context;
|
3820
3873
|
};
|
3821
3874
|
|
3875
|
+
//
|
3876
|
+
// NUMA support
|
3877
|
+
//
|
3878
|
+
|
3879
|
+
#define GGML_NUMA_MAX_NODES 8
|
3880
|
+
#define GGML_NUMA_MAX_CPUS 512
|
3881
|
+
|
3882
|
+
struct ggml_numa_node {
|
3883
|
+
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
3884
|
+
uint32_t n_cpus;
|
3885
|
+
};
|
3886
|
+
|
3887
|
+
struct ggml_numa_nodes {
|
3888
|
+
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
3889
|
+
uint32_t n_nodes;
|
3890
|
+
uint32_t total_cpus; // hardware threads on system
|
3891
|
+
};
|
3892
|
+
|
3822
3893
|
//
|
3823
3894
|
// ggml state
|
3824
3895
|
//
|
3825
3896
|
|
3826
3897
|
struct ggml_state {
|
3827
3898
|
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
3899
|
+
struct ggml_numa_nodes numa;
|
3828
3900
|
};
|
3829
3901
|
|
3830
3902
|
// global state
|
@@ -3849,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
|
|
3849
3921
|
atomic_fetch_sub(&g_state_barrier, 1);
|
3850
3922
|
}
|
3851
3923
|
|
3924
|
+
void ggml_numa_init(void) {
|
3925
|
+
if (g_state.numa.n_nodes > 0) {
|
3926
|
+
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
3927
|
+
|
3928
|
+
return;
|
3929
|
+
}
|
3930
|
+
|
3931
|
+
#ifdef __linux__
|
3932
|
+
struct stat st;
|
3933
|
+
char path[256];
|
3934
|
+
int rv;
|
3935
|
+
|
3936
|
+
// enumerate nodes
|
3937
|
+
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
3938
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
3939
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3940
|
+
if (stat(path, &st) != 0) { break; }
|
3941
|
+
++g_state.numa.n_nodes;
|
3942
|
+
}
|
3943
|
+
|
3944
|
+
// enumerate CPUs
|
3945
|
+
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
3946
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
3947
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3948
|
+
if (stat(path, &st) != 0) { break; }
|
3949
|
+
++g_state.numa.total_cpus;
|
3950
|
+
}
|
3951
|
+
|
3952
|
+
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
3953
|
+
|
3954
|
+
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
3955
|
+
g_state.numa.n_nodes = 0;
|
3956
|
+
return;
|
3957
|
+
}
|
3958
|
+
|
3959
|
+
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
3960
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
3961
|
+
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
3962
|
+
node->n_cpus = 0;
|
3963
|
+
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
|
3964
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
|
3965
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3966
|
+
if (stat(path, &st) == 0) {
|
3967
|
+
node->cpus[node->n_cpus++] = c;
|
3968
|
+
GGML_PRINT_DEBUG(" %u", c);
|
3969
|
+
}
|
3970
|
+
}
|
3971
|
+
GGML_PRINT_DEBUG("\n");
|
3972
|
+
}
|
3973
|
+
|
3974
|
+
if (ggml_is_numa()) {
|
3975
|
+
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
3976
|
+
if (fptr != NULL) {
|
3977
|
+
char buf[42];
|
3978
|
+
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
3979
|
+
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
3980
|
+
}
|
3981
|
+
fclose(fptr);
|
3982
|
+
}
|
3983
|
+
}
|
3984
|
+
#else
|
3985
|
+
// TODO
|
3986
|
+
#endif
|
3987
|
+
}
|
3988
|
+
|
3989
|
+
bool ggml_is_numa(void) {
|
3990
|
+
return g_state.numa.n_nodes > 1;
|
3991
|
+
}
|
3992
|
+
|
3852
3993
|
////////////////////////////////////////////////////////////////////////////////
|
3853
3994
|
|
3854
3995
|
void ggml_print_object(const struct ggml_object * obj) {
|
@@ -4105,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4105
4246
|
|
4106
4247
|
g_state = (struct ggml_state) {
|
4107
4248
|
/*.contexts =*/ { { 0 } },
|
4249
|
+
/*.numa =*/ {
|
4250
|
+
.n_nodes = 0,
|
4251
|
+
.total_cpus = 0,
|
4252
|
+
},
|
4108
4253
|
};
|
4109
4254
|
|
4110
4255
|
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
@@ -4734,10 +4879,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
|
|
4734
4879
|
return tensor;
|
4735
4880
|
}
|
4736
4881
|
|
4882
|
+
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
4883
|
+
va_list args;
|
4884
|
+
va_start(args, fmt);
|
4885
|
+
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
|
4886
|
+
va_end(args);
|
4887
|
+
return tensor;
|
4888
|
+
}
|
4889
|
+
|
4737
4890
|
struct ggml_tensor * ggml_view_tensor(
|
4738
4891
|
struct ggml_context * ctx,
|
4739
4892
|
const struct ggml_tensor * src) {
|
4740
4893
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
4894
|
+
ggml_format_name(result, "%s (view)", src->name);
|
4741
4895
|
|
4742
4896
|
result->nb[0] = src->nb[0];
|
4743
4897
|
result->nb[1] = src->nb[1];
|
@@ -5899,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
|
|
5899
6053
|
|
5900
6054
|
// make a view of the destination
|
5901
6055
|
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
6056
|
+
if (strlen(b->name) > 0) {
|
6057
|
+
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
6058
|
+
} else {
|
6059
|
+
ggml_format_name(result, "%s (copy)", a->name);
|
6060
|
+
}
|
5902
6061
|
|
5903
6062
|
result->op = GGML_OP_CPY;
|
5904
6063
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5935,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
|
|
5935
6094
|
}
|
5936
6095
|
|
5937
6096
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6097
|
+
ggml_format_name(result, "%s (cont)", a->name);
|
5938
6098
|
|
5939
6099
|
result->op = GGML_OP_CONT;
|
5940
6100
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5978,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
|
|
5978
6138
|
}
|
5979
6139
|
|
5980
6140
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
6141
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5981
6142
|
|
5982
6143
|
result->op = GGML_OP_RESHAPE;
|
5983
6144
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6002,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6002
6163
|
|
6003
6164
|
const int64_t ne[1] = { ne0 };
|
6004
6165
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
6166
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6005
6167
|
|
6006
6168
|
result->op = GGML_OP_RESHAPE;
|
6007
6169
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6027,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6027
6189
|
|
6028
6190
|
const int64_t ne[2] = { ne0, ne1 };
|
6029
6191
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
6192
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6030
6193
|
|
6031
6194
|
result->op = GGML_OP_RESHAPE;
|
6032
6195
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6053,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6053
6216
|
|
6054
6217
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6055
6218
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
6219
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6056
6220
|
|
6057
6221
|
result->op = GGML_OP_RESHAPE;
|
6058
6222
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6081,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6081
6245
|
|
6082
6246
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6083
6247
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
6248
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6084
6249
|
|
6085
6250
|
result->op = GGML_OP_RESHAPE;
|
6086
6251
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6105,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
|
|
6105
6270
|
}
|
6106
6271
|
|
6107
6272
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
6273
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6108
6274
|
|
6109
6275
|
ggml_scratch_save(ctx);
|
6110
6276
|
|
6111
6277
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6278
|
+
ggml_set_name(offs, "offset");
|
6112
6279
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6113
6280
|
|
6114
6281
|
ggml_scratch_load(ctx);
|
@@ -6141,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
|
|
6141
6308
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6142
6309
|
|
6143
6310
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
6311
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6144
6312
|
|
6145
6313
|
ggml_scratch_save(ctx);
|
6146
6314
|
|
6147
6315
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6316
|
+
ggml_set_name(offs, "offset");
|
6148
6317
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6149
6318
|
|
6150
6319
|
ggml_scratch_load(ctx);
|
@@ -6183,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
|
|
6183
6352
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6184
6353
|
|
6185
6354
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
6355
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6186
6356
|
|
6187
6357
|
ggml_scratch_save(ctx);
|
6188
6358
|
|
6189
6359
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6360
|
+
ggml_set_name(offs, "offset");
|
6190
6361
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6191
6362
|
|
6192
6363
|
ggml_scratch_load(ctx);
|
@@ -6227,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
|
|
6227
6398
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6228
6399
|
|
6229
6400
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
6401
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6230
6402
|
|
6231
6403
|
ggml_scratch_save(ctx);
|
6232
6404
|
|
6233
6405
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6406
|
+
ggml_set_name(offs, "offset");
|
6234
6407
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6235
6408
|
|
6236
6409
|
ggml_scratch_load(ctx);
|
@@ -6276,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
|
|
6276
6449
|
}
|
6277
6450
|
|
6278
6451
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6452
|
+
ggml_format_name(result, "%s (permuted)", a->name);
|
6279
6453
|
|
6280
6454
|
int ne[GGML_MAX_DIMS];
|
6281
6455
|
int nb[GGML_MAX_DIMS];
|
@@ -6335,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
|
|
6335
6509
|
}
|
6336
6510
|
|
6337
6511
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6512
|
+
ggml_format_name(result, "%s (transposed)", a->name);
|
6338
6513
|
|
6339
6514
|
result->ne[0] = a->ne[1];
|
6340
6515
|
result->ne[1] = a->ne[0];
|
@@ -6603,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6603
6778
|
int n_past,
|
6604
6779
|
int n_dims,
|
6605
6780
|
int mode,
|
6781
|
+
int n_ctx,
|
6606
6782
|
bool inplace) {
|
6607
6783
|
GGML_ASSERT(n_past >= 0);
|
6608
6784
|
bool is_node = false;
|
@@ -6615,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6615
6791
|
|
6616
6792
|
ggml_scratch_save(ctx);
|
6617
6793
|
|
6618
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6794
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
6619
6795
|
|
6620
6796
|
((int32_t *) b->data)[0] = n_past;
|
6621
6797
|
((int32_t *) b->data)[1] = n_dims;
|
6622
6798
|
((int32_t *) b->data)[2] = mode;
|
6799
|
+
((int32_t *) b->data)[3] = n_ctx;
|
6623
6800
|
|
6624
6801
|
ggml_scratch_load(ctx);
|
6625
6802
|
|
@@ -6636,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
|
|
6636
6813
|
struct ggml_tensor * a,
|
6637
6814
|
int n_past,
|
6638
6815
|
int n_dims,
|
6639
|
-
int mode
|
6640
|
-
|
6816
|
+
int mode,
|
6817
|
+
int n_ctx) {
|
6818
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
6641
6819
|
}
|
6642
6820
|
|
6643
6821
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6645,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6645
6823
|
struct ggml_tensor * a,
|
6646
6824
|
int n_past,
|
6647
6825
|
int n_dims,
|
6648
|
-
int mode
|
6649
|
-
|
6826
|
+
int mode,
|
6827
|
+
int n_ctx) {
|
6828
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
6650
6829
|
}
|
6651
6830
|
|
6652
6831
|
// ggml_rope_back
|
@@ -7063,9 +7242,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
|
|
7063
7242
|
is_node = true;
|
7064
7243
|
}
|
7065
7244
|
|
7245
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7246
|
+
|
7247
|
+
ggml_scratch_save(ctx);
|
7248
|
+
|
7066
7249
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7067
7250
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7068
|
-
|
7251
|
+
|
7252
|
+
ggml_scratch_load(ctx);
|
7069
7253
|
|
7070
7254
|
result->op = GGML_OP_MAP_UNARY;
|
7071
7255
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7105,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
|
|
7105
7289
|
is_node = true;
|
7106
7290
|
}
|
7107
7291
|
|
7292
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7293
|
+
|
7294
|
+
ggml_scratch_save(ctx);
|
7295
|
+
|
7108
7296
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7109
7297
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7110
|
-
|
7298
|
+
|
7299
|
+
ggml_scratch_load(ctx);
|
7111
7300
|
|
7112
7301
|
result->op = GGML_OP_MAP_BINARY;
|
7113
7302
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7134,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
7134
7323
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
7135
7324
|
}
|
7136
7325
|
|
7326
|
+
// ggml_map_custom1
|
7327
|
+
|
7328
|
+
struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7329
|
+
struct ggml_context * ctx,
|
7330
|
+
struct ggml_tensor * a,
|
7331
|
+
const ggml_custom1_op_f32_t fun,
|
7332
|
+
bool inplace) {
|
7333
|
+
bool is_node = false;
|
7334
|
+
|
7335
|
+
if (!inplace && a->grad) {
|
7336
|
+
is_node = true;
|
7337
|
+
}
|
7338
|
+
|
7339
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7340
|
+
|
7341
|
+
ggml_scratch_save(ctx);
|
7342
|
+
|
7343
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7344
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7345
|
+
|
7346
|
+
ggml_scratch_load(ctx);
|
7347
|
+
|
7348
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7349
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7350
|
+
result->src0 = a;
|
7351
|
+
result->opt[0] = addr_tensor;
|
7352
|
+
|
7353
|
+
return result;
|
7354
|
+
}
|
7355
|
+
|
7356
|
+
struct ggml_tensor * ggml_map_custom1_f32(
|
7357
|
+
struct ggml_context * ctx,
|
7358
|
+
struct ggml_tensor * a,
|
7359
|
+
const ggml_custom1_op_f32_t fun) {
|
7360
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, false);
|
7361
|
+
}
|
7362
|
+
|
7363
|
+
struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
7364
|
+
struct ggml_context * ctx,
|
7365
|
+
struct ggml_tensor * a,
|
7366
|
+
const ggml_custom1_op_f32_t fun) {
|
7367
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7368
|
+
}
|
7369
|
+
|
7370
|
+
// ggml_map_custom2
|
7371
|
+
|
7372
|
+
struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7373
|
+
struct ggml_context * ctx,
|
7374
|
+
struct ggml_tensor * a,
|
7375
|
+
struct ggml_tensor * b,
|
7376
|
+
const ggml_custom2_op_f32_t fun,
|
7377
|
+
bool inplace) {
|
7378
|
+
bool is_node = false;
|
7379
|
+
|
7380
|
+
if (!inplace && (a->grad || b->grad)) {
|
7381
|
+
is_node = true;
|
7382
|
+
}
|
7383
|
+
|
7384
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7385
|
+
|
7386
|
+
ggml_scratch_save(ctx);
|
7387
|
+
|
7388
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7389
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7390
|
+
|
7391
|
+
ggml_scratch_load(ctx);
|
7392
|
+
|
7393
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7394
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7395
|
+
result->src0 = a;
|
7396
|
+
result->src1 = b;
|
7397
|
+
result->opt[0] = addr_tensor;
|
7398
|
+
|
7399
|
+
return result;
|
7400
|
+
}
|
7401
|
+
|
7402
|
+
struct ggml_tensor * ggml_map_custom2_f32(
|
7403
|
+
struct ggml_context * ctx,
|
7404
|
+
struct ggml_tensor * a,
|
7405
|
+
struct ggml_tensor * b,
|
7406
|
+
const ggml_custom2_op_f32_t fun) {
|
7407
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
|
7408
|
+
}
|
7409
|
+
|
7410
|
+
struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
7411
|
+
struct ggml_context * ctx,
|
7412
|
+
struct ggml_tensor * a,
|
7413
|
+
struct ggml_tensor * b,
|
7414
|
+
const ggml_custom2_op_f32_t fun) {
|
7415
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7416
|
+
}
|
7417
|
+
|
7418
|
+
// ggml_map_custom3
|
7419
|
+
|
7420
|
+
struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7421
|
+
struct ggml_context * ctx,
|
7422
|
+
struct ggml_tensor * a,
|
7423
|
+
struct ggml_tensor * b,
|
7424
|
+
struct ggml_tensor * c,
|
7425
|
+
const ggml_custom3_op_f32_t fun,
|
7426
|
+
bool inplace) {
|
7427
|
+
bool is_node = false;
|
7428
|
+
|
7429
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7430
|
+
is_node = true;
|
7431
|
+
}
|
7432
|
+
|
7433
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7434
|
+
|
7435
|
+
ggml_scratch_save(ctx);
|
7436
|
+
|
7437
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7438
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7439
|
+
|
7440
|
+
ggml_scratch_load(ctx);
|
7441
|
+
|
7442
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7443
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7444
|
+
result->src0 = a;
|
7445
|
+
result->src1 = b;
|
7446
|
+
result->opt[0] = addr_tensor;
|
7447
|
+
result->opt[1] = c;
|
7448
|
+
|
7449
|
+
return result;
|
7450
|
+
}
|
7451
|
+
|
7452
|
+
struct ggml_tensor * ggml_map_custom3_f32(
|
7453
|
+
struct ggml_context * ctx,
|
7454
|
+
struct ggml_tensor * a,
|
7455
|
+
struct ggml_tensor * b,
|
7456
|
+
struct ggml_tensor * c,
|
7457
|
+
const ggml_custom3_op_f32_t fun) {
|
7458
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
|
7459
|
+
}
|
7460
|
+
|
7461
|
+
struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
7462
|
+
struct ggml_context * ctx,
|
7463
|
+
struct ggml_tensor * a,
|
7464
|
+
struct ggml_tensor * b,
|
7465
|
+
struct ggml_tensor * c,
|
7466
|
+
const ggml_custom3_op_f32_t fun) {
|
7467
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7468
|
+
}
|
7469
|
+
|
7137
7470
|
// ggml_cross_entropy_loss
|
7138
7471
|
|
7139
7472
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -12111,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12111
12444
|
const struct ggml_tensor * src1,
|
12112
12445
|
struct ggml_tensor * dst) {
|
12113
12446
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12114
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12447
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
12115
12448
|
|
12116
12449
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12117
12450
|
return;
|
@@ -12120,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12120
12453
|
const int n_past = ((int32_t *) src1->data)[0];
|
12121
12454
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12122
12455
|
const int mode = ((int32_t *) src1->data)[2];
|
12456
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
12123
12457
|
|
12124
12458
|
assert(n_past >= 0);
|
12125
12459
|
|
@@ -12164,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12164
12498
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
12165
12499
|
|
12166
12500
|
const bool is_neox = mode & 2;
|
12501
|
+
const bool is_glm = mode & 4;
|
12167
12502
|
|
12168
12503
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12169
12504
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -12174,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
|
|
12174
12509
|
|
12175
12510
|
float theta = (float)p;
|
12176
12511
|
|
12177
|
-
if (
|
12512
|
+
if (is_glm) {
|
12513
|
+
theta = MIN(p, n_ctx - 2);
|
12514
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12515
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12516
|
+
const float cos_theta = cosf(theta);
|
12517
|
+
const float sin_theta = sinf(theta);
|
12518
|
+
const float cos_block_theta = cosf(block_theta);
|
12519
|
+
const float sin_block_theta = sinf(block_theta);
|
12520
|
+
|
12521
|
+
theta *= theta_scale;
|
12522
|
+
block_theta *= theta_scale;
|
12523
|
+
|
12524
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12525
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12526
|
+
|
12527
|
+
const float x0 = src[0];
|
12528
|
+
const float x1 = src[n_dims/2];
|
12529
|
+
const float x2 = src[n_dims];
|
12530
|
+
const float x3 = src[n_dims/2*3];
|
12531
|
+
|
12532
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12533
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
12534
|
+
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
12535
|
+
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
12536
|
+
}
|
12537
|
+
} else if (!is_neox) {
|
12178
12538
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12179
12539
|
const float cos_theta = cosf(theta);
|
12180
12540
|
const float sin_theta = sinf(theta);
|
@@ -12224,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12224
12584
|
const struct ggml_tensor * src1,
|
12225
12585
|
struct ggml_tensor * dst) {
|
12226
12586
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12227
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12587
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
12228
12588
|
|
12229
12589
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12230
12590
|
return;
|
@@ -12233,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12233
12593
|
const int n_past = ((int32_t *) src1->data)[0];
|
12234
12594
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12235
12595
|
const int mode = ((int32_t *) src1->data)[2];
|
12596
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
12236
12597
|
|
12237
12598
|
assert(n_past >= 0);
|
12238
12599
|
|
@@ -12277,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12277
12638
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
12278
12639
|
|
12279
12640
|
const bool is_neox = mode & 2;
|
12641
|
+
const bool is_glm = mode & 4;
|
12280
12642
|
|
12281
12643
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12282
12644
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -12287,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
|
|
12287
12649
|
|
12288
12650
|
float theta = (float)p;
|
12289
12651
|
|
12290
|
-
if (
|
12652
|
+
if (is_glm) {
|
12653
|
+
theta = MIN(p, n_ctx - 2);
|
12654
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12655
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12656
|
+
const float cos_theta = cosf(theta);
|
12657
|
+
const float sin_theta = sinf(theta);
|
12658
|
+
const float cos_block_theta = cosf(block_theta);
|
12659
|
+
const float sin_block_theta = sinf(block_theta);
|
12660
|
+
|
12661
|
+
theta *= theta_scale;
|
12662
|
+
block_theta *= theta_scale;
|
12663
|
+
|
12664
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12665
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12666
|
+
|
12667
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
12668
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
12669
|
+
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
12670
|
+
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
12671
|
+
|
12672
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
12673
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12674
|
+
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
12675
|
+
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
12676
|
+
}
|
12677
|
+
} if (!is_neox) {
|
12291
12678
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12292
12679
|
const float cos_theta = cosf(theta);
|
12293
12680
|
const float sin_theta = sinf(theta);
|
@@ -13179,8 +13566,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13179
13566
|
const int nk1 = ne01;
|
13180
13567
|
|
13181
13568
|
// size of the convolution row - the kernel size unrolled across all channels
|
13182
|
-
|
13183
|
-
const int ew0 = ggml_up32(nk0*nk1*ne02);
|
13569
|
+
const int ew0 = nk0*nk1*ne02;
|
13184
13570
|
|
13185
13571
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13186
13572
|
GGML_ASSERT(nb10 == sizeof(float));
|
@@ -14590,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
|
|
14590
14976
|
}
|
14591
14977
|
}
|
14592
14978
|
|
14979
|
+
// ggml_compute_forward_map_custom1
|
14980
|
+
|
14981
|
+
static void ggml_compute_forward_map_custom1_f32(
|
14982
|
+
const struct ggml_compute_params * params,
|
14983
|
+
const struct ggml_tensor * a,
|
14984
|
+
struct ggml_tensor * dst,
|
14985
|
+
const ggml_custom1_op_f32_t fun) {
|
14986
|
+
assert(params->ith == 0);
|
14987
|
+
|
14988
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14989
|
+
return;
|
14990
|
+
}
|
14991
|
+
|
14992
|
+
fun(dst, a);
|
14993
|
+
}
|
14994
|
+
|
14995
|
+
|
14996
|
+
static void ggml_compute_forward_map_custom1(
|
14997
|
+
const struct ggml_compute_params * params,
|
14998
|
+
const struct ggml_tensor * a,
|
14999
|
+
struct ggml_tensor * dst,
|
15000
|
+
const ggml_custom1_op_f32_t fun) {
|
15001
|
+
switch (a->type) {
|
15002
|
+
case GGML_TYPE_F32:
|
15003
|
+
{
|
15004
|
+
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
15005
|
+
} break;
|
15006
|
+
default:
|
15007
|
+
{
|
15008
|
+
GGML_ASSERT(false);
|
15009
|
+
} break;
|
15010
|
+
}
|
15011
|
+
}
|
15012
|
+
|
15013
|
+
// ggml_compute_forward_map_custom2
|
15014
|
+
|
15015
|
+
static void ggml_compute_forward_map_custom2_f32(
|
15016
|
+
const struct ggml_compute_params * params,
|
15017
|
+
const struct ggml_tensor * a,
|
15018
|
+
const struct ggml_tensor * b,
|
15019
|
+
struct ggml_tensor * dst,
|
15020
|
+
const ggml_custom2_op_f32_t fun) {
|
15021
|
+
assert(params->ith == 0);
|
15022
|
+
|
15023
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15024
|
+
return;
|
15025
|
+
}
|
15026
|
+
|
15027
|
+
fun(dst, a, b);
|
15028
|
+
}
|
15029
|
+
|
15030
|
+
|
15031
|
+
static void ggml_compute_forward_map_custom2(
|
15032
|
+
const struct ggml_compute_params * params,
|
15033
|
+
const struct ggml_tensor * a,
|
15034
|
+
const struct ggml_tensor * b,
|
15035
|
+
struct ggml_tensor * dst,
|
15036
|
+
const ggml_custom2_op_f32_t fun) {
|
15037
|
+
switch (a->type) {
|
15038
|
+
case GGML_TYPE_F32:
|
15039
|
+
{
|
15040
|
+
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
15041
|
+
} break;
|
15042
|
+
default:
|
15043
|
+
{
|
15044
|
+
GGML_ASSERT(false);
|
15045
|
+
} break;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
// ggml_compute_forward_map_custom3
|
15050
|
+
|
15051
|
+
static void ggml_compute_forward_map_custom3_f32(
|
15052
|
+
const struct ggml_compute_params * params,
|
15053
|
+
const struct ggml_tensor * a,
|
15054
|
+
const struct ggml_tensor * b,
|
15055
|
+
const struct ggml_tensor * c,
|
15056
|
+
struct ggml_tensor * dst,
|
15057
|
+
const ggml_custom3_op_f32_t fun) {
|
15058
|
+
assert(params->ith == 0);
|
15059
|
+
|
15060
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15061
|
+
return;
|
15062
|
+
}
|
15063
|
+
|
15064
|
+
fun(dst, a, b, c);
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
|
15068
|
+
static void ggml_compute_forward_map_custom3(
|
15069
|
+
const struct ggml_compute_params * params,
|
15070
|
+
const struct ggml_tensor * a,
|
15071
|
+
const struct ggml_tensor * b,
|
15072
|
+
const struct ggml_tensor * c,
|
15073
|
+
struct ggml_tensor * dst,
|
15074
|
+
const ggml_custom3_op_f32_t fun) {
|
15075
|
+
switch (a->type) {
|
15076
|
+
case GGML_TYPE_F32:
|
15077
|
+
{
|
15078
|
+
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
15079
|
+
} break;
|
15080
|
+
default:
|
15081
|
+
{
|
15082
|
+
GGML_ASSERT(false);
|
15083
|
+
} break;
|
15084
|
+
}
|
15085
|
+
}
|
15086
|
+
|
14593
15087
|
// ggml_compute_forward_cross_entropy_loss
|
14594
15088
|
|
14595
15089
|
static void ggml_compute_forward_cross_entropy_loss_f32(
|
@@ -14880,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14880
15374
|
if (skip_cpu) {
|
14881
15375
|
return;
|
14882
15376
|
}
|
14883
|
-
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
15377
|
+
GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
|
14884
15378
|
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
14885
15379
|
#endif // GGML_USE_CUBLAS
|
14886
15380
|
|
@@ -15127,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15127
15621
|
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
15128
15622
|
}
|
15129
15623
|
break;
|
15624
|
+
case GGML_OP_MAP_CUSTOM1:
|
15625
|
+
{
|
15626
|
+
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
|
15627
|
+
ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
|
15628
|
+
}
|
15629
|
+
break;
|
15630
|
+
case GGML_OP_MAP_CUSTOM2:
|
15631
|
+
{
|
15632
|
+
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
|
15633
|
+
ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
|
15634
|
+
}
|
15635
|
+
break;
|
15636
|
+
case GGML_OP_MAP_CUSTOM3:
|
15637
|
+
{
|
15638
|
+
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
|
15639
|
+
ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
|
15640
|
+
}
|
15641
|
+
break;
|
15130
15642
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
15131
15643
|
{
|
15132
15644
|
ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
|
@@ -15735,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15735
16247
|
{
|
15736
16248
|
if (src0->grad) {
|
15737
16249
|
assert(src1->type == GGML_TYPE_I32);
|
15738
|
-
assert(ggml_nelements(src1) ==
|
16250
|
+
assert(ggml_nelements(src1) == 4);
|
15739
16251
|
const int n_past = ((int32_t *) src1->data)[0];
|
15740
16252
|
const int n_dims = ((int32_t *) src1->data)[1];
|
15741
16253
|
const int mode = ((int32_t *) src1->data)[2];
|
16254
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
15742
16255
|
src0->grad = ggml_add_impl(ctx,
|
15743
16256
|
src0->grad,
|
15744
16257
|
ggml_rope(ctx,
|
15745
16258
|
tensor->grad,
|
15746
16259
|
n_past,
|
15747
16260
|
n_dims,
|
15748
|
-
mode
|
16261
|
+
mode,
|
16262
|
+
n_ctx),
|
15749
16263
|
inplace);
|
15750
16264
|
}
|
15751
16265
|
if (src1->grad) {
|
@@ -15933,6 +16447,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15933
16447
|
case GGML_OP_WIN_UNPART:
|
15934
16448
|
case GGML_OP_MAP_UNARY:
|
15935
16449
|
case GGML_OP_MAP_BINARY:
|
16450
|
+
case GGML_OP_MAP_CUSTOM1:
|
16451
|
+
case GGML_OP_MAP_CUSTOM2:
|
16452
|
+
case GGML_OP_MAP_CUSTOM3:
|
15936
16453
|
{
|
15937
16454
|
GGML_ASSERT(false); // not supported
|
15938
16455
|
} break;
|
@@ -16004,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
16004
16521
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
16005
16522
|
|
16006
16523
|
if (strlen(node->name) == 0) {
|
16007
|
-
|
16524
|
+
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
16008
16525
|
}
|
16009
16526
|
|
16010
16527
|
cgraph->leafs[cgraph->n_leafs] = node;
|
@@ -16013,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
16013
16530
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
16014
16531
|
|
16015
16532
|
if (strlen(node->name) == 0) {
|
16016
|
-
|
16533
|
+
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
16017
16534
|
}
|
16018
16535
|
|
16019
16536
|
cgraph->nodes[cgraph->n_nodes] = node;
|
@@ -16167,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
|
|
16167
16684
|
|
16168
16685
|
#endif
|
16169
16686
|
|
16687
|
+
// Android's libc implementation "bionic" does not support setting affinity
|
16688
|
+
#if defined(__linux__) && !defined(__BIONIC__)
|
16689
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) {
|
16690
|
+
if (!ggml_is_numa()) {
|
16691
|
+
return;
|
16692
|
+
}
|
16693
|
+
|
16694
|
+
// run thread on node_num thread_n / (threads per node)
|
16695
|
+
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
16696
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
16697
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16698
|
+
|
16699
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16700
|
+
CPU_ZERO_S(setsize, cpus);
|
16701
|
+
for (size_t i = 0; i < node->n_cpus; ++i) {
|
16702
|
+
CPU_SET_S(node->cpus[i], setsize, cpus);
|
16703
|
+
}
|
16704
|
+
|
16705
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16706
|
+
if (rv) {
|
16707
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16708
|
+
strerror(rv));
|
16709
|
+
}
|
16710
|
+
|
16711
|
+
CPU_FREE(cpus);
|
16712
|
+
}
|
16713
|
+
|
16714
|
+
void clear_numa_thread_affinity(void) {
|
16715
|
+
if (!ggml_is_numa()) {
|
16716
|
+
return;
|
16717
|
+
}
|
16718
|
+
|
16719
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16720
|
+
|
16721
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16722
|
+
CPU_ZERO_S(setsize, cpus);
|
16723
|
+
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
|
16724
|
+
CPU_SET_S(i, setsize, cpus);
|
16725
|
+
}
|
16726
|
+
|
16727
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16728
|
+
if (rv) {
|
16729
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16730
|
+
strerror(rv));
|
16731
|
+
}
|
16732
|
+
|
16733
|
+
CPU_FREE(cpus);
|
16734
|
+
}
|
16735
|
+
#else
|
16736
|
+
// TODO: Windows etc.
|
16737
|
+
// (the linux implementation may also work on BSD, someone should test)
|
16738
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
16739
|
+
void clear_numa_thread_affinity(void) {}
|
16740
|
+
#endif
|
16741
|
+
|
16170
16742
|
struct ggml_compute_state_shared {
|
16171
|
-
|
16743
|
+
struct ggml_cgraph * cgraph;
|
16744
|
+
|
16745
|
+
int64_t perf_node_start_cycles;
|
16746
|
+
int64_t perf_node_start_time_us;
|
16172
16747
|
|
16173
16748
|
int n_threads;
|
16174
16749
|
|
16175
16750
|
// synchronization primitives
|
16176
|
-
atomic_int
|
16177
|
-
|
16178
|
-
atomic_bool stop; // stop all threads
|
16751
|
+
atomic_int n_active; // num active threads
|
16752
|
+
atomic_int node_n; // active graph node
|
16179
16753
|
};
|
16180
16754
|
|
16181
16755
|
struct ggml_compute_state {
|
16182
16756
|
ggml_thread_t thrd;
|
16183
|
-
|
16184
|
-
struct ggml_compute_params params;
|
16185
|
-
struct ggml_tensor * node;
|
16186
|
-
|
16757
|
+
int ith;
|
16187
16758
|
struct ggml_compute_state_shared * shared;
|
16188
16759
|
};
|
16189
16760
|
|
16761
|
+
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
16762
|
+
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
16763
|
+
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
16764
|
+
|
16765
|
+
node->perf_runs++;
|
16766
|
+
node->perf_cycles += cycles_cur;
|
16767
|
+
node->perf_time_us += time_us_cur;
|
16768
|
+
}
|
16769
|
+
|
16190
16770
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
16191
16771
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
16772
|
+
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
16192
16773
|
|
16193
16774
|
const int n_threads = state->shared->n_threads;
|
16775
|
+
set_numa_thread_affinity(state->ith, n_threads);
|
16776
|
+
|
16777
|
+
int node_n = -1;
|
16194
16778
|
|
16195
16779
|
while (true) {
|
16196
|
-
if (
|
16197
|
-
|
16198
|
-
|
16199
|
-
|
16200
|
-
|
16201
|
-
|
16202
|
-
|
16203
|
-
|
16204
|
-
|
16780
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
16781
|
+
// all other threads are finished and spinning
|
16782
|
+
// do finalize and init here so we don't have synchronize again
|
16783
|
+
struct ggml_compute_params params = {
|
16784
|
+
/*.type =*/ GGML_TASK_FINALIZE,
|
16785
|
+
/*.ith =*/ 0,
|
16786
|
+
/*.nth =*/ 0,
|
16787
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16788
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16789
|
+
};
|
16790
|
+
|
16791
|
+
if (node_n != -1) {
|
16792
|
+
/* FINALIZE */
|
16793
|
+
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16794
|
+
params.nth = node->n_tasks;
|
16795
|
+
ggml_compute_forward(¶ms, node);
|
16796
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16205
16797
|
}
|
16206
|
-
}
|
16207
16798
|
|
16208
|
-
|
16799
|
+
// distribute new work or execute it direct if 1T
|
16800
|
+
while (++node_n < cgraph->n_nodes) {
|
16801
|
+
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16802
|
+
|
16803
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16804
|
+
|
16805
|
+
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16806
|
+
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16209
16807
|
|
16210
|
-
|
16211
|
-
|
16212
|
-
|
16213
|
-
|
16808
|
+
/* INIT */
|
16809
|
+
params.type = GGML_TASK_INIT;
|
16810
|
+
params.nth = node->n_tasks;
|
16811
|
+
ggml_compute_forward(¶ms, node);
|
16812
|
+
|
16813
|
+
if (node->n_tasks == 1) {
|
16814
|
+
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16815
|
+
// they do something more efficient than spinning (?)
|
16816
|
+
params.type = GGML_TASK_COMPUTE;
|
16817
|
+
ggml_compute_forward(¶ms, node);
|
16818
|
+
|
16819
|
+
params.type = GGML_TASK_FINALIZE;
|
16820
|
+
ggml_compute_forward(¶ms, node);
|
16821
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16822
|
+
} else {
|
16823
|
+
break;
|
16824
|
+
}
|
16214
16825
|
}
|
16215
|
-
|
16216
|
-
|
16826
|
+
|
16827
|
+
atomic_store(&state->shared->n_active, n_threads);
|
16828
|
+
atomic_store(&state->shared->node_n, node_n);
|
16829
|
+
} else {
|
16830
|
+
// wait for other threads to finish
|
16831
|
+
const int last = node_n;
|
16832
|
+
do {
|
16833
|
+
sched_yield();
|
16834
|
+
node_n = atomic_load(&state->shared->node_n);
|
16835
|
+
} while (node_n == last);
|
16217
16836
|
}
|
16218
16837
|
|
16219
16838
|
// check if we should stop
|
16220
|
-
if (
|
16221
|
-
break;
|
16222
|
-
}
|
16839
|
+
if (node_n >= cgraph->n_nodes) break;
|
16223
16840
|
|
16224
|
-
|
16225
|
-
|
16226
|
-
ggml_compute_forward(&state->params, state->node);
|
16227
|
-
}
|
16841
|
+
/* COMPUTE */
|
16842
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16228
16843
|
|
16229
|
-
|
16230
|
-
|
16231
|
-
|
16844
|
+
struct ggml_compute_params params = {
|
16845
|
+
/*.type =*/ GGML_TASK_COMPUTE,
|
16846
|
+
/*.ith =*/ state->ith,
|
16847
|
+
/*.nth =*/ node->n_tasks,
|
16848
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16849
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16850
|
+
};
|
16851
|
+
|
16852
|
+
if (state->ith < node->n_tasks) {
|
16853
|
+
ggml_compute_forward(¶ms, node);
|
16232
16854
|
}
|
16233
16855
|
}
|
16234
16856
|
|
@@ -16239,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16239
16861
|
const int n_threads = cgraph->n_threads;
|
16240
16862
|
|
16241
16863
|
struct ggml_compute_state_shared state_shared = {
|
16242
|
-
/*.
|
16243
|
-
/*.
|
16244
|
-
/*.
|
16245
|
-
/*.
|
16246
|
-
/*.
|
16864
|
+
/*.cgraph =*/ cgraph,
|
16865
|
+
/*.perf_node_start_cycles =*/ 0,
|
16866
|
+
/*.perf_node_start_time_us =*/ 0,
|
16867
|
+
/*.n_threads =*/ n_threads,
|
16868
|
+
/*.n_active =*/ n_threads,
|
16869
|
+
/*.node_n =*/ -1,
|
16247
16870
|
};
|
16248
|
-
struct ggml_compute_state * workers =
|
16249
|
-
|
16250
|
-
// create thread pool
|
16251
|
-
if (n_threads > 1) {
|
16252
|
-
ggml_lock_init(&state_shared.spin);
|
16253
|
-
|
16254
|
-
atomic_store(&state_shared.has_work, true);
|
16255
|
-
|
16256
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16257
|
-
workers[j] = (struct ggml_compute_state) {
|
16258
|
-
.thrd = 0,
|
16259
|
-
.params = {
|
16260
|
-
.type = GGML_TASK_COMPUTE,
|
16261
|
-
.ith = j + 1,
|
16262
|
-
.nth = n_threads,
|
16263
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16264
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16265
|
-
},
|
16266
|
-
.node = NULL,
|
16267
|
-
.shared = &state_shared,
|
16268
|
-
};
|
16269
|
-
|
16270
|
-
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
16271
|
-
GGML_ASSERT(rc == 0);
|
16272
|
-
UNUSED(rc);
|
16273
|
-
}
|
16274
|
-
}
|
16871
|
+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
16275
16872
|
|
16276
16873
|
// initialize tasks + work buffer
|
16277
16874
|
{
|
@@ -16415,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16415
17012
|
} break;
|
16416
17013
|
case GGML_OP_SCALE:
|
16417
17014
|
{
|
16418
|
-
node->n_tasks =
|
17015
|
+
node->n_tasks = 1;
|
16419
17016
|
} break;
|
16420
17017
|
case GGML_OP_SET:
|
16421
17018
|
case GGML_OP_CONT:
|
@@ -16574,6 +17171,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16574
17171
|
case GGML_OP_WIN_UNPART:
|
16575
17172
|
case GGML_OP_MAP_UNARY:
|
16576
17173
|
case GGML_OP_MAP_BINARY:
|
17174
|
+
case GGML_OP_MAP_CUSTOM1:
|
17175
|
+
case GGML_OP_MAP_CUSTOM2:
|
17176
|
+
case GGML_OP_MAP_CUSTOM3:
|
16577
17177
|
{
|
16578
17178
|
node->n_tasks = 1;
|
16579
17179
|
} break;
|
@@ -16616,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16616
17216
|
}
|
16617
17217
|
}
|
16618
17218
|
|
16619
|
-
|
16620
|
-
|
16621
|
-
|
16622
|
-
|
16623
|
-
|
16624
|
-
|
16625
|
-
|
16626
|
-
|
16627
|
-
// TODO: this could be used to avoid unnecessary computations, but it needs to be improved
|
16628
|
-
//if (node->grad == NULL && node->perf_runs > 0) {
|
16629
|
-
// continue;
|
16630
|
-
//}
|
16631
|
-
|
16632
|
-
const int64_t perf_node_start_cycles = ggml_perf_cycles();
|
16633
|
-
const int64_t perf_node_start_time_us = ggml_perf_time_us();
|
16634
|
-
|
16635
|
-
// INIT
|
16636
|
-
struct ggml_compute_params params = {
|
16637
|
-
/*.type =*/ GGML_TASK_INIT,
|
16638
|
-
/*.ith =*/ 0,
|
16639
|
-
/*.nth =*/ node->n_tasks,
|
16640
|
-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16641
|
-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16642
|
-
};
|
16643
|
-
|
16644
|
-
ggml_compute_forward(¶ms, node);
|
16645
|
-
|
16646
|
-
// COMPUTE
|
16647
|
-
if (node->n_tasks > 1) {
|
16648
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16649
|
-
atomic_store(&state_shared.has_work, false);
|
16650
|
-
}
|
16651
|
-
|
16652
|
-
while (atomic_load(&state_shared.has_work)) {
|
16653
|
-
ggml_lock_lock (&state_shared.spin);
|
16654
|
-
ggml_lock_unlock(&state_shared.spin);
|
16655
|
-
}
|
16656
|
-
|
16657
|
-
// launch thread pool
|
16658
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16659
|
-
workers[j].params = (struct ggml_compute_params) {
|
16660
|
-
.type = GGML_TASK_COMPUTE,
|
16661
|
-
.ith = j + 1,
|
16662
|
-
.nth = node->n_tasks,
|
16663
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16664
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16665
|
-
};
|
16666
|
-
workers[j].node = node;
|
16667
|
-
}
|
16668
|
-
|
16669
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16670
|
-
|
16671
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16672
|
-
ggml_lock_lock (&state_shared.spin);
|
16673
|
-
ggml_lock_unlock(&state_shared.spin);
|
16674
|
-
}
|
16675
|
-
|
16676
|
-
atomic_store(&state_shared.has_work, true);
|
16677
|
-
}
|
16678
|
-
|
16679
|
-
params.type = GGML_TASK_COMPUTE;
|
16680
|
-
ggml_compute_forward(¶ms, node);
|
16681
|
-
|
16682
|
-
// wait for thread pool
|
16683
|
-
if (node->n_tasks > 1) {
|
16684
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16685
|
-
atomic_store(&state_shared.has_work, false);
|
16686
|
-
}
|
16687
|
-
|
16688
|
-
while (atomic_load(&state_shared.has_work)) {
|
16689
|
-
ggml_lock_lock (&state_shared.spin);
|
16690
|
-
ggml_lock_unlock(&state_shared.spin);
|
16691
|
-
}
|
16692
|
-
|
16693
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16694
|
-
|
16695
|
-
while (atomic_load(&state_shared.n_ready) != 0) {
|
16696
|
-
ggml_lock_lock (&state_shared.spin);
|
16697
|
-
ggml_lock_unlock(&state_shared.spin);
|
16698
|
-
}
|
16699
|
-
}
|
16700
|
-
|
16701
|
-
// FINALIZE
|
16702
|
-
if (node->n_tasks > 1) {
|
16703
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16704
|
-
atomic_store(&state_shared.has_work, false);
|
16705
|
-
}
|
16706
|
-
|
16707
|
-
while (atomic_load(&state_shared.has_work)) {
|
16708
|
-
ggml_lock_lock (&state_shared.spin);
|
16709
|
-
ggml_lock_unlock(&state_shared.spin);
|
16710
|
-
}
|
16711
|
-
|
16712
|
-
// launch thread pool
|
16713
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16714
|
-
workers[j].params = (struct ggml_compute_params) {
|
16715
|
-
.type = GGML_TASK_FINALIZE,
|
16716
|
-
.ith = j + 1,
|
16717
|
-
.nth = node->n_tasks,
|
16718
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16719
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16720
|
-
};
|
16721
|
-
workers[j].node = node;
|
16722
|
-
}
|
16723
|
-
|
16724
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16725
|
-
|
16726
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16727
|
-
ggml_lock_lock (&state_shared.spin);
|
16728
|
-
ggml_lock_unlock(&state_shared.spin);
|
16729
|
-
}
|
17219
|
+
// create thread pool
|
17220
|
+
if (n_threads > 1) {
|
17221
|
+
for (int j = 1; j < n_threads; ++j) {
|
17222
|
+
workers[j] = (struct ggml_compute_state) {
|
17223
|
+
.thrd = 0,
|
17224
|
+
.ith = j,
|
17225
|
+
.shared = &state_shared,
|
17226
|
+
};
|
16730
17227
|
|
16731
|
-
|
17228
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
17229
|
+
GGML_ASSERT(rc == 0);
|
16732
17230
|
}
|
17231
|
+
}
|
17232
|
+
workers[0].ith = 0;
|
17233
|
+
workers[0].shared = &state_shared;
|
16733
17234
|
|
16734
|
-
|
16735
|
-
|
16736
|
-
|
16737
|
-
// wait for thread pool
|
16738
|
-
if (node->n_tasks > 1) {
|
16739
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16740
|
-
atomic_store(&state_shared.has_work, false);
|
16741
|
-
}
|
16742
|
-
|
16743
|
-
while (atomic_load(&state_shared.has_work)) {
|
16744
|
-
ggml_lock_lock (&state_shared.spin);
|
16745
|
-
ggml_lock_unlock(&state_shared.spin);
|
16746
|
-
}
|
16747
|
-
|
16748
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16749
|
-
|
16750
|
-
while (atomic_load(&state_shared.n_ready) != 0) {
|
16751
|
-
ggml_lock_lock (&state_shared.spin);
|
16752
|
-
ggml_lock_unlock(&state_shared.spin);
|
16753
|
-
}
|
16754
|
-
}
|
17235
|
+
const int64_t perf_start_cycles = ggml_perf_cycles();
|
17236
|
+
const int64_t perf_start_time_us = ggml_perf_time_us();
|
16755
17237
|
|
16756
|
-
|
16757
|
-
|
16758
|
-
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
|
16759
|
-
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
|
17238
|
+
// this is a work thread too
|
17239
|
+
ggml_graph_compute_thread(&workers[0]);
|
16760
17240
|
|
16761
|
-
|
16762
|
-
|
16763
|
-
node->perf_time_us += perf_time_us_cur;
|
16764
|
-
}
|
16765
|
-
}
|
17241
|
+
// don't leave affinity set on the main thread
|
17242
|
+
clear_numa_thread_affinity();
|
16766
17243
|
|
16767
17244
|
// join thread pool
|
16768
17245
|
if (n_threads > 1) {
|
16769
|
-
|
16770
|
-
|
16771
|
-
|
16772
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16773
|
-
int rc = ggml_thread_join(workers[j].thrd, NULL);
|
17246
|
+
for (int j = 1; j < n_threads; j++) {
|
17247
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
16774
17248
|
GGML_ASSERT(rc == 0);
|
16775
|
-
UNUSED(rc);
|
16776
17249
|
}
|
16777
|
-
|
16778
|
-
ggml_lock_destroy(&state_shared.spin);
|
16779
17250
|
}
|
16780
17251
|
|
16781
17252
|
// performance stats (graph)
|
@@ -17397,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
|
17397
17868
|
return NULL;
|
17398
17869
|
}
|
17399
17870
|
|
17871
|
+
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17872
|
+
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
17873
|
+
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
17874
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
17875
|
+
gparent0 ? (void *) gparent0 : (void *) parent,
|
17876
|
+
gparent0 ? "g" : "x",
|
17877
|
+
gparent ? (void *) gparent : (void *) node,
|
17878
|
+
gparent ? "g" : "x",
|
17879
|
+
gparent ? "empty" : "vee",
|
17880
|
+
gparent ? "dashed" : "solid",
|
17881
|
+
label);
|
17882
|
+
}
|
17883
|
+
|
17884
|
+
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17885
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
17886
|
+
(void *) parent, "x",
|
17887
|
+
(void *) node, "x",
|
17888
|
+
label);
|
17889
|
+
}
|
17890
|
+
|
17400
17891
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
17401
17892
|
char color[16];
|
17402
17893
|
|
@@ -17432,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17432
17923
|
(void *) node, color);
|
17433
17924
|
|
17434
17925
|
if (strlen(node->name) > 0) {
|
17435
|
-
fprintf(fp, "%s |", node->name);
|
17926
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17927
|
+
} else {
|
17928
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
17436
17929
|
}
|
17437
17930
|
|
17438
17931
|
if (node->n_dims == 2) {
|
@@ -17441,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17441
17934
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
17442
17935
|
}
|
17443
17936
|
|
17444
|
-
|
17445
17937
|
if (node->grad) {
|
17446
17938
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
17447
17939
|
} else {
|
@@ -17460,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17460
17952
|
(void *) node, color);
|
17461
17953
|
|
17462
17954
|
if (strlen(node->name) > 0) {
|
17463
|
-
|
17955
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17956
|
+
} else {
|
17957
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
17464
17958
|
}
|
17465
|
-
|
17466
|
-
|
17467
|
-
|
17468
|
-
|
17469
|
-
|
17470
|
-
|
17959
|
+
|
17960
|
+
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17961
|
+
if (ggml_nelements(node) < 5) {
|
17962
|
+
fprintf(fp, " | (");
|
17963
|
+
for (int j = 0; j < ggml_nelements(node); j++) {
|
17964
|
+
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
17965
|
+
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
17966
|
+
}
|
17967
|
+
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
|
17968
|
+
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
17969
|
+
}
|
17970
|
+
else {
|
17971
|
+
fprintf(fp, "#");
|
17972
|
+
}
|
17973
|
+
if (j < ggml_nelements(node) - 1) {
|
17974
|
+
fprintf(fp, ", ");
|
17975
|
+
}
|
17471
17976
|
}
|
17472
|
-
|
17473
|
-
else {
|
17474
|
-
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17977
|
+
fprintf(fp, ")");
|
17475
17978
|
}
|
17476
17979
|
fprintf(fp, "\"; ]\n");
|
17477
17980
|
}
|
@@ -17479,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17479
17982
|
for (int i = 0; i < gb->n_nodes; i++) {
|
17480
17983
|
struct ggml_tensor * node = gb->nodes[i];
|
17481
17984
|
|
17482
|
-
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
|
17483
|
-
|
17484
17985
|
if (node->src0) {
|
17485
|
-
|
17486
|
-
|
17487
|
-
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
|
17488
|
-
parent0 ? (void *) parent0 : (void *) node->src0,
|
17489
|
-
parent0 ? "g" : "x",
|
17490
|
-
parent ? (void *) parent : (void *) node,
|
17491
|
-
parent ? "g" : "x",
|
17492
|
-
parent ? "empty" : "vee",
|
17493
|
-
parent ? "dashed" : "solid");
|
17986
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
|
17494
17987
|
}
|
17495
17988
|
|
17496
17989
|
if (node->src1) {
|
17497
|
-
|
17498
|
-
|
17499
|
-
|
17500
|
-
|
17501
|
-
|
17502
|
-
|
17503
|
-
|
17504
|
-
|
17505
|
-
|
17990
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
17991
|
+
}
|
17992
|
+
|
17993
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17994
|
+
if (node->opt[j]) {
|
17995
|
+
char label[16];
|
17996
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
17997
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
|
17998
|
+
}
|
17506
17999
|
}
|
17507
18000
|
}
|
17508
18001
|
|
@@ -17510,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17510
18003
|
struct ggml_tensor * node = gb->leafs[i];
|
17511
18004
|
|
17512
18005
|
if (node->src0) {
|
17513
|
-
|
17514
|
-
(void *) node->src0, "x",
|
17515
|
-
(void *) node, "x");
|
18006
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
|
17516
18007
|
}
|
17517
18008
|
|
17518
18009
|
if (node->src1) {
|
17519
|
-
|
17520
|
-
|
17521
|
-
|
18010
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
18011
|
+
}
|
18012
|
+
|
18013
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
18014
|
+
if (node->opt[j]) {
|
18015
|
+
char label[16];
|
18016
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
18017
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
|
18018
|
+
}
|
17522
18019
|
}
|
17523
18020
|
}
|
17524
18021
|
|