llama_cpp 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +319 -52
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +138 -72
- data/ext/llama_cpp/src/llama.h +33 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
// Defines CLOCK_MONOTONIC on Linux
|
2
|
-
#define
|
1
|
+
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <stdio.h>
|
25
25
|
#include <float.h>
|
26
26
|
#include <limits.h>
|
27
|
+
#include <stdarg.h>
|
27
28
|
|
28
29
|
#ifdef GGML_USE_METAL
|
29
30
|
#include <unistd.h>
|
@@ -90,6 +91,11 @@ static int sched_yield (void) {
|
|
90
91
|
#include <stdatomic.h>
|
91
92
|
|
92
93
|
typedef void* thread_ret_t;
|
94
|
+
|
95
|
+
#include <sys/types.h>
|
96
|
+
#include <sys/stat.h>
|
97
|
+
#include <unistd.h>
|
98
|
+
|
93
99
|
#endif
|
94
100
|
|
95
101
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -118,6 +124,30 @@ typedef void* thread_ret_t;
|
|
118
124
|
#define GGML_SOFT_MAX_UNROLL 4
|
119
125
|
#define GGML_VEC_DOT_UNROLL 2
|
120
126
|
|
127
|
+
//
|
128
|
+
// logging
|
129
|
+
//
|
130
|
+
|
131
|
+
#if (GGML_DEBUG >= 1)
|
132
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
133
|
+
#else
|
134
|
+
#define GGML_PRINT_DEBUG(...)
|
135
|
+
#endif
|
136
|
+
|
137
|
+
#if (GGML_DEBUG >= 5)
|
138
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
139
|
+
#else
|
140
|
+
#define GGML_PRINT_DEBUG_5(...)
|
141
|
+
#endif
|
142
|
+
|
143
|
+
#if (GGML_DEBUG >= 10)
|
144
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
145
|
+
#else
|
146
|
+
#define GGML_PRINT_DEBUG_10(...)
|
147
|
+
#endif
|
148
|
+
|
149
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
150
|
+
|
121
151
|
#ifdef GGML_USE_ACCELERATE
|
122
152
|
// uncomment to use vDSP for soft max computation
|
123
153
|
// note: not sure if it is actually faster
|
@@ -130,6 +160,34 @@ typedef void* thread_ret_t;
|
|
130
160
|
#define GGML_MEM_ALIGN 16
|
131
161
|
#endif
|
132
162
|
|
163
|
+
//
|
164
|
+
// logging
|
165
|
+
//
|
166
|
+
|
167
|
+
#if (GGML_DEBUG >= 1)
|
168
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
169
|
+
#else
|
170
|
+
#define GGML_PRINT_DEBUG(...)
|
171
|
+
#endif
|
172
|
+
|
173
|
+
#if (GGML_DEBUG >= 5)
|
174
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
175
|
+
#else
|
176
|
+
#define GGML_PRINT_DEBUG_5(...)
|
177
|
+
#endif
|
178
|
+
|
179
|
+
#if (GGML_DEBUG >= 10)
|
180
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
181
|
+
#else
|
182
|
+
#define GGML_PRINT_DEBUG_10(...)
|
183
|
+
#endif
|
184
|
+
|
185
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
186
|
+
|
187
|
+
//
|
188
|
+
// end of logging block
|
189
|
+
//
|
190
|
+
|
133
191
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
134
192
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
135
193
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -143,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
143
201
|
#endif
|
144
202
|
if (result != 0) {
|
145
203
|
// Handle allocation failure
|
204
|
+
const char *error_desc = "unknown allocation error";
|
205
|
+
switch (result) {
|
206
|
+
case EINVAL:
|
207
|
+
error_desc = "invalid alignment value";
|
208
|
+
break;
|
209
|
+
case ENOMEM:
|
210
|
+
error_desc = "insufficient memory";
|
211
|
+
break;
|
212
|
+
}
|
213
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
214
|
+
__func__, error_desc, size/(1024.0*1024.0));
|
146
215
|
return NULL;
|
147
216
|
}
|
148
217
|
return aligned_memory;
|
@@ -419,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
419
488
|
}
|
420
489
|
}
|
421
490
|
|
422
|
-
|
423
491
|
//
|
424
492
|
// timing
|
425
493
|
//
|
@@ -482,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
|
|
482
550
|
#define ggml_perf_cycles_per_ms() 0
|
483
551
|
#endif
|
484
552
|
|
553
|
+
|
485
554
|
//
|
486
555
|
// cache line
|
487
556
|
//
|
@@ -3529,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
|
3529
3598
|
*s = 1.f/(*s);
|
3530
3599
|
}
|
3531
3600
|
|
3532
|
-
//
|
3533
|
-
// logging
|
3534
|
-
//
|
3535
|
-
|
3536
|
-
#if (GGML_DEBUG >= 1)
|
3537
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
3538
|
-
#else
|
3539
|
-
#define GGML_PRINT_DEBUG(...)
|
3540
|
-
#endif
|
3541
|
-
|
3542
|
-
#if (GGML_DEBUG >= 5)
|
3543
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
3544
|
-
#else
|
3545
|
-
#define GGML_PRINT_DEBUG_5(...)
|
3546
|
-
#endif
|
3547
|
-
|
3548
|
-
#if (GGML_DEBUG >= 10)
|
3549
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
3550
|
-
#else
|
3551
|
-
#define GGML_PRINT_DEBUG_10(...)
|
3552
|
-
#endif
|
3553
|
-
|
3554
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
3555
|
-
|
3556
3601
|
//
|
3557
3602
|
// data types
|
3558
3603
|
//
|
@@ -3712,11 +3757,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3712
3757
|
"MAP_UNARY",
|
3713
3758
|
"MAP_BINARY",
|
3714
3759
|
|
3760
|
+
"MAP_CUSTOM1",
|
3761
|
+
"MAP_CUSTOM2",
|
3762
|
+
"MAP_CUSTOM3",
|
3763
|
+
|
3715
3764
|
"CROSS_ENTROPY_LOSS",
|
3716
3765
|
"CROSS_ENTROPY_LOSS_BACK",
|
3717
3766
|
};
|
3718
3767
|
|
3719
|
-
static_assert(GGML_OP_COUNT ==
|
3768
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3720
3769
|
|
3721
3770
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3722
3771
|
"none",
|
@@ -3784,11 +3833,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3784
3833
|
"f(x)",
|
3785
3834
|
"f(x,y)",
|
3786
3835
|
|
3836
|
+
"custom(x)",
|
3837
|
+
"custom(x,y)",
|
3838
|
+
"custom(x,y,z)",
|
3839
|
+
|
3787
3840
|
"cross_entropy_loss(x,y)",
|
3788
3841
|
"cross_entropy_loss_back(x,y)",
|
3789
3842
|
};
|
3790
3843
|
|
3791
|
-
static_assert(GGML_OP_COUNT ==
|
3844
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3792
3845
|
|
3793
3846
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3794
3847
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -3819,12 +3872,31 @@ struct ggml_context_container {
|
|
3819
3872
|
struct ggml_context context;
|
3820
3873
|
};
|
3821
3874
|
|
3875
|
+
//
|
3876
|
+
// NUMA support
|
3877
|
+
//
|
3878
|
+
|
3879
|
+
#define GGML_NUMA_MAX_NODES 8
|
3880
|
+
#define GGML_NUMA_MAX_CPUS 512
|
3881
|
+
|
3882
|
+
struct ggml_numa_node {
|
3883
|
+
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
3884
|
+
uint32_t n_cpus;
|
3885
|
+
};
|
3886
|
+
|
3887
|
+
struct ggml_numa_nodes {
|
3888
|
+
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
3889
|
+
uint32_t n_nodes;
|
3890
|
+
uint32_t total_cpus; // hardware threads on system
|
3891
|
+
};
|
3892
|
+
|
3822
3893
|
//
|
3823
3894
|
// ggml state
|
3824
3895
|
//
|
3825
3896
|
|
3826
3897
|
struct ggml_state {
|
3827
3898
|
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
3899
|
+
struct ggml_numa_nodes numa;
|
3828
3900
|
};
|
3829
3901
|
|
3830
3902
|
// global state
|
@@ -3849,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
|
|
3849
3921
|
atomic_fetch_sub(&g_state_barrier, 1);
|
3850
3922
|
}
|
3851
3923
|
|
3924
|
+
void ggml_numa_init(void) {
|
3925
|
+
if (g_state.numa.n_nodes > 0) {
|
3926
|
+
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
3927
|
+
|
3928
|
+
return;
|
3929
|
+
}
|
3930
|
+
|
3931
|
+
#ifdef __linux__
|
3932
|
+
struct stat st;
|
3933
|
+
char path[256];
|
3934
|
+
int rv;
|
3935
|
+
|
3936
|
+
// enumerate nodes
|
3937
|
+
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
3938
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
3939
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3940
|
+
if (stat(path, &st) != 0) { break; }
|
3941
|
+
++g_state.numa.n_nodes;
|
3942
|
+
}
|
3943
|
+
|
3944
|
+
// enumerate CPUs
|
3945
|
+
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
3946
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
3947
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3948
|
+
if (stat(path, &st) != 0) { break; }
|
3949
|
+
++g_state.numa.total_cpus;
|
3950
|
+
}
|
3951
|
+
|
3952
|
+
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
3953
|
+
|
3954
|
+
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
3955
|
+
g_state.numa.n_nodes = 0;
|
3956
|
+
return;
|
3957
|
+
}
|
3958
|
+
|
3959
|
+
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
3960
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
3961
|
+
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
3962
|
+
node->n_cpus = 0;
|
3963
|
+
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
|
3964
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
|
3965
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3966
|
+
if (stat(path, &st) == 0) {
|
3967
|
+
node->cpus[node->n_cpus++] = c;
|
3968
|
+
GGML_PRINT_DEBUG(" %u", c);
|
3969
|
+
}
|
3970
|
+
}
|
3971
|
+
GGML_PRINT_DEBUG("\n");
|
3972
|
+
}
|
3973
|
+
|
3974
|
+
if (ggml_is_numa()) {
|
3975
|
+
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
3976
|
+
if (fptr != NULL) {
|
3977
|
+
char buf[42];
|
3978
|
+
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
3979
|
+
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
3980
|
+
}
|
3981
|
+
fclose(fptr);
|
3982
|
+
}
|
3983
|
+
}
|
3984
|
+
#else
|
3985
|
+
// TODO
|
3986
|
+
#endif
|
3987
|
+
}
|
3988
|
+
|
3989
|
+
bool ggml_is_numa(void) {
|
3990
|
+
return g_state.numa.n_nodes > 1;
|
3991
|
+
}
|
3992
|
+
|
3852
3993
|
////////////////////////////////////////////////////////////////////////////////
|
3853
3994
|
|
3854
3995
|
void ggml_print_object(const struct ggml_object * obj) {
|
@@ -4105,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4105
4246
|
|
4106
4247
|
g_state = (struct ggml_state) {
|
4107
4248
|
/*.contexts =*/ { { 0 } },
|
4249
|
+
/*.numa =*/ {
|
4250
|
+
.n_nodes = 0,
|
4251
|
+
.total_cpus = 0,
|
4252
|
+
},
|
4108
4253
|
};
|
4109
4254
|
|
4110
4255
|
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
@@ -4734,10 +4879,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
|
|
4734
4879
|
return tensor;
|
4735
4880
|
}
|
4736
4881
|
|
4882
|
+
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
4883
|
+
va_list args;
|
4884
|
+
va_start(args, fmt);
|
4885
|
+
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
|
4886
|
+
va_end(args);
|
4887
|
+
return tensor;
|
4888
|
+
}
|
4889
|
+
|
4737
4890
|
struct ggml_tensor * ggml_view_tensor(
|
4738
4891
|
struct ggml_context * ctx,
|
4739
4892
|
const struct ggml_tensor * src) {
|
4740
4893
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
4894
|
+
ggml_format_name(result, "%s (view)", src->name);
|
4741
4895
|
|
4742
4896
|
result->nb[0] = src->nb[0];
|
4743
4897
|
result->nb[1] = src->nb[1];
|
@@ -5899,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
|
|
5899
6053
|
|
5900
6054
|
// make a view of the destination
|
5901
6055
|
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
6056
|
+
if (strlen(b->name) > 0) {
|
6057
|
+
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
6058
|
+
} else {
|
6059
|
+
ggml_format_name(result, "%s (copy)", a->name);
|
6060
|
+
}
|
5902
6061
|
|
5903
6062
|
result->op = GGML_OP_CPY;
|
5904
6063
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5935,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
|
|
5935
6094
|
}
|
5936
6095
|
|
5937
6096
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6097
|
+
ggml_format_name(result, "%s (cont)", a->name);
|
5938
6098
|
|
5939
6099
|
result->op = GGML_OP_CONT;
|
5940
6100
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5978,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
|
|
5978
6138
|
}
|
5979
6139
|
|
5980
6140
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
6141
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5981
6142
|
|
5982
6143
|
result->op = GGML_OP_RESHAPE;
|
5983
6144
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6002,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6002
6163
|
|
6003
6164
|
const int64_t ne[1] = { ne0 };
|
6004
6165
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
6166
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6005
6167
|
|
6006
6168
|
result->op = GGML_OP_RESHAPE;
|
6007
6169
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6027,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6027
6189
|
|
6028
6190
|
const int64_t ne[2] = { ne0, ne1 };
|
6029
6191
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
6192
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6030
6193
|
|
6031
6194
|
result->op = GGML_OP_RESHAPE;
|
6032
6195
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6053,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6053
6216
|
|
6054
6217
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6055
6218
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
6219
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6056
6220
|
|
6057
6221
|
result->op = GGML_OP_RESHAPE;
|
6058
6222
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6081,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6081
6245
|
|
6082
6246
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6083
6247
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
6248
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6084
6249
|
|
6085
6250
|
result->op = GGML_OP_RESHAPE;
|
6086
6251
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6105,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
|
|
6105
6270
|
}
|
6106
6271
|
|
6107
6272
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
6273
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6108
6274
|
|
6109
6275
|
ggml_scratch_save(ctx);
|
6110
6276
|
|
6111
6277
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6278
|
+
ggml_set_name(offs, "offset");
|
6112
6279
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6113
6280
|
|
6114
6281
|
ggml_scratch_load(ctx);
|
@@ -6141,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
|
|
6141
6308
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6142
6309
|
|
6143
6310
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
6311
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6144
6312
|
|
6145
6313
|
ggml_scratch_save(ctx);
|
6146
6314
|
|
6147
6315
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6316
|
+
ggml_set_name(offs, "offset");
|
6148
6317
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6149
6318
|
|
6150
6319
|
ggml_scratch_load(ctx);
|
@@ -6183,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
|
|
6183
6352
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6184
6353
|
|
6185
6354
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
6355
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6186
6356
|
|
6187
6357
|
ggml_scratch_save(ctx);
|
6188
6358
|
|
6189
6359
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6360
|
+
ggml_set_name(offs, "offset");
|
6190
6361
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6191
6362
|
|
6192
6363
|
ggml_scratch_load(ctx);
|
@@ -6227,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
|
|
6227
6398
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6228
6399
|
|
6229
6400
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
6401
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6230
6402
|
|
6231
6403
|
ggml_scratch_save(ctx);
|
6232
6404
|
|
6233
6405
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6406
|
+
ggml_set_name(offs, "offset");
|
6234
6407
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6235
6408
|
|
6236
6409
|
ggml_scratch_load(ctx);
|
@@ -6276,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
|
|
6276
6449
|
}
|
6277
6450
|
|
6278
6451
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6452
|
+
ggml_format_name(result, "%s (permuted)", a->name);
|
6279
6453
|
|
6280
6454
|
int ne[GGML_MAX_DIMS];
|
6281
6455
|
int nb[GGML_MAX_DIMS];
|
@@ -6335,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
|
|
6335
6509
|
}
|
6336
6510
|
|
6337
6511
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6512
|
+
ggml_format_name(result, "%s (transposed)", a->name);
|
6338
6513
|
|
6339
6514
|
result->ne[0] = a->ne[1];
|
6340
6515
|
result->ne[1] = a->ne[0];
|
@@ -6603,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6603
6778
|
int n_past,
|
6604
6779
|
int n_dims,
|
6605
6780
|
int mode,
|
6781
|
+
int n_ctx,
|
6606
6782
|
bool inplace) {
|
6607
6783
|
GGML_ASSERT(n_past >= 0);
|
6608
6784
|
bool is_node = false;
|
@@ -6615,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6615
6791
|
|
6616
6792
|
ggml_scratch_save(ctx);
|
6617
6793
|
|
6618
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6794
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
6619
6795
|
|
6620
6796
|
((int32_t *) b->data)[0] = n_past;
|
6621
6797
|
((int32_t *) b->data)[1] = n_dims;
|
6622
6798
|
((int32_t *) b->data)[2] = mode;
|
6799
|
+
((int32_t *) b->data)[3] = n_ctx;
|
6623
6800
|
|
6624
6801
|
ggml_scratch_load(ctx);
|
6625
6802
|
|
@@ -6636,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
|
|
6636
6813
|
struct ggml_tensor * a,
|
6637
6814
|
int n_past,
|
6638
6815
|
int n_dims,
|
6639
|
-
int mode
|
6640
|
-
|
6816
|
+
int mode,
|
6817
|
+
int n_ctx) {
|
6818
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
6641
6819
|
}
|
6642
6820
|
|
6643
6821
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6645,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6645
6823
|
struct ggml_tensor * a,
|
6646
6824
|
int n_past,
|
6647
6825
|
int n_dims,
|
6648
|
-
int mode
|
6649
|
-
|
6826
|
+
int mode,
|
6827
|
+
int n_ctx) {
|
6828
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
6650
6829
|
}
|
6651
6830
|
|
6652
6831
|
// ggml_rope_back
|
@@ -7063,9 +7242,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
|
|
7063
7242
|
is_node = true;
|
7064
7243
|
}
|
7065
7244
|
|
7245
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7246
|
+
|
7247
|
+
ggml_scratch_save(ctx);
|
7248
|
+
|
7066
7249
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7067
7250
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7068
|
-
|
7251
|
+
|
7252
|
+
ggml_scratch_load(ctx);
|
7069
7253
|
|
7070
7254
|
result->op = GGML_OP_MAP_UNARY;
|
7071
7255
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7105,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
|
|
7105
7289
|
is_node = true;
|
7106
7290
|
}
|
7107
7291
|
|
7292
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7293
|
+
|
7294
|
+
ggml_scratch_save(ctx);
|
7295
|
+
|
7108
7296
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7109
7297
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7110
|
-
|
7298
|
+
|
7299
|
+
ggml_scratch_load(ctx);
|
7111
7300
|
|
7112
7301
|
result->op = GGML_OP_MAP_BINARY;
|
7113
7302
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7134,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
7134
7323
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
7135
7324
|
}
|
7136
7325
|
|
7326
|
+
// ggml_map_custom1
|
7327
|
+
|
7328
|
+
struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7329
|
+
struct ggml_context * ctx,
|
7330
|
+
struct ggml_tensor * a,
|
7331
|
+
const ggml_custom1_op_f32_t fun,
|
7332
|
+
bool inplace) {
|
7333
|
+
bool is_node = false;
|
7334
|
+
|
7335
|
+
if (!inplace && a->grad) {
|
7336
|
+
is_node = true;
|
7337
|
+
}
|
7338
|
+
|
7339
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7340
|
+
|
7341
|
+
ggml_scratch_save(ctx);
|
7342
|
+
|
7343
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7344
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7345
|
+
|
7346
|
+
ggml_scratch_load(ctx);
|
7347
|
+
|
7348
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7349
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7350
|
+
result->src0 = a;
|
7351
|
+
result->opt[0] = addr_tensor;
|
7352
|
+
|
7353
|
+
return result;
|
7354
|
+
}
|
7355
|
+
|
7356
|
+
struct ggml_tensor * ggml_map_custom1_f32(
|
7357
|
+
struct ggml_context * ctx,
|
7358
|
+
struct ggml_tensor * a,
|
7359
|
+
const ggml_custom1_op_f32_t fun) {
|
7360
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, false);
|
7361
|
+
}
|
7362
|
+
|
7363
|
+
struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
7364
|
+
struct ggml_context * ctx,
|
7365
|
+
struct ggml_tensor * a,
|
7366
|
+
const ggml_custom1_op_f32_t fun) {
|
7367
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7368
|
+
}
|
7369
|
+
|
7370
|
+
// ggml_map_custom2
|
7371
|
+
|
7372
|
+
struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7373
|
+
struct ggml_context * ctx,
|
7374
|
+
struct ggml_tensor * a,
|
7375
|
+
struct ggml_tensor * b,
|
7376
|
+
const ggml_custom2_op_f32_t fun,
|
7377
|
+
bool inplace) {
|
7378
|
+
bool is_node = false;
|
7379
|
+
|
7380
|
+
if (!inplace && (a->grad || b->grad)) {
|
7381
|
+
is_node = true;
|
7382
|
+
}
|
7383
|
+
|
7384
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7385
|
+
|
7386
|
+
ggml_scratch_save(ctx);
|
7387
|
+
|
7388
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7389
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7390
|
+
|
7391
|
+
ggml_scratch_load(ctx);
|
7392
|
+
|
7393
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7394
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7395
|
+
result->src0 = a;
|
7396
|
+
result->src1 = b;
|
7397
|
+
result->opt[0] = addr_tensor;
|
7398
|
+
|
7399
|
+
return result;
|
7400
|
+
}
|
7401
|
+
|
7402
|
+
struct ggml_tensor * ggml_map_custom2_f32(
|
7403
|
+
struct ggml_context * ctx,
|
7404
|
+
struct ggml_tensor * a,
|
7405
|
+
struct ggml_tensor * b,
|
7406
|
+
const ggml_custom2_op_f32_t fun) {
|
7407
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
|
7408
|
+
}
|
7409
|
+
|
7410
|
+
struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
7411
|
+
struct ggml_context * ctx,
|
7412
|
+
struct ggml_tensor * a,
|
7413
|
+
struct ggml_tensor * b,
|
7414
|
+
const ggml_custom2_op_f32_t fun) {
|
7415
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7416
|
+
}
|
7417
|
+
|
7418
|
+
// ggml_map_custom3
|
7419
|
+
|
7420
|
+
struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7421
|
+
struct ggml_context * ctx,
|
7422
|
+
struct ggml_tensor * a,
|
7423
|
+
struct ggml_tensor * b,
|
7424
|
+
struct ggml_tensor * c,
|
7425
|
+
const ggml_custom3_op_f32_t fun,
|
7426
|
+
bool inplace) {
|
7427
|
+
bool is_node = false;
|
7428
|
+
|
7429
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7430
|
+
is_node = true;
|
7431
|
+
}
|
7432
|
+
|
7433
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7434
|
+
|
7435
|
+
ggml_scratch_save(ctx);
|
7436
|
+
|
7437
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7438
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7439
|
+
|
7440
|
+
ggml_scratch_load(ctx);
|
7441
|
+
|
7442
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7443
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7444
|
+
result->src0 = a;
|
7445
|
+
result->src1 = b;
|
7446
|
+
result->opt[0] = addr_tensor;
|
7447
|
+
result->opt[1] = c;
|
7448
|
+
|
7449
|
+
return result;
|
7450
|
+
}
|
7451
|
+
|
7452
|
+
struct ggml_tensor * ggml_map_custom3_f32(
|
7453
|
+
struct ggml_context * ctx,
|
7454
|
+
struct ggml_tensor * a,
|
7455
|
+
struct ggml_tensor * b,
|
7456
|
+
struct ggml_tensor * c,
|
7457
|
+
const ggml_custom3_op_f32_t fun) {
|
7458
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
|
7459
|
+
}
|
7460
|
+
|
7461
|
+
struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
7462
|
+
struct ggml_context * ctx,
|
7463
|
+
struct ggml_tensor * a,
|
7464
|
+
struct ggml_tensor * b,
|
7465
|
+
struct ggml_tensor * c,
|
7466
|
+
const ggml_custom3_op_f32_t fun) {
|
7467
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7468
|
+
}
|
7469
|
+
|
7137
7470
|
// ggml_cross_entropy_loss
|
7138
7471
|
|
7139
7472
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -12111,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12111
12444
|
const struct ggml_tensor * src1,
|
12112
12445
|
struct ggml_tensor * dst) {
|
12113
12446
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12114
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12447
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
12115
12448
|
|
12116
12449
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12117
12450
|
return;
|
@@ -12120,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12120
12453
|
const int n_past = ((int32_t *) src1->data)[0];
|
12121
12454
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12122
12455
|
const int mode = ((int32_t *) src1->data)[2];
|
12456
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
12123
12457
|
|
12124
12458
|
assert(n_past >= 0);
|
12125
12459
|
|
@@ -12164,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12164
12498
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
12165
12499
|
|
12166
12500
|
const bool is_neox = mode & 2;
|
12501
|
+
const bool is_glm = mode & 4;
|
12167
12502
|
|
12168
12503
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12169
12504
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -12174,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
|
|
12174
12509
|
|
12175
12510
|
float theta = (float)p;
|
12176
12511
|
|
12177
|
-
if (
|
12512
|
+
if (is_glm) {
|
12513
|
+
theta = MIN(p, n_ctx - 2);
|
12514
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12515
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12516
|
+
const float cos_theta = cosf(theta);
|
12517
|
+
const float sin_theta = sinf(theta);
|
12518
|
+
const float cos_block_theta = cosf(block_theta);
|
12519
|
+
const float sin_block_theta = sinf(block_theta);
|
12520
|
+
|
12521
|
+
theta *= theta_scale;
|
12522
|
+
block_theta *= theta_scale;
|
12523
|
+
|
12524
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12525
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12526
|
+
|
12527
|
+
const float x0 = src[0];
|
12528
|
+
const float x1 = src[n_dims/2];
|
12529
|
+
const float x2 = src[n_dims];
|
12530
|
+
const float x3 = src[n_dims/2*3];
|
12531
|
+
|
12532
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12533
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
12534
|
+
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
12535
|
+
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
12536
|
+
}
|
12537
|
+
} else if (!is_neox) {
|
12178
12538
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12179
12539
|
const float cos_theta = cosf(theta);
|
12180
12540
|
const float sin_theta = sinf(theta);
|
@@ -12224,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12224
12584
|
const struct ggml_tensor * src1,
|
12225
12585
|
struct ggml_tensor * dst) {
|
12226
12586
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12227
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12587
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
12228
12588
|
|
12229
12589
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12230
12590
|
return;
|
@@ -12233,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12233
12593
|
const int n_past = ((int32_t *) src1->data)[0];
|
12234
12594
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12235
12595
|
const int mode = ((int32_t *) src1->data)[2];
|
12596
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
12236
12597
|
|
12237
12598
|
assert(n_past >= 0);
|
12238
12599
|
|
@@ -12277,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12277
12638
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
12278
12639
|
|
12279
12640
|
const bool is_neox = mode & 2;
|
12641
|
+
const bool is_glm = mode & 4;
|
12280
12642
|
|
12281
12643
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12282
12644
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -12287,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
|
|
12287
12649
|
|
12288
12650
|
float theta = (float)p;
|
12289
12651
|
|
12290
|
-
if (
|
12652
|
+
if (is_glm) {
|
12653
|
+
theta = MIN(p, n_ctx - 2);
|
12654
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12655
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12656
|
+
const float cos_theta = cosf(theta);
|
12657
|
+
const float sin_theta = sinf(theta);
|
12658
|
+
const float cos_block_theta = cosf(block_theta);
|
12659
|
+
const float sin_block_theta = sinf(block_theta);
|
12660
|
+
|
12661
|
+
theta *= theta_scale;
|
12662
|
+
block_theta *= theta_scale;
|
12663
|
+
|
12664
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12665
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12666
|
+
|
12667
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
12668
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
12669
|
+
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
12670
|
+
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
12671
|
+
|
12672
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
12673
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12674
|
+
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
12675
|
+
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
12676
|
+
}
|
12677
|
+
} if (!is_neox) {
|
12291
12678
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12292
12679
|
const float cos_theta = cosf(theta);
|
12293
12680
|
const float sin_theta = sinf(theta);
|
@@ -13179,8 +13566,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13179
13566
|
const int nk1 = ne01;
|
13180
13567
|
|
13181
13568
|
// size of the convolution row - the kernel size unrolled across all channels
|
13182
|
-
|
13183
|
-
const int ew0 = ggml_up32(nk0*nk1*ne02);
|
13569
|
+
const int ew0 = nk0*nk1*ne02;
|
13184
13570
|
|
13185
13571
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13186
13572
|
GGML_ASSERT(nb10 == sizeof(float));
|
@@ -14590,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
|
|
14590
14976
|
}
|
14591
14977
|
}
|
14592
14978
|
|
14979
|
+
// ggml_compute_forward_map_custom1
|
14980
|
+
|
14981
|
+
static void ggml_compute_forward_map_custom1_f32(
|
14982
|
+
const struct ggml_compute_params * params,
|
14983
|
+
const struct ggml_tensor * a,
|
14984
|
+
struct ggml_tensor * dst,
|
14985
|
+
const ggml_custom1_op_f32_t fun) {
|
14986
|
+
assert(params->ith == 0);
|
14987
|
+
|
14988
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14989
|
+
return;
|
14990
|
+
}
|
14991
|
+
|
14992
|
+
fun(dst, a);
|
14993
|
+
}
|
14994
|
+
|
14995
|
+
|
14996
|
+
static void ggml_compute_forward_map_custom1(
|
14997
|
+
const struct ggml_compute_params * params,
|
14998
|
+
const struct ggml_tensor * a,
|
14999
|
+
struct ggml_tensor * dst,
|
15000
|
+
const ggml_custom1_op_f32_t fun) {
|
15001
|
+
switch (a->type) {
|
15002
|
+
case GGML_TYPE_F32:
|
15003
|
+
{
|
15004
|
+
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
15005
|
+
} break;
|
15006
|
+
default:
|
15007
|
+
{
|
15008
|
+
GGML_ASSERT(false);
|
15009
|
+
} break;
|
15010
|
+
}
|
15011
|
+
}
|
15012
|
+
|
15013
|
+
// ggml_compute_forward_map_custom2
|
15014
|
+
|
15015
|
+
static void ggml_compute_forward_map_custom2_f32(
|
15016
|
+
const struct ggml_compute_params * params,
|
15017
|
+
const struct ggml_tensor * a,
|
15018
|
+
const struct ggml_tensor * b,
|
15019
|
+
struct ggml_tensor * dst,
|
15020
|
+
const ggml_custom2_op_f32_t fun) {
|
15021
|
+
assert(params->ith == 0);
|
15022
|
+
|
15023
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15024
|
+
return;
|
15025
|
+
}
|
15026
|
+
|
15027
|
+
fun(dst, a, b);
|
15028
|
+
}
|
15029
|
+
|
15030
|
+
|
15031
|
+
static void ggml_compute_forward_map_custom2(
|
15032
|
+
const struct ggml_compute_params * params,
|
15033
|
+
const struct ggml_tensor * a,
|
15034
|
+
const struct ggml_tensor * b,
|
15035
|
+
struct ggml_tensor * dst,
|
15036
|
+
const ggml_custom2_op_f32_t fun) {
|
15037
|
+
switch (a->type) {
|
15038
|
+
case GGML_TYPE_F32:
|
15039
|
+
{
|
15040
|
+
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
15041
|
+
} break;
|
15042
|
+
default:
|
15043
|
+
{
|
15044
|
+
GGML_ASSERT(false);
|
15045
|
+
} break;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
// ggml_compute_forward_map_custom3
|
15050
|
+
|
15051
|
+
static void ggml_compute_forward_map_custom3_f32(
|
15052
|
+
const struct ggml_compute_params * params,
|
15053
|
+
const struct ggml_tensor * a,
|
15054
|
+
const struct ggml_tensor * b,
|
15055
|
+
const struct ggml_tensor * c,
|
15056
|
+
struct ggml_tensor * dst,
|
15057
|
+
const ggml_custom3_op_f32_t fun) {
|
15058
|
+
assert(params->ith == 0);
|
15059
|
+
|
15060
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15061
|
+
return;
|
15062
|
+
}
|
15063
|
+
|
15064
|
+
fun(dst, a, b, c);
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
|
15068
|
+
static void ggml_compute_forward_map_custom3(
|
15069
|
+
const struct ggml_compute_params * params,
|
15070
|
+
const struct ggml_tensor * a,
|
15071
|
+
const struct ggml_tensor * b,
|
15072
|
+
const struct ggml_tensor * c,
|
15073
|
+
struct ggml_tensor * dst,
|
15074
|
+
const ggml_custom3_op_f32_t fun) {
|
15075
|
+
switch (a->type) {
|
15076
|
+
case GGML_TYPE_F32:
|
15077
|
+
{
|
15078
|
+
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
15079
|
+
} break;
|
15080
|
+
default:
|
15081
|
+
{
|
15082
|
+
GGML_ASSERT(false);
|
15083
|
+
} break;
|
15084
|
+
}
|
15085
|
+
}
|
15086
|
+
|
14593
15087
|
// ggml_compute_forward_cross_entropy_loss
|
14594
15088
|
|
14595
15089
|
static void ggml_compute_forward_cross_entropy_loss_f32(
|
@@ -14880,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14880
15374
|
if (skip_cpu) {
|
14881
15375
|
return;
|
14882
15376
|
}
|
14883
|
-
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
15377
|
+
GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
|
14884
15378
|
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
14885
15379
|
#endif // GGML_USE_CUBLAS
|
14886
15380
|
|
@@ -15127,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15127
15621
|
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
15128
15622
|
}
|
15129
15623
|
break;
|
15624
|
+
case GGML_OP_MAP_CUSTOM1:
|
15625
|
+
{
|
15626
|
+
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
|
15627
|
+
ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
|
15628
|
+
}
|
15629
|
+
break;
|
15630
|
+
case GGML_OP_MAP_CUSTOM2:
|
15631
|
+
{
|
15632
|
+
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
|
15633
|
+
ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
|
15634
|
+
}
|
15635
|
+
break;
|
15636
|
+
case GGML_OP_MAP_CUSTOM3:
|
15637
|
+
{
|
15638
|
+
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
|
15639
|
+
ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
|
15640
|
+
}
|
15641
|
+
break;
|
15130
15642
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
15131
15643
|
{
|
15132
15644
|
ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
|
@@ -15735,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15735
16247
|
{
|
15736
16248
|
if (src0->grad) {
|
15737
16249
|
assert(src1->type == GGML_TYPE_I32);
|
15738
|
-
assert(ggml_nelements(src1) ==
|
16250
|
+
assert(ggml_nelements(src1) == 4);
|
15739
16251
|
const int n_past = ((int32_t *) src1->data)[0];
|
15740
16252
|
const int n_dims = ((int32_t *) src1->data)[1];
|
15741
16253
|
const int mode = ((int32_t *) src1->data)[2];
|
16254
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
15742
16255
|
src0->grad = ggml_add_impl(ctx,
|
15743
16256
|
src0->grad,
|
15744
16257
|
ggml_rope(ctx,
|
15745
16258
|
tensor->grad,
|
15746
16259
|
n_past,
|
15747
16260
|
n_dims,
|
15748
|
-
mode
|
16261
|
+
mode,
|
16262
|
+
n_ctx),
|
15749
16263
|
inplace);
|
15750
16264
|
}
|
15751
16265
|
if (src1->grad) {
|
@@ -15933,6 +16447,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15933
16447
|
case GGML_OP_WIN_UNPART:
|
15934
16448
|
case GGML_OP_MAP_UNARY:
|
15935
16449
|
case GGML_OP_MAP_BINARY:
|
16450
|
+
case GGML_OP_MAP_CUSTOM1:
|
16451
|
+
case GGML_OP_MAP_CUSTOM2:
|
16452
|
+
case GGML_OP_MAP_CUSTOM3:
|
15936
16453
|
{
|
15937
16454
|
GGML_ASSERT(false); // not supported
|
15938
16455
|
} break;
|
@@ -16004,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
16004
16521
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
16005
16522
|
|
16006
16523
|
if (strlen(node->name) == 0) {
|
16007
|
-
|
16524
|
+
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
16008
16525
|
}
|
16009
16526
|
|
16010
16527
|
cgraph->leafs[cgraph->n_leafs] = node;
|
@@ -16013,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
16013
16530
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
16014
16531
|
|
16015
16532
|
if (strlen(node->name) == 0) {
|
16016
|
-
|
16533
|
+
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
16017
16534
|
}
|
16018
16535
|
|
16019
16536
|
cgraph->nodes[cgraph->n_nodes] = node;
|
@@ -16167,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
|
|
16167
16684
|
|
16168
16685
|
#endif
|
16169
16686
|
|
16687
|
+
// Android's libc implementation "bionic" does not support setting affinity
|
16688
|
+
#if defined(__linux__) && !defined(__BIONIC__)
|
16689
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) {
|
16690
|
+
if (!ggml_is_numa()) {
|
16691
|
+
return;
|
16692
|
+
}
|
16693
|
+
|
16694
|
+
// run thread on node_num thread_n / (threads per node)
|
16695
|
+
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
16696
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
16697
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16698
|
+
|
16699
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16700
|
+
CPU_ZERO_S(setsize, cpus);
|
16701
|
+
for (size_t i = 0; i < node->n_cpus; ++i) {
|
16702
|
+
CPU_SET_S(node->cpus[i], setsize, cpus);
|
16703
|
+
}
|
16704
|
+
|
16705
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16706
|
+
if (rv) {
|
16707
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16708
|
+
strerror(rv));
|
16709
|
+
}
|
16710
|
+
|
16711
|
+
CPU_FREE(cpus);
|
16712
|
+
}
|
16713
|
+
|
16714
|
+
void clear_numa_thread_affinity(void) {
|
16715
|
+
if (!ggml_is_numa()) {
|
16716
|
+
return;
|
16717
|
+
}
|
16718
|
+
|
16719
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16720
|
+
|
16721
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16722
|
+
CPU_ZERO_S(setsize, cpus);
|
16723
|
+
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
|
16724
|
+
CPU_SET_S(i, setsize, cpus);
|
16725
|
+
}
|
16726
|
+
|
16727
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16728
|
+
if (rv) {
|
16729
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16730
|
+
strerror(rv));
|
16731
|
+
}
|
16732
|
+
|
16733
|
+
CPU_FREE(cpus);
|
16734
|
+
}
|
16735
|
+
#else
|
16736
|
+
// TODO: Windows etc.
|
16737
|
+
// (the linux implementation may also work on BSD, someone should test)
|
16738
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
16739
|
+
void clear_numa_thread_affinity(void) {}
|
16740
|
+
#endif
|
16741
|
+
|
16170
16742
|
struct ggml_compute_state_shared {
|
16171
|
-
|
16743
|
+
struct ggml_cgraph * cgraph;
|
16744
|
+
|
16745
|
+
int64_t perf_node_start_cycles;
|
16746
|
+
int64_t perf_node_start_time_us;
|
16172
16747
|
|
16173
16748
|
int n_threads;
|
16174
16749
|
|
16175
16750
|
// synchronization primitives
|
16176
|
-
atomic_int
|
16177
|
-
|
16178
|
-
atomic_bool stop; // stop all threads
|
16751
|
+
atomic_int n_active; // num active threads
|
16752
|
+
atomic_int node_n; // active graph node
|
16179
16753
|
};
|
16180
16754
|
|
16181
16755
|
struct ggml_compute_state {
|
16182
16756
|
ggml_thread_t thrd;
|
16183
|
-
|
16184
|
-
struct ggml_compute_params params;
|
16185
|
-
struct ggml_tensor * node;
|
16186
|
-
|
16757
|
+
int ith;
|
16187
16758
|
struct ggml_compute_state_shared * shared;
|
16188
16759
|
};
|
16189
16760
|
|
16761
|
+
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
16762
|
+
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
16763
|
+
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
16764
|
+
|
16765
|
+
node->perf_runs++;
|
16766
|
+
node->perf_cycles += cycles_cur;
|
16767
|
+
node->perf_time_us += time_us_cur;
|
16768
|
+
}
|
16769
|
+
|
16190
16770
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
16191
16771
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
16772
|
+
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
16192
16773
|
|
16193
16774
|
const int n_threads = state->shared->n_threads;
|
16775
|
+
set_numa_thread_affinity(state->ith, n_threads);
|
16776
|
+
|
16777
|
+
int node_n = -1;
|
16194
16778
|
|
16195
16779
|
while (true) {
|
16196
|
-
if (
|
16197
|
-
|
16198
|
-
|
16199
|
-
|
16200
|
-
|
16201
|
-
|
16202
|
-
|
16203
|
-
|
16204
|
-
|
16780
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
16781
|
+
// all other threads are finished and spinning
|
16782
|
+
// do finalize and init here so we don't have synchronize again
|
16783
|
+
struct ggml_compute_params params = {
|
16784
|
+
/*.type =*/ GGML_TASK_FINALIZE,
|
16785
|
+
/*.ith =*/ 0,
|
16786
|
+
/*.nth =*/ 0,
|
16787
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16788
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16789
|
+
};
|
16790
|
+
|
16791
|
+
if (node_n != -1) {
|
16792
|
+
/* FINALIZE */
|
16793
|
+
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16794
|
+
params.nth = node->n_tasks;
|
16795
|
+
ggml_compute_forward(¶ms, node);
|
16796
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16205
16797
|
}
|
16206
|
-
}
|
16207
16798
|
|
16208
|
-
|
16799
|
+
// distribute new work or execute it direct if 1T
|
16800
|
+
while (++node_n < cgraph->n_nodes) {
|
16801
|
+
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16802
|
+
|
16803
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16804
|
+
|
16805
|
+
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16806
|
+
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16209
16807
|
|
16210
|
-
|
16211
|
-
|
16212
|
-
|
16213
|
-
|
16808
|
+
/* INIT */
|
16809
|
+
params.type = GGML_TASK_INIT;
|
16810
|
+
params.nth = node->n_tasks;
|
16811
|
+
ggml_compute_forward(¶ms, node);
|
16812
|
+
|
16813
|
+
if (node->n_tasks == 1) {
|
16814
|
+
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16815
|
+
// they do something more efficient than spinning (?)
|
16816
|
+
params.type = GGML_TASK_COMPUTE;
|
16817
|
+
ggml_compute_forward(¶ms, node);
|
16818
|
+
|
16819
|
+
params.type = GGML_TASK_FINALIZE;
|
16820
|
+
ggml_compute_forward(¶ms, node);
|
16821
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16822
|
+
} else {
|
16823
|
+
break;
|
16824
|
+
}
|
16214
16825
|
}
|
16215
|
-
|
16216
|
-
|
16826
|
+
|
16827
|
+
atomic_store(&state->shared->n_active, n_threads);
|
16828
|
+
atomic_store(&state->shared->node_n, node_n);
|
16829
|
+
} else {
|
16830
|
+
// wait for other threads to finish
|
16831
|
+
const int last = node_n;
|
16832
|
+
do {
|
16833
|
+
sched_yield();
|
16834
|
+
node_n = atomic_load(&state->shared->node_n);
|
16835
|
+
} while (node_n == last);
|
16217
16836
|
}
|
16218
16837
|
|
16219
16838
|
// check if we should stop
|
16220
|
-
if (
|
16221
|
-
break;
|
16222
|
-
}
|
16839
|
+
if (node_n >= cgraph->n_nodes) break;
|
16223
16840
|
|
16224
|
-
|
16225
|
-
|
16226
|
-
ggml_compute_forward(&state->params, state->node);
|
16227
|
-
}
|
16841
|
+
/* COMPUTE */
|
16842
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16228
16843
|
|
16229
|
-
|
16230
|
-
|
16231
|
-
|
16844
|
+
struct ggml_compute_params params = {
|
16845
|
+
/*.type =*/ GGML_TASK_COMPUTE,
|
16846
|
+
/*.ith =*/ state->ith,
|
16847
|
+
/*.nth =*/ node->n_tasks,
|
16848
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16849
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16850
|
+
};
|
16851
|
+
|
16852
|
+
if (state->ith < node->n_tasks) {
|
16853
|
+
ggml_compute_forward(¶ms, node);
|
16232
16854
|
}
|
16233
16855
|
}
|
16234
16856
|
|
@@ -16239,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16239
16861
|
const int n_threads = cgraph->n_threads;
|
16240
16862
|
|
16241
16863
|
struct ggml_compute_state_shared state_shared = {
|
16242
|
-
/*.
|
16243
|
-
/*.
|
16244
|
-
/*.
|
16245
|
-
/*.
|
16246
|
-
/*.
|
16864
|
+
/*.cgraph =*/ cgraph,
|
16865
|
+
/*.perf_node_start_cycles =*/ 0,
|
16866
|
+
/*.perf_node_start_time_us =*/ 0,
|
16867
|
+
/*.n_threads =*/ n_threads,
|
16868
|
+
/*.n_active =*/ n_threads,
|
16869
|
+
/*.node_n =*/ -1,
|
16247
16870
|
};
|
16248
|
-
struct ggml_compute_state * workers =
|
16249
|
-
|
16250
|
-
// create thread pool
|
16251
|
-
if (n_threads > 1) {
|
16252
|
-
ggml_lock_init(&state_shared.spin);
|
16253
|
-
|
16254
|
-
atomic_store(&state_shared.has_work, true);
|
16255
|
-
|
16256
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16257
|
-
workers[j] = (struct ggml_compute_state) {
|
16258
|
-
.thrd = 0,
|
16259
|
-
.params = {
|
16260
|
-
.type = GGML_TASK_COMPUTE,
|
16261
|
-
.ith = j + 1,
|
16262
|
-
.nth = n_threads,
|
16263
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16264
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16265
|
-
},
|
16266
|
-
.node = NULL,
|
16267
|
-
.shared = &state_shared,
|
16268
|
-
};
|
16269
|
-
|
16270
|
-
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
16271
|
-
GGML_ASSERT(rc == 0);
|
16272
|
-
UNUSED(rc);
|
16273
|
-
}
|
16274
|
-
}
|
16871
|
+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
16275
16872
|
|
16276
16873
|
// initialize tasks + work buffer
|
16277
16874
|
{
|
@@ -16415,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16415
17012
|
} break;
|
16416
17013
|
case GGML_OP_SCALE:
|
16417
17014
|
{
|
16418
|
-
node->n_tasks =
|
17015
|
+
node->n_tasks = 1;
|
16419
17016
|
} break;
|
16420
17017
|
case GGML_OP_SET:
|
16421
17018
|
case GGML_OP_CONT:
|
@@ -16574,6 +17171,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16574
17171
|
case GGML_OP_WIN_UNPART:
|
16575
17172
|
case GGML_OP_MAP_UNARY:
|
16576
17173
|
case GGML_OP_MAP_BINARY:
|
17174
|
+
case GGML_OP_MAP_CUSTOM1:
|
17175
|
+
case GGML_OP_MAP_CUSTOM2:
|
17176
|
+
case GGML_OP_MAP_CUSTOM3:
|
16577
17177
|
{
|
16578
17178
|
node->n_tasks = 1;
|
16579
17179
|
} break;
|
@@ -16616,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16616
17216
|
}
|
16617
17217
|
}
|
16618
17218
|
|
16619
|
-
|
16620
|
-
|
16621
|
-
|
16622
|
-
|
16623
|
-
|
16624
|
-
|
16625
|
-
|
16626
|
-
|
16627
|
-
// TODO: this could be used to avoid unnecessary computations, but it needs to be improved
|
16628
|
-
//if (node->grad == NULL && node->perf_runs > 0) {
|
16629
|
-
// continue;
|
16630
|
-
//}
|
16631
|
-
|
16632
|
-
const int64_t perf_node_start_cycles = ggml_perf_cycles();
|
16633
|
-
const int64_t perf_node_start_time_us = ggml_perf_time_us();
|
16634
|
-
|
16635
|
-
// INIT
|
16636
|
-
struct ggml_compute_params params = {
|
16637
|
-
/*.type =*/ GGML_TASK_INIT,
|
16638
|
-
/*.ith =*/ 0,
|
16639
|
-
/*.nth =*/ node->n_tasks,
|
16640
|
-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16641
|
-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16642
|
-
};
|
16643
|
-
|
16644
|
-
ggml_compute_forward(¶ms, node);
|
16645
|
-
|
16646
|
-
// COMPUTE
|
16647
|
-
if (node->n_tasks > 1) {
|
16648
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16649
|
-
atomic_store(&state_shared.has_work, false);
|
16650
|
-
}
|
16651
|
-
|
16652
|
-
while (atomic_load(&state_shared.has_work)) {
|
16653
|
-
ggml_lock_lock (&state_shared.spin);
|
16654
|
-
ggml_lock_unlock(&state_shared.spin);
|
16655
|
-
}
|
16656
|
-
|
16657
|
-
// launch thread pool
|
16658
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16659
|
-
workers[j].params = (struct ggml_compute_params) {
|
16660
|
-
.type = GGML_TASK_COMPUTE,
|
16661
|
-
.ith = j + 1,
|
16662
|
-
.nth = node->n_tasks,
|
16663
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16664
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16665
|
-
};
|
16666
|
-
workers[j].node = node;
|
16667
|
-
}
|
16668
|
-
|
16669
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16670
|
-
|
16671
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16672
|
-
ggml_lock_lock (&state_shared.spin);
|
16673
|
-
ggml_lock_unlock(&state_shared.spin);
|
16674
|
-
}
|
16675
|
-
|
16676
|
-
atomic_store(&state_shared.has_work, true);
|
16677
|
-
}
|
16678
|
-
|
16679
|
-
params.type = GGML_TASK_COMPUTE;
|
16680
|
-
ggml_compute_forward(¶ms, node);
|
16681
|
-
|
16682
|
-
// wait for thread pool
|
16683
|
-
if (node->n_tasks > 1) {
|
16684
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16685
|
-
atomic_store(&state_shared.has_work, false);
|
16686
|
-
}
|
16687
|
-
|
16688
|
-
while (atomic_load(&state_shared.has_work)) {
|
16689
|
-
ggml_lock_lock (&state_shared.spin);
|
16690
|
-
ggml_lock_unlock(&state_shared.spin);
|
16691
|
-
}
|
16692
|
-
|
16693
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16694
|
-
|
16695
|
-
while (atomic_load(&state_shared.n_ready) != 0) {
|
16696
|
-
ggml_lock_lock (&state_shared.spin);
|
16697
|
-
ggml_lock_unlock(&state_shared.spin);
|
16698
|
-
}
|
16699
|
-
}
|
16700
|
-
|
16701
|
-
// FINALIZE
|
16702
|
-
if (node->n_tasks > 1) {
|
16703
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16704
|
-
atomic_store(&state_shared.has_work, false);
|
16705
|
-
}
|
16706
|
-
|
16707
|
-
while (atomic_load(&state_shared.has_work)) {
|
16708
|
-
ggml_lock_lock (&state_shared.spin);
|
16709
|
-
ggml_lock_unlock(&state_shared.spin);
|
16710
|
-
}
|
16711
|
-
|
16712
|
-
// launch thread pool
|
16713
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16714
|
-
workers[j].params = (struct ggml_compute_params) {
|
16715
|
-
.type = GGML_TASK_FINALIZE,
|
16716
|
-
.ith = j + 1,
|
16717
|
-
.nth = node->n_tasks,
|
16718
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16719
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16720
|
-
};
|
16721
|
-
workers[j].node = node;
|
16722
|
-
}
|
16723
|
-
|
16724
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16725
|
-
|
16726
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16727
|
-
ggml_lock_lock (&state_shared.spin);
|
16728
|
-
ggml_lock_unlock(&state_shared.spin);
|
16729
|
-
}
|
17219
|
+
// create thread pool
|
17220
|
+
if (n_threads > 1) {
|
17221
|
+
for (int j = 1; j < n_threads; ++j) {
|
17222
|
+
workers[j] = (struct ggml_compute_state) {
|
17223
|
+
.thrd = 0,
|
17224
|
+
.ith = j,
|
17225
|
+
.shared = &state_shared,
|
17226
|
+
};
|
16730
17227
|
|
16731
|
-
|
17228
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
17229
|
+
GGML_ASSERT(rc == 0);
|
16732
17230
|
}
|
17231
|
+
}
|
17232
|
+
workers[0].ith = 0;
|
17233
|
+
workers[0].shared = &state_shared;
|
16733
17234
|
|
16734
|
-
|
16735
|
-
|
16736
|
-
|
16737
|
-
// wait for thread pool
|
16738
|
-
if (node->n_tasks > 1) {
|
16739
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16740
|
-
atomic_store(&state_shared.has_work, false);
|
16741
|
-
}
|
16742
|
-
|
16743
|
-
while (atomic_load(&state_shared.has_work)) {
|
16744
|
-
ggml_lock_lock (&state_shared.spin);
|
16745
|
-
ggml_lock_unlock(&state_shared.spin);
|
16746
|
-
}
|
16747
|
-
|
16748
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16749
|
-
|
16750
|
-
while (atomic_load(&state_shared.n_ready) != 0) {
|
16751
|
-
ggml_lock_lock (&state_shared.spin);
|
16752
|
-
ggml_lock_unlock(&state_shared.spin);
|
16753
|
-
}
|
16754
|
-
}
|
17235
|
+
const int64_t perf_start_cycles = ggml_perf_cycles();
|
17236
|
+
const int64_t perf_start_time_us = ggml_perf_time_us();
|
16755
17237
|
|
16756
|
-
|
16757
|
-
|
16758
|
-
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
|
16759
|
-
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
|
17238
|
+
// this is a work thread too
|
17239
|
+
ggml_graph_compute_thread(&workers[0]);
|
16760
17240
|
|
16761
|
-
|
16762
|
-
|
16763
|
-
node->perf_time_us += perf_time_us_cur;
|
16764
|
-
}
|
16765
|
-
}
|
17241
|
+
// don't leave affinity set on the main thread
|
17242
|
+
clear_numa_thread_affinity();
|
16766
17243
|
|
16767
17244
|
// join thread pool
|
16768
17245
|
if (n_threads > 1) {
|
16769
|
-
|
16770
|
-
|
16771
|
-
|
16772
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16773
|
-
int rc = ggml_thread_join(workers[j].thrd, NULL);
|
17246
|
+
for (int j = 1; j < n_threads; j++) {
|
17247
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
16774
17248
|
GGML_ASSERT(rc == 0);
|
16775
|
-
UNUSED(rc);
|
16776
17249
|
}
|
16777
|
-
|
16778
|
-
ggml_lock_destroy(&state_shared.spin);
|
16779
17250
|
}
|
16780
17251
|
|
16781
17252
|
// performance stats (graph)
|
@@ -17397,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
|
17397
17868
|
return NULL;
|
17398
17869
|
}
|
17399
17870
|
|
17871
|
+
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17872
|
+
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
17873
|
+
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
17874
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
17875
|
+
gparent0 ? (void *) gparent0 : (void *) parent,
|
17876
|
+
gparent0 ? "g" : "x",
|
17877
|
+
gparent ? (void *) gparent : (void *) node,
|
17878
|
+
gparent ? "g" : "x",
|
17879
|
+
gparent ? "empty" : "vee",
|
17880
|
+
gparent ? "dashed" : "solid",
|
17881
|
+
label);
|
17882
|
+
}
|
17883
|
+
|
17884
|
+
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17885
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
17886
|
+
(void *) parent, "x",
|
17887
|
+
(void *) node, "x",
|
17888
|
+
label);
|
17889
|
+
}
|
17890
|
+
|
17400
17891
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
17401
17892
|
char color[16];
|
17402
17893
|
|
@@ -17432,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17432
17923
|
(void *) node, color);
|
17433
17924
|
|
17434
17925
|
if (strlen(node->name) > 0) {
|
17435
|
-
fprintf(fp, "%s |", node->name);
|
17926
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17927
|
+
} else {
|
17928
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
17436
17929
|
}
|
17437
17930
|
|
17438
17931
|
if (node->n_dims == 2) {
|
@@ -17441,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17441
17934
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
17442
17935
|
}
|
17443
17936
|
|
17444
|
-
|
17445
17937
|
if (node->grad) {
|
17446
17938
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
17447
17939
|
} else {
|
@@ -17460,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17460
17952
|
(void *) node, color);
|
17461
17953
|
|
17462
17954
|
if (strlen(node->name) > 0) {
|
17463
|
-
|
17955
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17956
|
+
} else {
|
17957
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
17464
17958
|
}
|
17465
|
-
|
17466
|
-
|
17467
|
-
|
17468
|
-
|
17469
|
-
|
17470
|
-
|
17959
|
+
|
17960
|
+
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17961
|
+
if (ggml_nelements(node) < 5) {
|
17962
|
+
fprintf(fp, " | (");
|
17963
|
+
for (int j = 0; j < ggml_nelements(node); j++) {
|
17964
|
+
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
17965
|
+
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
17966
|
+
}
|
17967
|
+
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
|
17968
|
+
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
17969
|
+
}
|
17970
|
+
else {
|
17971
|
+
fprintf(fp, "#");
|
17972
|
+
}
|
17973
|
+
if (j < ggml_nelements(node) - 1) {
|
17974
|
+
fprintf(fp, ", ");
|
17975
|
+
}
|
17471
17976
|
}
|
17472
|
-
|
17473
|
-
else {
|
17474
|
-
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17977
|
+
fprintf(fp, ")");
|
17475
17978
|
}
|
17476
17979
|
fprintf(fp, "\"; ]\n");
|
17477
17980
|
}
|
@@ -17479,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17479
17982
|
for (int i = 0; i < gb->n_nodes; i++) {
|
17480
17983
|
struct ggml_tensor * node = gb->nodes[i];
|
17481
17984
|
|
17482
|
-
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
|
17483
|
-
|
17484
17985
|
if (node->src0) {
|
17485
|
-
|
17486
|
-
|
17487
|
-
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
|
17488
|
-
parent0 ? (void *) parent0 : (void *) node->src0,
|
17489
|
-
parent0 ? "g" : "x",
|
17490
|
-
parent ? (void *) parent : (void *) node,
|
17491
|
-
parent ? "g" : "x",
|
17492
|
-
parent ? "empty" : "vee",
|
17493
|
-
parent ? "dashed" : "solid");
|
17986
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
|
17494
17987
|
}
|
17495
17988
|
|
17496
17989
|
if (node->src1) {
|
17497
|
-
|
17498
|
-
|
17499
|
-
|
17500
|
-
|
17501
|
-
|
17502
|
-
|
17503
|
-
|
17504
|
-
|
17505
|
-
|
17990
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
17991
|
+
}
|
17992
|
+
|
17993
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17994
|
+
if (node->opt[j]) {
|
17995
|
+
char label[16];
|
17996
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
17997
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
|
17998
|
+
}
|
17506
17999
|
}
|
17507
18000
|
}
|
17508
18001
|
|
@@ -17510,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17510
18003
|
struct ggml_tensor * node = gb->leafs[i];
|
17511
18004
|
|
17512
18005
|
if (node->src0) {
|
17513
|
-
|
17514
|
-
(void *) node->src0, "x",
|
17515
|
-
(void *) node, "x");
|
18006
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
|
17516
18007
|
}
|
17517
18008
|
|
17518
18009
|
if (node->src1) {
|
17519
|
-
|
17520
|
-
|
17521
|
-
|
18010
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
18011
|
+
}
|
18012
|
+
|
18013
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
18014
|
+
if (node->opt[j]) {
|
18015
|
+
char label[16];
|
18016
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
18017
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
|
18018
|
+
}
|
17522
18019
|
}
|
17523
18020
|
}
|
17524
18021
|
|