whisper.rn 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/cpp/ggml-alloc.c +264 -126
- package/cpp/ggml-backend-impl.h +4 -1
- package/cpp/ggml-backend-reg.cpp +13 -5
- package/cpp/ggml-backend.cpp +207 -17
- package/cpp/ggml-backend.h +17 -1
- package/cpp/ggml-cpu/amx/amx.cpp +4 -2
- package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/cpp/ggml-cpu/arch-fallback.h +0 -4
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
- package/cpp/ggml-cpu/ggml-cpu.c +48 -41
- package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/ggml-cpu/ops.cpp +518 -767
- package/cpp/ggml-cpu/ops.h +2 -0
- package/cpp/ggml-cpu/simd-mappings.h +88 -59
- package/cpp/ggml-cpu/vec.cpp +161 -20
- package/cpp/ggml-cpu/vec.h +400 -51
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +43 -10
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
- package/cpp/ggml-metal/ggml-metal-device.h +226 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +40 -40
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-quants.c +1 -0
- package/cpp/ggml.c +175 -13
- package/cpp/ggml.h +84 -5
- package/cpp/jsi/RNWhisperJSI.cpp +2 -0
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/whisper.cpp +85 -70
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisperVadContext.mm +14 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +1 -1
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6779
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-backend.cpp
CHANGED
|
@@ -19,9 +19,8 @@
|
|
|
19
19
|
#include <stdio.h>
|
|
20
20
|
#include <stdlib.h>
|
|
21
21
|
#include <string.h>
|
|
22
|
-
#include <string>
|
|
23
|
-
#include <vector>
|
|
24
22
|
#include <algorithm>
|
|
23
|
+
#include <vector>
|
|
25
24
|
|
|
26
25
|
#ifdef __APPLE__
|
|
27
26
|
#include <sys/types.h>
|
|
@@ -32,6 +31,7 @@
|
|
|
32
31
|
// backend buffer type
|
|
33
32
|
|
|
34
33
|
const char * wsp_ggml_backend_buft_name(wsp_ggml_backend_buffer_type_t buft) {
|
|
34
|
+
WSP_GGML_ASSERT(buft);
|
|
35
35
|
return buft->iface.get_name(buft);
|
|
36
36
|
}
|
|
37
37
|
|
|
@@ -41,14 +41,17 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_bu
|
|
|
41
41
|
return wsp_ggml_backend_buffer_init(buft, {}, NULL, 0);
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
+
WSP_GGML_ASSERT(buft);
|
|
44
45
|
return buft->iface.alloc_buffer(buft, size);
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
size_t wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
|
|
49
|
+
WSP_GGML_ASSERT(buft);
|
|
48
50
|
return buft->iface.get_alignment(buft);
|
|
49
51
|
}
|
|
50
52
|
|
|
51
53
|
size_t wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_type_t buft) {
|
|
54
|
+
WSP_GGML_ASSERT(buft);
|
|
52
55
|
// get_max_size is optional, defaults to SIZE_MAX
|
|
53
56
|
if (buft->iface.get_max_size) {
|
|
54
57
|
return buft->iface.get_max_size(buft);
|
|
@@ -57,6 +60,7 @@ size_t wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_type_t buft) {
|
|
|
57
60
|
}
|
|
58
61
|
|
|
59
62
|
size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_tensor * tensor) {
|
|
63
|
+
WSP_GGML_ASSERT(buft);
|
|
60
64
|
// get_alloc_size is optional, defaults to wsp_ggml_nbytes
|
|
61
65
|
if (buft->iface.get_alloc_size) {
|
|
62
66
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
|
@@ -67,6 +71,7 @@ size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft,
|
|
|
67
71
|
}
|
|
68
72
|
|
|
69
73
|
bool wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_type_t buft) {
|
|
74
|
+
WSP_GGML_ASSERT(buft);
|
|
70
75
|
if (buft->iface.is_host) {
|
|
71
76
|
return buft->iface.is_host(buft);
|
|
72
77
|
}
|
|
@@ -74,6 +79,7 @@ bool wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_type_t buft) {
|
|
|
74
79
|
}
|
|
75
80
|
|
|
76
81
|
wsp_ggml_backend_dev_t wsp_ggml_backend_buft_get_device(wsp_ggml_backend_buffer_type_t buft) {
|
|
82
|
+
WSP_GGML_ASSERT(buft);
|
|
77
83
|
return buft->device;
|
|
78
84
|
}
|
|
79
85
|
|
|
@@ -111,10 +117,12 @@ void wsp_ggml_backend_buffer_free(wsp_ggml_backend_buffer_t buffer) {
|
|
|
111
117
|
}
|
|
112
118
|
|
|
113
119
|
size_t wsp_ggml_backend_buffer_get_size(wsp_ggml_backend_buffer_t buffer) {
|
|
120
|
+
WSP_GGML_ASSERT(buffer);
|
|
114
121
|
return buffer->size;
|
|
115
122
|
}
|
|
116
123
|
|
|
117
124
|
void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
|
|
125
|
+
WSP_GGML_ASSERT(buffer);
|
|
118
126
|
// get_base is optional if the buffer is zero-sized
|
|
119
127
|
if (buffer->size == 0) {
|
|
120
128
|
return NULL;
|
|
@@ -128,6 +136,7 @@ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
|
|
|
128
136
|
}
|
|
129
137
|
|
|
130
138
|
enum wsp_ggml_status wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
|
|
139
|
+
WSP_GGML_ASSERT(buffer);
|
|
131
140
|
// init_tensor is optional
|
|
132
141
|
if (buffer->iface.init_tensor) {
|
|
133
142
|
return buffer->iface.init_tensor(buffer, tensor);
|
|
@@ -136,6 +145,7 @@ enum wsp_ggml_status wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer
|
|
|
136
145
|
}
|
|
137
146
|
|
|
138
147
|
void wsp_ggml_backend_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
|
|
148
|
+
WSP_GGML_ASSERT(buffer);
|
|
139
149
|
// clear is optional if the buffer is zero-sized
|
|
140
150
|
if (buffer->size == 0) {
|
|
141
151
|
return;
|
|
@@ -161,6 +171,7 @@ bool wsp_ggml_backend_buffer_is_host(wsp_ggml_backend_buffer_t buffer) {
|
|
|
161
171
|
}
|
|
162
172
|
|
|
163
173
|
void wsp_ggml_backend_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
|
|
174
|
+
WSP_GGML_ASSERT(buffer);
|
|
164
175
|
buffer->usage = usage;
|
|
165
176
|
|
|
166
177
|
// FIXME: add a generic callback to the buffer interface
|
|
@@ -170,14 +181,17 @@ void wsp_ggml_backend_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum ws
|
|
|
170
181
|
}
|
|
171
182
|
|
|
172
183
|
enum wsp_ggml_backend_buffer_usage wsp_ggml_backend_buffer_get_usage(wsp_ggml_backend_buffer_t buffer) {
|
|
184
|
+
WSP_GGML_ASSERT(buffer);
|
|
173
185
|
return buffer->usage;
|
|
174
186
|
}
|
|
175
187
|
|
|
176
188
|
wsp_ggml_backend_buffer_type_t wsp_ggml_backend_buffer_get_type(wsp_ggml_backend_buffer_t buffer) {
|
|
189
|
+
WSP_GGML_ASSERT(buffer);
|
|
177
190
|
return buffer->buft;
|
|
178
191
|
}
|
|
179
192
|
|
|
180
193
|
void wsp_ggml_backend_buffer_reset(wsp_ggml_backend_buffer_t buffer) {
|
|
194
|
+
WSP_GGML_ASSERT(buffer);
|
|
181
195
|
if (buffer->iface.reset) {
|
|
182
196
|
buffer->iface.reset(buffer);
|
|
183
197
|
}
|
|
@@ -216,6 +230,7 @@ void wsp_ggml_backend_free(wsp_ggml_backend_t backend) {
|
|
|
216
230
|
}
|
|
217
231
|
|
|
218
232
|
wsp_ggml_backend_buffer_type_t wsp_ggml_backend_get_default_buffer_type(wsp_ggml_backend_t backend) {
|
|
233
|
+
WSP_GGML_ASSERT(backend);
|
|
219
234
|
return wsp_ggml_backend_dev_buffer_type(backend->device);
|
|
220
235
|
}
|
|
221
236
|
|
|
@@ -232,6 +247,8 @@ size_t wsp_ggml_backend_get_max_size(wsp_ggml_backend_t backend) {
|
|
|
232
247
|
}
|
|
233
248
|
|
|
234
249
|
void wsp_ggml_backend_tensor_set_async(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
250
|
+
WSP_GGML_ASSERT(backend);
|
|
251
|
+
WSP_GGML_ASSERT(tensor);
|
|
235
252
|
WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
236
253
|
WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
237
254
|
|
|
@@ -243,6 +260,8 @@ void wsp_ggml_backend_tensor_set_async(wsp_ggml_backend_t backend, struct wsp_gg
|
|
|
243
260
|
}
|
|
244
261
|
|
|
245
262
|
void wsp_ggml_backend_tensor_get_async(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
263
|
+
WSP_GGML_ASSERT(backend);
|
|
264
|
+
WSP_GGML_ASSERT(tensor);
|
|
246
265
|
WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
247
266
|
WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
248
267
|
|
|
@@ -284,6 +303,7 @@ void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * d
|
|
|
284
303
|
}
|
|
285
304
|
|
|
286
305
|
void wsp_ggml_backend_tensor_memset(struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
306
|
+
WSP_GGML_ASSERT(tensor);
|
|
287
307
|
wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
288
308
|
|
|
289
309
|
if (size == 0) {
|
|
@@ -299,6 +319,7 @@ void wsp_ggml_backend_tensor_memset(struct wsp_ggml_tensor * tensor, uint8_t val
|
|
|
299
319
|
}
|
|
300
320
|
|
|
301
321
|
void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
|
|
322
|
+
WSP_GGML_ASSERT(backend);
|
|
302
323
|
if (backend->iface.synchronize == NULL) {
|
|
303
324
|
return;
|
|
304
325
|
}
|
|
@@ -307,18 +328,21 @@ void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
|
|
|
307
328
|
}
|
|
308
329
|
|
|
309
330
|
wsp_ggml_backend_graph_plan_t wsp_ggml_backend_graph_plan_create(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
|
|
331
|
+
WSP_GGML_ASSERT(backend);
|
|
310
332
|
WSP_GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
|
311
333
|
|
|
312
334
|
return backend->iface.graph_plan_create(backend, cgraph);
|
|
313
335
|
}
|
|
314
336
|
|
|
315
337
|
void wsp_ggml_backend_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
|
|
338
|
+
WSP_GGML_ASSERT(backend);
|
|
316
339
|
WSP_GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
|
317
340
|
|
|
318
341
|
backend->iface.graph_plan_free(backend, plan);
|
|
319
342
|
}
|
|
320
343
|
|
|
321
344
|
enum wsp_ggml_status wsp_ggml_backend_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
|
|
345
|
+
WSP_GGML_ASSERT(backend);
|
|
322
346
|
WSP_GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
|
323
347
|
|
|
324
348
|
return backend->iface.graph_plan_compute(backend, plan);
|
|
@@ -331,22 +355,27 @@ enum wsp_ggml_status wsp_ggml_backend_graph_compute(wsp_ggml_backend_t backend,
|
|
|
331
355
|
}
|
|
332
356
|
|
|
333
357
|
enum wsp_ggml_status wsp_ggml_backend_graph_compute_async(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
|
|
358
|
+
WSP_GGML_ASSERT(backend);
|
|
334
359
|
return backend->iface.graph_compute(backend, cgraph);
|
|
335
360
|
}
|
|
336
361
|
|
|
337
362
|
bool wsp_ggml_backend_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
|
|
363
|
+
WSP_GGML_ASSERT(backend);
|
|
338
364
|
return wsp_ggml_backend_dev_supports_op(backend->device, op);
|
|
339
365
|
}
|
|
340
366
|
|
|
341
367
|
bool wsp_ggml_backend_supports_buft(wsp_ggml_backend_t backend, wsp_ggml_backend_buffer_type_t buft) {
|
|
368
|
+
WSP_GGML_ASSERT(backend);
|
|
342
369
|
return wsp_ggml_backend_dev_supports_buft(backend->device, buft);
|
|
343
370
|
}
|
|
344
371
|
|
|
345
372
|
bool wsp_ggml_backend_offload_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
|
|
373
|
+
WSP_GGML_ASSERT(backend);
|
|
346
374
|
return wsp_ggml_backend_dev_offload_op(backend->device, op);
|
|
347
375
|
}
|
|
348
376
|
|
|
349
377
|
wsp_ggml_backend_dev_t wsp_ggml_backend_get_device(wsp_ggml_backend_t backend) {
|
|
378
|
+
WSP_GGML_ASSERT(backend);
|
|
350
379
|
return backend->device;
|
|
351
380
|
}
|
|
352
381
|
|
|
@@ -382,6 +411,7 @@ void wsp_ggml_backend_tensor_copy_async(wsp_ggml_backend_t backend_src, wsp_ggml
|
|
|
382
411
|
return;
|
|
383
412
|
}
|
|
384
413
|
|
|
414
|
+
WSP_GGML_ASSERT(backend_dst);
|
|
385
415
|
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
|
386
416
|
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
|
387
417
|
return;
|
|
@@ -413,38 +443,52 @@ void wsp_ggml_backend_event_free(wsp_ggml_backend_event_t event) {
|
|
|
413
443
|
}
|
|
414
444
|
|
|
415
445
|
void wsp_ggml_backend_event_record(wsp_ggml_backend_event_t event, wsp_ggml_backend_t backend) {
|
|
446
|
+
WSP_GGML_ASSERT(backend);
|
|
416
447
|
WSP_GGML_ASSERT(backend->iface.event_record != NULL);
|
|
417
448
|
|
|
418
449
|
backend->iface.event_record(backend, event);
|
|
419
450
|
}
|
|
420
451
|
|
|
421
452
|
void wsp_ggml_backend_event_synchronize(wsp_ggml_backend_event_t event) {
|
|
453
|
+
WSP_GGML_ASSERT(event);
|
|
422
454
|
WSP_GGML_ASSERT(event->device->iface.event_synchronize);
|
|
423
455
|
|
|
424
456
|
event->device->iface.event_synchronize(event->device, event);
|
|
425
457
|
}
|
|
426
458
|
|
|
427
459
|
void wsp_ggml_backend_event_wait(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event) {
|
|
460
|
+
WSP_GGML_ASSERT(backend);
|
|
428
461
|
WSP_GGML_ASSERT(backend->iface.event_wait != NULL);
|
|
429
462
|
|
|
430
463
|
backend->iface.event_wait(backend, event);
|
|
431
464
|
}
|
|
432
465
|
|
|
466
|
+
static void wsp_ggml_backend_graph_optimize(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
|
|
467
|
+
WSP_GGML_ASSERT(backend);
|
|
468
|
+
if (backend->iface.graph_optimize != NULL) {
|
|
469
|
+
backend->iface.graph_optimize(backend, cgraph);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
433
473
|
// Backend device
|
|
434
474
|
|
|
435
475
|
const char * wsp_ggml_backend_dev_name(wsp_ggml_backend_dev_t device) {
|
|
476
|
+
WSP_GGML_ASSERT(device);
|
|
436
477
|
return device->iface.get_name(device);
|
|
437
478
|
}
|
|
438
479
|
|
|
439
480
|
const char * wsp_ggml_backend_dev_description(wsp_ggml_backend_dev_t device) {
|
|
481
|
+
WSP_GGML_ASSERT(device);
|
|
440
482
|
return device->iface.get_description(device);
|
|
441
483
|
}
|
|
442
484
|
|
|
443
485
|
void wsp_ggml_backend_dev_memory(wsp_ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
486
|
+
WSP_GGML_ASSERT(device);
|
|
444
487
|
device->iface.get_memory(device, free, total);
|
|
445
488
|
}
|
|
446
489
|
|
|
447
490
|
enum wsp_ggml_backend_dev_type wsp_ggml_backend_dev_type(wsp_ggml_backend_dev_t device) {
|
|
491
|
+
WSP_GGML_ASSERT(device);
|
|
448
492
|
return device->iface.get_type(device);
|
|
449
493
|
}
|
|
450
494
|
|
|
@@ -454,18 +498,22 @@ void wsp_ggml_backend_dev_get_props(wsp_ggml_backend_dev_t device, struct wsp_gg
|
|
|
454
498
|
}
|
|
455
499
|
|
|
456
500
|
wsp_ggml_backend_reg_t wsp_ggml_backend_dev_backend_reg(wsp_ggml_backend_dev_t device) {
|
|
501
|
+
WSP_GGML_ASSERT(device);
|
|
457
502
|
return device->reg;
|
|
458
503
|
}
|
|
459
504
|
|
|
460
505
|
wsp_ggml_backend_t wsp_ggml_backend_dev_init(wsp_ggml_backend_dev_t device, const char * params) {
|
|
506
|
+
WSP_GGML_ASSERT(device);
|
|
461
507
|
return device->iface.init_backend(device, params);
|
|
462
508
|
}
|
|
463
509
|
|
|
464
510
|
wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_buffer_type(wsp_ggml_backend_dev_t device) {
|
|
511
|
+
WSP_GGML_ASSERT(device);
|
|
465
512
|
return device->iface.get_buffer_type(device);
|
|
466
513
|
}
|
|
467
514
|
|
|
468
515
|
wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_host_buffer_type(wsp_ggml_backend_dev_t device) {
|
|
516
|
+
WSP_GGML_ASSERT(device);
|
|
469
517
|
if (device->iface.get_host_buffer_type == NULL) {
|
|
470
518
|
return NULL;
|
|
471
519
|
}
|
|
@@ -474,18 +522,22 @@ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_host_buffer_type(wsp_ggml_ba
|
|
|
474
522
|
}
|
|
475
523
|
|
|
476
524
|
wsp_ggml_backend_buffer_t wsp_ggml_backend_dev_buffer_from_host_ptr(wsp_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
525
|
+
WSP_GGML_ASSERT(device);
|
|
477
526
|
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
478
527
|
}
|
|
479
528
|
|
|
480
529
|
bool wsp_ggml_backend_dev_supports_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
|
|
530
|
+
WSP_GGML_ASSERT(device);
|
|
481
531
|
return device->iface.supports_op(device, op);
|
|
482
532
|
}
|
|
483
533
|
|
|
484
534
|
bool wsp_ggml_backend_dev_supports_buft(wsp_ggml_backend_dev_t device, wsp_ggml_backend_buffer_type_t buft) {
|
|
535
|
+
WSP_GGML_ASSERT(device);
|
|
485
536
|
return device->iface.supports_buft(device, buft);
|
|
486
537
|
}
|
|
487
538
|
|
|
488
539
|
bool wsp_ggml_backend_dev_offload_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
|
|
540
|
+
WSP_GGML_ASSERT(device);
|
|
489
541
|
if (device->iface.offload_op != NULL) {
|
|
490
542
|
return device->iface.offload_op(device, op);
|
|
491
543
|
}
|
|
@@ -496,18 +548,22 @@ bool wsp_ggml_backend_dev_offload_op(wsp_ggml_backend_dev_t device, const struct
|
|
|
496
548
|
// Backend (reg)
|
|
497
549
|
|
|
498
550
|
const char * wsp_ggml_backend_reg_name(wsp_ggml_backend_reg_t reg) {
|
|
551
|
+
WSP_GGML_ASSERT(reg);
|
|
499
552
|
return reg->iface.get_name(reg);
|
|
500
553
|
}
|
|
501
554
|
|
|
502
555
|
size_t wsp_ggml_backend_reg_dev_count(wsp_ggml_backend_reg_t reg) {
|
|
556
|
+
WSP_GGML_ASSERT(reg);
|
|
503
557
|
return reg->iface.get_device_count(reg);
|
|
504
558
|
}
|
|
505
559
|
|
|
506
560
|
wsp_ggml_backend_dev_t wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_reg_t reg, size_t index) {
|
|
561
|
+
WSP_GGML_ASSERT(reg);
|
|
507
562
|
return reg->iface.get_device(reg, index);
|
|
508
563
|
}
|
|
509
564
|
|
|
510
565
|
void * wsp_ggml_backend_reg_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
|
|
566
|
+
WSP_GGML_ASSERT(reg);
|
|
511
567
|
if (!reg->iface.get_proc_address) {
|
|
512
568
|
return NULL;
|
|
513
569
|
}
|
|
@@ -522,6 +578,7 @@ struct wsp_ggml_backend_multi_buffer_context {
|
|
|
522
578
|
};
|
|
523
579
|
|
|
524
580
|
static void wsp_ggml_backend_multi_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
|
|
581
|
+
WSP_GGML_ASSERT(buffer);
|
|
525
582
|
wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
|
|
526
583
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
527
584
|
wsp_ggml_backend_buffer_free(ctx->buffers[i]);
|
|
@@ -532,6 +589,7 @@ static void wsp_ggml_backend_multi_buffer_free_buffer(wsp_ggml_backend_buffer_t
|
|
|
532
589
|
}
|
|
533
590
|
|
|
534
591
|
static void wsp_ggml_backend_multi_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
|
|
592
|
+
WSP_GGML_ASSERT(buffer);
|
|
535
593
|
wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
|
|
536
594
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
537
595
|
wsp_ggml_backend_buffer_clear(ctx->buffers[i], value);
|
|
@@ -567,10 +625,12 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_multi_buffer_alloc_buffer(wsp_ggml_ba
|
|
|
567
625
|
}
|
|
568
626
|
|
|
569
627
|
bool wsp_ggml_backend_buffer_is_multi_buffer(wsp_ggml_backend_buffer_t buffer) {
|
|
628
|
+
WSP_GGML_ASSERT(buffer);
|
|
570
629
|
return buffer->iface.free_buffer == wsp_ggml_backend_multi_buffer_free_buffer;
|
|
571
630
|
}
|
|
572
631
|
|
|
573
632
|
void wsp_ggml_backend_multi_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
|
|
633
|
+
WSP_GGML_ASSERT(buffer);
|
|
574
634
|
WSP_GGML_ASSERT(wsp_ggml_backend_buffer_is_multi_buffer(buffer));
|
|
575
635
|
wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
|
|
576
636
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
@@ -598,7 +658,7 @@ static bool wsp_ggml_is_view_op(enum wsp_ggml_op op) {
|
|
|
598
658
|
#endif
|
|
599
659
|
|
|
600
660
|
#ifndef WSP_GGML_SCHED_MAX_SPLIT_INPUTS
|
|
601
|
-
#define WSP_GGML_SCHED_MAX_SPLIT_INPUTS
|
|
661
|
+
#define WSP_GGML_SCHED_MAX_SPLIT_INPUTS 30
|
|
602
662
|
#endif
|
|
603
663
|
|
|
604
664
|
#ifndef WSP_GGML_SCHED_MAX_COPIES
|
|
@@ -849,7 +909,7 @@ static void wsp_ggml_backend_sched_set_if_supported(wsp_ggml_backend_sched_t sch
|
|
|
849
909
|
}
|
|
850
910
|
|
|
851
911
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
|
852
|
-
|
|
912
|
+
void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
|
|
853
913
|
// reset splits
|
|
854
914
|
sched->n_splits = 0;
|
|
855
915
|
sched->n_graph_inputs = 0;
|
|
@@ -1245,6 +1305,10 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
|
|
|
1245
1305
|
struct wsp_ggml_backend_sched_split * split = &sched->splits[i];
|
|
1246
1306
|
split->graph = wsp_ggml_graph_view(graph, split->i_start, split->i_end);
|
|
1247
1307
|
|
|
1308
|
+
// Optimize this split of the graph. This needs to happen before we make graph_copy,
|
|
1309
|
+
// so they are in sync.
|
|
1310
|
+
wsp_ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
|
|
1311
|
+
|
|
1248
1312
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
|
1249
1313
|
for (int j = 0; j < split->n_inputs; j++) {
|
|
1250
1314
|
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
|
@@ -1350,17 +1414,22 @@ static bool wsp_ggml_backend_sched_alloc_splits(wsp_ggml_backend_sched_t sched)
|
|
|
1350
1414
|
}
|
|
1351
1415
|
|
|
1352
1416
|
static enum wsp_ggml_status wsp_ggml_backend_sched_compute_splits(wsp_ggml_backend_sched_t sched) {
|
|
1417
|
+
WSP_GGML_ASSERT(sched);
|
|
1353
1418
|
struct wsp_ggml_backend_sched_split * splits = sched->splits;
|
|
1354
1419
|
|
|
1355
|
-
|
|
1356
|
-
|
|
1420
|
+
wsp_ggml_tensor * prev_ids_tensor = nullptr;
|
|
1421
|
+
std::vector<int32_t> ids;
|
|
1422
|
+
std::vector<wsp_ggml_bitset_t> used_ids;
|
|
1423
|
+
|
|
1424
|
+
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
|
|
1425
|
+
struct wsp_ggml_backend_sched_split * split = &splits[split_id];
|
|
1357
1426
|
int split_backend_id = split->backend_id;
|
|
1358
1427
|
wsp_ggml_backend_t split_backend = sched->backends[split_backend_id];
|
|
1359
1428
|
|
|
1360
1429
|
// copy the input tensors to the split backend
|
|
1361
|
-
for (int
|
|
1362
|
-
wsp_ggml_backend_t input_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, split->inputs[
|
|
1363
|
-
struct wsp_ggml_tensor * input = split->inputs[
|
|
1430
|
+
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
|
1431
|
+
wsp_ggml_backend_t input_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
|
1432
|
+
struct wsp_ggml_tensor * input = split->inputs[input_id];
|
|
1364
1433
|
struct wsp_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
|
1365
1434
|
|
|
1366
1435
|
if (input->flags & WSP_GGML_TENSOR_FLAG_INPUT) {
|
|
@@ -1378,16 +1447,104 @@ static enum wsp_ggml_status wsp_ggml_backend_sched_compute_splits(wsp_ggml_backe
|
|
|
1378
1447
|
} else {
|
|
1379
1448
|
wsp_ggml_backend_synchronize(split_backend);
|
|
1380
1449
|
}
|
|
1381
|
-
|
|
1382
|
-
//
|
|
1383
|
-
|
|
1450
|
+
|
|
1451
|
+
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
|
|
1452
|
+
wsp_ggml_tensor * node = split->graph.nodes[0];
|
|
1453
|
+
if (split->graph.n_nodes > 0 &&
|
|
1454
|
+
wsp_ggml_backend_buffer_get_usage(input->buffer) == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
|
|
1455
|
+
wsp_ggml_backend_buffer_is_host(input->buffer) && (
|
|
1456
|
+
(node->src[0] == input_cpy && node->op == WSP_GGML_OP_MUL_MAT_ID)
|
|
1457
|
+
//|| (node->src[1] == input_cpy && node->op == WSP_GGML_OP_ADD_ID) /* WSP_GGML_OP_ADD_ID weights are small and not worth splitting */
|
|
1458
|
+
)) {
|
|
1459
|
+
|
|
1460
|
+
const int64_t n_expert = node->op == WSP_GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
|
|
1461
|
+
const size_t expert_size = node->op == WSP_GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
|
|
1462
|
+
|
|
1384
1463
|
wsp_ggml_backend_synchronize(input_backend);
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1464
|
+
|
|
1465
|
+
// get the ids
|
|
1466
|
+
wsp_ggml_tensor * ids_tensor = node->src[2];
|
|
1467
|
+
wsp_ggml_backend_t ids_backend = split_backend;
|
|
1468
|
+
|
|
1469
|
+
// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
|
|
1470
|
+
// in that case, we use the original ids tensor
|
|
1471
|
+
for (int i = input_id + 1; i < split->n_inputs; i++) {
|
|
1472
|
+
if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
|
|
1473
|
+
ids_tensor = split->inputs[i];
|
|
1474
|
+
ids_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
|
|
1475
|
+
break;
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
if (ids_tensor != prev_ids_tensor) {
|
|
1480
|
+
ids.resize(wsp_ggml_nbytes(ids_tensor) / sizeof(int32_t));
|
|
1481
|
+
wsp_ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, wsp_ggml_nbytes(ids_tensor));
|
|
1482
|
+
wsp_ggml_backend_synchronize(ids_backend);
|
|
1483
|
+
|
|
1484
|
+
// find the used experts
|
|
1485
|
+
used_ids.clear();
|
|
1486
|
+
used_ids.resize(wsp_ggml_bitset_size(n_expert));
|
|
1487
|
+
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
|
|
1488
|
+
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
|
|
1489
|
+
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
|
|
1490
|
+
WSP_GGML_ASSERT(id >= 0 && id < n_expert);
|
|
1491
|
+
wsp_ggml_bitset_set(used_ids.data(), id);
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
prev_ids_tensor = ids_tensor;
|
|
1496
|
+
}
|
|
1497
|
+
|
|
1498
|
+
// group consecutive experts and copy them together
|
|
1499
|
+
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
|
|
1500
|
+
const size_t expert_offset = first_id * expert_size;
|
|
1501
|
+
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
|
|
1502
|
+
const size_t padding = std::min<size_t>(expert_size, 512);
|
|
1503
|
+
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
|
|
1504
|
+
|
|
1505
|
+
wsp_ggml_backend_tensor_set_async(split_backend,
|
|
1506
|
+
input_cpy,
|
|
1507
|
+
(const uint8_t *)input->data + expert_offset, expert_offset,
|
|
1508
|
+
// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
|
|
1509
|
+
// this is necessary for MMQ in the CUDA backend
|
|
1510
|
+
expert_size_copy + padding_end);
|
|
1511
|
+
};
|
|
1512
|
+
|
|
1513
|
+
int id = 0;
|
|
1514
|
+
while (!wsp_ggml_bitset_get(used_ids.data(), id)) {
|
|
1515
|
+
id++;
|
|
1516
|
+
}
|
|
1517
|
+
int32_t first_id = id;
|
|
1518
|
+
int32_t last_id = first_id;
|
|
1519
|
+
|
|
1520
|
+
for (++id; id < n_expert; ++id) {
|
|
1521
|
+
if (!wsp_ggml_bitset_get(used_ids.data(), id)) {
|
|
1522
|
+
continue;
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
if (id == last_id + 1) {
|
|
1526
|
+
last_id = id;
|
|
1527
|
+
continue;
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
copy_experts(first_id, last_id);
|
|
1531
|
+
|
|
1532
|
+
first_id = id;
|
|
1533
|
+
last_id = id;
|
|
1534
|
+
}
|
|
1535
|
+
copy_experts(first_id, last_id);
|
|
1536
|
+
} else {
|
|
1537
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
|
1538
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
|
1539
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
|
1540
|
+
wsp_ggml_backend_synchronize(input_backend);
|
|
1541
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
|
1542
|
+
wsp_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
|
1543
|
+
} else {
|
|
1544
|
+
wsp_ggml_backend_synchronize(split_backend);
|
|
1545
|
+
}
|
|
1546
|
+
wsp_ggml_backend_tensor_copy(input, input_cpy);
|
|
1389
1547
|
}
|
|
1390
|
-
wsp_ggml_backend_tensor_copy(input, input_cpy);
|
|
1391
1548
|
}
|
|
1392
1549
|
}
|
|
1393
1550
|
}
|
|
@@ -1526,6 +1683,7 @@ void wsp_ggml_backend_sched_free(wsp_ggml_backend_sched_t sched) {
|
|
|
1526
1683
|
}
|
|
1527
1684
|
|
|
1528
1685
|
void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
|
|
1686
|
+
WSP_GGML_ASSERT(sched);
|
|
1529
1687
|
// reset state for the next run
|
|
1530
1688
|
if (!sched->is_reset) {
|
|
1531
1689
|
wsp_ggml_hash_set_reset(&sched->hash_set);
|
|
@@ -1537,8 +1695,11 @@ void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
|
|
|
1537
1695
|
}
|
|
1538
1696
|
|
|
1539
1697
|
bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph) {
|
|
1698
|
+
WSP_GGML_ASSERT(sched);
|
|
1540
1699
|
WSP_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
|
1541
1700
|
|
|
1701
|
+
wsp_ggml_backend_sched_reset(sched);
|
|
1702
|
+
|
|
1542
1703
|
wsp_ggml_backend_sched_synchronize(sched);
|
|
1543
1704
|
|
|
1544
1705
|
wsp_ggml_backend_sched_split_graph(sched, measure_graph);
|
|
@@ -1553,6 +1714,7 @@ bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_g
|
|
|
1553
1714
|
}
|
|
1554
1715
|
|
|
1555
1716
|
bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
|
|
1717
|
+
WSP_GGML_ASSERT(sched);
|
|
1556
1718
|
WSP_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
|
1557
1719
|
WSP_GGML_ASSERT(!sched->is_alloc);
|
|
1558
1720
|
|
|
@@ -1577,6 +1739,7 @@ enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched
|
|
|
1577
1739
|
}
|
|
1578
1740
|
|
|
1579
1741
|
enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute_async(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
|
|
1742
|
+
WSP_GGML_ASSERT(sched);
|
|
1580
1743
|
if (!sched->is_reset && !sched->is_alloc) {
|
|
1581
1744
|
wsp_ggml_backend_sched_reset(sched);
|
|
1582
1745
|
}
|
|
@@ -1591,6 +1754,7 @@ enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute_async(wsp_ggml_backend
|
|
|
1591
1754
|
}
|
|
1592
1755
|
|
|
1593
1756
|
void wsp_ggml_backend_sched_synchronize(wsp_ggml_backend_sched_t sched) {
|
|
1757
|
+
WSP_GGML_ASSERT(sched);
|
|
1594
1758
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1595
1759
|
wsp_ggml_backend_synchronize(sched->backends[i]);
|
|
1596
1760
|
}
|
|
@@ -1603,28 +1767,42 @@ void wsp_ggml_backend_sched_synchronize(wsp_ggml_backend_sched_t sched) {
|
|
|
1603
1767
|
}
|
|
1604
1768
|
|
|
1605
1769
|
void wsp_ggml_backend_sched_set_eval_callback(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_sched_eval_callback callback, void * user_data) {
|
|
1770
|
+
WSP_GGML_ASSERT(sched);
|
|
1606
1771
|
sched->callback_eval = callback;
|
|
1607
1772
|
sched->callback_eval_user_data = user_data;
|
|
1608
1773
|
}
|
|
1609
1774
|
|
|
1610
1775
|
int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched) {
|
|
1776
|
+
WSP_GGML_ASSERT(sched);
|
|
1611
1777
|
return sched->n_splits;
|
|
1612
1778
|
}
|
|
1613
1779
|
|
|
1614
1780
|
int wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched) {
|
|
1781
|
+
WSP_GGML_ASSERT(sched);
|
|
1615
1782
|
return sched->n_copies;
|
|
1616
1783
|
}
|
|
1617
1784
|
|
|
1618
1785
|
int wsp_ggml_backend_sched_get_n_backends(wsp_ggml_backend_sched_t sched) {
|
|
1786
|
+
WSP_GGML_ASSERT(sched);
|
|
1619
1787
|
return sched->n_backends;
|
|
1620
1788
|
}
|
|
1621
1789
|
|
|
1622
1790
|
wsp_ggml_backend_t wsp_ggml_backend_sched_get_backend(wsp_ggml_backend_sched_t sched, int i) {
|
|
1791
|
+
WSP_GGML_ASSERT(sched);
|
|
1623
1792
|
WSP_GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
|
1624
1793
|
return sched->backends[i];
|
|
1625
1794
|
}
|
|
1626
1795
|
|
|
1796
|
+
wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
|
|
1797
|
+
WSP_GGML_ASSERT(sched);
|
|
1798
|
+
int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
|
|
1799
|
+
WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1800
|
+
|
|
1801
|
+
return sched->bufts[backend_index];
|
|
1802
|
+
}
|
|
1803
|
+
|
|
1627
1804
|
size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
|
|
1805
|
+
WSP_GGML_ASSERT(sched);
|
|
1628
1806
|
int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
|
|
1629
1807
|
WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1630
1808
|
|
|
@@ -1632,6 +1810,7 @@ size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, ws
|
|
|
1632
1810
|
}
|
|
1633
1811
|
|
|
1634
1812
|
void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend) {
|
|
1813
|
+
WSP_GGML_ASSERT(sched);
|
|
1635
1814
|
int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
|
|
1636
1815
|
WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1637
1816
|
tensor_backend_id(node) = backend_index;
|
|
@@ -1640,6 +1819,7 @@ void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, s
|
|
|
1640
1819
|
}
|
|
1641
1820
|
|
|
1642
1821
|
wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
|
|
1822
|
+
WSP_GGML_ASSERT(sched);
|
|
1643
1823
|
int backend_index = tensor_backend_id(node);
|
|
1644
1824
|
if (backend_index == -1) {
|
|
1645
1825
|
return NULL;
|
|
@@ -1650,6 +1830,7 @@ wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sc
|
|
|
1650
1830
|
// utils
|
|
1651
1831
|
|
|
1652
1832
|
enum wsp_ggml_status wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor) {
|
|
1833
|
+
WSP_GGML_ASSERT(tensor);
|
|
1653
1834
|
WSP_GGML_ASSERT(tensor->buffer == NULL);
|
|
1654
1835
|
WSP_GGML_ASSERT(tensor->view_src != NULL);
|
|
1655
1836
|
WSP_GGML_ASSERT(tensor->view_src->buffer != NULL);
|
|
@@ -1661,6 +1842,7 @@ enum wsp_ggml_status wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor)
|
|
|
1661
1842
|
}
|
|
1662
1843
|
|
|
1663
1844
|
enum wsp_ggml_status wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr) {
|
|
1845
|
+
WSP_GGML_ASSERT(tensor);
|
|
1664
1846
|
WSP_GGML_ASSERT(tensor->buffer == NULL);
|
|
1665
1847
|
WSP_GGML_ASSERT(tensor->data == NULL);
|
|
1666
1848
|
WSP_GGML_ASSERT(tensor->view_src == NULL);
|
|
@@ -1734,6 +1916,7 @@ static void graph_copy_init_tensor(struct wsp_ggml_hash_set * hash_set, struct w
|
|
|
1734
1916
|
}
|
|
1735
1917
|
|
|
1736
1918
|
struct wsp_ggml_backend_graph_copy wsp_ggml_backend_graph_copy(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * graph) {
|
|
1919
|
+
WSP_GGML_ASSERT(graph);
|
|
1737
1920
|
struct wsp_ggml_hash_set hash_set = wsp_ggml_hash_set_new(graph->visited_hash_set.size);
|
|
1738
1921
|
struct wsp_ggml_tensor ** node_copies = (wsp_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
|
1739
1922
|
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
|
@@ -1878,6 +2061,7 @@ bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggm
|
|
|
1878
2061
|
// CPU backend - buffer
|
|
1879
2062
|
|
|
1880
2063
|
static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
|
|
2064
|
+
WSP_GGML_ASSERT(buffer);
|
|
1881
2065
|
uintptr_t data = (uintptr_t)buffer->context;
|
|
1882
2066
|
|
|
1883
2067
|
// align the buffer
|
|
@@ -1889,28 +2073,33 @@ static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buf
|
|
|
1889
2073
|
}
|
|
1890
2074
|
|
|
1891
2075
|
static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
|
|
2076
|
+
WSP_GGML_ASSERT(buffer);
|
|
1892
2077
|
wsp_ggml_aligned_free(buffer->context, buffer->size);
|
|
1893
2078
|
}
|
|
1894
2079
|
|
|
1895
2080
|
static void wsp_ggml_backend_cpu_buffer_memset_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
2081
|
+
WSP_GGML_ASSERT(tensor);
|
|
1896
2082
|
memset((char *)tensor->data + offset, value, size);
|
|
1897
2083
|
|
|
1898
2084
|
WSP_GGML_UNUSED(buffer);
|
|
1899
2085
|
}
|
|
1900
2086
|
|
|
1901
2087
|
static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
2088
|
+
WSP_GGML_ASSERT(tensor);
|
|
1902
2089
|
memcpy((char *)tensor->data + offset, data, size);
|
|
1903
2090
|
|
|
1904
2091
|
WSP_GGML_UNUSED(buffer);
|
|
1905
2092
|
}
|
|
1906
2093
|
|
|
1907
2094
|
static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
2095
|
+
WSP_GGML_ASSERT(tensor);
|
|
1908
2096
|
memcpy(data, (const char *)tensor->data + offset, size);
|
|
1909
2097
|
|
|
1910
2098
|
WSP_GGML_UNUSED(buffer);
|
|
1911
2099
|
}
|
|
1912
2100
|
|
|
1913
2101
|
static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
|
|
2102
|
+
WSP_GGML_ASSERT(src);
|
|
1914
2103
|
if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
|
|
1915
2104
|
memcpy(dst->data, src->data, wsp_ggml_nbytes(src));
|
|
1916
2105
|
return true;
|
|
@@ -1921,6 +2110,7 @@ static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buf
|
|
|
1921
2110
|
}
|
|
1922
2111
|
|
|
1923
2112
|
static void wsp_ggml_backend_cpu_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
|
|
2113
|
+
WSP_GGML_ASSERT(buffer);
|
|
1924
2114
|
memset(buffer->context, value, buffer->size);
|
|
1925
2115
|
}
|
|
1926
2116
|
|