whisper.rn 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/cpp/ggml-alloc.c +264 -126
  4. package/cpp/ggml-backend-impl.h +4 -1
  5. package/cpp/ggml-backend-reg.cpp +13 -5
  6. package/cpp/ggml-backend.cpp +207 -17
  7. package/cpp/ggml-backend.h +17 -1
  8. package/cpp/ggml-cpu/amx/amx.cpp +4 -2
  9. package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
  10. package/cpp/ggml-cpu/arch-fallback.h +0 -4
  11. package/cpp/ggml-cpu/common.h +14 -0
  12. package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
  13. package/cpp/ggml-cpu/ggml-cpu.c +48 -41
  14. package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
  15. package/cpp/ggml-cpu/ops.cpp +518 -767
  16. package/cpp/ggml-cpu/ops.h +2 -0
  17. package/cpp/ggml-cpu/simd-mappings.h +88 -59
  18. package/cpp/ggml-cpu/vec.cpp +161 -20
  19. package/cpp/ggml-cpu/vec.h +400 -51
  20. package/cpp/ggml-cpu.h +1 -1
  21. package/cpp/ggml-impl.h +43 -10
  22. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  23. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  24. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  25. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  26. package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
  27. package/cpp/ggml-metal/ggml-metal-device.h +226 -0
  28. package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
  29. package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
  30. package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
  31. package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
  32. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  33. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  34. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  35. package/cpp/ggml-metal-impl.h +40 -40
  36. package/cpp/ggml-metal.h +1 -6
  37. package/cpp/ggml-quants.c +1 -0
  38. package/cpp/ggml.c +175 -13
  39. package/cpp/ggml.h +84 -5
  40. package/cpp/jsi/RNWhisperJSI.cpp +2 -0
  41. package/cpp/jsi/ThreadPool.h +3 -3
  42. package/cpp/whisper.cpp +85 -70
  43. package/cpp/whisper.h +1 -0
  44. package/ios/CMakeLists.txt +6 -1
  45. package/ios/RNWhisperVadContext.mm +14 -13
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  49. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  50. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  51. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  58. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  59. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  60. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  61. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  62. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  68. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  70. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  71. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  72. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  73. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  74. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  75. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  76. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  77. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  78. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  79. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  80. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  81. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  82. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  83. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  84. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  85. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  86. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  87. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  88. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  89. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  90. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  91. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  92. package/lib/commonjs/version.json +1 -1
  93. package/lib/module/version.json +1 -1
  94. package/package.json +1 -1
  95. package/src/version.json +1 -1
  96. package/whisper-rn.podspec +8 -9
  97. package/cpp/ggml-metal.m +0 -6779
  98. package/cpp/ggml-whisper-sim.metallib +0 -0
  99. package/cpp/ggml-whisper.metallib +0 -0
@@ -19,9 +19,8 @@
19
19
  #include <stdio.h>
20
20
  #include <stdlib.h>
21
21
  #include <string.h>
22
- #include <string>
23
- #include <vector>
24
22
  #include <algorithm>
23
+ #include <vector>
25
24
 
26
25
  #ifdef __APPLE__
27
26
  #include <sys/types.h>
@@ -32,6 +31,7 @@
32
31
  // backend buffer type
33
32
 
34
33
  const char * wsp_ggml_backend_buft_name(wsp_ggml_backend_buffer_type_t buft) {
34
+ WSP_GGML_ASSERT(buft);
35
35
  return buft->iface.get_name(buft);
36
36
  }
37
37
 
@@ -41,14 +41,17 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_bu
41
41
  return wsp_ggml_backend_buffer_init(buft, {}, NULL, 0);
42
42
  }
43
43
 
44
+ WSP_GGML_ASSERT(buft);
44
45
  return buft->iface.alloc_buffer(buft, size);
45
46
  }
46
47
 
47
48
  size_t wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
49
+ WSP_GGML_ASSERT(buft);
48
50
  return buft->iface.get_alignment(buft);
49
51
  }
50
52
 
51
53
  size_t wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_type_t buft) {
54
+ WSP_GGML_ASSERT(buft);
52
55
  // get_max_size is optional, defaults to SIZE_MAX
53
56
  if (buft->iface.get_max_size) {
54
57
  return buft->iface.get_max_size(buft);
@@ -57,6 +60,7 @@ size_t wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_type_t buft) {
57
60
  }
58
61
 
59
62
  size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_tensor * tensor) {
63
+ WSP_GGML_ASSERT(buft);
60
64
  // get_alloc_size is optional, defaults to wsp_ggml_nbytes
61
65
  if (buft->iface.get_alloc_size) {
62
66
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -67,6 +71,7 @@ size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft,
67
71
  }
68
72
 
69
73
  bool wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_type_t buft) {
74
+ WSP_GGML_ASSERT(buft);
70
75
  if (buft->iface.is_host) {
71
76
  return buft->iface.is_host(buft);
72
77
  }
@@ -74,6 +79,7 @@ bool wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_type_t buft) {
74
79
  }
75
80
 
76
81
  wsp_ggml_backend_dev_t wsp_ggml_backend_buft_get_device(wsp_ggml_backend_buffer_type_t buft) {
82
+ WSP_GGML_ASSERT(buft);
77
83
  return buft->device;
78
84
  }
79
85
 
@@ -111,10 +117,12 @@ void wsp_ggml_backend_buffer_free(wsp_ggml_backend_buffer_t buffer) {
111
117
  }
112
118
 
113
119
  size_t wsp_ggml_backend_buffer_get_size(wsp_ggml_backend_buffer_t buffer) {
120
+ WSP_GGML_ASSERT(buffer);
114
121
  return buffer->size;
115
122
  }
116
123
 
117
124
  void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
125
+ WSP_GGML_ASSERT(buffer);
118
126
  // get_base is optional if the buffer is zero-sized
119
127
  if (buffer->size == 0) {
120
128
  return NULL;
@@ -128,6 +136,7 @@ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
128
136
  }
129
137
 
130
138
  enum wsp_ggml_status wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
139
+ WSP_GGML_ASSERT(buffer);
131
140
  // init_tensor is optional
132
141
  if (buffer->iface.init_tensor) {
133
142
  return buffer->iface.init_tensor(buffer, tensor);
@@ -136,6 +145,7 @@ enum wsp_ggml_status wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer
136
145
  }
137
146
 
138
147
  void wsp_ggml_backend_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
148
+ WSP_GGML_ASSERT(buffer);
139
149
  // clear is optional if the buffer is zero-sized
140
150
  if (buffer->size == 0) {
141
151
  return;
@@ -161,6 +171,7 @@ bool wsp_ggml_backend_buffer_is_host(wsp_ggml_backend_buffer_t buffer) {
161
171
  }
162
172
 
163
173
  void wsp_ggml_backend_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
174
+ WSP_GGML_ASSERT(buffer);
164
175
  buffer->usage = usage;
165
176
 
166
177
  // FIXME: add a generic callback to the buffer interface
@@ -170,14 +181,17 @@ void wsp_ggml_backend_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum ws
170
181
  }
171
182
 
172
183
  enum wsp_ggml_backend_buffer_usage wsp_ggml_backend_buffer_get_usage(wsp_ggml_backend_buffer_t buffer) {
184
+ WSP_GGML_ASSERT(buffer);
173
185
  return buffer->usage;
174
186
  }
175
187
 
176
188
  wsp_ggml_backend_buffer_type_t wsp_ggml_backend_buffer_get_type(wsp_ggml_backend_buffer_t buffer) {
189
+ WSP_GGML_ASSERT(buffer);
177
190
  return buffer->buft;
178
191
  }
179
192
 
180
193
  void wsp_ggml_backend_buffer_reset(wsp_ggml_backend_buffer_t buffer) {
194
+ WSP_GGML_ASSERT(buffer);
181
195
  if (buffer->iface.reset) {
182
196
  buffer->iface.reset(buffer);
183
197
  }
@@ -216,6 +230,7 @@ void wsp_ggml_backend_free(wsp_ggml_backend_t backend) {
216
230
  }
217
231
 
218
232
  wsp_ggml_backend_buffer_type_t wsp_ggml_backend_get_default_buffer_type(wsp_ggml_backend_t backend) {
233
+ WSP_GGML_ASSERT(backend);
219
234
  return wsp_ggml_backend_dev_buffer_type(backend->device);
220
235
  }
221
236
 
@@ -232,6 +247,8 @@ size_t wsp_ggml_backend_get_max_size(wsp_ggml_backend_t backend) {
232
247
  }
233
248
 
234
249
  void wsp_ggml_backend_tensor_set_async(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
250
+ WSP_GGML_ASSERT(backend);
251
+ WSP_GGML_ASSERT(tensor);
235
252
  WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
236
253
  WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
237
254
 
@@ -243,6 +260,8 @@ void wsp_ggml_backend_tensor_set_async(wsp_ggml_backend_t backend, struct wsp_gg
243
260
  }
244
261
 
245
262
  void wsp_ggml_backend_tensor_get_async(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
263
+ WSP_GGML_ASSERT(backend);
264
+ WSP_GGML_ASSERT(tensor);
246
265
  WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
247
266
  WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
248
267
 
@@ -284,6 +303,7 @@ void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * d
284
303
  }
285
304
 
286
305
  void wsp_ggml_backend_tensor_memset(struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
306
+ WSP_GGML_ASSERT(tensor);
287
307
  wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
288
308
 
289
309
  if (size == 0) {
@@ -299,6 +319,7 @@ void wsp_ggml_backend_tensor_memset(struct wsp_ggml_tensor * tensor, uint8_t val
299
319
  }
300
320
 
301
321
  void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
322
+ WSP_GGML_ASSERT(backend);
302
323
  if (backend->iface.synchronize == NULL) {
303
324
  return;
304
325
  }
@@ -307,18 +328,21 @@ void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
307
328
  }
308
329
 
309
330
  wsp_ggml_backend_graph_plan_t wsp_ggml_backend_graph_plan_create(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
331
+ WSP_GGML_ASSERT(backend);
310
332
  WSP_GGML_ASSERT(backend->iface.graph_plan_create != NULL);
311
333
 
312
334
  return backend->iface.graph_plan_create(backend, cgraph);
313
335
  }
314
336
 
315
337
  void wsp_ggml_backend_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
338
+ WSP_GGML_ASSERT(backend);
316
339
  WSP_GGML_ASSERT(backend->iface.graph_plan_free != NULL);
317
340
 
318
341
  backend->iface.graph_plan_free(backend, plan);
319
342
  }
320
343
 
321
344
  enum wsp_ggml_status wsp_ggml_backend_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
345
+ WSP_GGML_ASSERT(backend);
322
346
  WSP_GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
323
347
 
324
348
  return backend->iface.graph_plan_compute(backend, plan);
@@ -331,22 +355,27 @@ enum wsp_ggml_status wsp_ggml_backend_graph_compute(wsp_ggml_backend_t backend,
331
355
  }
332
356
 
333
357
  enum wsp_ggml_status wsp_ggml_backend_graph_compute_async(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
358
+ WSP_GGML_ASSERT(backend);
334
359
  return backend->iface.graph_compute(backend, cgraph);
335
360
  }
336
361
 
337
362
  bool wsp_ggml_backend_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
363
+ WSP_GGML_ASSERT(backend);
338
364
  return wsp_ggml_backend_dev_supports_op(backend->device, op);
339
365
  }
340
366
 
341
367
  bool wsp_ggml_backend_supports_buft(wsp_ggml_backend_t backend, wsp_ggml_backend_buffer_type_t buft) {
368
+ WSP_GGML_ASSERT(backend);
342
369
  return wsp_ggml_backend_dev_supports_buft(backend->device, buft);
343
370
  }
344
371
 
345
372
  bool wsp_ggml_backend_offload_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
373
+ WSP_GGML_ASSERT(backend);
346
374
  return wsp_ggml_backend_dev_offload_op(backend->device, op);
347
375
  }
348
376
 
349
377
  wsp_ggml_backend_dev_t wsp_ggml_backend_get_device(wsp_ggml_backend_t backend) {
378
+ WSP_GGML_ASSERT(backend);
350
379
  return backend->device;
351
380
  }
352
381
 
@@ -382,6 +411,7 @@ void wsp_ggml_backend_tensor_copy_async(wsp_ggml_backend_t backend_src, wsp_ggml
382
411
  return;
383
412
  }
384
413
 
414
+ WSP_GGML_ASSERT(backend_dst);
385
415
  if (backend_dst->iface.cpy_tensor_async != NULL) {
386
416
  if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
387
417
  return;
@@ -413,38 +443,52 @@ void wsp_ggml_backend_event_free(wsp_ggml_backend_event_t event) {
413
443
  }
414
444
 
415
445
  void wsp_ggml_backend_event_record(wsp_ggml_backend_event_t event, wsp_ggml_backend_t backend) {
446
+ WSP_GGML_ASSERT(backend);
416
447
  WSP_GGML_ASSERT(backend->iface.event_record != NULL);
417
448
 
418
449
  backend->iface.event_record(backend, event);
419
450
  }
420
451
 
421
452
  void wsp_ggml_backend_event_synchronize(wsp_ggml_backend_event_t event) {
453
+ WSP_GGML_ASSERT(event);
422
454
  WSP_GGML_ASSERT(event->device->iface.event_synchronize);
423
455
 
424
456
  event->device->iface.event_synchronize(event->device, event);
425
457
  }
426
458
 
427
459
  void wsp_ggml_backend_event_wait(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event) {
460
+ WSP_GGML_ASSERT(backend);
428
461
  WSP_GGML_ASSERT(backend->iface.event_wait != NULL);
429
462
 
430
463
  backend->iface.event_wait(backend, event);
431
464
  }
432
465
 
466
+ static void wsp_ggml_backend_graph_optimize(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
467
+ WSP_GGML_ASSERT(backend);
468
+ if (backend->iface.graph_optimize != NULL) {
469
+ backend->iface.graph_optimize(backend, cgraph);
470
+ }
471
+ }
472
+
433
473
  // Backend device
434
474
 
435
475
  const char * wsp_ggml_backend_dev_name(wsp_ggml_backend_dev_t device) {
476
+ WSP_GGML_ASSERT(device);
436
477
  return device->iface.get_name(device);
437
478
  }
438
479
 
439
480
  const char * wsp_ggml_backend_dev_description(wsp_ggml_backend_dev_t device) {
481
+ WSP_GGML_ASSERT(device);
440
482
  return device->iface.get_description(device);
441
483
  }
442
484
 
443
485
  void wsp_ggml_backend_dev_memory(wsp_ggml_backend_dev_t device, size_t * free, size_t * total) {
486
+ WSP_GGML_ASSERT(device);
444
487
  device->iface.get_memory(device, free, total);
445
488
  }
446
489
 
447
490
  enum wsp_ggml_backend_dev_type wsp_ggml_backend_dev_type(wsp_ggml_backend_dev_t device) {
491
+ WSP_GGML_ASSERT(device);
448
492
  return device->iface.get_type(device);
449
493
  }
450
494
 
@@ -454,18 +498,22 @@ void wsp_ggml_backend_dev_get_props(wsp_ggml_backend_dev_t device, struct wsp_gg
454
498
  }
455
499
 
456
500
  wsp_ggml_backend_reg_t wsp_ggml_backend_dev_backend_reg(wsp_ggml_backend_dev_t device) {
501
+ WSP_GGML_ASSERT(device);
457
502
  return device->reg;
458
503
  }
459
504
 
460
505
  wsp_ggml_backend_t wsp_ggml_backend_dev_init(wsp_ggml_backend_dev_t device, const char * params) {
506
+ WSP_GGML_ASSERT(device);
461
507
  return device->iface.init_backend(device, params);
462
508
  }
463
509
 
464
510
  wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_buffer_type(wsp_ggml_backend_dev_t device) {
511
+ WSP_GGML_ASSERT(device);
465
512
  return device->iface.get_buffer_type(device);
466
513
  }
467
514
 
468
515
  wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_host_buffer_type(wsp_ggml_backend_dev_t device) {
516
+ WSP_GGML_ASSERT(device);
469
517
  if (device->iface.get_host_buffer_type == NULL) {
470
518
  return NULL;
471
519
  }
@@ -474,18 +522,22 @@ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_host_buffer_type(wsp_ggml_ba
474
522
  }
475
523
 
476
524
  wsp_ggml_backend_buffer_t wsp_ggml_backend_dev_buffer_from_host_ptr(wsp_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
525
+ WSP_GGML_ASSERT(device);
477
526
  return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
478
527
  }
479
528
 
480
529
  bool wsp_ggml_backend_dev_supports_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
530
+ WSP_GGML_ASSERT(device);
481
531
  return device->iface.supports_op(device, op);
482
532
  }
483
533
 
484
534
  bool wsp_ggml_backend_dev_supports_buft(wsp_ggml_backend_dev_t device, wsp_ggml_backend_buffer_type_t buft) {
535
+ WSP_GGML_ASSERT(device);
485
536
  return device->iface.supports_buft(device, buft);
486
537
  }
487
538
 
488
539
  bool wsp_ggml_backend_dev_offload_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
540
+ WSP_GGML_ASSERT(device);
489
541
  if (device->iface.offload_op != NULL) {
490
542
  return device->iface.offload_op(device, op);
491
543
  }
@@ -496,18 +548,22 @@ bool wsp_ggml_backend_dev_offload_op(wsp_ggml_backend_dev_t device, const struct
496
548
  // Backend (reg)
497
549
 
498
550
  const char * wsp_ggml_backend_reg_name(wsp_ggml_backend_reg_t reg) {
551
+ WSP_GGML_ASSERT(reg);
499
552
  return reg->iface.get_name(reg);
500
553
  }
501
554
 
502
555
  size_t wsp_ggml_backend_reg_dev_count(wsp_ggml_backend_reg_t reg) {
556
+ WSP_GGML_ASSERT(reg);
503
557
  return reg->iface.get_device_count(reg);
504
558
  }
505
559
 
506
560
  wsp_ggml_backend_dev_t wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_reg_t reg, size_t index) {
561
+ WSP_GGML_ASSERT(reg);
507
562
  return reg->iface.get_device(reg, index);
508
563
  }
509
564
 
510
565
  void * wsp_ggml_backend_reg_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
566
+ WSP_GGML_ASSERT(reg);
511
567
  if (!reg->iface.get_proc_address) {
512
568
  return NULL;
513
569
  }
@@ -522,6 +578,7 @@ struct wsp_ggml_backend_multi_buffer_context {
522
578
  };
523
579
 
524
580
  static void wsp_ggml_backend_multi_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
581
+ WSP_GGML_ASSERT(buffer);
525
582
  wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
526
583
  for (size_t i = 0; i < ctx->n_buffers; i++) {
527
584
  wsp_ggml_backend_buffer_free(ctx->buffers[i]);
@@ -532,6 +589,7 @@ static void wsp_ggml_backend_multi_buffer_free_buffer(wsp_ggml_backend_buffer_t
532
589
  }
533
590
 
534
591
  static void wsp_ggml_backend_multi_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
592
+ WSP_GGML_ASSERT(buffer);
535
593
  wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
536
594
  for (size_t i = 0; i < ctx->n_buffers; i++) {
537
595
  wsp_ggml_backend_buffer_clear(ctx->buffers[i], value);
@@ -567,10 +625,12 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_multi_buffer_alloc_buffer(wsp_ggml_ba
567
625
  }
568
626
 
569
627
  bool wsp_ggml_backend_buffer_is_multi_buffer(wsp_ggml_backend_buffer_t buffer) {
628
+ WSP_GGML_ASSERT(buffer);
570
629
  return buffer->iface.free_buffer == wsp_ggml_backend_multi_buffer_free_buffer;
571
630
  }
572
631
 
573
632
  void wsp_ggml_backend_multi_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
633
+ WSP_GGML_ASSERT(buffer);
574
634
  WSP_GGML_ASSERT(wsp_ggml_backend_buffer_is_multi_buffer(buffer));
575
635
  wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
576
636
  for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -598,7 +658,7 @@ static bool wsp_ggml_is_view_op(enum wsp_ggml_op op) {
598
658
  #endif
599
659
 
600
660
  #ifndef WSP_GGML_SCHED_MAX_SPLIT_INPUTS
601
- #define WSP_GGML_SCHED_MAX_SPLIT_INPUTS WSP_GGML_MAX_SRC
661
+ #define WSP_GGML_SCHED_MAX_SPLIT_INPUTS 30
602
662
  #endif
603
663
 
604
664
  #ifndef WSP_GGML_SCHED_MAX_COPIES
@@ -849,7 +909,7 @@ static void wsp_ggml_backend_sched_set_if_supported(wsp_ggml_backend_sched_t sch
849
909
  }
850
910
 
851
911
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
852
- static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
912
+ void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
853
913
  // reset splits
854
914
  sched->n_splits = 0;
855
915
  sched->n_graph_inputs = 0;
@@ -1245,6 +1305,10 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
1245
1305
  struct wsp_ggml_backend_sched_split * split = &sched->splits[i];
1246
1306
  split->graph = wsp_ggml_graph_view(graph, split->i_start, split->i_end);
1247
1307
 
1308
+ // Optimize this split of the graph. This needs to happen before we make graph_copy,
1309
+ // so they are in sync.
1310
+ wsp_ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
1311
+
1248
1312
  // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1249
1313
  for (int j = 0; j < split->n_inputs; j++) {
1250
1314
  assert(graph_copy->size > (graph_copy->n_nodes + 1));
@@ -1350,17 +1414,22 @@ static bool wsp_ggml_backend_sched_alloc_splits(wsp_ggml_backend_sched_t sched)
1350
1414
  }
1351
1415
 
1352
1416
  static enum wsp_ggml_status wsp_ggml_backend_sched_compute_splits(wsp_ggml_backend_sched_t sched) {
1417
+ WSP_GGML_ASSERT(sched);
1353
1418
  struct wsp_ggml_backend_sched_split * splits = sched->splits;
1354
1419
 
1355
- for (int i = 0; i < sched->n_splits; i++) {
1356
- struct wsp_ggml_backend_sched_split * split = &splits[i];
1420
+ wsp_ggml_tensor * prev_ids_tensor = nullptr;
1421
+ std::vector<int32_t> ids;
1422
+ std::vector<wsp_ggml_bitset_t> used_ids;
1423
+
1424
+ for (int split_id = 0; split_id < sched->n_splits; split_id++) {
1425
+ struct wsp_ggml_backend_sched_split * split = &splits[split_id];
1357
1426
  int split_backend_id = split->backend_id;
1358
1427
  wsp_ggml_backend_t split_backend = sched->backends[split_backend_id];
1359
1428
 
1360
1429
  // copy the input tensors to the split backend
1361
- for (int j = 0; j < split->n_inputs; j++) {
1362
- wsp_ggml_backend_t input_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1363
- struct wsp_ggml_tensor * input = split->inputs[j];
1430
+ for (int input_id = 0; input_id < split->n_inputs; input_id++) {
1431
+ wsp_ggml_backend_t input_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
1432
+ struct wsp_ggml_tensor * input = split->inputs[input_id];
1364
1433
  struct wsp_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1365
1434
 
1366
1435
  if (input->flags & WSP_GGML_TENSOR_FLAG_INPUT) {
@@ -1378,16 +1447,104 @@ static enum wsp_ggml_status wsp_ggml_backend_sched_compute_splits(wsp_ggml_backe
1378
1447
  } else {
1379
1448
  wsp_ggml_backend_synchronize(split_backend);
1380
1449
  }
1381
- // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1382
- // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1383
- if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1450
+
1451
+ // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
1452
+ wsp_ggml_tensor * node = split->graph.nodes[0];
1453
+ if (split->graph.n_nodes > 0 &&
1454
+ wsp_ggml_backend_buffer_get_usage(input->buffer) == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
1455
+ wsp_ggml_backend_buffer_is_host(input->buffer) && (
1456
+ (node->src[0] == input_cpy && node->op == WSP_GGML_OP_MUL_MAT_ID)
1457
+ //|| (node->src[1] == input_cpy && node->op == WSP_GGML_OP_ADD_ID) /* WSP_GGML_OP_ADD_ID weights are small and not worth splitting */
1458
+ )) {
1459
+
1460
+ const int64_t n_expert = node->op == WSP_GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
1461
+ const size_t expert_size = node->op == WSP_GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
1462
+
1384
1463
  wsp_ggml_backend_synchronize(input_backend);
1385
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1386
- wsp_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1387
- } else {
1388
- wsp_ggml_backend_synchronize(split_backend);
1464
+
1465
+ // get the ids
1466
+ wsp_ggml_tensor * ids_tensor = node->src[2];
1467
+ wsp_ggml_backend_t ids_backend = split_backend;
1468
+
1469
+ // if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
1470
+ // in that case, we use the original ids tensor
1471
+ for (int i = input_id + 1; i < split->n_inputs; i++) {
1472
+ if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
1473
+ ids_tensor = split->inputs[i];
1474
+ ids_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
1475
+ break;
1476
+ }
1477
+ }
1478
+
1479
+ if (ids_tensor != prev_ids_tensor) {
1480
+ ids.resize(wsp_ggml_nbytes(ids_tensor) / sizeof(int32_t));
1481
+ wsp_ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, wsp_ggml_nbytes(ids_tensor));
1482
+ wsp_ggml_backend_synchronize(ids_backend);
1483
+
1484
+ // find the used experts
1485
+ used_ids.clear();
1486
+ used_ids.resize(wsp_ggml_bitset_size(n_expert));
1487
+ for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
1488
+ for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
1489
+ int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
1490
+ WSP_GGML_ASSERT(id >= 0 && id < n_expert);
1491
+ wsp_ggml_bitset_set(used_ids.data(), id);
1492
+ }
1493
+ }
1494
+
1495
+ prev_ids_tensor = ids_tensor;
1496
+ }
1497
+
1498
+ // group consecutive experts and copy them together
1499
+ auto copy_experts = [&](int32_t first_id, int32_t last_id) {
1500
+ const size_t expert_offset = first_id * expert_size;
1501
+ const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
1502
+ const size_t padding = std::min<size_t>(expert_size, 512);
1503
+ const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
1504
+
1505
+ wsp_ggml_backend_tensor_set_async(split_backend,
1506
+ input_cpy,
1507
+ (const uint8_t *)input->data + expert_offset, expert_offset,
1508
+ // copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
1509
+ // this is necessary for MMQ in the CUDA backend
1510
+ expert_size_copy + padding_end);
1511
+ };
1512
+
1513
+ int id = 0;
1514
+ while (!wsp_ggml_bitset_get(used_ids.data(), id)) {
1515
+ id++;
1516
+ }
1517
+ int32_t first_id = id;
1518
+ int32_t last_id = first_id;
1519
+
1520
+ for (++id; id < n_expert; ++id) {
1521
+ if (!wsp_ggml_bitset_get(used_ids.data(), id)) {
1522
+ continue;
1523
+ }
1524
+
1525
+ if (id == last_id + 1) {
1526
+ last_id = id;
1527
+ continue;
1528
+ }
1529
+
1530
+ copy_experts(first_id, last_id);
1531
+
1532
+ first_id = id;
1533
+ last_id = id;
1534
+ }
1535
+ copy_experts(first_id, last_id);
1536
+ } else {
1537
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1538
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1539
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1540
+ wsp_ggml_backend_synchronize(input_backend);
1541
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1542
+ wsp_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1543
+ } else {
1544
+ wsp_ggml_backend_synchronize(split_backend);
1545
+ }
1546
+ wsp_ggml_backend_tensor_copy(input, input_cpy);
1389
1547
  }
1390
- wsp_ggml_backend_tensor_copy(input, input_cpy);
1391
1548
  }
1392
1549
  }
1393
1550
  }
@@ -1526,6 +1683,7 @@ void wsp_ggml_backend_sched_free(wsp_ggml_backend_sched_t sched) {
1526
1683
  }
1527
1684
 
1528
1685
  void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
1686
+ WSP_GGML_ASSERT(sched);
1529
1687
  // reset state for the next run
1530
1688
  if (!sched->is_reset) {
1531
1689
  wsp_ggml_hash_set_reset(&sched->hash_set);
@@ -1537,8 +1695,11 @@ void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
1537
1695
  }
1538
1696
 
1539
1697
  bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph) {
1698
+ WSP_GGML_ASSERT(sched);
1540
1699
  WSP_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1541
1700
 
1701
+ wsp_ggml_backend_sched_reset(sched);
1702
+
1542
1703
  wsp_ggml_backend_sched_synchronize(sched);
1543
1704
 
1544
1705
  wsp_ggml_backend_sched_split_graph(sched, measure_graph);
@@ -1553,6 +1714,7 @@ bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_g
1553
1714
  }
1554
1715
 
1555
1716
  bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
1717
+ WSP_GGML_ASSERT(sched);
1556
1718
  WSP_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1557
1719
  WSP_GGML_ASSERT(!sched->is_alloc);
1558
1720
 
@@ -1577,6 +1739,7 @@ enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched
1577
1739
  }
1578
1740
 
1579
1741
  enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute_async(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
1742
+ WSP_GGML_ASSERT(sched);
1580
1743
  if (!sched->is_reset && !sched->is_alloc) {
1581
1744
  wsp_ggml_backend_sched_reset(sched);
1582
1745
  }
@@ -1591,6 +1754,7 @@ enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute_async(wsp_ggml_backend
1591
1754
  }
1592
1755
 
1593
1756
  void wsp_ggml_backend_sched_synchronize(wsp_ggml_backend_sched_t sched) {
1757
+ WSP_GGML_ASSERT(sched);
1594
1758
  for (int i = 0; i < sched->n_backends; i++) {
1595
1759
  wsp_ggml_backend_synchronize(sched->backends[i]);
1596
1760
  }
@@ -1603,28 +1767,42 @@ void wsp_ggml_backend_sched_synchronize(wsp_ggml_backend_sched_t sched) {
1603
1767
  }
1604
1768
 
1605
1769
  void wsp_ggml_backend_sched_set_eval_callback(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_sched_eval_callback callback, void * user_data) {
1770
+ WSP_GGML_ASSERT(sched);
1606
1771
  sched->callback_eval = callback;
1607
1772
  sched->callback_eval_user_data = user_data;
1608
1773
  }
1609
1774
 
1610
1775
  int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched) {
1776
+ WSP_GGML_ASSERT(sched);
1611
1777
  return sched->n_splits;
1612
1778
  }
1613
1779
 
1614
1780
  int wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched) {
1781
+ WSP_GGML_ASSERT(sched);
1615
1782
  return sched->n_copies;
1616
1783
  }
1617
1784
 
1618
1785
  int wsp_ggml_backend_sched_get_n_backends(wsp_ggml_backend_sched_t sched) {
1786
+ WSP_GGML_ASSERT(sched);
1619
1787
  return sched->n_backends;
1620
1788
  }
1621
1789
 
1622
1790
  wsp_ggml_backend_t wsp_ggml_backend_sched_get_backend(wsp_ggml_backend_sched_t sched, int i) {
1791
+ WSP_GGML_ASSERT(sched);
1623
1792
  WSP_GGML_ASSERT(i >= 0 && i < sched->n_backends);
1624
1793
  return sched->backends[i];
1625
1794
  }
1626
1795
 
1796
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
1797
+ WSP_GGML_ASSERT(sched);
1798
+ int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
1799
+ WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1800
+
1801
+ return sched->bufts[backend_index];
1802
+ }
1803
+
1627
1804
  size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
1805
+ WSP_GGML_ASSERT(sched);
1628
1806
  int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
1629
1807
  WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1630
1808
 
@@ -1632,6 +1810,7 @@ size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, ws
1632
1810
  }
1633
1811
 
1634
1812
  void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend) {
1813
+ WSP_GGML_ASSERT(sched);
1635
1814
  int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
1636
1815
  WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1637
1816
  tensor_backend_id(node) = backend_index;
@@ -1640,6 +1819,7 @@ void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, s
1640
1819
  }
1641
1820
 
1642
1821
  wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
1822
+ WSP_GGML_ASSERT(sched);
1643
1823
  int backend_index = tensor_backend_id(node);
1644
1824
  if (backend_index == -1) {
1645
1825
  return NULL;
@@ -1650,6 +1830,7 @@ wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sc
1650
1830
  // utils
1651
1831
 
1652
1832
  enum wsp_ggml_status wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor) {
1833
+ WSP_GGML_ASSERT(tensor);
1653
1834
  WSP_GGML_ASSERT(tensor->buffer == NULL);
1654
1835
  WSP_GGML_ASSERT(tensor->view_src != NULL);
1655
1836
  WSP_GGML_ASSERT(tensor->view_src->buffer != NULL);
@@ -1661,6 +1842,7 @@ enum wsp_ggml_status wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor)
1661
1842
  }
1662
1843
 
1663
1844
  enum wsp_ggml_status wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr) {
1845
+ WSP_GGML_ASSERT(tensor);
1664
1846
  WSP_GGML_ASSERT(tensor->buffer == NULL);
1665
1847
  WSP_GGML_ASSERT(tensor->data == NULL);
1666
1848
  WSP_GGML_ASSERT(tensor->view_src == NULL);
@@ -1734,6 +1916,7 @@ static void graph_copy_init_tensor(struct wsp_ggml_hash_set * hash_set, struct w
1734
1916
  }
1735
1917
 
1736
1918
  struct wsp_ggml_backend_graph_copy wsp_ggml_backend_graph_copy(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * graph) {
1919
+ WSP_GGML_ASSERT(graph);
1737
1920
  struct wsp_ggml_hash_set hash_set = wsp_ggml_hash_set_new(graph->visited_hash_set.size);
1738
1921
  struct wsp_ggml_tensor ** node_copies = (wsp_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1739
1922
  bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@@ -1878,6 +2061,7 @@ bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggm
1878
2061
  // CPU backend - buffer
1879
2062
 
1880
2063
  static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
2064
+ WSP_GGML_ASSERT(buffer);
1881
2065
  uintptr_t data = (uintptr_t)buffer->context;
1882
2066
 
1883
2067
  // align the buffer
@@ -1889,28 +2073,33 @@ static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buf
1889
2073
  }
1890
2074
 
1891
2075
  static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
2076
+ WSP_GGML_ASSERT(buffer);
1892
2077
  wsp_ggml_aligned_free(buffer->context, buffer->size);
1893
2078
  }
1894
2079
 
1895
2080
  static void wsp_ggml_backend_cpu_buffer_memset_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
2081
+ WSP_GGML_ASSERT(tensor);
1896
2082
  memset((char *)tensor->data + offset, value, size);
1897
2083
 
1898
2084
  WSP_GGML_UNUSED(buffer);
1899
2085
  }
1900
2086
 
1901
2087
  static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2088
+ WSP_GGML_ASSERT(tensor);
1902
2089
  memcpy((char *)tensor->data + offset, data, size);
1903
2090
 
1904
2091
  WSP_GGML_UNUSED(buffer);
1905
2092
  }
1906
2093
 
1907
2094
  static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2095
+ WSP_GGML_ASSERT(tensor);
1908
2096
  memcpy(data, (const char *)tensor->data + offset, size);
1909
2097
 
1910
2098
  WSP_GGML_UNUSED(buffer);
1911
2099
  }
1912
2100
 
1913
2101
  static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
2102
+ WSP_GGML_ASSERT(src);
1914
2103
  if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
1915
2104
  memcpy(dst->data, src->data, wsp_ggml_nbytes(src));
1916
2105
  return true;
@@ -1921,6 +2110,7 @@ static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buf
1921
2110
  }
1922
2111
 
1923
2112
  static void wsp_ggml_backend_cpu_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
2113
+ WSP_GGML_ASSERT(buffer);
1924
2114
  memset(buffer->context, value, buffer->size);
1925
2115
  }
1926
2116