whisper.rn 0.4.0-rc.9 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/README.md +74 -1
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +43 -13
  4. package/android/src/main/java/com/rnwhisper/RNWhisper.java +211 -0
  5. package/android/src/main/java/com/rnwhisper/WhisperContext.java +64 -36
  6. package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +157 -0
  7. package/android/src/main/jni.cpp +205 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
  15. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
  16. package/cpp/coreml/whisper-compat.h +10 -0
  17. package/cpp/coreml/whisper-compat.m +35 -0
  18. package/cpp/coreml/whisper-decoder-impl.h +27 -15
  19. package/cpp/coreml/whisper-decoder-impl.m +36 -10
  20. package/cpp/coreml/whisper-encoder-impl.h +21 -9
  21. package/cpp/coreml/whisper-encoder-impl.m +29 -3
  22. package/cpp/ggml-alloc.c +39 -37
  23. package/cpp/ggml-alloc.h +1 -1
  24. package/cpp/ggml-backend-impl.h +55 -27
  25. package/cpp/ggml-backend-reg.cpp +591 -0
  26. package/cpp/ggml-backend.cpp +336 -955
  27. package/cpp/ggml-backend.h +70 -42
  28. package/cpp/ggml-common.h +57 -49
  29. package/cpp/ggml-cpp.h +39 -0
  30. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  31. package/cpp/ggml-cpu/amx/amx.h +8 -0
  32. package/cpp/ggml-cpu/amx/common.h +91 -0
  33. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  34. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  35. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  36. package/cpp/ggml-cpu/arch/arm/quants.c +4113 -0
  37. package/cpp/ggml-cpu/arch/arm/repack.cpp +2162 -0
  38. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  39. package/cpp/ggml-cpu/arch/x86/quants.c +4310 -0
  40. package/cpp/ggml-cpu/arch/x86/repack.cpp +3284 -0
  41. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  42. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  43. package/cpp/ggml-cpu/binary-ops.h +16 -0
  44. package/cpp/ggml-cpu/common.h +72 -0
  45. package/cpp/ggml-cpu/ggml-cpu-impl.h +511 -0
  46. package/cpp/ggml-cpu/ggml-cpu.c +3473 -0
  47. package/cpp/ggml-cpu/ggml-cpu.cpp +671 -0
  48. package/cpp/ggml-cpu/ops.cpp +9085 -0
  49. package/cpp/ggml-cpu/ops.h +111 -0
  50. package/cpp/ggml-cpu/quants.c +1157 -0
  51. package/cpp/ggml-cpu/quants.h +89 -0
  52. package/cpp/ggml-cpu/repack.cpp +1570 -0
  53. package/cpp/ggml-cpu/repack.h +98 -0
  54. package/cpp/ggml-cpu/simd-mappings.h +1006 -0
  55. package/cpp/ggml-cpu/traits.cpp +36 -0
  56. package/cpp/ggml-cpu/traits.h +38 -0
  57. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  58. package/cpp/ggml-cpu/unary-ops.h +28 -0
  59. package/cpp/ggml-cpu/vec.cpp +321 -0
  60. package/cpp/ggml-cpu/vec.h +973 -0
  61. package/cpp/ggml-cpu.h +143 -0
  62. package/cpp/ggml-impl.h +417 -23
  63. package/cpp/ggml-metal-impl.h +622 -0
  64. package/cpp/ggml-metal.h +9 -9
  65. package/cpp/ggml-metal.m +3451 -1344
  66. package/cpp/ggml-opt.cpp +1037 -0
  67. package/cpp/ggml-opt.h +237 -0
  68. package/cpp/ggml-quants.c +296 -10818
  69. package/cpp/ggml-quants.h +78 -125
  70. package/cpp/ggml-threading.cpp +12 -0
  71. package/cpp/ggml-threading.h +14 -0
  72. package/cpp/ggml-whisper-sim.metallib +0 -0
  73. package/cpp/ggml-whisper.metallib +0 -0
  74. package/cpp/ggml.c +4633 -21450
  75. package/cpp/ggml.h +320 -661
  76. package/cpp/gguf.cpp +1347 -0
  77. package/cpp/gguf.h +202 -0
  78. package/cpp/rn-whisper.cpp +4 -11
  79. package/cpp/whisper-arch.h +197 -0
  80. package/cpp/whisper.cpp +2022 -495
  81. package/cpp/whisper.h +75 -18
  82. package/ios/CMakeLists.txt +95 -0
  83. package/ios/RNWhisper.h +5 -0
  84. package/ios/RNWhisper.mm +147 -0
  85. package/ios/RNWhisperAudioUtils.m +4 -0
  86. package/ios/RNWhisperContext.h +5 -0
  87. package/ios/RNWhisperContext.mm +22 -26
  88. package/ios/RNWhisperVadContext.h +29 -0
  89. package/ios/RNWhisperVadContext.mm +152 -0
  90. package/ios/rnwhisper.xcframework/Info.plist +74 -0
  91. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  92. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  93. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  94. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  95. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  96. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  97. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  98. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  99. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  100. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  101. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  102. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  103. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  104. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  105. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  106. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  107. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  108. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  109. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  110. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  111. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  112. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  113. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  114. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  115. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  116. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  117. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  118. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  119. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  120. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  121. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  122. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  123. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  124. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  125. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  126. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  127. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  128. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  129. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  130. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  131. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  132. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  133. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  134. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  135. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  136. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  137. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  138. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  139. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  140. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  141. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  142. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  143. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  144. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  145. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  146. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  147. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  148. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  149. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  150. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  151. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  152. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  153. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  154. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  155. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  156. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  157. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  158. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  159. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  160. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  161. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  162. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  163. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  164. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  165. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  166. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  167. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  168. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  169. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  170. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  171. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  172. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  173. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  174. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  175. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  176. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  177. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  178. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  179. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  180. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  181. package/jest/mock.js +24 -0
  182. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  183. package/lib/commonjs/index.js +111 -1
  184. package/lib/commonjs/index.js.map +1 -1
  185. package/lib/commonjs/version.json +1 -1
  186. package/lib/module/NativeRNWhisper.js.map +1 -1
  187. package/lib/module/index.js +112 -0
  188. package/lib/module/index.js.map +1 -1
  189. package/lib/module/version.json +1 -1
  190. package/lib/typescript/NativeRNWhisper.d.ts +35 -0
  191. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  192. package/lib/typescript/index.d.ts +39 -3
  193. package/lib/typescript/index.d.ts.map +1 -1
  194. package/package.json +10 -6
  195. package/src/NativeRNWhisper.ts +48 -0
  196. package/src/index.ts +132 -1
  197. package/src/version.json +1 -1
  198. package/whisper-rn.podspec +11 -18
  199. package/cpp/README.md +0 -4
  200. package/cpp/ggml-aarch64.c +0 -3209
  201. package/cpp/ggml-aarch64.h +0 -39
  202. package/cpp/ggml-cpu-impl.h +0 -614
@@ -8,6 +8,7 @@
8
8
  #include <windows.h>
9
9
  #endif
10
10
 
11
+ #include "ggml-backend.h"
11
12
  #include "ggml-backend-impl.h"
12
13
  #include "ggml-alloc.h"
13
14
  #include "ggml-impl.h"
@@ -20,6 +21,7 @@
20
21
  #include <string.h>
21
22
  #include <string>
22
23
  #include <vector>
24
+ #include <algorithm>
23
25
 
24
26
  #ifdef __APPLE__
25
27
  #include <sys/types.h>
@@ -34,6 +36,11 @@ const char * wsp_ggml_backend_buft_name(wsp_ggml_backend_buffer_type_t buft) {
34
36
  }
35
37
 
36
38
  wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
39
+ if (size == 0) {
40
+ // return a dummy buffer for zero-sized allocations
41
+ return wsp_ggml_backend_buffer_init(buft, {}, NULL, 0);
42
+ }
43
+
37
44
  return buft->iface.alloc_buffer(buft, size);
38
45
  }
39
46
 
@@ -49,7 +56,7 @@ size_t wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_type_t buft) {
49
56
  return SIZE_MAX;
50
57
  }
51
58
 
52
- size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor) {
59
+ size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_tensor * tensor) {
53
60
  // get_alloc_size is optional, defaults to wsp_ggml_nbytes
54
61
  if (buft->iface.get_alloc_size) {
55
62
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -89,7 +96,7 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
89
96
  }
90
97
 
91
98
  const char * wsp_ggml_backend_buffer_name(wsp_ggml_backend_buffer_t buffer) {
92
- return buffer->iface.get_name(buffer);
99
+ return wsp_ggml_backend_buft_name(wsp_ggml_backend_buffer_get_type(buffer));
93
100
  }
94
101
 
95
102
  void wsp_ggml_backend_buffer_free(wsp_ggml_backend_buffer_t buffer) {
@@ -108,6 +115,11 @@ size_t wsp_ggml_backend_buffer_get_size(wsp_ggml_backend_buffer_t buffer) {
108
115
  }
109
116
 
110
117
  void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
118
+ // get_base is optional if the buffer is zero-sized
119
+ if (buffer->size == 0) {
120
+ return NULL;
121
+ }
122
+
111
123
  void * base = buffer->iface.get_base(buffer);
112
124
 
113
125
  WSP_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -115,11 +127,21 @@ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
115
127
  return base;
116
128
  }
117
129
 
118
- void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
130
+ enum wsp_ggml_status wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
119
131
  // init_tensor is optional
120
132
  if (buffer->iface.init_tensor) {
121
- buffer->iface.init_tensor(buffer, tensor);
133
+ return buffer->iface.init_tensor(buffer, tensor);
134
+ }
135
+ return WSP_GGML_STATUS_SUCCESS;
136
+ }
137
+
138
+ void wsp_ggml_backend_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
139
+ // clear is optional if the buffer is zero-sized
140
+ if (buffer->size == 0) {
141
+ return;
122
142
  }
143
+
144
+ buffer->iface.clear(buffer, value);
123
145
  }
124
146
 
125
147
  size_t wsp_ggml_backend_buffer_get_alignment(wsp_ggml_backend_buffer_t buffer) {
@@ -130,14 +152,10 @@ size_t wsp_ggml_backend_buffer_get_max_size(wsp_ggml_backend_buffer_t buffer) {
130
152
  return wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_get_type(buffer));
131
153
  }
132
154
 
133
- size_t wsp_ggml_backend_buffer_get_alloc_size(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
155
+ size_t wsp_ggml_backend_buffer_get_alloc_size(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor) {
134
156
  return wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_get_type(buffer), tensor);
135
157
  }
136
158
 
137
- void wsp_ggml_backend_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
138
- buffer->iface.clear(buffer, value);
139
- }
140
-
141
159
  bool wsp_ggml_backend_buffer_is_host(wsp_ggml_backend_buffer_t buffer) {
142
160
  return wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_get_type(buffer));
143
161
  }
@@ -198,7 +216,7 @@ void wsp_ggml_backend_free(wsp_ggml_backend_t backend) {
198
216
  }
199
217
 
200
218
  wsp_ggml_backend_buffer_type_t wsp_ggml_backend_get_default_buffer_type(wsp_ggml_backend_t backend) {
201
- return backend->iface.get_default_buffer_type(backend);
219
+ return wsp_ggml_backend_dev_buffer_type(backend->device);
202
220
  }
203
221
 
204
222
  wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_buffer(wsp_ggml_backend_t backend, size_t size) {
@@ -236,45 +254,46 @@ void wsp_ggml_backend_tensor_get_async(wsp_ggml_backend_t backend, const struct
236
254
  }
237
255
 
238
256
  void wsp_ggml_backend_tensor_set(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
257
+ WSP_GGML_ASSERT(tensor);
239
258
  wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
259
 
260
+ if (size == 0) {
261
+ return;
262
+ }
263
+
241
264
  WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
265
  WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
266
  WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
244
267
 
245
- if (!size) {
246
- return;
247
- }
248
-
249
268
  buf->iface.set_tensor(buf, tensor, data, offset, size);
250
269
  }
251
270
 
252
271
  void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
272
+ WSP_GGML_ASSERT(tensor);
253
273
  wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
274
 
275
+ if (size == 0) {
276
+ return;
277
+ }
278
+
255
279
  WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
280
  WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
281
  WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
258
282
 
259
- if (!size) {
260
- return;
261
- }
262
-
263
283
  buf->iface.get_tensor(buf, tensor, data, offset, size);
264
284
  }
265
285
 
266
- WSP_GGML_API void wsp_ggml_backend_tensor_memset(struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
286
+ void wsp_ggml_backend_tensor_memset(struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
287
  wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
288
 
269
- WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
- WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
- WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
272
-
273
- if (!size) {
289
+ if (size == 0) {
274
290
  return;
275
291
  }
276
292
 
277
- WSP_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
293
+ WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
294
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
295
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
296
+ WSP_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
278
297
 
279
298
  buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
299
  }
@@ -316,32 +335,15 @@ enum wsp_ggml_status wsp_ggml_backend_graph_compute_async(wsp_ggml_backend_t bac
316
335
  }
317
336
 
318
337
  bool wsp_ggml_backend_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
319
- // helper to ease transition to device interface
320
- if (backend->device) {
321
- return wsp_ggml_backend_dev_supports_op(backend->device, op);
322
- }
323
-
324
- return backend->iface.supports_op(backend, op);
338
+ return wsp_ggml_backend_dev_supports_op(backend->device, op);
325
339
  }
326
340
 
327
341
  bool wsp_ggml_backend_supports_buft(wsp_ggml_backend_t backend, wsp_ggml_backend_buffer_type_t buft) {
328
- // helper to ease transition to device interface
329
- if (backend->device) {
330
- return wsp_ggml_backend_dev_supports_buft(backend->device, buft);
331
- }
332
- return backend->iface.supports_buft(backend, buft);
342
+ return wsp_ggml_backend_dev_supports_buft(backend->device, buft);
333
343
  }
334
344
 
335
345
  bool wsp_ggml_backend_offload_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
336
- // helper to ease transition to device interface
337
- if (backend->device) {
338
- return wsp_ggml_backend_dev_offload_op(backend->device, op);
339
- }
340
-
341
- if (backend->iface.offload_op != NULL) {
342
- return backend->iface.offload_op(backend, op);
343
- }
344
- return false;
346
+ return wsp_ggml_backend_dev_offload_op(backend->device, op);
345
347
  }
346
348
 
347
349
  wsp_ggml_backend_dev_t wsp_ggml_backend_get_device(wsp_ggml_backend_t backend) {
@@ -398,917 +400,133 @@ void wsp_ggml_backend_tensor_copy_async(wsp_ggml_backend_t backend_src, wsp_ggml
398
400
  if (backend_dst->iface.cpy_tensor_async != NULL) {
399
401
  if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
400
402
  return;
401
- }
402
- }
403
-
404
- // an async copy would normally happen after all the queued operations on both backends are completed
405
- // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
406
- wsp_ggml_backend_synchronize(backend_src);
407
- wsp_ggml_backend_synchronize(backend_dst);
408
- wsp_ggml_backend_tensor_copy(src, dst);
409
- }
410
-
411
- // events
412
-
413
- wsp_ggml_backend_event_t wsp_ggml_backend_event_new(wsp_ggml_backend_dev_t device) {
414
- // null device is allowed for the transition period to the device interface
415
- if (device == NULL || device->iface.event_new == NULL) {
416
- return NULL;
417
- }
418
- return device->iface.event_new(device);
419
- }
420
-
421
- void wsp_ggml_backend_event_free(wsp_ggml_backend_event_t event) {
422
- if (event == NULL) {
423
- return;
424
- }
425
- event->device->iface.event_free(event->device, event);
426
- }
427
-
428
- void wsp_ggml_backend_event_record(wsp_ggml_backend_event_t event, wsp_ggml_backend_t backend) {
429
- WSP_GGML_ASSERT(backend->iface.event_record != NULL);
430
-
431
- backend->iface.event_record(backend, event);
432
- }
433
-
434
- void wsp_ggml_backend_event_synchronize(wsp_ggml_backend_event_t event) {
435
- WSP_GGML_ASSERT(event->device->iface.event_synchronize);
436
-
437
- event->device->iface.event_synchronize(event->device, event);
438
- }
439
-
440
- void wsp_ggml_backend_event_wait(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event) {
441
- WSP_GGML_ASSERT(backend->iface.event_wait != NULL);
442
-
443
- backend->iface.event_wait(backend, event);
444
- }
445
-
446
- // Backend device
447
-
448
- const char * wsp_ggml_backend_dev_name(wsp_ggml_backend_dev_t device) {
449
- return device->iface.get_name(device);
450
- }
451
-
452
- const char * wsp_ggml_backend_dev_description(wsp_ggml_backend_dev_t device) {
453
- return device->iface.get_description(device);
454
- }
455
-
456
- void wsp_ggml_backend_dev_memory(wsp_ggml_backend_dev_t device, size_t * free, size_t * total) {
457
- device->iface.get_memory(device, free, total);
458
- }
459
-
460
- enum wsp_ggml_backend_dev_type wsp_ggml_backend_dev_type(wsp_ggml_backend_dev_t device) {
461
- return device->iface.get_type(device);
462
- }
463
-
464
- void wsp_ggml_backend_dev_get_props(wsp_ggml_backend_dev_t device, struct wsp_ggml_backend_dev_props * props) {
465
- memset(props, 0, sizeof(*props));
466
- device->iface.get_props(device, props);
467
- }
468
-
469
- wsp_ggml_backend_reg_t wsp_ggml_backend_dev_backend_reg(wsp_ggml_backend_dev_t device) {
470
- return device->reg;
471
- }
472
-
473
- wsp_ggml_backend_t wsp_ggml_backend_dev_init(wsp_ggml_backend_dev_t device, const char * params) {
474
- return device->iface.init_backend(device, params);
475
- }
476
-
477
- wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_buffer_type(wsp_ggml_backend_dev_t device) {
478
- return device->iface.get_buffer_type(device);
479
- }
480
-
481
- wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_host_buffer_type(wsp_ggml_backend_dev_t device) {
482
- if (device->iface.get_host_buffer_type == NULL) {
483
- return NULL;
484
- }
485
-
486
- return device->iface.get_host_buffer_type(device);
487
- }
488
-
489
- wsp_ggml_backend_buffer_t wsp_ggml_backend_dev_buffer_from_host_ptr(wsp_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
490
- return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
491
- }
492
-
493
- bool wsp_ggml_backend_dev_supports_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
494
- return device->iface.supports_op(device, op);
495
- }
496
-
497
- bool wsp_ggml_backend_dev_supports_buft(wsp_ggml_backend_dev_t device, wsp_ggml_backend_buffer_type_t buft) {
498
- return device->iface.supports_buft(device, buft);
499
- }
500
-
501
- bool wsp_ggml_backend_dev_offload_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
502
- if (device->iface.offload_op != NULL) {
503
- return device->iface.offload_op(device, op);
504
- }
505
-
506
- return false;
507
- }
508
-
509
- // Backend (reg)
510
-
511
- const char * wsp_ggml_backend_reg_name(wsp_ggml_backend_reg_t reg) {
512
- return reg->iface.get_name(reg);
513
- }
514
-
515
- size_t wsp_ggml_backend_reg_dev_count(wsp_ggml_backend_reg_t reg) {
516
- return reg->iface.get_device_count(reg);
517
- }
518
-
519
- wsp_ggml_backend_dev_t wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_reg_t reg, size_t index) {
520
- return reg->iface.get_device(reg, index);
521
- }
522
-
523
- void * wsp_ggml_backend_reg_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
524
- if (!reg->iface.get_proc_address) {
525
- return NULL;
526
- }
527
- return reg->iface.get_proc_address(reg, name);
528
- }
529
-
530
- // Backend registry
531
-
532
- #ifdef WSP_GGML_USE_CUDA
533
- #include "ggml-cuda.h"
534
- #endif
535
-
536
- #ifdef WSP_GGML_USE_METAL
537
- #include "ggml-metal.h"
538
- #endif
539
-
540
- #ifdef WSP_GGML_USE_SYCL
541
- #include "ggml-sycl.h"
542
- #endif
543
-
544
- #ifdef WSP_GGML_USE_VULKAN
545
- #include "ggml-vulkan.h"
546
- #endif
547
-
548
- #ifdef WSP_GGML_USE_BLAS
549
- #include "ggml-blas.h"
550
- #endif
551
-
552
- #ifdef WSP_GGML_USE_RPC
553
- #include "ggml-rpc.h"
554
- #endif
555
-
556
- #ifndef __AMX_INT8__
557
- #undef WSP_GGML_USE_AMX
558
- #endif
559
-
560
- #ifdef WSP_GGML_USE_AMX
561
- # include "ggml-amx.h"
562
- #endif
563
-
564
- #ifdef WSP_GGML_USE_CANN
565
- #include "ggml-cann.h"
566
- #endif
567
-
568
- struct wsp_ggml_backend_registry {
569
- std::vector<wsp_ggml_backend_reg_t> backends;
570
- std::vector<wsp_ggml_backend_dev_t> devices;
571
-
572
- wsp_ggml_backend_registry() {
573
- #ifdef WSP_GGML_USE_CUDA
574
- register_backend(wsp_ggml_backend_cuda_reg());
575
- #endif
576
- #ifdef WSP_GGML_USE_METAL
577
- #include <TargetConditionals.h>
578
- #if !TARGET_OS_SIMULATOR
579
- register_backend(wsp_ggml_backend_metal_reg());
580
- #endif
581
- #endif
582
- #ifdef WSP_GGML_USE_SYCL
583
- register_backend(wsp_ggml_backend_sycl_reg());
584
- #endif
585
- #ifdef WSP_GGML_USE_VULKAN
586
- register_backend(wsp_ggml_backend_vk_reg());
587
- #endif
588
- #ifdef WSP_GGML_USE_BLAS
589
- register_backend(wsp_ggml_backend_blas_reg());
590
- #endif
591
- #ifdef WSP_GGML_USE_RPC
592
- register_backend(wsp_ggml_backend_rpc_reg());
593
- #endif
594
- #ifdef WSP_GGML_USE_AMX
595
- register_backend(wsp_ggml_backend_amx_reg());
596
- #endif
597
- #ifdef WSP_GGML_USE_CANN
598
- register_backend(wsp_ggml_backend_cann_reg());
599
- #endif
600
-
601
- // TODO: kompute
602
-
603
- register_backend(wsp_ggml_backend_cpu_reg());
604
- }
605
-
606
- void register_backend(wsp_ggml_backend_reg_t reg) {
607
- #ifndef NDEBUG
608
- WSP_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
609
- __func__, wsp_ggml_backend_reg_name(reg), wsp_ggml_backend_reg_dev_count(reg));
610
- #endif
611
- backends.push_back(reg);
612
- for (size_t i = 0; i < wsp_ggml_backend_reg_dev_count(reg); i++) {
613
- register_device(wsp_ggml_backend_reg_dev_get(reg, i));
614
- }
615
- }
616
-
617
- void register_device(wsp_ggml_backend_dev_t device) {
618
- #ifndef NDEBUG
619
- WSP_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, wsp_ggml_backend_dev_name(device), wsp_ggml_backend_dev_description(device));
620
- #endif
621
- devices.push_back(device);
622
- }
623
- };
624
-
625
- static wsp_ggml_backend_registry & get_reg() {
626
- static wsp_ggml_backend_registry reg;
627
- return reg;
628
- }
629
-
630
- // Internal API
631
- void wsp_ggml_backend_register(wsp_ggml_backend_reg_t reg) {
632
- get_reg().register_backend(reg);
633
- }
634
-
635
- void wsp_ggml_backend_device_register(wsp_ggml_backend_dev_t device) {
636
- get_reg().register_device(device);
637
- }
638
-
639
- // Backend (reg) enumeration
640
- size_t wsp_ggml_backend_reg_count() {
641
- return get_reg().backends.size();
642
- }
643
-
644
- wsp_ggml_backend_reg_t wsp_ggml_backend_reg_get(size_t index) {
645
- WSP_GGML_ASSERT(index < wsp_ggml_backend_reg_count());
646
- return get_reg().backends[index];
647
- }
648
-
649
- wsp_ggml_backend_reg_t wsp_ggml_backend_reg_by_name(const char * name) {
650
- for (size_t i = 0; i < wsp_ggml_backend_reg_count(); i++) {
651
- wsp_ggml_backend_reg_t reg = wsp_ggml_backend_reg_get(i);
652
- if (strcmp(wsp_ggml_backend_reg_name(reg), name) == 0) {
653
- return reg;
654
- }
655
- }
656
- return NULL;
657
- }
658
-
659
- // Device enumeration
660
- size_t wsp_ggml_backend_dev_count() {
661
- return get_reg().devices.size();
662
- }
663
-
664
- wsp_ggml_backend_dev_t wsp_ggml_backend_dev_get(size_t index) {
665
- WSP_GGML_ASSERT(index < wsp_ggml_backend_dev_count());
666
- return get_reg().devices[index];
667
- }
668
-
669
- wsp_ggml_backend_dev_t wsp_ggml_backend_dev_by_name(const char * name) {
670
- for (size_t i = 0; i < wsp_ggml_backend_dev_count(); i++) {
671
- wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
672
- if (strcmp(wsp_ggml_backend_dev_name(dev), name) == 0) {
673
- return dev;
674
- }
675
- }
676
- return NULL;
677
- }
678
-
679
- wsp_ggml_backend_dev_t wsp_ggml_backend_dev_by_type(enum wsp_ggml_backend_dev_type type) {
680
- for (size_t i = 0; i < wsp_ggml_backend_dev_count(); i++) {
681
- wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
682
- if (wsp_ggml_backend_dev_type(dev) == type) {
683
- return dev;
684
- }
685
- }
686
- return NULL;
687
- }
688
-
689
- // Convenience functions
690
- wsp_ggml_backend_t wsp_ggml_backend_init_by_name(const char * name, const char * params) {
691
- wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_by_name(name);
692
- if (!dev) {
693
- return NULL;
694
- }
695
- return wsp_ggml_backend_dev_init(dev, params);
696
- }
697
-
698
- wsp_ggml_backend_t wsp_ggml_backend_init_by_type(enum wsp_ggml_backend_dev_type type, const char * params) {
699
- wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_by_type(type);
700
- if (!dev) {
701
- return NULL;
702
- }
703
- return wsp_ggml_backend_dev_init(dev, params);
704
- }
705
-
706
- wsp_ggml_backend_t wsp_ggml_backend_init_best(void) {
707
- wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
708
- if (!dev) {
709
- dev = wsp_ggml_backend_dev_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
710
- }
711
- if (!dev) {
712
- return NULL;
713
- }
714
- return wsp_ggml_backend_dev_init(dev, NULL);
715
- }
716
-
717
- // backend CPU
718
-
719
- static const char * wsp_ggml_backend_cpu_buffer_get_name(wsp_ggml_backend_buffer_t buffer) {
720
- return "CPU";
721
-
722
- WSP_GGML_UNUSED(buffer);
723
- }
724
-
725
- static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
726
- uintptr_t data = (uintptr_t)buffer->context;
727
-
728
- // align the buffer
729
- if (data % TENSOR_ALIGNMENT != 0) {
730
- data = WSP_GGML_PAD(data, TENSOR_ALIGNMENT);
731
- }
732
-
733
- return (void *)data;
734
- }
735
-
736
- static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
737
- wsp_ggml_aligned_free(buffer->context, buffer->size);
738
- }
739
-
740
- static void wsp_ggml_backend_cpu_buffer_memset_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
741
- memset((char *)tensor->data + offset, value, size);
742
-
743
- WSP_GGML_UNUSED(buffer);
744
- }
745
-
746
- static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
747
- memcpy((char *)tensor->data + offset, data, size);
748
-
749
- WSP_GGML_UNUSED(buffer);
750
- }
751
-
752
- static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
753
- memcpy(data, (const char *)tensor->data + offset, size);
754
-
755
- WSP_GGML_UNUSED(buffer);
756
- }
757
-
758
- static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
759
- if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
760
- memcpy(dst->data, src->data, wsp_ggml_nbytes(src));
761
- return true;
762
- }
763
- return false;
764
-
765
- WSP_GGML_UNUSED(buffer);
766
- }
767
-
768
- static void wsp_ggml_backend_cpu_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
769
- memset(buffer->context, value, buffer->size);
770
- }
771
-
772
- static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_cpu_buffer_i = {
773
- /* .get_name = */ wsp_ggml_backend_cpu_buffer_get_name,
774
- /* .free_buffer = */ wsp_ggml_backend_cpu_buffer_free_buffer,
775
- /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
776
- /* .init_tensor = */ NULL, // no initialization required
777
- /* .memset_tensor = */ wsp_ggml_backend_cpu_buffer_memset_tensor,
778
- /* .set_tensor = */ wsp_ggml_backend_cpu_buffer_set_tensor,
779
- /* .get_tensor = */ wsp_ggml_backend_cpu_buffer_get_tensor,
780
- /* .cpy_tensor = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
781
- /* .clear = */ wsp_ggml_backend_cpu_buffer_clear,
782
- /* .reset = */ NULL,
783
- };
784
-
785
- static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_cpu_buffer_from_ptr_i = {
786
- /* .get_name = */ wsp_ggml_backend_cpu_buffer_get_name,
787
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
788
- /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
789
- /* .init_tensor = */ NULL, // no initialization required
790
- /* .memset_tensor = */ wsp_ggml_backend_cpu_buffer_memset_tensor,
791
- /* .set_tensor = */ wsp_ggml_backend_cpu_buffer_set_tensor,
792
- /* .get_tensor = */ wsp_ggml_backend_cpu_buffer_get_tensor,
793
- /* .cpy_tensor = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
794
- /* .clear = */ wsp_ggml_backend_cpu_buffer_clear,
795
- /* .reset = */ NULL,
796
- };
797
-
798
- static const char * wsp_ggml_backend_cpu_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
799
- return "CPU";
800
-
801
- WSP_GGML_UNUSED(buft);
802
- }
803
-
804
- static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
805
- auto alloc_size = size;
806
- if (alloc_size == 0) {
807
- alloc_size = 1;
808
- }
809
-
810
- void * data = wsp_ggml_aligned_malloc(alloc_size);
811
-
812
- if (data == NULL) {
813
- WSP_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
814
- return NULL;
815
- }
816
-
817
- return wsp_ggml_backend_buffer_init(buft, wsp_ggml_backend_cpu_buffer_i, data, alloc_size);
818
- }
819
-
820
- static size_t wsp_ggml_backend_cpu_buffer_type_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
821
- return TENSOR_ALIGNMENT;
822
-
823
- WSP_GGML_UNUSED(buft);
824
- }
825
-
826
- static bool wsp_ggml_backend_cpu_buffer_type_is_host(wsp_ggml_backend_buffer_type_t buft) {
827
- return true;
828
-
829
- WSP_GGML_UNUSED(buft);
830
- }
831
-
832
- wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_buffer_type(void) {
833
- static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type = {
834
- /* .iface = */ {
835
- /* .get_name = */ wsp_ggml_backend_cpu_buffer_type_get_name,
836
- /* .alloc_buffer = */ wsp_ggml_backend_cpu_buffer_type_alloc_buffer,
837
- /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
838
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
839
- /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
840
- /* .is_host = */ wsp_ggml_backend_cpu_buffer_type_is_host,
841
- },
842
- /* .device = */ wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
843
- /* .context = */ NULL,
844
- };
845
-
846
- return &wsp_ggml_backend_cpu_buffer_type;
847
- }
848
-
849
- #ifdef WSP_GGML_USE_CPU_HBM
850
-
851
- // buffer type HBM
852
-
853
- #include <hbwmalloc.h>
854
-
855
- static const char * wsp_ggml_backend_cpu_hbm_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
856
- return "CPU_HBM";
857
-
858
- WSP_GGML_UNUSED(buft);
859
- }
860
-
861
- static const char * wsp_ggml_backend_cpu_hbm_buffer_get_name(wsp_ggml_backend_buffer_t buf) {
862
- return "CPU_HBM";
863
-
864
- WSP_GGML_UNUSED(buf);
865
- }
866
-
867
- static void wsp_ggml_backend_cpu_hbm_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
868
- hbw_free(buffer->context);
869
- }
870
-
871
- static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
872
- //void * ptr = hbw_malloc(size);
873
- void * ptr;
874
- int result = hbw_posix_memalign(&ptr, wsp_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
875
- if (result != 0) {
876
- WSP_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
877
- return NULL;
878
- }
879
-
880
- wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_cpu_buffer_from_ptr(ptr, size);
881
- buffer->buft = buft;
882
- buffer->iface.get_name = wsp_ggml_backend_cpu_hbm_buffer_get_name;
883
- buffer->iface.free_buffer = wsp_ggml_backend_cpu_hbm_buffer_free_buffer;
884
-
885
- return buffer;
886
- }
887
-
888
- wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_hbm_buffer_type(void) {
889
- static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type_hbm = {
890
- /* .iface = */ {
891
- /* .get_name = */ wsp_ggml_backend_cpu_hbm_buffer_type_get_name,
892
- /* .alloc_buffer = */ wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
893
- /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
894
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
895
- /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
896
- /* .is_host = */ wsp_ggml_backend_cpu_buffer_type_is_host,
897
- },
898
- /* .context = */ NULL,
899
- };
900
-
901
- return &wsp_ggml_backend_cpu_buffer_type_hbm;
902
- }
903
- #endif
904
-
905
- struct wsp_ggml_backend_cpu_context {
906
- int n_threads;
907
- wsp_ggml_threadpool_t threadpool;
908
-
909
- uint8_t * work_data;
910
- size_t work_size;
911
-
912
- wsp_ggml_abort_callback abort_callback;
913
- void * abort_callback_data;
914
- };
915
-
916
- static const char * wsp_ggml_backend_cpu_get_name(wsp_ggml_backend_t backend) {
917
- return "CPU";
918
-
919
- WSP_GGML_UNUSED(backend);
920
- }
921
-
922
- static void wsp_ggml_backend_cpu_free(wsp_ggml_backend_t backend) {
923
- struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
924
- delete[] cpu_ctx->work_data;
925
- delete cpu_ctx;
926
- delete backend;
927
- }
928
-
929
- static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_get_default_buffer_type(wsp_ggml_backend_t backend) {
930
- return wsp_ggml_backend_cpu_buffer_type();
931
-
932
- WSP_GGML_UNUSED(backend);
933
- }
934
-
935
- struct wsp_ggml_backend_plan_cpu {
936
- struct wsp_ggml_cplan cplan;
937
- struct wsp_ggml_cgraph cgraph;
938
- };
939
-
940
- static wsp_ggml_backend_graph_plan_t wsp_ggml_backend_cpu_graph_plan_create(wsp_ggml_backend_t backend, const struct wsp_ggml_cgraph * cgraph) {
941
- struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
942
-
943
- struct wsp_ggml_backend_plan_cpu * cpu_plan = new wsp_ggml_backend_plan_cpu;
944
-
945
- cpu_plan->cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
946
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
947
-
948
- if (cpu_plan->cplan.work_size > 0) {
949
- cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
950
- if (cpu_plan->cplan.work_data == NULL) {
951
- delete cpu_plan;
952
- return NULL;
953
- }
954
- }
955
-
956
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
957
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
958
-
959
- return cpu_plan;
960
- }
961
-
962
- static void wsp_ggml_backend_cpu_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
963
- struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
964
-
965
- delete[] cpu_plan->cplan.work_data;
966
- delete cpu_plan;
967
-
968
- WSP_GGML_UNUSED(backend);
969
- }
970
-
971
- static enum wsp_ggml_status wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
972
- struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
973
-
974
- return wsp_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
975
-
976
- WSP_GGML_UNUSED(backend);
977
- }
978
-
979
- static enum wsp_ggml_status wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
980
- struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
981
-
982
- struct wsp_ggml_cplan cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
983
-
984
- if (cpu_ctx->work_size < cplan.work_size) {
985
- delete[] cpu_ctx->work_data;
986
- cpu_ctx->work_data = new uint8_t[cplan.work_size];
987
- if (cpu_ctx->work_data == NULL) {
988
- cpu_ctx->work_size = 0;
989
- return WSP_GGML_STATUS_ALLOC_FAILED;
990
- }
991
- cpu_ctx->work_size = cplan.work_size;
992
- }
993
- cplan.work_data = (uint8_t *)cpu_ctx->work_data;
994
-
995
- cplan.abort_callback = cpu_ctx->abort_callback;
996
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
997
-
998
- return wsp_ggml_graph_compute(cgraph, &cplan);
999
- }
1000
-
1001
- static const struct wsp_ggml_backend_i wsp_ggml_backend_cpu_i = {
1002
- /* .get_name = */ wsp_ggml_backend_cpu_get_name,
1003
- /* .free = */ wsp_ggml_backend_cpu_free,
1004
- /* .get_default_buffer_type = */ wsp_ggml_backend_cpu_get_default_buffer_type,
1005
- /* .set_tensor_async = */ NULL,
1006
- /* .get_tensor_async = */ NULL,
1007
- /* .cpy_tensor_async = */ NULL,
1008
- /* .synchronize = */ NULL,
1009
- /* .graph_plan_create = */ wsp_ggml_backend_cpu_graph_plan_create,
1010
- /* .graph_plan_free = */ wsp_ggml_backend_cpu_graph_plan_free,
1011
- /* .graph_plan_update = */ NULL,
1012
- /* .graph_plan_compute = */ wsp_ggml_backend_cpu_graph_plan_compute,
1013
- /* .graph_compute = */ wsp_ggml_backend_cpu_graph_compute,
1014
- /* .supports_op = */ NULL,
1015
- /* .supports_buft = */ NULL,
1016
- /* .offload_op = */ NULL,
1017
- /* .event_record = */ NULL,
1018
- /* .event_wait = */ NULL,
1019
- };
1020
-
1021
- static wsp_ggml_guid_t wsp_ggml_backend_cpu_guid(void) {
1022
- static wsp_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
1023
- return &guid;
1024
- }
1025
-
1026
- wsp_ggml_backend_t wsp_ggml_backend_cpu_init(void) {
1027
- struct wsp_ggml_backend_cpu_context * ctx = new wsp_ggml_backend_cpu_context;
1028
- if (ctx == NULL) {
1029
- return NULL;
1030
- }
1031
-
1032
- ctx->n_threads = WSP_GGML_DEFAULT_N_THREADS;
1033
- ctx->threadpool = NULL;
1034
- ctx->work_data = NULL;
1035
- ctx->work_size = 0;
1036
- ctx->abort_callback = NULL;
1037
- ctx->abort_callback_data = NULL;
1038
-
1039
- wsp_ggml_backend_t cpu_backend = new wsp_ggml_backend {
1040
- /* .guid = */ wsp_ggml_backend_cpu_guid(),
1041
- /* .interface = */ wsp_ggml_backend_cpu_i,
1042
- /* .device = */ wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
1043
- /* .context = */ ctx,
1044
- };
1045
-
1046
- if (cpu_backend == NULL) {
1047
- delete ctx;
1048
- return NULL;
1049
- }
1050
-
1051
- return cpu_backend;
1052
- }
1053
-
1054
- bool wsp_ggml_backend_is_cpu(wsp_ggml_backend_t backend) {
1055
- return backend != NULL && wsp_ggml_guid_matches(backend->guid, wsp_ggml_backend_cpu_guid());
1056
- }
1057
-
1058
- void wsp_ggml_backend_cpu_set_n_threads(wsp_ggml_backend_t backend_cpu, int n_threads) {
1059
- WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backend_cpu));
1060
-
1061
- struct wsp_ggml_backend_cpu_context * ctx = (struct wsp_ggml_backend_cpu_context *)backend_cpu->context;
1062
- ctx->n_threads = n_threads;
1063
- }
1064
-
1065
- void wsp_ggml_backend_cpu_set_threadpool(wsp_ggml_backend_t backend_cpu, wsp_ggml_threadpool_t threadpool) {
1066
- WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backend_cpu));
1067
-
1068
- struct wsp_ggml_backend_cpu_context * ctx = (struct wsp_ggml_backend_cpu_context *)backend_cpu->context;
1069
-
1070
- if (ctx->threadpool && ctx->threadpool != threadpool) {
1071
- // already had a different threadpool, pause/suspend it before switching
1072
- wsp_ggml_threadpool_pause(ctx->threadpool);
1073
- }
1074
- ctx->threadpool = threadpool;
1075
- }
1076
-
1077
- void wsp_ggml_backend_cpu_set_abort_callback(wsp_ggml_backend_t backend_cpu, wsp_ggml_abort_callback abort_callback, void * abort_callback_data) {
1078
- WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backend_cpu));
1079
-
1080
- struct wsp_ggml_backend_cpu_context * ctx = (struct wsp_ggml_backend_cpu_context *)backend_cpu->context;
1081
- ctx->abort_callback = abort_callback;
1082
- ctx->abort_callback_data = abort_callback_data;
1083
- }
1084
-
1085
- wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1086
- WSP_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1087
- return wsp_ggml_backend_buffer_init(wsp_ggml_backend_cpu_buffer_type(), wsp_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1088
- }
1089
-
1090
- ////////////////////////
1091
-
1092
- struct wsp_ggml_backend_cpu_device_context {
1093
- std::string description = "CPU";
1094
-
1095
- wsp_ggml_backend_cpu_device_context() {
1096
- #ifdef __APPLE__
1097
- size_t len = 0;
1098
- if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1099
- description.resize(len);
1100
- sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1101
- }
1102
- #elif defined(__linux__)
1103
- FILE * f = fopen("/proc/cpuinfo", "r");
1104
- if (f) {
1105
- char buf[1024];
1106
- while (fgets(buf, sizeof(buf), f)) {
1107
- if (strncmp(buf, "model name", 10) == 0) {
1108
- char * p = strchr(buf, ':');
1109
- if (p) {
1110
- p++;
1111
- while (std::isspace(*p)) {
1112
- p++;
1113
- }
1114
- while (std::isspace(p[strlen(p) - 1])) {
1115
- p[strlen(p) - 1] = '\0';
1116
- }
1117
- description = p;
1118
- break;
1119
- }
1120
- }
1121
- }
1122
- fclose(f);
1123
- }
1124
- #elif defined(_WIN32)
1125
- HKEY hKey;
1126
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1127
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1128
- 0,
1129
- KEY_READ,
1130
- &hKey) == ERROR_SUCCESS) {
1131
- DWORD cpu_brand_size = 0;
1132
- if (RegQueryValueExA(hKey,
1133
- TEXT("ProcessorNameString"),
1134
- NULL,
1135
- NULL,
1136
- NULL,
1137
- &cpu_brand_size) == ERROR_SUCCESS) {
1138
- description.resize(cpu_brand_size);
1139
- if (RegQueryValueExA(hKey,
1140
- TEXT("ProcessorNameString"),
1141
- NULL,
1142
- NULL,
1143
- (LPBYTE)&description[0], // NOLINT
1144
- &cpu_brand_size) == ERROR_SUCCESS) {
1145
- if (description.find('\0') != std::string::npos) {
1146
- description.resize(description.find('\0'));
1147
- }
1148
- }
1149
- }
1150
- RegCloseKey(hKey);
1151
- }
1152
- #endif
1153
- }
1154
- };
1155
-
1156
- static const char * wsp_ggml_backend_cpu_device_get_name(wsp_ggml_backend_dev_t dev) {
1157
- return "CPU";
403
+ }
404
+ }
1158
405
 
1159
- WSP_GGML_UNUSED(dev);
406
+ // an async copy would normally happen after all the queued operations on both backends are completed
407
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
408
+ wsp_ggml_backend_synchronize(backend_src);
409
+ wsp_ggml_backend_synchronize(backend_dst);
410
+ wsp_ggml_backend_tensor_copy(src, dst);
1160
411
  }
1161
412
 
1162
- static const char * wsp_ggml_backend_cpu_device_get_description(wsp_ggml_backend_dev_t dev) {
1163
- struct wsp_ggml_backend_cpu_device_context * ctx = (struct wsp_ggml_backend_cpu_device_context *)dev->context;
413
+ // events
1164
414
 
1165
- return ctx->description.c_str();
415
+ wsp_ggml_backend_event_t wsp_ggml_backend_event_new(wsp_ggml_backend_dev_t device) {
416
+ // null device is allowed for the transition period to the device interface
417
+ if (device == NULL || device->iface.event_new == NULL) {
418
+ return NULL;
419
+ }
420
+ return device->iface.event_new(device);
1166
421
  }
1167
422
 
1168
- static void wsp_ggml_backend_cpu_device_get_memory(wsp_ggml_backend_dev_t dev, size_t * free, size_t * total) {
1169
- // TODO
1170
- *free = 0;
1171
- *total = 0;
1172
-
1173
- WSP_GGML_UNUSED(dev);
423
+ void wsp_ggml_backend_event_free(wsp_ggml_backend_event_t event) {
424
+ if (event == NULL) {
425
+ return;
426
+ }
427
+ event->device->iface.event_free(event->device, event);
1174
428
  }
1175
429
 
1176
- static enum wsp_ggml_backend_dev_type wsp_ggml_backend_cpu_device_get_type(wsp_ggml_backend_dev_t dev) {
1177
- return WSP_GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
430
+ void wsp_ggml_backend_event_record(wsp_ggml_backend_event_t event, wsp_ggml_backend_t backend) {
431
+ WSP_GGML_ASSERT(backend->iface.event_record != NULL);
1178
432
 
1179
- WSP_GGML_UNUSED(dev);
433
+ backend->iface.event_record(backend, event);
1180
434
  }
1181
435
 
1182
- static void wsp_ggml_backend_cpu_device_get_props(wsp_ggml_backend_dev_t dev, struct wsp_ggml_backend_dev_props * props) {
1183
- props->name = wsp_ggml_backend_cpu_device_get_name(dev);
1184
- props->description = wsp_ggml_backend_cpu_device_get_description(dev);
1185
- props->type = wsp_ggml_backend_cpu_device_get_type(dev);
1186
- wsp_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1187
- props->caps = {
1188
- /* .async = */ false,
1189
- /* .host_buffer = */ false,
1190
- /* .buffer_from_host_ptr = */ true,
1191
- /* .events = */ false,
1192
- };
436
+ void wsp_ggml_backend_event_synchronize(wsp_ggml_backend_event_t event) {
437
+ WSP_GGML_ASSERT(event->device->iface.event_synchronize);
438
+
439
+ event->device->iface.event_synchronize(event->device, event);
1193
440
  }
1194
441
 
1195
- static wsp_ggml_backend_t wsp_ggml_backend_cpu_device_init(wsp_ggml_backend_dev_t dev, const char * params) {
1196
- return wsp_ggml_backend_cpu_init();
442
+ void wsp_ggml_backend_event_wait(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event) {
443
+ WSP_GGML_ASSERT(backend->iface.event_wait != NULL);
1197
444
 
1198
- WSP_GGML_UNUSED(dev);
1199
- WSP_GGML_UNUSED(params);
445
+ backend->iface.event_wait(backend, event);
1200
446
  }
1201
447
 
1202
- static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_device_get_buffer_type(wsp_ggml_backend_dev_t dev) {
1203
- return wsp_ggml_backend_cpu_buffer_type();
448
+ // Backend device
1204
449
 
1205
- WSP_GGML_UNUSED(dev);
450
+ const char * wsp_ggml_backend_dev_name(wsp_ggml_backend_dev_t device) {
451
+ return device->iface.get_name(device);
1206
452
  }
1207
453
 
1208
- static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_device_buffer_from_ptr(wsp_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1209
- return wsp_ggml_backend_cpu_buffer_from_ptr(ptr, size);
1210
-
1211
- WSP_GGML_UNUSED(dev);
1212
- WSP_GGML_UNUSED(max_tensor_size);
454
+ const char * wsp_ggml_backend_dev_description(wsp_ggml_backend_dev_t device) {
455
+ return device->iface.get_description(device);
1213
456
  }
1214
457
 
1215
- static bool wsp_ggml_backend_cpu_device_supports_op(wsp_ggml_backend_dev_t dev, const struct wsp_ggml_tensor * op) {
1216
- switch (op->op) {
1217
- case WSP_GGML_OP_CPY:
1218
- return
1219
- op->type != WSP_GGML_TYPE_IQ2_XXS &&
1220
- op->type != WSP_GGML_TYPE_IQ2_XS &&
1221
- op->type != WSP_GGML_TYPE_IQ1_S &&
1222
- op->type != WSP_GGML_TYPE_IQ1_M; // missing type_traits.from_float
1223
- case WSP_GGML_OP_MUL_MAT:
1224
- return op->src[1]->type == WSP_GGML_TYPE_F32 || op->src[1]->type == wsp_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
1225
- case WSP_GGML_OP_ROPE_BACK:
1226
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1227
- case WSP_GGML_OP_IM2COL_BACK:
1228
- return op->src[0]->type == WSP_GGML_TYPE_F32 && op->src[1]->type == WSP_GGML_TYPE_F32;
1229
- case WSP_GGML_OP_OUT_PROD:
1230
- return (op->src[0]->type == WSP_GGML_TYPE_F32 || wsp_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == WSP_GGML_TYPE_F32;
1231
- default:
1232
- return true;
1233
- }
458
+ void wsp_ggml_backend_dev_memory(wsp_ggml_backend_dev_t device, size_t * free, size_t * total) {
459
+ device->iface.get_memory(device, free, total);
460
+ }
1234
461
 
1235
- WSP_GGML_UNUSED(dev);
462
+ enum wsp_ggml_backend_dev_type wsp_ggml_backend_dev_type(wsp_ggml_backend_dev_t device) {
463
+ return device->iface.get_type(device);
1236
464
  }
1237
465
 
1238
- static bool wsp_ggml_backend_cpu_device_supports_buft(wsp_ggml_backend_dev_t dev, wsp_ggml_backend_buffer_type_t buft) {
1239
- return wsp_ggml_backend_buft_is_host(buft);
466
+ void wsp_ggml_backend_dev_get_props(wsp_ggml_backend_dev_t device, struct wsp_ggml_backend_dev_props * props) {
467
+ memset(props, 0, sizeof(*props));
468
+ device->iface.get_props(device, props);
469
+ }
1240
470
 
1241
- WSP_GGML_UNUSED(dev);
471
+ wsp_ggml_backend_reg_t wsp_ggml_backend_dev_backend_reg(wsp_ggml_backend_dev_t device) {
472
+ return device->reg;
1242
473
  }
1243
474
 
1244
- static const struct wsp_ggml_backend_device_i wsp_ggml_backend_cpu_device_i = {
1245
- /* .get_name = */ wsp_ggml_backend_cpu_device_get_name,
1246
- /* .get_description = */ wsp_ggml_backend_cpu_device_get_description,
1247
- /* .get_memory = */ wsp_ggml_backend_cpu_device_get_memory,
1248
- /* .get_type = */ wsp_ggml_backend_cpu_device_get_type,
1249
- /* .get_props = */ wsp_ggml_backend_cpu_device_get_props,
1250
- /* .init_backend = */ wsp_ggml_backend_cpu_device_init,
1251
- /* .get_buffer_type = */ wsp_ggml_backend_cpu_device_get_buffer_type,
1252
- /* .get_host_buffer_type = */ NULL,
1253
- /* .buffer_from_host_ptr = */ wsp_ggml_backend_cpu_device_buffer_from_ptr,
1254
- /* .supports_op = */ wsp_ggml_backend_cpu_device_supports_op,
1255
- /* .supports_buft = */ wsp_ggml_backend_cpu_device_supports_buft,
1256
- /* .offload_op = */ NULL,
1257
- /* .event_new = */ NULL,
1258
- /* .event_free = */ NULL,
1259
- /* .event_synchronize = */ NULL,
1260
- };
475
+ wsp_ggml_backend_t wsp_ggml_backend_dev_init(wsp_ggml_backend_dev_t device, const char * params) {
476
+ return device->iface.init_backend(device, params);
477
+ }
1261
478
 
1262
- ////////////////////////
479
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_buffer_type(wsp_ggml_backend_dev_t device) {
480
+ return device->iface.get_buffer_type(device);
481
+ }
1263
482
 
1264
- static const char * wsp_ggml_backend_cpu_reg_get_name(wsp_ggml_backend_reg_t reg) {
1265
- return "CPU";
483
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_host_buffer_type(wsp_ggml_backend_dev_t device) {
484
+ if (device->iface.get_host_buffer_type == NULL) {
485
+ return NULL;
486
+ }
1266
487
 
1267
- WSP_GGML_UNUSED(reg);
488
+ return device->iface.get_host_buffer_type(device);
1268
489
  }
1269
490
 
1270
- static size_t wsp_ggml_backend_cpu_reg_get_device_count(wsp_ggml_backend_reg_t reg) {
1271
- return 1;
491
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_dev_buffer_from_host_ptr(wsp_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
492
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
493
+ }
1272
494
 
1273
- WSP_GGML_UNUSED(reg);
495
+ bool wsp_ggml_backend_dev_supports_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
496
+ return device->iface.supports_op(device, op);
1274
497
  }
1275
498
 
1276
- static wsp_ggml_backend_dev_t wsp_ggml_backend_cpu_reg_get_device(wsp_ggml_backend_reg_t reg, size_t index) {
1277
- WSP_GGML_ASSERT(index == 0);
499
+ bool wsp_ggml_backend_dev_supports_buft(wsp_ggml_backend_dev_t device, wsp_ggml_backend_buffer_type_t buft) {
500
+ return device->iface.supports_buft(device, buft);
501
+ }
1278
502
 
1279
- static wsp_ggml_backend_cpu_device_context ctx;
1280
- static wsp_ggml_backend_device wsp_ggml_backend_cpu_device = {
1281
- /* .iface = */ wsp_ggml_backend_cpu_device_i,
1282
- /* .reg = */ reg,
1283
- /* .context = */ &ctx,
1284
- };
503
+ bool wsp_ggml_backend_dev_offload_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
504
+ if (device->iface.offload_op != NULL) {
505
+ return device->iface.offload_op(device, op);
506
+ }
1285
507
 
1286
- return &wsp_ggml_backend_cpu_device;
508
+ return false;
1287
509
  }
1288
510
 
1289
- static void * wsp_ggml_backend_cpu_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
1290
- if (strcmp(name, "wsp_ggml_backend_set_n_threads") == 0) {
1291
- return (void *)wsp_ggml_backend_cpu_set_n_threads;
1292
- }
1293
- return NULL;
511
+ // Backend (reg)
1294
512
 
1295
- WSP_GGML_UNUSED(reg);
513
+ const char * wsp_ggml_backend_reg_name(wsp_ggml_backend_reg_t reg) {
514
+ return reg->iface.get_name(reg);
1296
515
  }
1297
516
 
1298
- static const struct wsp_ggml_backend_reg_i wsp_ggml_backend_cpu_reg_i = {
1299
- /* .get_name = */ wsp_ggml_backend_cpu_reg_get_name,
1300
- /* .get_device_count = */ wsp_ggml_backend_cpu_reg_get_device_count,
1301
- /* .get_device = */ wsp_ggml_backend_cpu_reg_get_device,
1302
- /* .get_proc_address = */ wsp_ggml_backend_cpu_get_proc_address,
1303
- };
517
+ size_t wsp_ggml_backend_reg_dev_count(wsp_ggml_backend_reg_t reg) {
518
+ return reg->iface.get_device_count(reg);
519
+ }
1304
520
 
1305
- wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void) {
1306
- static struct wsp_ggml_backend_reg wsp_ggml_backend_cpu_reg = {
1307
- /* .iface = */ wsp_ggml_backend_cpu_reg_i,
1308
- /* .context = */ NULL,
1309
- };
521
+ wsp_ggml_backend_dev_t wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_reg_t reg, size_t index) {
522
+ return reg->iface.get_device(reg, index);
523
+ }
1310
524
 
1311
- return &wsp_ggml_backend_cpu_reg;
525
+ void * wsp_ggml_backend_reg_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
526
+ if (!reg->iface.get_proc_address) {
527
+ return NULL;
528
+ }
529
+ return reg->iface.get_proc_address(reg, name);
1312
530
  }
1313
531
 
1314
532
  // multi-buffer buffer
@@ -1318,12 +536,6 @@ struct wsp_ggml_backend_multi_buffer_context {
1318
536
  size_t n_buffers;
1319
537
  };
1320
538
 
1321
- static const char * wsp_ggml_backend_multi_buffer_get_name(wsp_ggml_backend_buffer_t buffer) {
1322
- wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
1323
-
1324
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1325
- }
1326
-
1327
539
  static void wsp_ggml_backend_multi_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
1328
540
  wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
1329
541
  for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -1342,7 +554,6 @@ static void wsp_ggml_backend_multi_buffer_clear(wsp_ggml_backend_buffer_t buffer
1342
554
  }
1343
555
 
1344
556
  static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_multi_buffer_i = {
1345
- /* .get_name = */ wsp_ggml_backend_multi_buffer_get_name,
1346
557
  /* .free_buffer = */ wsp_ggml_backend_multi_buffer_free_buffer,
1347
558
  /* .get_base = */ NULL,
1348
559
  /* .init_tensor = */ NULL,
@@ -1371,7 +582,7 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_multi_buffer_alloc_buffer(wsp_ggml_ba
1371
582
  }
1372
583
 
1373
584
  bool wsp_ggml_backend_buffer_is_multi_buffer(wsp_ggml_backend_buffer_t buffer) {
1374
- return buffer->iface.get_name == wsp_ggml_backend_multi_buffer_get_name;
585
+ return buffer->iface.free_buffer == wsp_ggml_backend_multi_buffer_free_buffer;
1375
586
  }
1376
587
 
1377
588
  void wsp_ggml_backend_multi_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
@@ -1463,7 +674,9 @@ struct wsp_ggml_backend_sched {
1463
674
  char * context_buffer;
1464
675
  size_t context_buffer_size;
1465
676
 
1466
- bool debug;
677
+ bool op_offload;
678
+
679
+ int debug;
1467
680
  };
1468
681
 
1469
682
  #define hash_id(tensor) wsp_ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1482,7 +695,7 @@ static int wsp_ggml_backend_sched_backend_id(wsp_ggml_backend_sched_t sched, wsp
1482
695
  }
1483
696
 
1484
697
  static int wsp_ggml_backend_sched_backend_from_buffer(wsp_ggml_backend_sched_t sched, const struct wsp_ggml_tensor * tensor, const struct wsp_ggml_tensor * op) {
1485
- wsp_ggml_backend_buffer_t buffer = tensor->buffer;
698
+ wsp_ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1486
699
  if (buffer == NULL) {
1487
700
  return -1;
1488
701
  }
@@ -1515,8 +728,6 @@ static char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*16 + WSP_GGML_SCHED_MAX_SPLITS_DE
1515
728
 
1516
729
  // returns the backend that should be used for the node based on the current locations
1517
730
  static int wsp_ggml_backend_sched_backend_id_from_cur(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * tensor) {
1518
- // TODO: use supports_op to check if the backend supports the op
1519
-
1520
731
  // assign pre-allocated nodes to their backend
1521
732
  int cur_backend_id = wsp_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1522
733
  if (cur_backend_id != -1) {
@@ -1535,7 +746,8 @@ static int wsp_ggml_backend_sched_backend_id_from_cur(wsp_ggml_backend_sched_t s
1535
746
 
1536
747
  if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1537
748
  // since the tensor is pre-allocated, it cannot be moved to another backend
1538
- WSP_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
749
+ wsp_ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
750
+ WSP_GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, wsp_ggml_backend_buffer_name(buffer), wsp_ggml_op_name(tensor->op));
1539
751
  }
1540
752
 
1541
753
  // graph input
@@ -1551,10 +763,12 @@ static int wsp_ggml_backend_sched_backend_id_from_cur(wsp_ggml_backend_sched_t s
1551
763
  if (src == NULL) {
1552
764
  continue;
1553
765
  }
1554
- if (src->buffer != NULL && src->buffer->usage == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
766
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
767
+ // not an ideal solution
768
+ if (tensor->op != WSP_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1555
769
  int src_backend_id = wsp_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1556
770
  // check if a backend with higher prio wants to offload the op
1557
- if (src_backend_id == sched->n_backends - 1) {
771
+ if (sched->op_offload && src_backend_id == sched->n_backends - 1 && wsp_ggml_backend_buffer_is_host(src->buffer)) {
1558
772
  for (int b = 0; b < src_backend_id; b++) {
1559
773
  if (wsp_ggml_backend_supports_op(sched->backends[b], tensor) && wsp_ggml_backend_offload_op(sched->backends[b], tensor)) {
1560
774
  SET_CAUSE(tensor, "1.off");
@@ -1585,9 +799,12 @@ static void wsp_ggml_backend_sched_print_assignments(wsp_ggml_backend_sched_t sc
1585
799
  for (int i = 0; i < graph->n_nodes; i++) {
1586
800
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1587
801
  wsp_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1588
- WSP_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, wsp_ggml_backend_name(split_backend),
802
+ WSP_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, wsp_ggml_backend_name(split_backend),
1589
803
  sched->splits[cur_split].n_inputs);
1590
804
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
805
+ if (j == 0) {
806
+ WSP_GGML_LOG_DEBUG(": ");
807
+ }
1591
808
  WSP_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1592
809
  fmt_size(wsp_ggml_nbytes(sched->splits[cur_split].inputs[j])));
1593
810
  }
@@ -1598,19 +815,21 @@ static void wsp_ggml_backend_sched_print_assignments(wsp_ggml_backend_sched_t sc
1598
815
  if (wsp_ggml_is_view_op(node->op)) {
1599
816
  continue;
1600
817
  }
1601
- wsp_ggml_backend_t tensor_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, node);
1602
- WSP_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name,
1603
- fmt_size(wsp_ggml_nbytes(node)), tensor_backend ? wsp_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1604
- for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1605
- struct wsp_ggml_tensor * src = node->src[j];
1606
- if (src == NULL) {
1607
- continue;
818
+ if (sched->debug > 1) {
819
+ wsp_ggml_backend_t tensor_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, node);
820
+ WSP_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name,
821
+ fmt_size(wsp_ggml_nbytes(node)), tensor_backend ? wsp_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
822
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
823
+ struct wsp_ggml_tensor * src = node->src[j];
824
+ if (src == NULL) {
825
+ continue;
826
+ }
827
+ wsp_ggml_backend_t src_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, src);
828
+ WSP_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
829
+ fmt_size(wsp_ggml_nbytes(src)), src_backend ? wsp_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1608
830
  }
1609
- wsp_ggml_backend_t src_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, src);
1610
- WSP_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1611
- fmt_size(wsp_ggml_nbytes(src)), src_backend ? wsp_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
831
+ WSP_GGML_LOG_DEBUG("\n");
1612
832
  }
1613
- WSP_GGML_LOG_DEBUG("\n");
1614
833
  }
1615
834
  }
1616
835
 
@@ -1892,7 +1111,7 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
1892
1111
 
1893
1112
  const int node_backend_id = tensor_backend_id(node);
1894
1113
 
1895
- assert(node_backend_id != -1); // all nodes should be assigned by now
1114
+ assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1896
1115
 
1897
1116
  // check if we should start a new split based on the sources of the current node
1898
1117
  bool need_new_split = false;
@@ -1902,11 +1121,11 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
1902
1121
  if (src == NULL) {
1903
1122
  continue;
1904
1123
  }
1905
- // check if a weight is on a different backend
1124
+ // check if a weight is on a different and incompatible backend
1906
1125
  // by starting a new split, the memory of the previously offloaded weights can be reused
1907
1126
  if (src->buffer != NULL && src->buffer->usage == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1908
1127
  int src_backend_id = tensor_backend_id(src);
1909
- if (src_backend_id != cur_backend_id) {
1128
+ if (src_backend_id != cur_backend_id && !wsp_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1910
1129
  need_new_split = true;
1911
1130
  break;
1912
1131
  }
@@ -1918,7 +1137,6 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
1918
1137
  int src_backend_id = sched->hv_tensor_backend_ids[id];
1919
1138
  bool supported = wsp_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1920
1139
  if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1921
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1922
1140
  need_new_split = true;
1923
1141
  break;
1924
1142
  }
@@ -2122,7 +1340,10 @@ static bool wsp_ggml_backend_sched_alloc_splits(wsp_ggml_backend_sched_t sched)
2122
1340
  // allocate graph
2123
1341
  if (backend_ids_changed || !wsp_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2124
1342
  // the re-allocation may cause the split inputs to be moved to a different address
2125
- wsp_ggml_backend_sched_synchronize(sched);
1343
+ // synchronize without wsp_ggml_backend_sched_synchronize to avoid changing cur_copy
1344
+ for (int i = 0; i < sched->n_backends; i++) {
1345
+ wsp_ggml_backend_synchronize(sched->backends[i]);
1346
+ }
2126
1347
  #ifndef NDEBUG
2127
1348
  WSP_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2128
1349
  #endif
@@ -2236,14 +1457,16 @@ wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(
2236
1457
  wsp_ggml_backend_buffer_type_t * bufts,
2237
1458
  int n_backends,
2238
1459
  size_t graph_size,
2239
- bool parallel) {
1460
+ bool parallel,
1461
+ bool op_offload) {
2240
1462
  WSP_GGML_ASSERT(n_backends > 0);
2241
1463
  WSP_GGML_ASSERT(n_backends <= WSP_GGML_SCHED_MAX_BACKENDS);
2242
- WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1464
+ WSP_GGML_ASSERT(wsp_ggml_backend_dev_type(wsp_ggml_backend_get_device(backends[n_backends - 1])) == WSP_GGML_BACKEND_DEVICE_TYPE_CPU);
2243
1465
 
2244
1466
  struct wsp_ggml_backend_sched * sched = (wsp_ggml_backend_sched *) calloc(1, sizeof(struct wsp_ggml_backend_sched));
2245
1467
 
2246
- sched->debug = getenv("WSP_GGML_SCHED_DEBUG") != NULL;
1468
+ const char * WSP_GGML_SCHED_DEBUG = getenv("WSP_GGML_SCHED_DEBUG");
1469
+ sched->debug = WSP_GGML_SCHED_DEBUG ? atoi(WSP_GGML_SCHED_DEBUG) : 0;
2247
1470
  sched->n_backends = n_backends;
2248
1471
  sched->n_copies = parallel ? WSP_GGML_SCHED_MAX_COPIES : 1;
2249
1472
 
@@ -2280,6 +1503,7 @@ wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(
2280
1503
  }
2281
1504
 
2282
1505
  sched->galloc = wsp_ggml_gallocr_new_n(sched->bufts, n_backends);
1506
+ sched->op_offload = op_offload;
2283
1507
 
2284
1508
  wsp_ggml_backend_sched_reset(sched);
2285
1509
 
@@ -2327,12 +1551,13 @@ bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_g
2327
1551
 
2328
1552
  wsp_ggml_backend_sched_split_graph(sched, measure_graph);
2329
1553
 
1554
+ wsp_ggml_backend_sched_synchronize(sched);
1555
+
2330
1556
  if (!wsp_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
2331
1557
  return false;
2332
1558
  }
2333
1559
 
2334
1560
  wsp_ggml_backend_sched_reset(sched);
2335
- wsp_ggml_backend_sched_synchronize(sched);
2336
1561
 
2337
1562
  return true;
2338
1563
  }
@@ -2342,7 +1567,6 @@ bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct w
2342
1567
 
2343
1568
  wsp_ggml_backend_sched_split_graph(sched, graph);
2344
1569
 
2345
-
2346
1570
  if (!wsp_ggml_backend_sched_alloc_splits(sched)) {
2347
1571
  return false;
2348
1572
  }
@@ -2376,6 +1600,12 @@ void wsp_ggml_backend_sched_synchronize(wsp_ggml_backend_sched_t sched) {
2376
1600
  for (int i = 0; i < sched->n_backends; i++) {
2377
1601
  wsp_ggml_backend_synchronize(sched->backends[i]);
2378
1602
  }
1603
+ if (!sched->is_alloc) {
1604
+ // if the graph is not already allocated, always use copy 0 after a synchronization
1605
+ // this ensures that during generation the same copy is used every time,
1606
+ // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607
+ sched->cur_copy = 0;
1608
+ }
2379
1609
  }
2380
1610
 
2381
1611
  void wsp_ggml_backend_sched_set_eval_callback(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_sched_eval_callback callback, void * user_data) {
@@ -2425,7 +1655,7 @@ wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sc
2425
1655
 
2426
1656
  // utils
2427
1657
 
2428
- void wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor) {
1658
+ enum wsp_ggml_status wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor) {
2429
1659
  WSP_GGML_ASSERT(tensor->buffer == NULL);
2430
1660
  WSP_GGML_ASSERT(tensor->view_src != NULL);
2431
1661
  WSP_GGML_ASSERT(tensor->view_src->buffer != NULL);
@@ -2433,10 +1663,10 @@ void wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor) {
2433
1663
 
2434
1664
  tensor->buffer = tensor->view_src->buffer;
2435
1665
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2436
- wsp_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1666
+ return wsp_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2437
1667
  }
2438
1668
 
2439
- void wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr) {
1669
+ enum wsp_ggml_status wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr) {
2440
1670
  WSP_GGML_ASSERT(tensor->buffer == NULL);
2441
1671
  WSP_GGML_ASSERT(tensor->data == NULL);
2442
1672
  WSP_GGML_ASSERT(tensor->view_src == NULL);
@@ -2446,7 +1676,7 @@ void wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_
2446
1676
 
2447
1677
  tensor->buffer = buffer;
2448
1678
  tensor->data = addr;
2449
- wsp_ggml_backend_buffer_init_tensor(buffer, tensor);
1679
+ return wsp_ggml_backend_buffer_init_tensor(buffer, tensor);
2450
1680
  }
2451
1681
 
2452
1682
  static struct wsp_ggml_tensor * graph_copy_dup_tensor(struct wsp_ggml_hash_set hash_set, struct wsp_ggml_tensor ** node_copies,
@@ -2492,7 +1722,8 @@ static void graph_copy_init_tensor(struct wsp_ggml_hash_set * hash_set, struct w
2492
1722
  struct wsp_ggml_tensor * dst = node_copies[id];
2493
1723
  if (dst->view_src != NULL) {
2494
1724
  graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2495
- wsp_ggml_backend_view_init(dst);
1725
+ enum wsp_ggml_status status = wsp_ggml_backend_view_init(dst);
1726
+ WSP_GGML_ASSERT(status == WSP_GGML_STATUS_SUCCESS);
2496
1727
  }
2497
1728
  else {
2498
1729
  wsp_ggml_backend_tensor_copy(src, dst);
@@ -2607,7 +1838,6 @@ bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggm
2607
1838
  assert(g1->n_nodes == g2->n_nodes);
2608
1839
 
2609
1840
  for (int i = 0; i < g1->n_nodes; i++) {
2610
- //printf("eval %d/%d\n", i, g1->n_nodes);
2611
1841
  struct wsp_ggml_tensor * t1 = g1->nodes[i];
2612
1842
  struct wsp_ggml_tensor * t2 = g2->nodes[i];
2613
1843
 
@@ -2633,3 +1863,154 @@ bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggm
2633
1863
 
2634
1864
  return true;
2635
1865
  }
1866
+
1867
+ // CPU backend - buffer
1868
+
1869
+ static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
1870
+ uintptr_t data = (uintptr_t)buffer->context;
1871
+
1872
+ // align the buffer
1873
+ if (data % TENSOR_ALIGNMENT != 0) {
1874
+ data = WSP_GGML_PAD(data, TENSOR_ALIGNMENT);
1875
+ }
1876
+
1877
+ return (void *)data;
1878
+ }
1879
+
1880
+ static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
1881
+ wsp_ggml_aligned_free(buffer->context, buffer->size);
1882
+ }
1883
+
1884
+ static void wsp_ggml_backend_cpu_buffer_memset_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1885
+ memset((char *)tensor->data + offset, value, size);
1886
+
1887
+ WSP_GGML_UNUSED(buffer);
1888
+ }
1889
+
1890
+ static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1891
+ memcpy((char *)tensor->data + offset, data, size);
1892
+
1893
+ WSP_GGML_UNUSED(buffer);
1894
+ }
1895
+
1896
+ static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1897
+ memcpy(data, (const char *)tensor->data + offset, size);
1898
+
1899
+ WSP_GGML_UNUSED(buffer);
1900
+ }
1901
+
1902
+ static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
1903
+ if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
1904
+ memcpy(dst->data, src->data, wsp_ggml_nbytes(src));
1905
+ return true;
1906
+ }
1907
+ return false;
1908
+
1909
+ WSP_GGML_UNUSED(buffer);
1910
+ }
1911
+
1912
+ static void wsp_ggml_backend_cpu_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
1913
+ memset(buffer->context, value, buffer->size);
1914
+ }
1915
+
1916
+ static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_cpu_buffer_i = {
1917
+ /* .free_buffer = */ wsp_ggml_backend_cpu_buffer_free_buffer,
1918
+ /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
1919
+ /* .init_tensor = */ NULL, // no initialization required
1920
+ /* .memset_tensor = */ wsp_ggml_backend_cpu_buffer_memset_tensor,
1921
+ /* .set_tensor = */ wsp_ggml_backend_cpu_buffer_set_tensor,
1922
+ /* .get_tensor = */ wsp_ggml_backend_cpu_buffer_get_tensor,
1923
+ /* .cpy_tensor = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
1924
+ /* .clear = */ wsp_ggml_backend_cpu_buffer_clear,
1925
+ /* .reset = */ NULL,
1926
+ };
1927
+
1928
+ static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_cpu_buffer_from_ptr_i = {
1929
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
1930
+ /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
1931
+ /* .init_tensor = */ NULL, // no initialization required
1932
+ /* .memset_tensor = */ wsp_ggml_backend_cpu_buffer_memset_tensor,
1933
+ /* .set_tensor = */ wsp_ggml_backend_cpu_buffer_set_tensor,
1934
+ /* .get_tensor = */ wsp_ggml_backend_cpu_buffer_get_tensor,
1935
+ /* .cpy_tensor = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
1936
+ /* .clear = */ wsp_ggml_backend_cpu_buffer_clear,
1937
+ /* .reset = */ NULL,
1938
+ };
1939
+
1940
+ // CPU backend buffer type
1941
+
1942
+ // this buffer type is defined here to make it available to all backends
1943
+
1944
+ static const char * wsp_ggml_backend_cpu_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
1945
+ return "CPU";
1946
+
1947
+ WSP_GGML_UNUSED(buft);
1948
+ }
1949
+
1950
+ static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
1951
+ void * data = wsp_ggml_aligned_malloc(size);
1952
+
1953
+ if (data == NULL) {
1954
+ WSP_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
1955
+ return NULL;
1956
+ }
1957
+
1958
+ return wsp_ggml_backend_buffer_init(buft, wsp_ggml_backend_cpu_buffer_i, data, size);
1959
+ }
1960
+
1961
+ static size_t wsp_ggml_backend_cpu_buffer_type_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
1962
+ return TENSOR_ALIGNMENT;
1963
+
1964
+ WSP_GGML_UNUSED(buft);
1965
+ }
1966
+
1967
+ static bool wsp_ggml_backend_cpu_buffer_type_is_host(wsp_ggml_backend_buffer_type_t buft) {
1968
+ return true;
1969
+
1970
+ WSP_GGML_UNUSED(buft);
1971
+ }
1972
+
1973
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_buffer_type(void) {
1974
+ static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type = {
1975
+ /* .iface = */ {
1976
+ /* .get_name = */ wsp_ggml_backend_cpu_buffer_type_get_name,
1977
+ /* .alloc_buffer = */ wsp_ggml_backend_cpu_buffer_type_alloc_buffer,
1978
+ /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
1979
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1980
+ /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
1981
+ /* .is_host = */ wsp_ggml_backend_cpu_buffer_type_is_host,
1982
+ },
1983
+ /* .device = */ NULL, // FIXME wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
1984
+ /* .context = */ NULL,
1985
+ };
1986
+
1987
+ return &wsp_ggml_backend_cpu_buffer_type;
1988
+ }
1989
+
1990
+ static const char * wsp_ggml_backend_cpu_buffer_from_ptr_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
1991
+ return "CPU_Mapped";
1992
+
1993
+ WSP_GGML_UNUSED(buft);
1994
+ }
1995
+
1996
+ static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_buffer_from_ptr_type(void) {
1997
+ static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type = {
1998
+ /* .iface = */ {
1999
+ /* .get_name = */ wsp_ggml_backend_cpu_buffer_from_ptr_type_get_name,
2000
+ /* .alloc_buffer = */ wsp_ggml_backend_cpu_buffer_type_alloc_buffer,
2001
+ /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
2002
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2003
+ /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
2004
+ /* .is_host = */ wsp_ggml_backend_cpu_buffer_type_is_host,
2005
+ },
2006
+ /* .device = */ NULL, // FIXME wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
2007
+ /* .context = */ NULL,
2008
+ };
2009
+
2010
+ return &wsp_ggml_backend_cpu_buffer_type;
2011
+ }
2012
+
2013
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
2014
+ WSP_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
2015
+ return wsp_ggml_backend_buffer_init(wsp_ggml_backend_cpu_buffer_from_ptr_type(), wsp_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
2016
+ }