whisper.rn 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/cpp/ggml-alloc.c +264 -126
  4. package/cpp/ggml-backend-impl.h +4 -1
  5. package/cpp/ggml-backend-reg.cpp +13 -5
  6. package/cpp/ggml-backend.cpp +207 -17
  7. package/cpp/ggml-backend.h +17 -1
  8. package/cpp/ggml-cpu/amx/amx.cpp +4 -2
  9. package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
  10. package/cpp/ggml-cpu/arch-fallback.h +0 -4
  11. package/cpp/ggml-cpu/common.h +14 -0
  12. package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
  13. package/cpp/ggml-cpu/ggml-cpu.c +48 -41
  14. package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
  15. package/cpp/ggml-cpu/ops.cpp +518 -767
  16. package/cpp/ggml-cpu/ops.h +2 -0
  17. package/cpp/ggml-cpu/simd-mappings.h +88 -59
  18. package/cpp/ggml-cpu/vec.cpp +161 -20
  19. package/cpp/ggml-cpu/vec.h +400 -51
  20. package/cpp/ggml-cpu.h +1 -1
  21. package/cpp/ggml-impl.h +43 -10
  22. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  23. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  24. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  25. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  26. package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
  27. package/cpp/ggml-metal/ggml-metal-device.h +226 -0
  28. package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
  29. package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
  30. package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
  31. package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
  32. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  33. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  34. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  35. package/cpp/ggml-metal-impl.h +40 -40
  36. package/cpp/ggml-metal.h +1 -6
  37. package/cpp/ggml-quants.c +1 -0
  38. package/cpp/ggml.c +175 -13
  39. package/cpp/ggml.h +84 -5
  40. package/cpp/jsi/RNWhisperJSI.cpp +2 -0
  41. package/cpp/jsi/ThreadPool.h +3 -3
  42. package/cpp/whisper.cpp +85 -70
  43. package/cpp/whisper.h +1 -0
  44. package/ios/CMakeLists.txt +6 -1
  45. package/ios/RNWhisperVadContext.mm +14 -13
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  49. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  50. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  51. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  58. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  59. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  60. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  61. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  62. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  68. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  70. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  71. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  72. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  73. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  74. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  75. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  76. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  77. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  78. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  79. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  80. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  81. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  82. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  83. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  84. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  85. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  86. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  87. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  88. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  89. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  90. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  91. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  92. package/lib/commonjs/version.json +1 -1
  93. package/lib/module/version.json +1 -1
  94. package/package.json +1 -1
  95. package/src/version.json +1 -1
  96. package/whisper-rn.podspec +8 -9
  97. package/cpp/ggml-metal.m +0 -6779
  98. package/cpp/ggml-whisper-sim.metallib +0 -0
  99. package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-alloc.c CHANGED
@@ -23,7 +23,7 @@ static bool wsp_ggml_is_view(const struct wsp_ggml_tensor * t) {
23
23
  }
24
24
 
25
25
  // ops that return true for this function must not use restrict pointers for their backend implementations
26
- static bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
26
+ bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
27
27
  switch (op) {
28
28
  case WSP_GGML_OP_SCALE:
29
29
  case WSP_GGML_OP_DIAG_MASK_ZERO:
@@ -95,39 +95,104 @@ enum wsp_ggml_status wsp_ggml_tallocr_alloc(struct wsp_ggml_tallocr * talloc, st
95
95
 
96
96
  // dynamic tensor allocator
97
97
 
98
+ #define WSP_GGML_VBUFFER_MAX_CHUNKS 16
99
+
100
+ // relative memory address within an allocation that can be split into multiple buffers (chunks)
101
+ struct buffer_address {
102
+ int chunk; // index of a backend buffer
103
+ size_t offset; // local memory offset within the buffer
104
+ };
105
+
106
+ static const struct buffer_address WSP_GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
107
+
108
+ static bool wsp_ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
109
+ return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
110
+ }
111
+
98
112
  struct free_block {
99
113
  size_t offset;
100
114
  size_t size;
101
115
  };
102
116
 
103
- struct wsp_ggml_dyn_tallocr {
104
- size_t alignment;
105
- int n_free_blocks;
117
+ struct tallocr_chunk {
106
118
  struct free_block free_blocks[MAX_FREE_BLOCKS];
119
+ int n_free_blocks;
107
120
  size_t max_size;
121
+ };
122
+
123
+ struct wsp_ggml_dyn_tallocr {
124
+ size_t alignment;
125
+ size_t max_chunk_size;
126
+ struct tallocr_chunk * chunks[WSP_GGML_VBUFFER_MAX_CHUNKS];
127
+ int n_chunks;
108
128
 
109
129
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
110
130
  struct {
111
131
  const struct wsp_ggml_tensor * tensor;
112
- size_t offset;
132
+ struct buffer_address addr;
113
133
  } allocated_tensors[1024];
114
134
  #endif
115
135
  };
116
136
 
137
+ static void wsp_ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
138
+ WSP_GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
139
+ // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
140
+ int insert_pos = 0;
141
+ while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
142
+ insert_pos++;
143
+ }
144
+ // shift all blocks from insert_pos onward to make room for the new block
145
+ for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
146
+ chunk->free_blocks[i] = chunk->free_blocks[i-1];
147
+ }
148
+ // insert the new block
149
+ chunk->free_blocks[insert_pos].offset = offset;
150
+ chunk->free_blocks[insert_pos].size = size;
151
+ chunk->n_free_blocks++;
152
+ }
153
+
154
+ static void wsp_ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
155
+ // shift all elements after idx by 1 to the left, overwriting the element at idx
156
+ for (int i = idx; i < chunk->n_free_blocks; i++) {
157
+ chunk->free_blocks[i] = chunk->free_blocks[i+1];
158
+ }
159
+ chunk->n_free_blocks--;
160
+ }
161
+
162
+ static int wsp_ggml_dyn_tallocr_new_chunk(struct wsp_ggml_dyn_tallocr * alloc, size_t min_size) {
163
+ if (alloc->n_chunks >= WSP_GGML_VBUFFER_MAX_CHUNKS) {
164
+ return -1;
165
+ }
166
+ struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
167
+ chunk->n_free_blocks = 1;
168
+ chunk->free_blocks[0].offset = 0;
169
+ // available space in a chunk is limited to max_chunk_size, but can be higher if:
170
+ // 1. a single tensor exceeds the maximum, and cannot fit any other way
171
+ // 2. we are running out of chunks
172
+ // backends will either manage to allocate the larger size, or report an error.
173
+ chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
174
+ if (alloc->n_chunks == WSP_GGML_VBUFFER_MAX_CHUNKS - 1) {
175
+ chunk->free_blocks[0].size = SIZE_MAX/2;
176
+ }
177
+ alloc->chunks[alloc->n_chunks] = chunk;
178
+ alloc->n_chunks++;
179
+ return alloc->n_chunks - 1;
180
+ }
181
+
117
182
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
118
- static void add_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t offset, const struct wsp_ggml_tensor * tensor) {
183
+ static void add_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct wsp_ggml_tensor * tensor) {
119
184
  for (int i = 0; i < 1024; i++) {
120
185
  if (alloc->allocated_tensors[i].tensor == NULL) {
121
186
  alloc->allocated_tensors[i].tensor = tensor;
122
- alloc->allocated_tensors[i].offset = offset;
187
+ alloc->allocated_tensors[i].addr = addr;
123
188
  return;
124
189
  }
125
190
  }
126
191
  WSP_GGML_ABORT("out of allocated_tensors");
127
192
  }
128
- static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t offset, const struct wsp_ggml_tensor * tensor) {
193
+ static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct wsp_ggml_tensor * tensor) {
129
194
  for (int i = 0; i < 1024; i++) {
130
- if (alloc->allocated_tensors[i].offset == offset) {
195
+ if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
131
196
  alloc->allocated_tensors[i].tensor = NULL;
132
197
  return;
133
198
  }
@@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t
136
201
  }
137
202
  #endif
138
203
 
139
- static size_t wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, size_t size, const struct wsp_ggml_tensor * tensor) {
204
+ static struct buffer_address wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, size_t size, const struct wsp_ggml_tensor * tensor) {
140
205
  size = aligned_offset(NULL, size, alloc->alignment);
141
206
 
142
207
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
143
208
 
209
+ int best_fit_chunk = -1;
210
+ int best_fit_block = -1;
144
211
  size_t max_avail = 0;
145
212
 
146
- // find the best fitting free block besides the last block
147
- int best_fit_block = -1;
148
- size_t best_fit_size = SIZE_MAX;
149
- for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
150
- struct free_block * block = &alloc->free_blocks[i];
151
- max_avail = MAX(max_avail, block->size);
152
- if (block->size >= size && block->size <= best_fit_size) {
153
- best_fit_block = i;
154
- best_fit_size = block->size;
213
+ // find the best fitting free block besides the last block, within any chunk
214
+ for (int c = 0; c < alloc->n_chunks; ++c) {
215
+ struct tallocr_chunk * chunk = alloc->chunks[c];
216
+ size_t best_fit_size = SIZE_MAX;
217
+ for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
218
+ struct free_block * block = &chunk->free_blocks[i];
219
+ max_avail = MAX(max_avail, block->size);
220
+ if (block->size >= size && block->size <= best_fit_size) {
221
+ best_fit_chunk = c;
222
+ best_fit_block = i;
223
+ best_fit_size = block->size;
224
+ }
155
225
  }
156
226
  }
157
227
 
158
228
  if (best_fit_block == -1) {
159
- // the last block is our last resort
160
- struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
161
- max_avail = MAX(max_avail, block->size);
162
- if (block->size >= size) {
163
- best_fit_block = alloc->n_free_blocks - 1;
164
- } else {
165
- // this should never happen
166
- WSP_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
167
- __func__, size, max_avail);
168
- WSP_GGML_ABORT("not enough space in the buffer");
229
+ // no suitable block found, try the last block (this will grow a chunks size)
230
+ for (int c = 0; c < alloc->n_chunks; ++c) {
231
+ struct tallocr_chunk * chunk = alloc->chunks[c];
232
+ if (chunk->n_free_blocks > 0) {
233
+ struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
234
+ max_avail = MAX(max_avail, block->size);
235
+ if (block->size >= size) {
236
+ best_fit_chunk = c;
237
+ best_fit_block = chunk->n_free_blocks - 1;
238
+ break;
239
+ }
240
+ }
169
241
  }
170
242
  }
171
243
 
172
- struct free_block * block = &alloc->free_blocks[best_fit_block];
173
- size_t offset = block->offset;
174
- block->offset = offset + size;
244
+ if (best_fit_block == -1) {
245
+ // none of the existing chunks have enough space left
246
+ best_fit_chunk = wsp_ggml_dyn_tallocr_new_chunk(alloc, size);
247
+ best_fit_block = 0;
248
+ }
249
+ if (best_fit_chunk == -1) {
250
+ // since the last chunk always has virtually endless memory, this should never happen
251
+ WSP_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
252
+ __func__, size, max_avail);
253
+ WSP_GGML_ABORT("graph allocation: failed to reserve memory");
254
+ }
255
+
256
+ struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
257
+ struct free_block * block = &chunk->free_blocks[best_fit_block];
258
+ struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
259
+ block->offset += size;
175
260
  block->size -= size;
176
261
  if (block->size == 0) {
177
262
  // remove block if empty
178
- alloc->n_free_blocks--;
179
- for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
180
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
181
- }
263
+ wsp_ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
182
264
  }
183
265
 
184
- AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
266
+ AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
185
267
 
186
268
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
187
- add_allocated_tensor(alloc, offset, tensor);
188
- size_t cur_max = offset + size;
189
- if (cur_max > alloc->max_size) {
190
- // sort allocated_tensors by offset
269
+ add_allocated_tensor(alloc, addr, tensor);
270
+ size_t cur_max = addr.offset + size;
271
+ if (cur_max > alloc->max_size[addr.chunk]) {
272
+ // sort allocated_tensors by chunk/offset
191
273
  for (int i = 0; i < 1024; i++) {
192
274
  for (int j = i + 1; j < 1024; j++) {
193
- if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
275
+ if (wsp_ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
194
276
  const struct wsp_ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
195
- size_t tmp_offset = alloc->allocated_tensors[i].offset;
277
+ struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
196
278
  alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
197
- alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
279
+ alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
198
280
  alloc->allocated_tensors[j].tensor = tmp_tensor;
199
- alloc->allocated_tensors[j].offset = tmp_offset;
281
+ alloc->allocated_tensors[j].addr = tmp_addr;
200
282
  }
201
283
  }
202
284
  }
203
- WSP_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
285
+ WSP_GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
204
286
  for (int i = 0; i < 1024; i++) {
205
287
  if (alloc->allocated_tensors[i].tensor) {
206
- WSP_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
207
- alloc->allocated_tensors[i].offset,
208
- alloc->allocated_tensors[i].offset + wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor),
288
+ WSP_GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
289
+ alloc->allocated_tensors[i].addr.chunk,
290
+ alloc->allocated_tensors[i].addr.offset,
291
+ alloc->allocated_tensors[i].addr.offset + wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor),
209
292
  wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
210
293
  }
211
294
  }
@@ -213,78 +296,69 @@ static size_t wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, si
213
296
  }
214
297
  #endif
215
298
 
216
- alloc->max_size = MAX(alloc->max_size, offset + size);
299
+ chunk->max_size = MAX(chunk->max_size, addr.offset + size);
217
300
 
218
- return offset;
301
+ return addr;
219
302
 
220
303
  WSP_GGML_UNUSED(tensor);
221
304
  }
222
305
 
223
306
  // this is a very naive implementation, but for our case the number of free blocks should be very small
224
- static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct wsp_ggml_tensor * tensor) {
307
+ static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct wsp_ggml_tensor * tensor) {
225
308
  size = aligned_offset(NULL, size, alloc->alignment);
226
309
 
227
- AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
310
+ AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
311
+ __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
228
312
 
229
313
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
230
- remove_allocated_tensor(alloc, offset, tensor);
314
+ remove_allocated_tensor(alloc, addr, tensor);
231
315
  #endif
232
316
 
317
+ struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
318
+
233
319
  // see if we can merge with an existing block
234
- for (int i = 0; i < alloc->n_free_blocks; i++) {
235
- struct free_block * block = &alloc->free_blocks[i];
320
+ for (int i = 0; i < chunk->n_free_blocks; i++) {
321
+ struct free_block * block = &chunk->free_blocks[i];
236
322
  // check if ptr is at the end of the block
237
- if (block->offset + block->size == offset) {
323
+ if (block->offset + block->size == addr.offset) {
238
324
  block->size += size;
239
325
  // check if we can merge with the next block
240
- if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
241
- block->size += alloc->free_blocks[i+1].size;
242
- alloc->n_free_blocks--;
243
- for (int j = i+1; j < alloc->n_free_blocks; j++) {
244
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
326
+ if (i < chunk->n_free_blocks - 1) {
327
+ struct free_block * next = &chunk->free_blocks[i+1];
328
+ if (block->offset + block->size == next->offset) {
329
+ block->size += next->size;
330
+ wsp_ggml_dyn_tallocr_remove_block(chunk, i+1);
245
331
  }
246
332
  }
247
333
  return;
248
334
  }
249
335
  // check if ptr is at the beginning of the block
250
- if (offset + size == block->offset) {
251
- block->offset = offset;
336
+ if (addr.offset + size == block->offset) {
337
+ block->offset = addr.offset;
252
338
  block->size += size;
253
339
  // check if we can merge with the previous block
254
- if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
255
- alloc->free_blocks[i-1].size += block->size;
256
- alloc->n_free_blocks--;
257
- for (int j = i; j < alloc->n_free_blocks; j++) {
258
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
340
+ if (i > 0) {
341
+ struct free_block * prev = &chunk->free_blocks[i-1];
342
+ if (prev->offset + prev->size == block->offset) {
343
+ prev->size += block->size;
344
+ wsp_ggml_dyn_tallocr_remove_block(chunk, i);
259
345
  }
260
346
  }
261
347
  return;
262
348
  }
263
349
  }
264
350
  // otherwise, add a new block
265
- WSP_GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
266
- // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
267
- int insert_pos = 0;
268
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
269
- insert_pos++;
270
- }
271
- // shift all blocks from insert_pos onward to make room for the new block
272
- for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
273
- alloc->free_blocks[i] = alloc->free_blocks[i-1];
274
- }
275
- // insert the new block
276
- alloc->free_blocks[insert_pos].offset = offset;
277
- alloc->free_blocks[insert_pos].size = size;
278
- alloc->n_free_blocks++;
351
+ wsp_ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
279
352
 
280
353
  WSP_GGML_UNUSED(tensor);
281
354
  }
282
355
 
283
356
  static void wsp_ggml_dyn_tallocr_reset(struct wsp_ggml_dyn_tallocr * alloc) {
284
- alloc->n_free_blocks = 1;
285
- alloc->free_blocks[0].offset = 0;
286
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
287
- alloc->max_size = 0;
357
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS; i++) {
358
+ free(alloc->chunks[i]);
359
+ alloc->chunks[i] = NULL;
360
+ }
361
+ alloc->n_chunks = 0;
288
362
 
289
363
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
290
364
  for (int i = 0; i < 1024; i++) {
@@ -293,14 +367,14 @@ static void wsp_ggml_dyn_tallocr_reset(struct wsp_ggml_dyn_tallocr * alloc) {
293
367
  #endif
294
368
  }
295
369
 
296
- static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment) {
370
+ static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
297
371
  struct wsp_ggml_dyn_tallocr * alloc = (struct wsp_ggml_dyn_tallocr *)malloc(sizeof(struct wsp_ggml_dyn_tallocr));
298
372
 
299
373
  *alloc = (struct wsp_ggml_dyn_tallocr) {
300
- /*.alignment = */ alignment,
301
- /*.n_free_blocks = */ 0,
302
- /*.free_blocks = */ {{0}},
303
- /*.max_size = */ 0,
374
+ /*.alignment = */ alignment,
375
+ /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
376
+ /*.chunks = */ {NULL},
377
+ /*.n_chunks = */ 0,
304
378
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
305
379
  /*.allocated_tensors = */ {{0}},
306
380
  #endif
@@ -312,11 +386,79 @@ static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment)
312
386
  }
313
387
 
314
388
  static void wsp_ggml_dyn_tallocr_free(struct wsp_ggml_dyn_tallocr * alloc) {
389
+ for (int i = 0; i < alloc->n_chunks; ++i) {
390
+ free(alloc->chunks[i]);
391
+ }
315
392
  free(alloc);
316
393
  }
317
394
 
318
395
  static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc) {
319
- return alloc->max_size;
396
+ size_t max_size = 0;
397
+ for (int i = 0; i < alloc->n_chunks; i++) {
398
+ max_size += alloc->chunks[i]->max_size;
399
+ }
400
+ return max_size;
401
+ }
402
+
403
+
404
+ // virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
405
+
406
+ struct vbuffer {
407
+ wsp_ggml_backend_buffer_t chunks[WSP_GGML_VBUFFER_MAX_CHUNKS];
408
+ };
409
+
410
+ static void wsp_ggml_vbuffer_free(struct vbuffer * buf) {
411
+ if (buf == NULL) {
412
+ return;
413
+ }
414
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS; ++i) {
415
+ wsp_ggml_backend_buffer_free(buf->chunks[i]);
416
+ }
417
+ free(buf);
418
+ }
419
+
420
+ static int wsp_ggml_vbuffer_n_chunks(struct vbuffer * buf) {
421
+ int n = 0;
422
+ while (n < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
423
+ return n;
424
+ }
425
+
426
+ static size_t wsp_ggml_vbuffer_size(struct vbuffer * buf) {
427
+ size_t size = 0;
428
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
429
+ size += wsp_ggml_backend_buffer_get_size(buf->chunks[i]);
430
+ }
431
+ return size;
432
+ }
433
+
434
+ static struct vbuffer * wsp_ggml_vbuffer_alloc(wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_dyn_tallocr * talloc, enum wsp_ggml_backend_buffer_usage usage) {
435
+ struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
436
+ if (buf == NULL) {
437
+ return NULL;
438
+ }
439
+
440
+ for (int n = 0; n < talloc->n_chunks; n++) {
441
+ size_t chunk_size = talloc->chunks[n]->max_size;
442
+ buf->chunks[n] = wsp_ggml_backend_buft_alloc_buffer(buft, chunk_size);
443
+ if (buf->chunks[n] == NULL) {
444
+ wsp_ggml_vbuffer_free(buf);
445
+ return NULL;
446
+ }
447
+ wsp_ggml_backend_buffer_set_usage(buf->chunks[n], usage);
448
+ }
449
+ return buf;
450
+ }
451
+
452
+ static void wsp_ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct wsp_ggml_tensor * tensor, struct buffer_address buf_addr) {
453
+ void * base = wsp_ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
454
+ void * addr = (char *)base + buf_addr.offset;
455
+ wsp_ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
456
+ }
457
+
458
+ static void wsp_ggml_vbuffer_reset(struct vbuffer * buf) {
459
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
460
+ wsp_ggml_backend_buffer_reset(buf->chunks[i]);
461
+ }
320
462
  }
321
463
 
322
464
 
@@ -328,13 +470,13 @@ struct hash_node {
328
470
  int n_children;
329
471
  int n_views;
330
472
  int buffer_id;
331
- size_t offset; // offset within the buffer
473
+ struct buffer_address addr;
332
474
  bool allocated;
333
475
  };
334
476
 
335
477
  struct tensor_alloc {
336
478
  int buffer_id;
337
- size_t offset;
479
+ struct buffer_address addr;
338
480
  size_t size_max; // 0 = pre-allocated, unused, or view
339
481
  };
340
482
 
@@ -349,7 +491,7 @@ struct node_alloc {
349
491
 
350
492
  struct wsp_ggml_gallocr {
351
493
  wsp_ggml_backend_buffer_type_t * bufts; // [n_buffers]
352
- wsp_ggml_backend_buffer_t * buffers; // [n_buffers]
494
+ struct vbuffer ** buffers; // [n_buffers]
353
495
  struct wsp_ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
354
496
  int n_buffers;
355
497
 
@@ -370,7 +512,7 @@ wsp_ggml_gallocr_t wsp_ggml_gallocr_new_n(wsp_ggml_backend_buffer_type_t * bufts
370
512
  galloc->bufts = calloc(n_bufs, sizeof(wsp_ggml_backend_buffer_type_t));
371
513
  WSP_GGML_ASSERT(galloc->bufts != NULL);
372
514
 
373
- galloc->buffers = calloc(n_bufs, sizeof(wsp_ggml_backend_buffer_t));
515
+ galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
374
516
  WSP_GGML_ASSERT(galloc->buffers != NULL);
375
517
 
376
518
  galloc->buf_tallocs = calloc(n_bufs, sizeof(struct wsp_ggml_dyn_tallocr *));
@@ -390,7 +532,8 @@ wsp_ggml_gallocr_t wsp_ggml_gallocr_new_n(wsp_ggml_backend_buffer_type_t * bufts
390
532
 
391
533
  if (galloc->buf_tallocs[i] == NULL) {
392
534
  size_t alignment = wsp_ggml_backend_buft_get_alignment(bufts[i]);
393
- galloc->buf_tallocs[i] = wsp_ggml_dyn_tallocr_new(alignment);
535
+ size_t max_size = wsp_ggml_backend_buft_get_max_size(bufts[i]);
536
+ galloc->buf_tallocs[i] = wsp_ggml_dyn_tallocr_new(alignment, max_size);
394
537
  }
395
538
  }
396
539
  galloc->n_buffers = n_bufs;
@@ -418,7 +561,7 @@ void wsp_ggml_gallocr_free(wsp_ggml_gallocr_t galloc) {
418
561
  }
419
562
  }
420
563
  if (!freed) {
421
- wsp_ggml_backend_buffer_free(galloc->buffers[i]);
564
+ wsp_ggml_vbuffer_free(galloc->buffers[i]);
422
565
  }
423
566
  }
424
567
  if (galloc->buf_tallocs != NULL) {
@@ -467,7 +610,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
467
610
 
468
611
  if (!wsp_ggml_gallocr_is_allocated(galloc, node) && !wsp_ggml_is_view(node)) {
469
612
  hn->allocated = true;
470
- assert(hn->offset == 0);
613
+ assert(hn->addr.offset == 0);
471
614
 
472
615
  // try to reuse a parent's buffer (inplace)
473
616
  if (wsp_ggml_op_can_inplace(node->op)) {
@@ -501,9 +644,9 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
501
644
  struct hash_node * view_src_hn = wsp_ggml_gallocr_hash_get(galloc, view_src);
502
645
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
503
646
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
504
- assert(view_src_hn->offset == p_hn->offset);
647
+ assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
505
648
  hn->buffer_id = p_hn->buffer_id;
506
- hn->offset = p_hn->offset;
649
+ hn->addr = p_hn->addr;
507
650
  p_hn->allocated = false; // avoid freeing the parent
508
651
  view_src_hn->allocated = false;
509
652
  return;
@@ -511,7 +654,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
511
654
  } else {
512
655
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
513
656
  hn->buffer_id = p_hn->buffer_id;
514
- hn->offset = p_hn->offset;
657
+ hn->addr = p_hn->addr;
515
658
  p_hn->allocated = false; // avoid freeing the parent
516
659
  return;
517
660
  }
@@ -522,9 +665,8 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
522
665
  struct wsp_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
523
666
  wsp_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
524
667
  size_t size = wsp_ggml_backend_buft_get_alloc_size(buft, node);
525
- size_t offset = wsp_ggml_dyn_tallocr_alloc(alloc, size, node);
526
668
  hn->buffer_id = buffer_id;
527
- hn->offset = offset;
669
+ hn->addr = wsp_ggml_dyn_tallocr_alloc(alloc, size, node);
528
670
  }
529
671
  }
530
672
 
@@ -536,12 +678,11 @@ static void wsp_ggml_gallocr_free_node(wsp_ggml_gallocr_t galloc, struct wsp_ggm
536
678
  }
537
679
 
538
680
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
539
- size_t offset = hn->offset;
540
681
  int buffer_id = hn->buffer_id;
541
682
  struct wsp_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
542
683
  wsp_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
543
684
  size_t size = wsp_ggml_backend_buft_get_alloc_size(buft, node);
544
- wsp_ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
685
+ wsp_ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
545
686
  hn->allocated = false;
546
687
  }
547
688
 
@@ -692,24 +833,24 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
692
833
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
693
834
  if (node->view_src || node->data) {
694
835
  node_alloc->dst.buffer_id = -1;
695
- node_alloc->dst.offset = SIZE_MAX;
836
+ node_alloc->dst.addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
696
837
  node_alloc->dst.size_max = 0;
697
838
  } else {
698
839
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
699
840
  node_alloc->dst.buffer_id = hn->buffer_id;
700
- node_alloc->dst.offset = hn->offset;
841
+ node_alloc->dst.addr = hn->addr;
701
842
  node_alloc->dst.size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
702
843
  }
703
844
  for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
704
845
  struct wsp_ggml_tensor * src = node->src[j];
705
846
  if (!src || src->view_src || src->data) {
706
847
  node_alloc->src[j].buffer_id = -1;
707
- node_alloc->src[j].offset = SIZE_MAX;
848
+ node_alloc->src[j].addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
708
849
  node_alloc->src[j].size_max = 0;
709
850
  } else {
710
851
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, src);
711
852
  node_alloc->src[j].buffer_id = hn->buffer_id;
712
- node_alloc->src[j].offset = hn->offset;
853
+ node_alloc->src[j].addr = hn->addr;
713
854
  node_alloc->src[j].size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
714
855
  }
715
856
  }
@@ -725,11 +866,11 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
725
866
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, leaf);
726
867
  if (leaf->view_src || leaf->data) {
727
868
  galloc->leaf_allocs[i].leaf.buffer_id = -1;
728
- galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
869
+ galloc->leaf_allocs[i].leaf.addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
729
870
  galloc->leaf_allocs[i].leaf.size_max = 0;
730
871
  } else {
731
872
  galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
732
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
873
+ galloc->leaf_allocs[i].leaf.addr = hn->addr;
733
874
  galloc->leaf_allocs[i].leaf.size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
734
875
  }
735
876
  }
@@ -744,7 +885,7 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
744
885
  }
745
886
  }
746
887
 
747
- size_t cur_size = galloc->buffers[i] ? wsp_ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
888
+ size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
748
889
  size_t new_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
749
890
 
750
891
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
@@ -753,13 +894,12 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
753
894
  WSP_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
754
895
  #endif
755
896
 
756
- wsp_ggml_backend_buffer_free(galloc->buffers[i]);
757
- galloc->buffers[i] = wsp_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
897
+ wsp_ggml_vbuffer_free(galloc->buffers[i]);
898
+ galloc->buffers[i] = wsp_ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
758
899
  if (galloc->buffers[i] == NULL) {
759
900
  WSP_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), new_size);
760
901
  return false;
761
902
  }
762
- wsp_ggml_backend_buffer_set_usage(galloc->buffers[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
763
903
  }
764
904
  }
765
905
 
@@ -772,11 +912,11 @@ bool wsp_ggml_gallocr_reserve(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph
772
912
 
773
913
  static void wsp_ggml_gallocr_init_tensor(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
774
914
  int buffer_id = tensor_alloc->buffer_id;
775
- assert(tensor->data || tensor->view_src || wsp_ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
915
+ assert(tensor->data || tensor->view_src || wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
776
916
 
777
917
  if (tensor->view_src != NULL) {
778
918
  if (tensor->buffer == NULL) {
779
- assert(tensor_alloc->offset == SIZE_MAX);
919
+ assert(tensor_alloc->addr.offset == SIZE_MAX);
780
920
  if (tensor->view_src->buffer == NULL) {
781
921
  // this tensor was allocated without ggml-backend
782
922
  return;
@@ -785,11 +925,9 @@ static void wsp_ggml_gallocr_init_tensor(wsp_ggml_gallocr_t galloc, struct wsp_g
785
925
  }
786
926
  } else {
787
927
  if (tensor->data == NULL) {
788
- assert(tensor_alloc->offset != SIZE_MAX);
789
- assert(wsp_ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
790
- void * base = wsp_ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
791
- void * addr = (char *)base + tensor_alloc->offset;
792
- wsp_ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
928
+ assert(tensor_alloc->addr.offset != SIZE_MAX);
929
+ assert(wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
930
+ wsp_ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
793
931
  } else {
794
932
  if (tensor->buffer == NULL) {
795
933
  // this tensor was allocated without ggml-backend
@@ -874,7 +1012,7 @@ bool wsp_ggml_gallocr_alloc_graph(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgr
874
1012
  // reset buffers
875
1013
  for (int i = 0; i < galloc->n_buffers; i++) {
876
1014
  if (galloc->buffers[i] != NULL) {
877
- wsp_ggml_backend_buffer_reset(galloc->buffers[i]);
1015
+ wsp_ggml_vbuffer_reset(galloc->buffers[i]);
878
1016
  }
879
1017
  }
880
1018
 
@@ -917,7 +1055,7 @@ size_t wsp_ggml_gallocr_get_buffer_size(wsp_ggml_gallocr_t galloc, int buffer_id
917
1055
  }
918
1056
  }
919
1057
 
920
- return wsp_ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
1058
+ return wsp_ggml_vbuffer_size(galloc->buffers[buffer_id]);
921
1059
  }
922
1060
 
923
1061
  // utils