whisper.rn 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/android/src/main/jni.cpp +12 -3
  4. package/cpp/ggml-alloc.c +292 -130
  5. package/cpp/ggml-backend-impl.h +4 -4
  6. package/cpp/ggml-backend-reg.cpp +13 -5
  7. package/cpp/ggml-backend.cpp +207 -17
  8. package/cpp/ggml-backend.h +19 -1
  9. package/cpp/ggml-cpu/amx/amx.cpp +5 -2
  10. package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
  11. package/cpp/ggml-cpu/arch-fallback.h +0 -4
  12. package/cpp/ggml-cpu/common.h +14 -0
  13. package/cpp/ggml-cpu/ggml-cpu-impl.h +14 -7
  14. package/cpp/ggml-cpu/ggml-cpu.c +65 -44
  15. package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
  16. package/cpp/ggml-cpu/ops.cpp +542 -775
  17. package/cpp/ggml-cpu/ops.h +2 -0
  18. package/cpp/ggml-cpu/simd-mappings.h +88 -59
  19. package/cpp/ggml-cpu/unary-ops.cpp +135 -0
  20. package/cpp/ggml-cpu/unary-ops.h +5 -0
  21. package/cpp/ggml-cpu/vec.cpp +227 -20
  22. package/cpp/ggml-cpu/vec.h +407 -56
  23. package/cpp/ggml-cpu.h +1 -1
  24. package/cpp/ggml-impl.h +94 -12
  25. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  26. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  27. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  28. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  29. package/cpp/ggml-metal/ggml-metal-device.cpp +1565 -0
  30. package/cpp/ggml-metal/ggml-metal-device.h +244 -0
  31. package/cpp/ggml-metal/ggml-metal-device.m +1325 -0
  32. package/cpp/ggml-metal/ggml-metal-impl.h +802 -0
  33. package/cpp/ggml-metal/ggml-metal-ops.cpp +3583 -0
  34. package/cpp/ggml-metal/ggml-metal-ops.h +88 -0
  35. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  36. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  37. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  38. package/cpp/ggml-metal-impl.h +40 -40
  39. package/cpp/ggml-metal.h +1 -6
  40. package/cpp/ggml-quants.c +1 -0
  41. package/cpp/ggml.c +341 -15
  42. package/cpp/ggml.h +150 -5
  43. package/cpp/jsi/RNWhisperJSI.cpp +9 -2
  44. package/cpp/jsi/ThreadPool.h +3 -3
  45. package/cpp/rn-whisper.h +1 -0
  46. package/cpp/whisper.cpp +89 -72
  47. package/cpp/whisper.h +1 -0
  48. package/ios/CMakeLists.txt +6 -1
  49. package/ios/RNWhisperContext.mm +3 -1
  50. package/ios/RNWhisperVadContext.mm +14 -13
  51. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  57. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
  58. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  59. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  60. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  61. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  62. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  68. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  69. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
  70. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  71. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  72. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  73. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  74. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  75. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  76. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  77. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  78. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  79. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  80. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  81. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  82. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
  83. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  84. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  85. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  86. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  87. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  88. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  89. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  90. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  91. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  92. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  93. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  94. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
  95. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  96. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  97. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  98. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  99. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  100. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  101. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  102. package/lib/commonjs/version.json +1 -1
  103. package/lib/module/NativeRNWhisper.js.map +1 -1
  104. package/lib/module/version.json +1 -1
  105. package/lib/typescript/NativeRNWhisper.d.ts +2 -0
  106. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  107. package/package.json +1 -1
  108. package/src/NativeRNWhisper.ts +2 -0
  109. package/src/version.json +1 -1
  110. package/whisper-rn.podspec +8 -9
  111. package/cpp/ggml-metal.m +0 -6779
  112. package/cpp/ggml-whisper-sim.metallib +0 -0
  113. package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-alloc.c CHANGED
@@ -23,7 +23,7 @@ static bool wsp_ggml_is_view(const struct wsp_ggml_tensor * t) {
23
23
  }
24
24
 
25
25
  // ops that return true for this function must not use restrict pointers for their backend implementations
26
- static bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
26
+ bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
27
27
  switch (op) {
28
28
  case WSP_GGML_OP_SCALE:
29
29
  case WSP_GGML_OP_DIAG_MASK_ZERO:
@@ -95,39 +95,104 @@ enum wsp_ggml_status wsp_ggml_tallocr_alloc(struct wsp_ggml_tallocr * talloc, st
95
95
 
96
96
  // dynamic tensor allocator
97
97
 
98
+ #define WSP_GGML_VBUFFER_MAX_CHUNKS 16
99
+
100
+ // relative memory address within an allocation that can be split into multiple buffers (chunks)
101
+ struct buffer_address {
102
+ int chunk; // index of a backend buffer
103
+ size_t offset; // local memory offset within the buffer
104
+ };
105
+
106
+ static const struct buffer_address WSP_GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
107
+
108
+ static bool wsp_ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
109
+ return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
110
+ }
111
+
98
112
  struct free_block {
99
113
  size_t offset;
100
114
  size_t size;
101
115
  };
102
116
 
103
- struct wsp_ggml_dyn_tallocr {
104
- size_t alignment;
105
- int n_free_blocks;
117
+ struct tallocr_chunk {
106
118
  struct free_block free_blocks[MAX_FREE_BLOCKS];
119
+ int n_free_blocks;
107
120
  size_t max_size;
121
+ };
122
+
123
+ struct wsp_ggml_dyn_tallocr {
124
+ size_t alignment;
125
+ size_t max_chunk_size;
126
+ struct tallocr_chunk * chunks[WSP_GGML_VBUFFER_MAX_CHUNKS];
127
+ int n_chunks;
108
128
 
109
129
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
110
130
  struct {
111
131
  const struct wsp_ggml_tensor * tensor;
112
- size_t offset;
132
+ struct buffer_address addr;
113
133
  } allocated_tensors[1024];
114
134
  #endif
115
135
  };
116
136
 
137
+ static void wsp_ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
138
+ WSP_GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
139
+ // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
140
+ int insert_pos = 0;
141
+ while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
142
+ insert_pos++;
143
+ }
144
+ // shift all blocks from insert_pos onward to make room for the new block
145
+ for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
146
+ chunk->free_blocks[i] = chunk->free_blocks[i-1];
147
+ }
148
+ // insert the new block
149
+ chunk->free_blocks[insert_pos].offset = offset;
150
+ chunk->free_blocks[insert_pos].size = size;
151
+ chunk->n_free_blocks++;
152
+ }
153
+
154
+ static void wsp_ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
155
+ // shift all elements after idx by 1 to the left, overwriting the element at idx
156
+ for (int i = idx; i < chunk->n_free_blocks; i++) {
157
+ chunk->free_blocks[i] = chunk->free_blocks[i+1];
158
+ }
159
+ chunk->n_free_blocks--;
160
+ }
161
+
162
+ static int wsp_ggml_dyn_tallocr_new_chunk(struct wsp_ggml_dyn_tallocr * alloc, size_t min_size) {
163
+ if (alloc->n_chunks >= WSP_GGML_VBUFFER_MAX_CHUNKS) {
164
+ return -1;
165
+ }
166
+ struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
167
+ chunk->n_free_blocks = 1;
168
+ chunk->free_blocks[0].offset = 0;
169
+ // available space in a chunk is limited to max_chunk_size, but can be higher if:
170
+ // 1. a single tensor exceeds the maximum, and cannot fit any other way
171
+ // 2. we are running out of chunks
172
+ // backends will either manage to allocate the larger size, or report an error.
173
+ chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
174
+ if (alloc->n_chunks == WSP_GGML_VBUFFER_MAX_CHUNKS - 1) {
175
+ chunk->free_blocks[0].size = SIZE_MAX/2;
176
+ }
177
+ alloc->chunks[alloc->n_chunks] = chunk;
178
+ alloc->n_chunks++;
179
+ return alloc->n_chunks - 1;
180
+ }
181
+
117
182
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
118
- static void add_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t offset, const struct wsp_ggml_tensor * tensor) {
183
+ static void add_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct wsp_ggml_tensor * tensor) {
119
184
  for (int i = 0; i < 1024; i++) {
120
185
  if (alloc->allocated_tensors[i].tensor == NULL) {
121
186
  alloc->allocated_tensors[i].tensor = tensor;
122
- alloc->allocated_tensors[i].offset = offset;
187
+ alloc->allocated_tensors[i].addr = addr;
123
188
  return;
124
189
  }
125
190
  }
126
191
  WSP_GGML_ABORT("out of allocated_tensors");
127
192
  }
128
- static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t offset, const struct wsp_ggml_tensor * tensor) {
193
+ static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct wsp_ggml_tensor * tensor) {
129
194
  for (int i = 0; i < 1024; i++) {
130
- if (alloc->allocated_tensors[i].offset == offset) {
195
+ if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
131
196
  alloc->allocated_tensors[i].tensor = NULL;
132
197
  return;
133
198
  }
@@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t
136
201
  }
137
202
  #endif
138
203
 
139
- static size_t wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, size_t size, const struct wsp_ggml_tensor * tensor) {
204
+ static struct buffer_address wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, size_t size, const struct wsp_ggml_tensor * tensor) {
140
205
  size = aligned_offset(NULL, size, alloc->alignment);
141
206
 
142
207
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
143
208
 
209
+ int best_fit_chunk = -1;
210
+ int best_fit_block = -1;
144
211
  size_t max_avail = 0;
145
212
 
146
- // find the best fitting free block besides the last block
147
- int best_fit_block = -1;
148
- size_t best_fit_size = SIZE_MAX;
149
- for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
150
- struct free_block * block = &alloc->free_blocks[i];
151
- max_avail = MAX(max_avail, block->size);
152
- if (block->size >= size && block->size <= best_fit_size) {
153
- best_fit_block = i;
154
- best_fit_size = block->size;
213
+ // find the best fitting free block besides the last block, within any chunk
214
+ for (int c = 0; c < alloc->n_chunks; ++c) {
215
+ struct tallocr_chunk * chunk = alloc->chunks[c];
216
+ size_t best_fit_size = SIZE_MAX;
217
+ for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
218
+ struct free_block * block = &chunk->free_blocks[i];
219
+ max_avail = MAX(max_avail, block->size);
220
+ if (block->size >= size && block->size <= best_fit_size) {
221
+ best_fit_chunk = c;
222
+ best_fit_block = i;
223
+ best_fit_size = block->size;
224
+ }
155
225
  }
156
226
  }
157
227
 
158
228
  if (best_fit_block == -1) {
159
- // the last block is our last resort
160
- struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
161
- max_avail = MAX(max_avail, block->size);
162
- if (block->size >= size) {
163
- best_fit_block = alloc->n_free_blocks - 1;
164
- } else {
165
- // this should never happen
166
- WSP_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
167
- __func__, size, max_avail);
168
- WSP_GGML_ABORT("not enough space in the buffer");
229
+ // no suitable block found, try the last block (this will grow a chunks size)
230
+ for (int c = 0; c < alloc->n_chunks; ++c) {
231
+ struct tallocr_chunk * chunk = alloc->chunks[c];
232
+ if (chunk->n_free_blocks > 0) {
233
+ struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
234
+ max_avail = MAX(max_avail, block->size);
235
+ if (block->size >= size) {
236
+ best_fit_chunk = c;
237
+ best_fit_block = chunk->n_free_blocks - 1;
238
+ break;
239
+ }
240
+ }
169
241
  }
170
242
  }
171
243
 
172
- struct free_block * block = &alloc->free_blocks[best_fit_block];
173
- size_t offset = block->offset;
174
- block->offset = offset + size;
244
+ if (best_fit_block == -1) {
245
+ // none of the existing chunks have enough space left
246
+ best_fit_chunk = wsp_ggml_dyn_tallocr_new_chunk(alloc, size);
247
+ best_fit_block = 0;
248
+ }
249
+ if (best_fit_chunk == -1) {
250
+ // since the last chunk always has virtually endless memory, this should never happen
251
+ WSP_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
252
+ __func__, size, max_avail);
253
+ WSP_GGML_ABORT("graph allocation: failed to reserve memory");
254
+ }
255
+
256
+ struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
257
+ struct free_block * block = &chunk->free_blocks[best_fit_block];
258
+ struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
259
+ block->offset += size;
175
260
  block->size -= size;
176
261
  if (block->size == 0) {
177
262
  // remove block if empty
178
- alloc->n_free_blocks--;
179
- for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
180
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
181
- }
263
+ wsp_ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
182
264
  }
183
265
 
184
- AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
266
+ AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
185
267
 
186
268
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
187
- add_allocated_tensor(alloc, offset, tensor);
188
- size_t cur_max = offset + size;
189
- if (cur_max > alloc->max_size) {
190
- // sort allocated_tensors by offset
269
+ add_allocated_tensor(alloc, addr, tensor);
270
+ size_t cur_max = addr.offset + size;
271
+ if (cur_max > alloc->max_size[addr.chunk]) {
272
+ // sort allocated_tensors by chunk/offset
191
273
  for (int i = 0; i < 1024; i++) {
192
274
  for (int j = i + 1; j < 1024; j++) {
193
- if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
275
+ if (wsp_ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
194
276
  const struct wsp_ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
195
- size_t tmp_offset = alloc->allocated_tensors[i].offset;
277
+ struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
196
278
  alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
197
- alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
279
+ alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
198
280
  alloc->allocated_tensors[j].tensor = tmp_tensor;
199
- alloc->allocated_tensors[j].offset = tmp_offset;
281
+ alloc->allocated_tensors[j].addr = tmp_addr;
200
282
  }
201
283
  }
202
284
  }
203
- WSP_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
285
+ WSP_GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
204
286
  for (int i = 0; i < 1024; i++) {
205
287
  if (alloc->allocated_tensors[i].tensor) {
206
- WSP_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
207
- alloc->allocated_tensors[i].offset,
208
- alloc->allocated_tensors[i].offset + wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor),
288
+ WSP_GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
289
+ alloc->allocated_tensors[i].addr.chunk,
290
+ alloc->allocated_tensors[i].addr.offset,
291
+ alloc->allocated_tensors[i].addr.offset + wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor),
209
292
  wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
210
293
  }
211
294
  }
@@ -213,78 +296,69 @@ static size_t wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, si
213
296
  }
214
297
  #endif
215
298
 
216
- alloc->max_size = MAX(alloc->max_size, offset + size);
299
+ chunk->max_size = MAX(chunk->max_size, addr.offset + size);
217
300
 
218
- return offset;
301
+ return addr;
219
302
 
220
303
  WSP_GGML_UNUSED(tensor);
221
304
  }
222
305
 
223
306
  // this is a very naive implementation, but for our case the number of free blocks should be very small
224
- static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct wsp_ggml_tensor * tensor) {
307
+ static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct wsp_ggml_tensor * tensor) {
225
308
  size = aligned_offset(NULL, size, alloc->alignment);
226
309
 
227
- AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
310
+ AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
311
+ __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
228
312
 
229
313
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
230
- remove_allocated_tensor(alloc, offset, tensor);
314
+ remove_allocated_tensor(alloc, addr, tensor);
231
315
  #endif
232
316
 
317
+ struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
318
+
233
319
  // see if we can merge with an existing block
234
- for (int i = 0; i < alloc->n_free_blocks; i++) {
235
- struct free_block * block = &alloc->free_blocks[i];
320
+ for (int i = 0; i < chunk->n_free_blocks; i++) {
321
+ struct free_block * block = &chunk->free_blocks[i];
236
322
  // check if ptr is at the end of the block
237
- if (block->offset + block->size == offset) {
323
+ if (block->offset + block->size == addr.offset) {
238
324
  block->size += size;
239
325
  // check if we can merge with the next block
240
- if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
241
- block->size += alloc->free_blocks[i+1].size;
242
- alloc->n_free_blocks--;
243
- for (int j = i+1; j < alloc->n_free_blocks; j++) {
244
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
326
+ if (i < chunk->n_free_blocks - 1) {
327
+ struct free_block * next = &chunk->free_blocks[i+1];
328
+ if (block->offset + block->size == next->offset) {
329
+ block->size += next->size;
330
+ wsp_ggml_dyn_tallocr_remove_block(chunk, i+1);
245
331
  }
246
332
  }
247
333
  return;
248
334
  }
249
335
  // check if ptr is at the beginning of the block
250
- if (offset + size == block->offset) {
251
- block->offset = offset;
336
+ if (addr.offset + size == block->offset) {
337
+ block->offset = addr.offset;
252
338
  block->size += size;
253
339
  // check if we can merge with the previous block
254
- if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
255
- alloc->free_blocks[i-1].size += block->size;
256
- alloc->n_free_blocks--;
257
- for (int j = i; j < alloc->n_free_blocks; j++) {
258
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
340
+ if (i > 0) {
341
+ struct free_block * prev = &chunk->free_blocks[i-1];
342
+ if (prev->offset + prev->size == block->offset) {
343
+ prev->size += block->size;
344
+ wsp_ggml_dyn_tallocr_remove_block(chunk, i);
259
345
  }
260
346
  }
261
347
  return;
262
348
  }
263
349
  }
264
350
  // otherwise, add a new block
265
- WSP_GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
266
- // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
267
- int insert_pos = 0;
268
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
269
- insert_pos++;
270
- }
271
- // shift all blocks from insert_pos onward to make room for the new block
272
- for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
273
- alloc->free_blocks[i] = alloc->free_blocks[i-1];
274
- }
275
- // insert the new block
276
- alloc->free_blocks[insert_pos].offset = offset;
277
- alloc->free_blocks[insert_pos].size = size;
278
- alloc->n_free_blocks++;
351
+ wsp_ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
279
352
 
280
353
  WSP_GGML_UNUSED(tensor);
281
354
  }
282
355
 
283
356
  static void wsp_ggml_dyn_tallocr_reset(struct wsp_ggml_dyn_tallocr * alloc) {
284
- alloc->n_free_blocks = 1;
285
- alloc->free_blocks[0].offset = 0;
286
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
287
- alloc->max_size = 0;
357
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS; i++) {
358
+ free(alloc->chunks[i]);
359
+ alloc->chunks[i] = NULL;
360
+ }
361
+ alloc->n_chunks = 0;
288
362
 
289
363
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
290
364
  for (int i = 0; i < 1024; i++) {
@@ -293,14 +367,14 @@ static void wsp_ggml_dyn_tallocr_reset(struct wsp_ggml_dyn_tallocr * alloc) {
293
367
  #endif
294
368
  }
295
369
 
296
- static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment) {
370
+ static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
297
371
  struct wsp_ggml_dyn_tallocr * alloc = (struct wsp_ggml_dyn_tallocr *)malloc(sizeof(struct wsp_ggml_dyn_tallocr));
298
372
 
299
373
  *alloc = (struct wsp_ggml_dyn_tallocr) {
300
- /*.alignment = */ alignment,
301
- /*.n_free_blocks = */ 0,
302
- /*.free_blocks = */ {{0}},
303
- /*.max_size = */ 0,
374
+ /*.alignment = */ alignment,
375
+ /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
376
+ /*.chunks = */ {NULL},
377
+ /*.n_chunks = */ 0,
304
378
  #ifdef WSP_GGML_ALLOCATOR_DEBUG
305
379
  /*.allocated_tensors = */ {{0}},
306
380
  #endif
@@ -312,11 +386,73 @@ static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment)
312
386
  }
313
387
 
314
388
  static void wsp_ggml_dyn_tallocr_free(struct wsp_ggml_dyn_tallocr * alloc) {
389
+ for (int i = 0; i < alloc->n_chunks; ++i) {
390
+ free(alloc->chunks[i]);
391
+ }
315
392
  free(alloc);
316
393
  }
317
394
 
318
- static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc) {
319
- return alloc->max_size;
395
+ static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc, int chunk) {
396
+ return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
397
+ }
398
+
399
+
400
+ // virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
401
+
402
+ struct vbuffer {
403
+ wsp_ggml_backend_buffer_t chunks[WSP_GGML_VBUFFER_MAX_CHUNKS];
404
+ };
405
+
406
+ static void wsp_ggml_vbuffer_free(struct vbuffer * buf) {
407
+ if (buf == NULL) {
408
+ return;
409
+ }
410
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS; ++i) {
411
+ wsp_ggml_backend_buffer_free(buf->chunks[i]);
412
+ }
413
+ free(buf);
414
+ }
415
+
416
+ static size_t wsp_ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
417
+ return buf->chunks[chunk] ? wsp_ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
418
+ }
419
+
420
+ static size_t wsp_ggml_vbuffer_size(struct vbuffer * buf) {
421
+ size_t size = 0;
422
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
423
+ size += wsp_ggml_backend_buffer_get_size(buf->chunks[i]);
424
+ }
425
+ return size;
426
+ }
427
+
428
+ static struct vbuffer * wsp_ggml_vbuffer_alloc(wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_dyn_tallocr * talloc, enum wsp_ggml_backend_buffer_usage usage) {
429
+ struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
430
+ if (buf == NULL) {
431
+ return NULL;
432
+ }
433
+
434
+ for (int n = 0; n < talloc->n_chunks; n++) {
435
+ size_t chunk_size = talloc->chunks[n]->max_size;
436
+ buf->chunks[n] = wsp_ggml_backend_buft_alloc_buffer(buft, chunk_size);
437
+ if (buf->chunks[n] == NULL) {
438
+ wsp_ggml_vbuffer_free(buf);
439
+ return NULL;
440
+ }
441
+ wsp_ggml_backend_buffer_set_usage(buf->chunks[n], usage);
442
+ }
443
+ return buf;
444
+ }
445
+
446
+ static void wsp_ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct wsp_ggml_tensor * tensor, struct buffer_address buf_addr) {
447
+ void * base = wsp_ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
448
+ void * addr = (char *)base + buf_addr.offset;
449
+ wsp_ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
450
+ }
451
+
452
+ static void wsp_ggml_vbuffer_reset(struct vbuffer * buf) {
453
+ for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
454
+ wsp_ggml_backend_buffer_reset(buf->chunks[i]);
455
+ }
320
456
  }
321
457
 
322
458
 
@@ -328,13 +464,13 @@ struct hash_node {
328
464
  int n_children;
329
465
  int n_views;
330
466
  int buffer_id;
331
- size_t offset; // offset within the buffer
467
+ struct buffer_address addr;
332
468
  bool allocated;
333
469
  };
334
470
 
335
471
  struct tensor_alloc {
336
472
  int buffer_id;
337
- size_t offset;
473
+ struct buffer_address addr;
338
474
  size_t size_max; // 0 = pre-allocated, unused, or view
339
475
  };
340
476
 
@@ -349,7 +485,7 @@ struct node_alloc {
349
485
 
350
486
  struct wsp_ggml_gallocr {
351
487
  wsp_ggml_backend_buffer_type_t * bufts; // [n_buffers]
352
- wsp_ggml_backend_buffer_t * buffers; // [n_buffers]
488
+ struct vbuffer ** buffers; // [n_buffers]
353
489
  struct wsp_ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
354
490
  int n_buffers;
355
491
 
@@ -370,7 +506,7 @@ wsp_ggml_gallocr_t wsp_ggml_gallocr_new_n(wsp_ggml_backend_buffer_type_t * bufts
370
506
  galloc->bufts = calloc(n_bufs, sizeof(wsp_ggml_backend_buffer_type_t));
371
507
  WSP_GGML_ASSERT(galloc->bufts != NULL);
372
508
 
373
- galloc->buffers = calloc(n_bufs, sizeof(wsp_ggml_backend_buffer_t));
509
+ galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
374
510
  WSP_GGML_ASSERT(galloc->buffers != NULL);
375
511
 
376
512
  galloc->buf_tallocs = calloc(n_bufs, sizeof(struct wsp_ggml_dyn_tallocr *));
@@ -390,7 +526,8 @@ wsp_ggml_gallocr_t wsp_ggml_gallocr_new_n(wsp_ggml_backend_buffer_type_t * bufts
390
526
 
391
527
  if (galloc->buf_tallocs[i] == NULL) {
392
528
  size_t alignment = wsp_ggml_backend_buft_get_alignment(bufts[i]);
393
- galloc->buf_tallocs[i] = wsp_ggml_dyn_tallocr_new(alignment);
529
+ size_t max_size = wsp_ggml_backend_buft_get_max_size(bufts[i]);
530
+ galloc->buf_tallocs[i] = wsp_ggml_dyn_tallocr_new(alignment, max_size);
394
531
  }
395
532
  }
396
533
  galloc->n_buffers = n_bufs;
@@ -418,7 +555,7 @@ void wsp_ggml_gallocr_free(wsp_ggml_gallocr_t galloc) {
418
555
  }
419
556
  }
420
557
  if (!freed) {
421
- wsp_ggml_backend_buffer_free(galloc->buffers[i]);
558
+ wsp_ggml_vbuffer_free(galloc->buffers[i]);
422
559
  }
423
560
  }
424
561
  if (galloc->buf_tallocs != NULL) {
@@ -461,13 +598,33 @@ static bool wsp_ggml_gallocr_is_allocated(wsp_ggml_gallocr_t galloc, struct wsp_
461
598
  return t->data != NULL || wsp_ggml_gallocr_hash_get(galloc, t)->allocated;
462
599
  }
463
600
 
601
+ // free the extra space at the end if the new tensor is smaller
602
+ static void wsp_ggml_gallocr_free_extra_space(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * node, struct wsp_ggml_tensor * parent) {
603
+ struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
604
+ struct hash_node * p_hn = wsp_ggml_gallocr_hash_get(galloc, parent);
605
+
606
+ size_t parent_size = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
607
+ size_t node_size = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
608
+
609
+ WSP_GGML_ASSERT(parent_size >= node_size);
610
+
611
+ if (parent_size > node_size) {
612
+ struct wsp_ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
613
+ struct buffer_address p_addr = p_hn->addr;
614
+ p_addr.offset += node_size;
615
+ size_t extra_size = parent_size - node_size;
616
+ AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
617
+ wsp_ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
618
+ }
619
+ }
620
+
464
621
  static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * node, int buffer_id) {
465
622
  WSP_GGML_ASSERT(buffer_id >= 0);
466
623
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
467
624
 
468
625
  if (!wsp_ggml_gallocr_is_allocated(galloc, node) && !wsp_ggml_is_view(node)) {
469
626
  hn->allocated = true;
470
- assert(hn->offset == 0);
627
+ assert(hn->addr.offset == 0);
471
628
 
472
629
  // try to reuse a parent's buffer (inplace)
473
630
  if (wsp_ggml_op_can_inplace(node->op)) {
@@ -501,18 +658,20 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
501
658
  struct hash_node * view_src_hn = wsp_ggml_gallocr_hash_get(galloc, view_src);
502
659
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
503
660
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
504
- assert(view_src_hn->offset == p_hn->offset);
661
+ assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
505
662
  hn->buffer_id = p_hn->buffer_id;
506
- hn->offset = p_hn->offset;
663
+ hn->addr = p_hn->addr;
507
664
  p_hn->allocated = false; // avoid freeing the parent
508
665
  view_src_hn->allocated = false;
666
+ wsp_ggml_gallocr_free_extra_space(galloc, node, view_src);
509
667
  return;
510
668
  }
511
669
  } else {
512
670
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
513
671
  hn->buffer_id = p_hn->buffer_id;
514
- hn->offset = p_hn->offset;
672
+ hn->addr = p_hn->addr;
515
673
  p_hn->allocated = false; // avoid freeing the parent
674
+ wsp_ggml_gallocr_free_extra_space(galloc, node, parent);
516
675
  return;
517
676
  }
518
677
  }
@@ -522,9 +681,8 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
522
681
  struct wsp_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
523
682
  wsp_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
524
683
  size_t size = wsp_ggml_backend_buft_get_alloc_size(buft, node);
525
- size_t offset = wsp_ggml_dyn_tallocr_alloc(alloc, size, node);
526
684
  hn->buffer_id = buffer_id;
527
- hn->offset = offset;
685
+ hn->addr = wsp_ggml_dyn_tallocr_alloc(alloc, size, node);
528
686
  }
529
687
  }
530
688
 
@@ -536,12 +694,11 @@ static void wsp_ggml_gallocr_free_node(wsp_ggml_gallocr_t galloc, struct wsp_ggm
536
694
  }
537
695
 
538
696
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
539
- size_t offset = hn->offset;
540
697
  int buffer_id = hn->buffer_id;
541
698
  struct wsp_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
542
699
  wsp_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
543
700
  size_t size = wsp_ggml_backend_buft_get_alloc_size(buft, node);
544
- wsp_ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
701
+ wsp_ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
545
702
  hn->allocated = false;
546
703
  }
547
704
 
@@ -692,24 +849,24 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
692
849
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
693
850
  if (node->view_src || node->data) {
694
851
  node_alloc->dst.buffer_id = -1;
695
- node_alloc->dst.offset = SIZE_MAX;
852
+ node_alloc->dst.addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
696
853
  node_alloc->dst.size_max = 0;
697
854
  } else {
698
855
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
699
856
  node_alloc->dst.buffer_id = hn->buffer_id;
700
- node_alloc->dst.offset = hn->offset;
857
+ node_alloc->dst.addr = hn->addr;
701
858
  node_alloc->dst.size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
702
859
  }
703
860
  for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
704
861
  struct wsp_ggml_tensor * src = node->src[j];
705
862
  if (!src || src->view_src || src->data) {
706
863
  node_alloc->src[j].buffer_id = -1;
707
- node_alloc->src[j].offset = SIZE_MAX;
864
+ node_alloc->src[j].addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
708
865
  node_alloc->src[j].size_max = 0;
709
866
  } else {
710
867
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, src);
711
868
  node_alloc->src[j].buffer_id = hn->buffer_id;
712
- node_alloc->src[j].offset = hn->offset;
869
+ node_alloc->src[j].addr = hn->addr;
713
870
  node_alloc->src[j].size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
714
871
  }
715
872
  }
@@ -725,11 +882,11 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
725
882
  struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, leaf);
726
883
  if (leaf->view_src || leaf->data) {
727
884
  galloc->leaf_allocs[i].leaf.buffer_id = -1;
728
- galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
885
+ galloc->leaf_allocs[i].leaf.addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
729
886
  galloc->leaf_allocs[i].leaf.size_max = 0;
730
887
  } else {
731
888
  galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
732
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
889
+ galloc->leaf_allocs[i].leaf.addr = hn->addr;
733
890
  galloc->leaf_allocs[i].leaf.size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
734
891
  }
735
892
  }
@@ -744,22 +901,29 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
744
901
  }
745
902
  }
746
903
 
747
- size_t cur_size = galloc->buffers[i] ? wsp_ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
748
- size_t new_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
749
-
750
904
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
751
- if (new_size > cur_size || galloc->buffers[i] == NULL) {
905
+ bool realloc = galloc->buffers[i] == NULL;
906
+ size_t new_size = 0;
907
+ for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
908
+ size_t cur_chunk_size = galloc->buffers[i] ? wsp_ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
909
+ size_t new_chunk_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
910
+ new_size += new_chunk_size;
911
+ if (new_chunk_size > cur_chunk_size) {
912
+ realloc = true;
913
+ }
914
+ }
915
+ if (realloc) {
752
916
  #ifndef NDEBUG
917
+ size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
753
918
  WSP_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
754
919
  #endif
755
920
 
756
- wsp_ggml_backend_buffer_free(galloc->buffers[i]);
757
- galloc->buffers[i] = wsp_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
921
+ wsp_ggml_vbuffer_free(galloc->buffers[i]);
922
+ galloc->buffers[i] = wsp_ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
758
923
  if (galloc->buffers[i] == NULL) {
759
924
  WSP_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), new_size);
760
925
  return false;
761
926
  }
762
- wsp_ggml_backend_buffer_set_usage(galloc->buffers[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
763
927
  }
764
928
  }
765
929
 
@@ -772,11 +936,11 @@ bool wsp_ggml_gallocr_reserve(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph
772
936
 
773
937
  static void wsp_ggml_gallocr_init_tensor(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
774
938
  int buffer_id = tensor_alloc->buffer_id;
775
- assert(tensor->data || tensor->view_src || wsp_ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
939
+ assert(tensor->data || tensor->view_src || wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
776
940
 
777
941
  if (tensor->view_src != NULL) {
778
942
  if (tensor->buffer == NULL) {
779
- assert(tensor_alloc->offset == SIZE_MAX);
943
+ assert(tensor_alloc->addr.offset == SIZE_MAX);
780
944
  if (tensor->view_src->buffer == NULL) {
781
945
  // this tensor was allocated without ggml-backend
782
946
  return;
@@ -785,11 +949,9 @@ static void wsp_ggml_gallocr_init_tensor(wsp_ggml_gallocr_t galloc, struct wsp_g
785
949
  }
786
950
  } else {
787
951
  if (tensor->data == NULL) {
788
- assert(tensor_alloc->offset != SIZE_MAX);
789
- assert(wsp_ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
790
- void * base = wsp_ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
791
- void * addr = (char *)base + tensor_alloc->offset;
792
- wsp_ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
952
+ assert(tensor_alloc->addr.offset != SIZE_MAX);
953
+ assert(wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
954
+ wsp_ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
793
955
  } else {
794
956
  if (tensor->buffer == NULL) {
795
957
  // this tensor was allocated without ggml-backend
@@ -874,7 +1036,7 @@ bool wsp_ggml_gallocr_alloc_graph(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgr
874
1036
  // reset buffers
875
1037
  for (int i = 0; i < galloc->n_buffers; i++) {
876
1038
  if (galloc->buffers[i] != NULL) {
877
- wsp_ggml_backend_buffer_reset(galloc->buffers[i]);
1039
+ wsp_ggml_vbuffer_reset(galloc->buffers[i]);
878
1040
  }
879
1041
  }
880
1042
 
@@ -917,7 +1079,7 @@ size_t wsp_ggml_gallocr_get_buffer_size(wsp_ggml_gallocr_t galloc, int buffer_id
917
1079
  }
918
1080
  }
919
1081
 
920
- return wsp_ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
1082
+ return wsp_ggml_vbuffer_size(galloc->buffers[buffer_id]);
921
1083
  }
922
1084
 
923
1085
  // utils