whisper.rn 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/cpp/ggml-alloc.c +264 -126
- package/cpp/ggml-backend-impl.h +4 -1
- package/cpp/ggml-backend-reg.cpp +13 -5
- package/cpp/ggml-backend.cpp +207 -17
- package/cpp/ggml-backend.h +17 -1
- package/cpp/ggml-cpu/amx/amx.cpp +4 -2
- package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/cpp/ggml-cpu/arch-fallback.h +0 -4
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
- package/cpp/ggml-cpu/ggml-cpu.c +48 -41
- package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/ggml-cpu/ops.cpp +518 -767
- package/cpp/ggml-cpu/ops.h +2 -0
- package/cpp/ggml-cpu/simd-mappings.h +88 -59
- package/cpp/ggml-cpu/vec.cpp +161 -20
- package/cpp/ggml-cpu/vec.h +400 -51
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +43 -10
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
- package/cpp/ggml-metal/ggml-metal-device.h +226 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +40 -40
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-quants.c +1 -0
- package/cpp/ggml.c +175 -13
- package/cpp/ggml.h +84 -5
- package/cpp/jsi/RNWhisperJSI.cpp +2 -0
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/whisper.cpp +85 -70
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisperVadContext.mm +14 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +1 -1
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6779
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-alloc.c
CHANGED
|
@@ -23,7 +23,7 @@ static bool wsp_ggml_is_view(const struct wsp_ggml_tensor * t) {
|
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
// ops that return true for this function must not use restrict pointers for their backend implementations
|
|
26
|
-
|
|
26
|
+
bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
|
|
27
27
|
switch (op) {
|
|
28
28
|
case WSP_GGML_OP_SCALE:
|
|
29
29
|
case WSP_GGML_OP_DIAG_MASK_ZERO:
|
|
@@ -95,39 +95,104 @@ enum wsp_ggml_status wsp_ggml_tallocr_alloc(struct wsp_ggml_tallocr * talloc, st
|
|
|
95
95
|
|
|
96
96
|
// dynamic tensor allocator
|
|
97
97
|
|
|
98
|
+
#define WSP_GGML_VBUFFER_MAX_CHUNKS 16
|
|
99
|
+
|
|
100
|
+
// relative memory address within an allocation that can be split into multiple buffers (chunks)
|
|
101
|
+
struct buffer_address {
|
|
102
|
+
int chunk; // index of a backend buffer
|
|
103
|
+
size_t offset; // local memory offset within the buffer
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
static const struct buffer_address WSP_GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
|
|
107
|
+
|
|
108
|
+
static bool wsp_ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
|
|
109
|
+
return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
|
|
110
|
+
}
|
|
111
|
+
|
|
98
112
|
struct free_block {
|
|
99
113
|
size_t offset;
|
|
100
114
|
size_t size;
|
|
101
115
|
};
|
|
102
116
|
|
|
103
|
-
struct
|
|
104
|
-
size_t alignment;
|
|
105
|
-
int n_free_blocks;
|
|
117
|
+
struct tallocr_chunk {
|
|
106
118
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
|
119
|
+
int n_free_blocks;
|
|
107
120
|
size_t max_size;
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
struct wsp_ggml_dyn_tallocr {
|
|
124
|
+
size_t alignment;
|
|
125
|
+
size_t max_chunk_size;
|
|
126
|
+
struct tallocr_chunk * chunks[WSP_GGML_VBUFFER_MAX_CHUNKS];
|
|
127
|
+
int n_chunks;
|
|
108
128
|
|
|
109
129
|
#ifdef WSP_GGML_ALLOCATOR_DEBUG
|
|
110
130
|
struct {
|
|
111
131
|
const struct wsp_ggml_tensor * tensor;
|
|
112
|
-
|
|
132
|
+
struct buffer_address addr;
|
|
113
133
|
} allocated_tensors[1024];
|
|
114
134
|
#endif
|
|
115
135
|
};
|
|
116
136
|
|
|
137
|
+
static void wsp_ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
|
|
138
|
+
WSP_GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
|
139
|
+
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
|
140
|
+
int insert_pos = 0;
|
|
141
|
+
while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
|
|
142
|
+
insert_pos++;
|
|
143
|
+
}
|
|
144
|
+
// shift all blocks from insert_pos onward to make room for the new block
|
|
145
|
+
for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
|
|
146
|
+
chunk->free_blocks[i] = chunk->free_blocks[i-1];
|
|
147
|
+
}
|
|
148
|
+
// insert the new block
|
|
149
|
+
chunk->free_blocks[insert_pos].offset = offset;
|
|
150
|
+
chunk->free_blocks[insert_pos].size = size;
|
|
151
|
+
chunk->n_free_blocks++;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
static void wsp_ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
|
|
155
|
+
// shift all elements after idx by 1 to the left, overwriting the element at idx
|
|
156
|
+
for (int i = idx; i < chunk->n_free_blocks; i++) {
|
|
157
|
+
chunk->free_blocks[i] = chunk->free_blocks[i+1];
|
|
158
|
+
}
|
|
159
|
+
chunk->n_free_blocks--;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
static int wsp_ggml_dyn_tallocr_new_chunk(struct wsp_ggml_dyn_tallocr * alloc, size_t min_size) {
|
|
163
|
+
if (alloc->n_chunks >= WSP_GGML_VBUFFER_MAX_CHUNKS) {
|
|
164
|
+
return -1;
|
|
165
|
+
}
|
|
166
|
+
struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
|
|
167
|
+
chunk->n_free_blocks = 1;
|
|
168
|
+
chunk->free_blocks[0].offset = 0;
|
|
169
|
+
// available space in a chunk is limited to max_chunk_size, but can be higher if:
|
|
170
|
+
// 1. a single tensor exceeds the maximum, and cannot fit any other way
|
|
171
|
+
// 2. we are running out of chunks
|
|
172
|
+
// backends will either manage to allocate the larger size, or report an error.
|
|
173
|
+
chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
|
|
174
|
+
if (alloc->n_chunks == WSP_GGML_VBUFFER_MAX_CHUNKS - 1) {
|
|
175
|
+
chunk->free_blocks[0].size = SIZE_MAX/2;
|
|
176
|
+
}
|
|
177
|
+
alloc->chunks[alloc->n_chunks] = chunk;
|
|
178
|
+
alloc->n_chunks++;
|
|
179
|
+
return alloc->n_chunks - 1;
|
|
180
|
+
}
|
|
181
|
+
|
|
117
182
|
#ifdef WSP_GGML_ALLOCATOR_DEBUG
|
|
118
|
-
static void add_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc,
|
|
183
|
+
static void add_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct wsp_ggml_tensor * tensor) {
|
|
119
184
|
for (int i = 0; i < 1024; i++) {
|
|
120
185
|
if (alloc->allocated_tensors[i].tensor == NULL) {
|
|
121
186
|
alloc->allocated_tensors[i].tensor = tensor;
|
|
122
|
-
alloc->allocated_tensors[i].
|
|
187
|
+
alloc->allocated_tensors[i].addr = addr;
|
|
123
188
|
return;
|
|
124
189
|
}
|
|
125
190
|
}
|
|
126
191
|
WSP_GGML_ABORT("out of allocated_tensors");
|
|
127
192
|
}
|
|
128
|
-
static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc,
|
|
193
|
+
static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct wsp_ggml_tensor * tensor) {
|
|
129
194
|
for (int i = 0; i < 1024; i++) {
|
|
130
|
-
if (alloc->allocated_tensors[i].offset == offset) {
|
|
195
|
+
if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
|
|
131
196
|
alloc->allocated_tensors[i].tensor = NULL;
|
|
132
197
|
return;
|
|
133
198
|
}
|
|
@@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct wsp_ggml_dyn_tallocr * alloc, size_t
|
|
|
136
201
|
}
|
|
137
202
|
#endif
|
|
138
203
|
|
|
139
|
-
static
|
|
204
|
+
static struct buffer_address wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, size_t size, const struct wsp_ggml_tensor * tensor) {
|
|
140
205
|
size = aligned_offset(NULL, size, alloc->alignment);
|
|
141
206
|
|
|
142
207
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
|
143
208
|
|
|
209
|
+
int best_fit_chunk = -1;
|
|
210
|
+
int best_fit_block = -1;
|
|
144
211
|
size_t max_avail = 0;
|
|
145
212
|
|
|
146
|
-
// find the best fitting free block besides the last block
|
|
147
|
-
int
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
213
|
+
// find the best fitting free block besides the last block, within any chunk
|
|
214
|
+
for (int c = 0; c < alloc->n_chunks; ++c) {
|
|
215
|
+
struct tallocr_chunk * chunk = alloc->chunks[c];
|
|
216
|
+
size_t best_fit_size = SIZE_MAX;
|
|
217
|
+
for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
|
|
218
|
+
struct free_block * block = &chunk->free_blocks[i];
|
|
219
|
+
max_avail = MAX(max_avail, block->size);
|
|
220
|
+
if (block->size >= size && block->size <= best_fit_size) {
|
|
221
|
+
best_fit_chunk = c;
|
|
222
|
+
best_fit_block = i;
|
|
223
|
+
best_fit_size = block->size;
|
|
224
|
+
}
|
|
155
225
|
}
|
|
156
226
|
}
|
|
157
227
|
|
|
158
228
|
if (best_fit_block == -1) {
|
|
159
|
-
// the last block
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
229
|
+
// no suitable block found, try the last block (this will grow a chunks size)
|
|
230
|
+
for (int c = 0; c < alloc->n_chunks; ++c) {
|
|
231
|
+
struct tallocr_chunk * chunk = alloc->chunks[c];
|
|
232
|
+
if (chunk->n_free_blocks > 0) {
|
|
233
|
+
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
|
|
234
|
+
max_avail = MAX(max_avail, block->size);
|
|
235
|
+
if (block->size >= size) {
|
|
236
|
+
best_fit_chunk = c;
|
|
237
|
+
best_fit_block = chunk->n_free_blocks - 1;
|
|
238
|
+
break;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
169
241
|
}
|
|
170
242
|
}
|
|
171
243
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
244
|
+
if (best_fit_block == -1) {
|
|
245
|
+
// none of the existing chunks have enough space left
|
|
246
|
+
best_fit_chunk = wsp_ggml_dyn_tallocr_new_chunk(alloc, size);
|
|
247
|
+
best_fit_block = 0;
|
|
248
|
+
}
|
|
249
|
+
if (best_fit_chunk == -1) {
|
|
250
|
+
// since the last chunk always has virtually endless memory, this should never happen
|
|
251
|
+
WSP_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
|
252
|
+
__func__, size, max_avail);
|
|
253
|
+
WSP_GGML_ABORT("graph allocation: failed to reserve memory");
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
|
|
257
|
+
struct free_block * block = &chunk->free_blocks[best_fit_block];
|
|
258
|
+
struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
|
|
259
|
+
block->offset += size;
|
|
175
260
|
block->size -= size;
|
|
176
261
|
if (block->size == 0) {
|
|
177
262
|
// remove block if empty
|
|
178
|
-
|
|
179
|
-
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
|
|
180
|
-
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
|
181
|
-
}
|
|
263
|
+
wsp_ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
|
|
182
264
|
}
|
|
183
265
|
|
|
184
|
-
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
|
|
266
|
+
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
|
|
185
267
|
|
|
186
268
|
#ifdef WSP_GGML_ALLOCATOR_DEBUG
|
|
187
|
-
add_allocated_tensor(alloc,
|
|
188
|
-
size_t cur_max = offset + size;
|
|
189
|
-
if (cur_max > alloc->max_size) {
|
|
190
|
-
// sort allocated_tensors by offset
|
|
269
|
+
add_allocated_tensor(alloc, addr, tensor);
|
|
270
|
+
size_t cur_max = addr.offset + size;
|
|
271
|
+
if (cur_max > alloc->max_size[addr.chunk]) {
|
|
272
|
+
// sort allocated_tensors by chunk/offset
|
|
191
273
|
for (int i = 0; i < 1024; i++) {
|
|
192
274
|
for (int j = i + 1; j < 1024; j++) {
|
|
193
|
-
if (alloc->allocated_tensors[
|
|
275
|
+
if (wsp_ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
|
|
194
276
|
const struct wsp_ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
|
195
|
-
|
|
277
|
+
struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
|
|
196
278
|
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
|
197
|
-
alloc->allocated_tensors[i].
|
|
279
|
+
alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
|
|
198
280
|
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
|
199
|
-
alloc->allocated_tensors[j].
|
|
281
|
+
alloc->allocated_tensors[j].addr = tmp_addr;
|
|
200
282
|
}
|
|
201
283
|
}
|
|
202
284
|
}
|
|
203
|
-
WSP_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
|
285
|
+
WSP_GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
|
|
204
286
|
for (int i = 0; i < 1024; i++) {
|
|
205
287
|
if (alloc->allocated_tensors[i].tensor) {
|
|
206
|
-
WSP_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
|
207
|
-
alloc->allocated_tensors[i].
|
|
208
|
-
alloc->allocated_tensors[i].offset
|
|
288
|
+
WSP_GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
|
289
|
+
alloc->allocated_tensors[i].addr.chunk,
|
|
290
|
+
alloc->allocated_tensors[i].addr.offset,
|
|
291
|
+
alloc->allocated_tensors[i].addr.offset + wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
|
209
292
|
wsp_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
|
210
293
|
}
|
|
211
294
|
}
|
|
@@ -213,78 +296,69 @@ static size_t wsp_ggml_dyn_tallocr_alloc(struct wsp_ggml_dyn_tallocr * alloc, si
|
|
|
213
296
|
}
|
|
214
297
|
#endif
|
|
215
298
|
|
|
216
|
-
|
|
299
|
+
chunk->max_size = MAX(chunk->max_size, addr.offset + size);
|
|
217
300
|
|
|
218
|
-
return
|
|
301
|
+
return addr;
|
|
219
302
|
|
|
220
303
|
WSP_GGML_UNUSED(tensor);
|
|
221
304
|
}
|
|
222
305
|
|
|
223
306
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
|
224
|
-
static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc,
|
|
307
|
+
static void wsp_ggml_dyn_tallocr_free_tensor(struct wsp_ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct wsp_ggml_tensor * tensor) {
|
|
225
308
|
size = aligned_offset(NULL, size, alloc->alignment);
|
|
226
309
|
|
|
227
|
-
AT_PRINTF("%s: freeing %s at
|
|
310
|
+
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
|
311
|
+
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
|
228
312
|
|
|
229
313
|
#ifdef WSP_GGML_ALLOCATOR_DEBUG
|
|
230
|
-
remove_allocated_tensor(alloc,
|
|
314
|
+
remove_allocated_tensor(alloc, addr, tensor);
|
|
231
315
|
#endif
|
|
232
316
|
|
|
317
|
+
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
|
318
|
+
|
|
233
319
|
// see if we can merge with an existing block
|
|
234
|
-
for (int i = 0; i <
|
|
235
|
-
struct free_block * block = &
|
|
320
|
+
for (int i = 0; i < chunk->n_free_blocks; i++) {
|
|
321
|
+
struct free_block * block = &chunk->free_blocks[i];
|
|
236
322
|
// check if ptr is at the end of the block
|
|
237
|
-
if (block->offset + block->size == offset) {
|
|
323
|
+
if (block->offset + block->size == addr.offset) {
|
|
238
324
|
block->size += size;
|
|
239
325
|
// check if we can merge with the next block
|
|
240
|
-
if (i <
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
326
|
+
if (i < chunk->n_free_blocks - 1) {
|
|
327
|
+
struct free_block * next = &chunk->free_blocks[i+1];
|
|
328
|
+
if (block->offset + block->size == next->offset) {
|
|
329
|
+
block->size += next->size;
|
|
330
|
+
wsp_ggml_dyn_tallocr_remove_block(chunk, i+1);
|
|
245
331
|
}
|
|
246
332
|
}
|
|
247
333
|
return;
|
|
248
334
|
}
|
|
249
335
|
// check if ptr is at the beginning of the block
|
|
250
|
-
if (offset + size == block->offset) {
|
|
251
|
-
block->offset = offset;
|
|
336
|
+
if (addr.offset + size == block->offset) {
|
|
337
|
+
block->offset = addr.offset;
|
|
252
338
|
block->size += size;
|
|
253
339
|
// check if we can merge with the previous block
|
|
254
|
-
if (i > 0
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
340
|
+
if (i > 0) {
|
|
341
|
+
struct free_block * prev = &chunk->free_blocks[i-1];
|
|
342
|
+
if (prev->offset + prev->size == block->offset) {
|
|
343
|
+
prev->size += block->size;
|
|
344
|
+
wsp_ggml_dyn_tallocr_remove_block(chunk, i);
|
|
259
345
|
}
|
|
260
346
|
}
|
|
261
347
|
return;
|
|
262
348
|
}
|
|
263
349
|
}
|
|
264
350
|
// otherwise, add a new block
|
|
265
|
-
|
|
266
|
-
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
|
267
|
-
int insert_pos = 0;
|
|
268
|
-
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
|
|
269
|
-
insert_pos++;
|
|
270
|
-
}
|
|
271
|
-
// shift all blocks from insert_pos onward to make room for the new block
|
|
272
|
-
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
|
|
273
|
-
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
|
274
|
-
}
|
|
275
|
-
// insert the new block
|
|
276
|
-
alloc->free_blocks[insert_pos].offset = offset;
|
|
277
|
-
alloc->free_blocks[insert_pos].size = size;
|
|
278
|
-
alloc->n_free_blocks++;
|
|
351
|
+
wsp_ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
|
279
352
|
|
|
280
353
|
WSP_GGML_UNUSED(tensor);
|
|
281
354
|
}
|
|
282
355
|
|
|
283
356
|
static void wsp_ggml_dyn_tallocr_reset(struct wsp_ggml_dyn_tallocr * alloc) {
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
357
|
+
for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS; i++) {
|
|
358
|
+
free(alloc->chunks[i]);
|
|
359
|
+
alloc->chunks[i] = NULL;
|
|
360
|
+
}
|
|
361
|
+
alloc->n_chunks = 0;
|
|
288
362
|
|
|
289
363
|
#ifdef WSP_GGML_ALLOCATOR_DEBUG
|
|
290
364
|
for (int i = 0; i < 1024; i++) {
|
|
@@ -293,14 +367,14 @@ static void wsp_ggml_dyn_tallocr_reset(struct wsp_ggml_dyn_tallocr * alloc) {
|
|
|
293
367
|
#endif
|
|
294
368
|
}
|
|
295
369
|
|
|
296
|
-
static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment) {
|
|
370
|
+
static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
|
|
297
371
|
struct wsp_ggml_dyn_tallocr * alloc = (struct wsp_ggml_dyn_tallocr *)malloc(sizeof(struct wsp_ggml_dyn_tallocr));
|
|
298
372
|
|
|
299
373
|
*alloc = (struct wsp_ggml_dyn_tallocr) {
|
|
300
|
-
/*.alignment
|
|
301
|
-
/*.
|
|
302
|
-
/*.
|
|
303
|
-
/*.
|
|
374
|
+
/*.alignment = */ alignment,
|
|
375
|
+
/*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
|
|
376
|
+
/*.chunks = */ {NULL},
|
|
377
|
+
/*.n_chunks = */ 0,
|
|
304
378
|
#ifdef WSP_GGML_ALLOCATOR_DEBUG
|
|
305
379
|
/*.allocated_tensors = */ {{0}},
|
|
306
380
|
#endif
|
|
@@ -312,11 +386,79 @@ static struct wsp_ggml_dyn_tallocr * wsp_ggml_dyn_tallocr_new(size_t alignment)
|
|
|
312
386
|
}
|
|
313
387
|
|
|
314
388
|
static void wsp_ggml_dyn_tallocr_free(struct wsp_ggml_dyn_tallocr * alloc) {
|
|
389
|
+
for (int i = 0; i < alloc->n_chunks; ++i) {
|
|
390
|
+
free(alloc->chunks[i]);
|
|
391
|
+
}
|
|
315
392
|
free(alloc);
|
|
316
393
|
}
|
|
317
394
|
|
|
318
395
|
static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc) {
|
|
319
|
-
|
|
396
|
+
size_t max_size = 0;
|
|
397
|
+
for (int i = 0; i < alloc->n_chunks; i++) {
|
|
398
|
+
max_size += alloc->chunks[i]->max_size;
|
|
399
|
+
}
|
|
400
|
+
return max_size;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
|
|
405
|
+
|
|
406
|
+
struct vbuffer {
|
|
407
|
+
wsp_ggml_backend_buffer_t chunks[WSP_GGML_VBUFFER_MAX_CHUNKS];
|
|
408
|
+
};
|
|
409
|
+
|
|
410
|
+
static void wsp_ggml_vbuffer_free(struct vbuffer * buf) {
|
|
411
|
+
if (buf == NULL) {
|
|
412
|
+
return;
|
|
413
|
+
}
|
|
414
|
+
for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS; ++i) {
|
|
415
|
+
wsp_ggml_backend_buffer_free(buf->chunks[i]);
|
|
416
|
+
}
|
|
417
|
+
free(buf);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
static int wsp_ggml_vbuffer_n_chunks(struct vbuffer * buf) {
|
|
421
|
+
int n = 0;
|
|
422
|
+
while (n < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
|
|
423
|
+
return n;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
static size_t wsp_ggml_vbuffer_size(struct vbuffer * buf) {
|
|
427
|
+
size_t size = 0;
|
|
428
|
+
for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
|
429
|
+
size += wsp_ggml_backend_buffer_get_size(buf->chunks[i]);
|
|
430
|
+
}
|
|
431
|
+
return size;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
static struct vbuffer * wsp_ggml_vbuffer_alloc(wsp_ggml_backend_buffer_type_t buft, const struct wsp_ggml_dyn_tallocr * talloc, enum wsp_ggml_backend_buffer_usage usage) {
|
|
435
|
+
struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
|
|
436
|
+
if (buf == NULL) {
|
|
437
|
+
return NULL;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
for (int n = 0; n < talloc->n_chunks; n++) {
|
|
441
|
+
size_t chunk_size = talloc->chunks[n]->max_size;
|
|
442
|
+
buf->chunks[n] = wsp_ggml_backend_buft_alloc_buffer(buft, chunk_size);
|
|
443
|
+
if (buf->chunks[n] == NULL) {
|
|
444
|
+
wsp_ggml_vbuffer_free(buf);
|
|
445
|
+
return NULL;
|
|
446
|
+
}
|
|
447
|
+
wsp_ggml_backend_buffer_set_usage(buf->chunks[n], usage);
|
|
448
|
+
}
|
|
449
|
+
return buf;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
static void wsp_ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct wsp_ggml_tensor * tensor, struct buffer_address buf_addr) {
|
|
453
|
+
void * base = wsp_ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
|
|
454
|
+
void * addr = (char *)base + buf_addr.offset;
|
|
455
|
+
wsp_ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
static void wsp_ggml_vbuffer_reset(struct vbuffer * buf) {
|
|
459
|
+
for (int i = 0; i < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
|
460
|
+
wsp_ggml_backend_buffer_reset(buf->chunks[i]);
|
|
461
|
+
}
|
|
320
462
|
}
|
|
321
463
|
|
|
322
464
|
|
|
@@ -328,13 +470,13 @@ struct hash_node {
|
|
|
328
470
|
int n_children;
|
|
329
471
|
int n_views;
|
|
330
472
|
int buffer_id;
|
|
331
|
-
|
|
473
|
+
struct buffer_address addr;
|
|
332
474
|
bool allocated;
|
|
333
475
|
};
|
|
334
476
|
|
|
335
477
|
struct tensor_alloc {
|
|
336
478
|
int buffer_id;
|
|
337
|
-
|
|
479
|
+
struct buffer_address addr;
|
|
338
480
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
|
339
481
|
};
|
|
340
482
|
|
|
@@ -349,7 +491,7 @@ struct node_alloc {
|
|
|
349
491
|
|
|
350
492
|
struct wsp_ggml_gallocr {
|
|
351
493
|
wsp_ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
|
352
|
-
|
|
494
|
+
struct vbuffer ** buffers; // [n_buffers]
|
|
353
495
|
struct wsp_ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
|
354
496
|
int n_buffers;
|
|
355
497
|
|
|
@@ -370,7 +512,7 @@ wsp_ggml_gallocr_t wsp_ggml_gallocr_new_n(wsp_ggml_backend_buffer_type_t * bufts
|
|
|
370
512
|
galloc->bufts = calloc(n_bufs, sizeof(wsp_ggml_backend_buffer_type_t));
|
|
371
513
|
WSP_GGML_ASSERT(galloc->bufts != NULL);
|
|
372
514
|
|
|
373
|
-
galloc->buffers = calloc(n_bufs, sizeof(
|
|
515
|
+
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
|
374
516
|
WSP_GGML_ASSERT(galloc->buffers != NULL);
|
|
375
517
|
|
|
376
518
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct wsp_ggml_dyn_tallocr *));
|
|
@@ -390,7 +532,8 @@ wsp_ggml_gallocr_t wsp_ggml_gallocr_new_n(wsp_ggml_backend_buffer_type_t * bufts
|
|
|
390
532
|
|
|
391
533
|
if (galloc->buf_tallocs[i] == NULL) {
|
|
392
534
|
size_t alignment = wsp_ggml_backend_buft_get_alignment(bufts[i]);
|
|
393
|
-
|
|
535
|
+
size_t max_size = wsp_ggml_backend_buft_get_max_size(bufts[i]);
|
|
536
|
+
galloc->buf_tallocs[i] = wsp_ggml_dyn_tallocr_new(alignment, max_size);
|
|
394
537
|
}
|
|
395
538
|
}
|
|
396
539
|
galloc->n_buffers = n_bufs;
|
|
@@ -418,7 +561,7 @@ void wsp_ggml_gallocr_free(wsp_ggml_gallocr_t galloc) {
|
|
|
418
561
|
}
|
|
419
562
|
}
|
|
420
563
|
if (!freed) {
|
|
421
|
-
|
|
564
|
+
wsp_ggml_vbuffer_free(galloc->buffers[i]);
|
|
422
565
|
}
|
|
423
566
|
}
|
|
424
567
|
if (galloc->buf_tallocs != NULL) {
|
|
@@ -467,7 +610,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
|
|
|
467
610
|
|
|
468
611
|
if (!wsp_ggml_gallocr_is_allocated(galloc, node) && !wsp_ggml_is_view(node)) {
|
|
469
612
|
hn->allocated = true;
|
|
470
|
-
assert(hn->offset == 0);
|
|
613
|
+
assert(hn->addr.offset == 0);
|
|
471
614
|
|
|
472
615
|
// try to reuse a parent's buffer (inplace)
|
|
473
616
|
if (wsp_ggml_op_can_inplace(node->op)) {
|
|
@@ -501,9 +644,9 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
|
|
|
501
644
|
struct hash_node * view_src_hn = wsp_ggml_gallocr_hash_get(galloc, view_src);
|
|
502
645
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
|
503
646
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
|
504
|
-
assert(view_src_hn->offset == p_hn->offset);
|
|
647
|
+
assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
|
|
505
648
|
hn->buffer_id = p_hn->buffer_id;
|
|
506
|
-
hn->
|
|
649
|
+
hn->addr = p_hn->addr;
|
|
507
650
|
p_hn->allocated = false; // avoid freeing the parent
|
|
508
651
|
view_src_hn->allocated = false;
|
|
509
652
|
return;
|
|
@@ -511,7 +654,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
|
|
|
511
654
|
} else {
|
|
512
655
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
|
513
656
|
hn->buffer_id = p_hn->buffer_id;
|
|
514
|
-
hn->
|
|
657
|
+
hn->addr = p_hn->addr;
|
|
515
658
|
p_hn->allocated = false; // avoid freeing the parent
|
|
516
659
|
return;
|
|
517
660
|
}
|
|
@@ -522,9 +665,8 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
|
|
|
522
665
|
struct wsp_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
523
666
|
wsp_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
524
667
|
size_t size = wsp_ggml_backend_buft_get_alloc_size(buft, node);
|
|
525
|
-
size_t offset = wsp_ggml_dyn_tallocr_alloc(alloc, size, node);
|
|
526
668
|
hn->buffer_id = buffer_id;
|
|
527
|
-
hn->
|
|
669
|
+
hn->addr = wsp_ggml_dyn_tallocr_alloc(alloc, size, node);
|
|
528
670
|
}
|
|
529
671
|
}
|
|
530
672
|
|
|
@@ -536,12 +678,11 @@ static void wsp_ggml_gallocr_free_node(wsp_ggml_gallocr_t galloc, struct wsp_ggm
|
|
|
536
678
|
}
|
|
537
679
|
|
|
538
680
|
struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
|
|
539
|
-
size_t offset = hn->offset;
|
|
540
681
|
int buffer_id = hn->buffer_id;
|
|
541
682
|
struct wsp_ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
542
683
|
wsp_ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
543
684
|
size_t size = wsp_ggml_backend_buft_get_alloc_size(buft, node);
|
|
544
|
-
wsp_ggml_dyn_tallocr_free_tensor(alloc,
|
|
685
|
+
wsp_ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
|
545
686
|
hn->allocated = false;
|
|
546
687
|
}
|
|
547
688
|
|
|
@@ -692,24 +833,24 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
|
|
|
692
833
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
693
834
|
if (node->view_src || node->data) {
|
|
694
835
|
node_alloc->dst.buffer_id = -1;
|
|
695
|
-
node_alloc->dst.
|
|
836
|
+
node_alloc->dst.addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
|
|
696
837
|
node_alloc->dst.size_max = 0;
|
|
697
838
|
} else {
|
|
698
839
|
struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
|
|
699
840
|
node_alloc->dst.buffer_id = hn->buffer_id;
|
|
700
|
-
node_alloc->dst.
|
|
841
|
+
node_alloc->dst.addr = hn->addr;
|
|
701
842
|
node_alloc->dst.size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
|
702
843
|
}
|
|
703
844
|
for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
|
|
704
845
|
struct wsp_ggml_tensor * src = node->src[j];
|
|
705
846
|
if (!src || src->view_src || src->data) {
|
|
706
847
|
node_alloc->src[j].buffer_id = -1;
|
|
707
|
-
node_alloc->src[j].
|
|
848
|
+
node_alloc->src[j].addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
|
|
708
849
|
node_alloc->src[j].size_max = 0;
|
|
709
850
|
} else {
|
|
710
851
|
struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, src);
|
|
711
852
|
node_alloc->src[j].buffer_id = hn->buffer_id;
|
|
712
|
-
node_alloc->src[j].
|
|
853
|
+
node_alloc->src[j].addr = hn->addr;
|
|
713
854
|
node_alloc->src[j].size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
|
714
855
|
}
|
|
715
856
|
}
|
|
@@ -725,11 +866,11 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
|
|
|
725
866
|
struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, leaf);
|
|
726
867
|
if (leaf->view_src || leaf->data) {
|
|
727
868
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
|
728
|
-
galloc->leaf_allocs[i].leaf.
|
|
869
|
+
galloc->leaf_allocs[i].leaf.addr = WSP_GGML_BUFFER_ADDRESS_INVALID;
|
|
729
870
|
galloc->leaf_allocs[i].leaf.size_max = 0;
|
|
730
871
|
} else {
|
|
731
872
|
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
|
732
|
-
galloc->leaf_allocs[i].leaf.
|
|
873
|
+
galloc->leaf_allocs[i].leaf.addr = hn->addr;
|
|
733
874
|
galloc->leaf_allocs[i].leaf.size_max = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
|
734
875
|
}
|
|
735
876
|
}
|
|
@@ -744,7 +885,7 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
|
|
|
744
885
|
}
|
|
745
886
|
}
|
|
746
887
|
|
|
747
|
-
size_t cur_size = galloc->buffers[i] ?
|
|
888
|
+
size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
|
748
889
|
size_t new_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
|
749
890
|
|
|
750
891
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
|
@@ -753,13 +894,12 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
|
|
|
753
894
|
WSP_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
754
895
|
#endif
|
|
755
896
|
|
|
756
|
-
|
|
757
|
-
galloc->buffers[i] =
|
|
897
|
+
wsp_ggml_vbuffer_free(galloc->buffers[i]);
|
|
898
|
+
galloc->buffers[i] = wsp_ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
758
899
|
if (galloc->buffers[i] == NULL) {
|
|
759
900
|
WSP_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
|
760
901
|
return false;
|
|
761
902
|
}
|
|
762
|
-
wsp_ggml_backend_buffer_set_usage(galloc->buffers[i], WSP_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
763
903
|
}
|
|
764
904
|
}
|
|
765
905
|
|
|
@@ -772,11 +912,11 @@ bool wsp_ggml_gallocr_reserve(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph
|
|
|
772
912
|
|
|
773
913
|
static void wsp_ggml_gallocr_init_tensor(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
|
774
914
|
int buffer_id = tensor_alloc->buffer_id;
|
|
775
|
-
assert(tensor->data || tensor->view_src ||
|
|
915
|
+
assert(tensor->data || tensor->view_src || wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
|
776
916
|
|
|
777
917
|
if (tensor->view_src != NULL) {
|
|
778
918
|
if (tensor->buffer == NULL) {
|
|
779
|
-
assert(tensor_alloc->offset == SIZE_MAX);
|
|
919
|
+
assert(tensor_alloc->addr.offset == SIZE_MAX);
|
|
780
920
|
if (tensor->view_src->buffer == NULL) {
|
|
781
921
|
// this tensor was allocated without ggml-backend
|
|
782
922
|
return;
|
|
@@ -785,11 +925,9 @@ static void wsp_ggml_gallocr_init_tensor(wsp_ggml_gallocr_t galloc, struct wsp_g
|
|
|
785
925
|
}
|
|
786
926
|
} else {
|
|
787
927
|
if (tensor->data == NULL) {
|
|
788
|
-
assert(tensor_alloc->offset != SIZE_MAX);
|
|
789
|
-
assert(
|
|
790
|
-
|
|
791
|
-
void * addr = (char *)base + tensor_alloc->offset;
|
|
792
|
-
wsp_ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
|
928
|
+
assert(tensor_alloc->addr.offset != SIZE_MAX);
|
|
929
|
+
assert(wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
|
930
|
+
wsp_ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
|
|
793
931
|
} else {
|
|
794
932
|
if (tensor->buffer == NULL) {
|
|
795
933
|
// this tensor was allocated without ggml-backend
|
|
@@ -874,7 +1012,7 @@ bool wsp_ggml_gallocr_alloc_graph(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgr
|
|
|
874
1012
|
// reset buffers
|
|
875
1013
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
876
1014
|
if (galloc->buffers[i] != NULL) {
|
|
877
|
-
|
|
1015
|
+
wsp_ggml_vbuffer_reset(galloc->buffers[i]);
|
|
878
1016
|
}
|
|
879
1017
|
}
|
|
880
1018
|
|
|
@@ -917,7 +1055,7 @@ size_t wsp_ggml_gallocr_get_buffer_size(wsp_ggml_gallocr_t galloc, int buffer_id
|
|
|
917
1055
|
}
|
|
918
1056
|
}
|
|
919
1057
|
|
|
920
|
-
return
|
|
1058
|
+
return wsp_ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
|
921
1059
|
}
|
|
922
1060
|
|
|
923
1061
|
// utils
|