whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,1037 @@
1
+ #include "ggml-alloc.h"
2
+ #include "ggml-backend-impl.h"
3
+ #include "ggml.h"
4
+ #include "ggml-impl.h"
5
+ #include <assert.h>
6
+ #include <limits.h>
7
+ #include <stdarg.h>
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
+ #define MAX_FREE_BLOCKS 256
14
+
15
+ //#define GGML_ALLOCATOR_DEBUG
16
+
17
+ //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
18
+ #define AT_PRINTF(...)
19
+
20
+
21
+ static bool ggml_is_view(const struct ggml_tensor * t) {
22
+ return t->view_src != NULL;
23
+ }
24
+
25
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
+ if (a->type != b->type) {
27
+ return false;
28
+ }
29
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
+ if (a->ne[i] != b->ne[i]) {
31
+ return false;
32
+ }
33
+ if (a->nb[i] != b->nb[i]) {
34
+ return false;
35
+ }
36
+ }
37
+ return true;
38
+ }
39
+
40
+ static bool ggml_op_can_inplace(enum ggml_op op) {
41
+ switch (op) {
42
+ case GGML_OP_SCALE:
43
+ case GGML_OP_DIAG_MASK_ZERO:
44
+ case GGML_OP_DIAG_MASK_INF:
45
+ case GGML_OP_ADD:
46
+ case GGML_OP_ADD1:
47
+ case GGML_OP_SUB:
48
+ case GGML_OP_MUL:
49
+ case GGML_OP_DIV:
50
+ case GGML_OP_SQR:
51
+ case GGML_OP_SQRT:
52
+ case GGML_OP_LOG:
53
+ case GGML_OP_UNARY:
54
+ case GGML_OP_ROPE:
55
+ case GGML_OP_RMS_NORM:
56
+ case GGML_OP_SOFT_MAX:
57
+ return true;
58
+
59
+ default:
60
+ return false;
61
+ }
62
+ }
63
+
64
+ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
65
+ assert(alignment && !(alignment & (alignment - 1))); // power of 2
66
+ size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
67
+ return offset + align;
68
+ }
69
+
70
+ // tallocr
71
+
72
+ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
73
+ void * base = ggml_backend_buffer_get_base(buffer);
74
+ size_t align = ggml_backend_buffer_get_alignment(buffer);
75
+
76
+ assert(align && !(align & (align - 1))); // power of 2
77
+
78
+ struct ggml_tallocr talloc = (struct ggml_tallocr) {
79
+ /*.buffer = */ buffer,
80
+ /*.base = */ base,
81
+ /*.alignment = */ align,
82
+ /*.offset = */ aligned_offset(base, 0, align),
83
+ };
84
+ return talloc;
85
+ }
86
+
87
+ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
88
+ size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
89
+ size = GGML_PAD(size, talloc->alignment);
90
+
91
+ if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
+ __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
+ GGML_ABORT("not enough space in the buffer");
95
+ }
96
+
97
+ void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
98
+ talloc->offset += size;
99
+
100
+ assert(((uintptr_t)addr % talloc->alignment) == 0);
101
+
102
+ ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
103
+ }
104
+
105
+ // dynamic tensor allocator
106
+
107
+ struct free_block {
108
+ size_t offset;
109
+ size_t size;
110
+ };
111
+
112
+ struct ggml_dyn_tallocr {
113
+ size_t alignment;
114
+ int n_free_blocks;
115
+ struct free_block free_blocks[MAX_FREE_BLOCKS];
116
+ size_t max_size;
117
+
118
+ #ifdef GGML_ALLOCATOR_DEBUG
119
+ struct {
120
+ const struct ggml_tensor * tensor;
121
+ size_t offset;
122
+ } allocated_tensors[1024];
123
+ #endif
124
+ };
125
+
126
+ #ifdef GGML_ALLOCATOR_DEBUG
127
+ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
128
+ for (int i = 0; i < 1024; i++) {
129
+ if (alloc->allocated_tensors[i].tensor == NULL) {
130
+ alloc->allocated_tensors[i].tensor = tensor;
131
+ alloc->allocated_tensors[i].offset = offset;
132
+ return;
133
+ }
134
+ }
135
+ GGML_ABORT("out of allocated_tensors");
136
+ }
137
+ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
138
+ for (int i = 0; i < 1024; i++) {
139
+ if (alloc->allocated_tensors[i].offset == offset) {
140
+ alloc->allocated_tensors[i].tensor = NULL;
141
+ return;
142
+ }
143
+ }
144
+ GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
145
+ }
146
+ #endif
147
+
148
+ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
149
+ size = aligned_offset(NULL, size, alloc->alignment);
150
+
151
+ AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
152
+
153
+ size_t max_avail = 0;
154
+
155
+ // find the best fitting free block besides the last block
156
+ int best_fit_block = -1;
157
+ size_t best_fit_size = SIZE_MAX;
158
+ for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
159
+ struct free_block * block = &alloc->free_blocks[i];
160
+ max_avail = MAX(max_avail, block->size);
161
+ if (block->size >= size && block->size <= best_fit_size) {
162
+ best_fit_block = i;
163
+ best_fit_size = block->size;
164
+ }
165
+ }
166
+
167
+ if (best_fit_block == -1) {
168
+ // the last block is our last resort
169
+ struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
170
+ max_avail = MAX(max_avail, block->size);
171
+ if (block->size >= size) {
172
+ best_fit_block = alloc->n_free_blocks - 1;
173
+ } else {
174
+ // this should never happen
175
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
+ __func__, size, max_avail);
177
+ GGML_ABORT("not enough space in the buffer");
178
+ }
179
+ }
180
+
181
+ struct free_block * block = &alloc->free_blocks[best_fit_block];
182
+ size_t offset = block->offset;
183
+ block->offset = offset + size;
184
+ block->size -= size;
185
+ if (block->size == 0) {
186
+ // remove block if empty
187
+ alloc->n_free_blocks--;
188
+ for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
189
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
190
+ }
191
+ }
192
+
193
+ AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
194
+
195
+ #ifdef GGML_ALLOCATOR_DEBUG
196
+ add_allocated_tensor(alloc, offset, tensor);
197
+ size_t cur_max = offset + size;
198
+ if (cur_max > alloc->max_size) {
199
+ // sort allocated_tensors by offset
200
+ for (int i = 0; i < 1024; i++) {
201
+ for (int j = i + 1; j < 1024; j++) {
202
+ if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
203
+ const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
204
+ size_t tmp_offset = alloc->allocated_tensors[i].offset;
205
+ alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
206
+ alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
207
+ alloc->allocated_tensors[j].tensor = tmp_tensor;
208
+ alloc->allocated_tensors[j].offset = tmp_offset;
209
+ }
210
+ }
211
+ }
212
+ GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
+ for (int i = 0; i < 1024; i++) {
214
+ if (alloc->allocated_tensors[i].tensor) {
215
+ GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
+ alloc->allocated_tensors[i].offset,
217
+ alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
+ ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
+ }
220
+ }
221
+ GGML_LOG_DEBUG("\n");
222
+ }
223
+ #endif
224
+
225
+ alloc->max_size = MAX(alloc->max_size, offset + size);
226
+
227
+ return offset;
228
+
229
+ GGML_UNUSED(tensor);
230
+ }
231
+
232
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
233
+ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
234
+ size = aligned_offset(NULL, size, alloc->alignment);
235
+
236
+ AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
237
+
238
+ #ifdef GGML_ALLOCATOR_DEBUG
239
+ remove_allocated_tensor(alloc, offset, tensor);
240
+ #endif
241
+
242
+ // see if we can merge with an existing block
243
+ for (int i = 0; i < alloc->n_free_blocks; i++) {
244
+ struct free_block * block = &alloc->free_blocks[i];
245
+ // check if ptr is at the end of the block
246
+ if (block->offset + block->size == offset) {
247
+ block->size += size;
248
+ // check if we can merge with the next block
249
+ if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
250
+ block->size += alloc->free_blocks[i+1].size;
251
+ alloc->n_free_blocks--;
252
+ for (int j = i+1; j < alloc->n_free_blocks; j++) {
253
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
254
+ }
255
+ }
256
+ return;
257
+ }
258
+ // check if ptr is at the beginning of the block
259
+ if (offset + size == block->offset) {
260
+ block->offset = offset;
261
+ block->size += size;
262
+ // check if we can merge with the previous block
263
+ if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
264
+ alloc->free_blocks[i-1].size += block->size;
265
+ alloc->n_free_blocks--;
266
+ for (int j = i; j < alloc->n_free_blocks; j++) {
267
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
268
+ }
269
+ }
270
+ return;
271
+ }
272
+ }
273
+ // otherwise, add a new block
274
+ GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
275
+ // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
276
+ int insert_pos = 0;
277
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
278
+ insert_pos++;
279
+ }
280
+ // shift all blocks from insert_pos onward to make room for the new block
281
+ for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
282
+ alloc->free_blocks[i] = alloc->free_blocks[i-1];
283
+ }
284
+ // insert the new block
285
+ alloc->free_blocks[insert_pos].offset = offset;
286
+ alloc->free_blocks[insert_pos].size = size;
287
+ alloc->n_free_blocks++;
288
+
289
+ GGML_UNUSED(tensor);
290
+ }
291
+
292
+ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
293
+ alloc->n_free_blocks = 1;
294
+ alloc->free_blocks[0].offset = 0;
295
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
296
+ alloc->max_size = 0;
297
+
298
+ #ifdef GGML_ALLOCATOR_DEBUG
299
+ for (int i = 0; i < 1024; i++) {
300
+ alloc->allocated_tensors[i].tensor = NULL;
301
+ }
302
+ #endif
303
+ }
304
+
305
+ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
306
+ struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
307
+
308
+ *alloc = (struct ggml_dyn_tallocr) {
309
+ /*.alignment = */ alignment,
310
+ /*.n_free_blocks = */ 0,
311
+ /*.free_blocks = */ {{0}},
312
+ /*.max_size = */ 0,
313
+ #ifdef GGML_ALLOCATOR_DEBUG
314
+ /*.allocated_tensors = */ {{0}},
315
+ #endif
316
+ };
317
+
318
+ ggml_dyn_tallocr_reset(alloc);
319
+
320
+ return alloc;
321
+ }
322
+
323
+ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
324
+ free(alloc);
325
+ }
326
+
327
+ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
328
+ return alloc->max_size;
329
+ }
330
+
331
+
332
+ /////////////////////////////////////
333
+
334
+ // graph allocator
335
+
336
+ struct hash_node {
337
+ int n_children;
338
+ int n_views;
339
+ int buffer_id;
340
+ size_t offset; // offset within the buffer
341
+ bool allocated;
342
+ };
343
+
344
+ struct tensor_alloc {
345
+ int buffer_id;
346
+ size_t offset;
347
+ size_t size_max; // 0 = pre-allocated, unused, or view
348
+ };
349
+
350
+ struct leaf_alloc {
351
+ struct tensor_alloc leaf;
352
+ };
353
+
354
+ struct node_alloc {
355
+ struct tensor_alloc dst;
356
+ struct tensor_alloc src[GGML_MAX_SRC];
357
+ };
358
+
359
+ struct ggml_gallocr {
360
+ ggml_backend_buffer_type_t * bufts; // [n_buffers]
361
+ ggml_backend_buffer_t * buffers; // [n_buffers]
362
+ struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
363
+ int n_buffers;
364
+
365
+ struct ggml_hash_set hash_set;
366
+ struct hash_node * hash_values; // [hash_set.size]
367
+
368
+ struct node_alloc * node_allocs; // [n_nodes]
369
+ int n_nodes;
370
+
371
+ struct leaf_alloc * leaf_allocs; // [n_leafs]
372
+ int n_leafs;
373
+ };
374
+
375
+ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
376
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
377
+ GGML_ASSERT(galloc != NULL);
378
+
379
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
380
+ GGML_ASSERT(galloc->bufts != NULL);
381
+
382
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
383
+ GGML_ASSERT(galloc->buffers != NULL);
384
+
385
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
386
+ GGML_ASSERT(galloc->buf_tallocs != NULL);
387
+
388
+ for (int i = 0; i < n_bufs; i++) {
389
+ galloc->bufts[i] = bufts[i];
390
+ galloc->buffers[i] = NULL;
391
+
392
+ // check if the same buffer type is used multiple times and reuse the same allocator
393
+ for (int j = 0; j < i; j++) {
394
+ if (bufts[i] == bufts[j]) {
395
+ galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
396
+ break;
397
+ }
398
+ }
399
+
400
+ if (galloc->buf_tallocs[i] == NULL) {
401
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
402
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
403
+ }
404
+ }
405
+ galloc->n_buffers = n_bufs;
406
+
407
+ return galloc;
408
+ }
409
+
410
+ ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
411
+ return ggml_gallocr_new_n(&buft, 1);
412
+ }
413
+
414
+ void ggml_gallocr_free(ggml_gallocr_t galloc) {
415
+ if (galloc == NULL) {
416
+ return;
417
+ }
418
+
419
+ for (int i = 0; i < galloc->n_buffers; i++) {
420
+ if (galloc->buffers != NULL) {
421
+ // skip if already freed
422
+ bool freed = false;
423
+ for (int j = 0; j < i; j++) {
424
+ if (galloc->buffers[j] == galloc->buffers[i]) {
425
+ freed = true;
426
+ break;
427
+ }
428
+ }
429
+ if (!freed) {
430
+ ggml_backend_buffer_free(galloc->buffers[i]);
431
+ }
432
+ }
433
+ if (galloc->buf_tallocs != NULL) {
434
+ // skip if already freed
435
+ bool freed = false;
436
+ for (int j = 0; j < i; j++) {
437
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
438
+ freed = true;
439
+ break;
440
+ }
441
+ }
442
+ if (!freed) {
443
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
444
+ }
445
+ }
446
+ }
447
+
448
+ ggml_hash_set_free(&galloc->hash_set);
449
+ free(galloc->hash_values);
450
+ free(galloc->bufts);
451
+ free(galloc->buffers);
452
+ free(galloc->buf_tallocs);
453
+ free(galloc->node_allocs);
454
+ free(galloc->leaf_allocs);
455
+ free(galloc);
456
+ }
457
+
458
+ typedef struct ggml_gallocr * ggml_gallocr_t;
459
+
460
+ static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
461
+ size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
462
+ return &galloc->hash_values[i];
463
+ }
464
+
465
+ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
466
+ return ggml_gallocr_hash_get(galloc, t)->allocated;
467
+ }
468
+
469
+ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
470
+ return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
471
+ }
472
+
473
+ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
474
+ GGML_ASSERT(buffer_id >= 0);
475
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
476
+
477
+ if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
478
+ hn->allocated = true;
479
+ assert(hn->offset == 0);
480
+
481
+ // try to reuse a parent's buffer (inplace)
482
+ if (ggml_op_can_inplace(node->op)) {
483
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
484
+ struct ggml_tensor * parent = node->src[i];
485
+ if (parent == NULL) {
486
+ continue;
487
+ }
488
+
489
+ // if the node's data is external, then we cannot re-use it
490
+ if (!ggml_gallocr_is_own(galloc, parent)) {
491
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
492
+ continue;
493
+ }
494
+
495
+ // outputs cannot be reused
496
+ if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
497
+ AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
498
+ continue;
499
+ }
500
+
501
+ if (!ggml_are_same_layout(node, parent)) {
502
+ AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
503
+ continue;
504
+ }
505
+
506
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
507
+ if (p_hn->n_children == 1 && p_hn->n_views == 0) {
508
+ if (ggml_is_view(parent)) {
509
+ struct ggml_tensor * view_src = parent->view_src;
510
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
511
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
512
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
513
+ assert(view_src_hn->offset == p_hn->offset);
514
+ hn->buffer_id = p_hn->buffer_id;
515
+ hn->offset = p_hn->offset;
516
+ p_hn->allocated = false; // avoid freeing the parent
517
+ view_src_hn->allocated = false;
518
+ return;
519
+ }
520
+ } else {
521
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
522
+ hn->buffer_id = p_hn->buffer_id;
523
+ hn->offset = p_hn->offset;
524
+ p_hn->allocated = false; // avoid freeing the parent
525
+ return;
526
+ }
527
+ }
528
+ }
529
+ }
530
+ // allocate tensor from the buffer
531
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
532
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
533
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
534
+ size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
535
+ hn->buffer_id = buffer_id;
536
+ hn->offset = offset;
537
+ }
538
+ }
539
+
540
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
541
+ // graph outputs are never freed
542
+ if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
543
+ AT_PRINTF("not freeing output %s\n", node->name);
544
+ return;
545
+ }
546
+
547
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
548
+ size_t offset = hn->offset;
549
+ int buffer_id = hn->buffer_id;
550
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
551
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
552
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
553
+ ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
554
+ hn->allocated = false;
555
+ }
556
+
557
+ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
558
+ return node_buffer_ids ? node_buffer_ids[i] : 0;
559
+ }
560
+
561
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
562
+ // clear hash tables
563
+ ggml_hash_set_reset(&galloc->hash_set);
564
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
565
+
566
+ // allocate leafs
567
+ // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
568
+ for (int i = 0; i < graph->n_leafs; i++) {
569
+ struct ggml_tensor * leaf = graph->leafs[i];
570
+ ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
571
+ }
572
+
573
+ // count number of children and views
574
+ // allocate other graph inputs and leafs first to avoid overwriting them
575
+ for (int i = 0; i < graph->n_nodes; i++) {
576
+ struct ggml_tensor * node = graph->nodes[i];
577
+
578
+ // TODO: better way to add external dependencies
579
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
580
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
581
+ // itself is never used and should not be considered a dependency
582
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
583
+ struct ggml_tensor * view_src = node->view_src;
584
+ ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
585
+ }
586
+
587
+ if (node->flags & GGML_TENSOR_FLAG_INPUT) {
588
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
589
+ }
590
+
591
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
592
+ struct ggml_tensor * src = node->src[j];
593
+ if (src == NULL) {
594
+ continue;
595
+ }
596
+
597
+ ggml_gallocr_hash_get(galloc, src)->n_children += 1;
598
+
599
+ // allocate explicit inputs
600
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
601
+ ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
602
+ }
603
+ }
604
+ }
605
+
606
+ // allocate tensors
607
+ for (int i = 0; i < graph->n_nodes; i++) {
608
+ struct ggml_tensor * node = graph->nodes[i];
609
+ int buffer_id = get_node_buffer_id(node_buffer_ids, i);
610
+
611
+ // allocate parents (only leafs need to be allocated at this point)
612
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
613
+ struct ggml_tensor * parent = node->src[j];
614
+ if (parent == NULL) {
615
+ continue;
616
+ }
617
+ ggml_gallocr_allocate_node(galloc, parent, buffer_id);
618
+ }
619
+
620
+ // allocate node
621
+ ggml_gallocr_allocate_node(galloc, node, buffer_id);
622
+
623
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
624
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
625
+ struct ggml_tensor * parent = node->src[j];
626
+ if (parent == NULL) {
627
+ continue;
628
+ }
629
+ AT_PRINTF("%s", parent->name);
630
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
631
+ AT_PRINTF(", ");
632
+ }
633
+ }
634
+ AT_PRINTF("\n");
635
+
636
+ // update parents
637
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
638
+ struct ggml_tensor * parent = node->src[j];
639
+ if (parent == NULL) {
640
+ continue;
641
+ }
642
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
643
+ p_hn->n_children -= 1;
644
+
645
+ AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
646
+ parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
647
+
648
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
649
+ if (ggml_is_view(parent)) {
650
+ struct ggml_tensor * view_src = parent->view_src;
651
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
652
+ view_src_hn->n_views -= 1;
653
+ AT_PRINTF("view_src %s: %d children, %d views\n",
654
+ view_src->name, view_src_hn->n_children, view_src_hn->n_views);
655
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
656
+ ggml_gallocr_free_node(galloc, view_src);
657
+ }
658
+ }
659
+ else if (p_hn->allocated) {
660
+ ggml_gallocr_free_node(galloc, parent);
661
+ }
662
+ }
663
+ AT_PRINTF("\n");
664
+ }
665
+ }
666
+ }
667
+
668
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
669
+ size_t min_hash_size = graph->n_nodes + graph->n_leafs;
670
+ // add 25% margin to avoid hash collisions
671
+ min_hash_size += min_hash_size / 4;
672
+
673
+ // initialize hash table
674
+ if (galloc->hash_set.size < min_hash_size) {
675
+ ggml_hash_set_free(&galloc->hash_set);
676
+ galloc->hash_set = ggml_hash_set_new(min_hash_size);
677
+ GGML_ASSERT(galloc->hash_set.keys != NULL);
678
+
679
+ free(galloc->hash_values);
680
+ galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
681
+ GGML_ASSERT(galloc->hash_values != NULL);
682
+ }
683
+
684
+ // reset allocators
685
+ for (int i = 0; i < galloc->n_buffers; i++) {
686
+ ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
687
+ }
688
+
689
+ // allocate in hash table
690
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
691
+
692
+ // set the node_allocs from the hash table
693
+ if (galloc->n_nodes < graph->n_nodes) {
694
+ free(galloc->node_allocs);
695
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
696
+ GGML_ASSERT(galloc->node_allocs != NULL);
697
+ }
698
+ galloc->n_nodes = graph->n_nodes;
699
+ for (int i = 0; i < graph->n_nodes; i++) {
700
+ struct ggml_tensor * node = graph->nodes[i];
701
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
702
+ if (node->view_src || node->data) {
703
+ node_alloc->dst.buffer_id = -1;
704
+ node_alloc->dst.offset = SIZE_MAX;
705
+ node_alloc->dst.size_max = 0;
706
+ } else {
707
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
708
+ node_alloc->dst.buffer_id = hn->buffer_id;
709
+ node_alloc->dst.offset = hn->offset;
710
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
711
+ }
712
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
713
+ struct ggml_tensor * src = node->src[j];
714
+ if (!src || src->view_src || src->data) {
715
+ node_alloc->src[j].buffer_id = -1;
716
+ node_alloc->src[j].offset = SIZE_MAX;
717
+ node_alloc->src[j].size_max = 0;
718
+ } else {
719
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
720
+ node_alloc->src[j].buffer_id = hn->buffer_id;
721
+ node_alloc->src[j].offset = hn->offset;
722
+ node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
723
+ }
724
+ }
725
+ }
726
+ if (galloc->n_leafs < graph->n_leafs) {
727
+ free(galloc->leaf_allocs);
728
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
729
+ GGML_ASSERT(galloc->leaf_allocs != NULL);
730
+ }
731
+ galloc->n_leafs = graph->n_leafs;
732
+ for (int i = 0; i < graph->n_leafs; i++) {
733
+ struct ggml_tensor * leaf = graph->leafs[i];
734
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
735
+ if (leaf->view_src || leaf->data) {
736
+ galloc->leaf_allocs[i].leaf.buffer_id = -1;
737
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
738
+ galloc->leaf_allocs[i].leaf.size_max = 0;
739
+ } else {
740
+ galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
741
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
742
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
743
+ }
744
+ }
745
+
746
+ // reallocate buffers if needed
747
+ for (int i = 0; i < galloc->n_buffers; i++) {
748
+ // if the buffer type is used multiple times, we reuse the same buffer
749
+ for (int j = 0; j < i; j++) {
750
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
751
+ galloc->buffers[i] = galloc->buffers[j];
752
+ break;
753
+ }
754
+ }
755
+
756
+ size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
757
+ size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
758
+
759
+ // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
760
+ if (new_size > cur_size || galloc->buffers[i] == NULL) {
761
+ #ifndef NDEBUG
762
+ GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
763
+ #endif
764
+
765
+ ggml_backend_buffer_free(galloc->buffers[i]);
766
+ galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
767
+ if (galloc->buffers[i] == NULL) {
768
+ GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
769
+ return false;
770
+ }
771
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
772
+ }
773
+ }
774
+
775
+ return true;
776
+ }
777
+
778
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
779
+ return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
780
+ }
781
+
782
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
783
+ int buffer_id = tensor_alloc->buffer_id;
784
+ assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
785
+
786
+ if (tensor->view_src != NULL) {
787
+ if (tensor->buffer == NULL) {
788
+ assert(tensor_alloc->offset == SIZE_MAX);
789
+ if (tensor->view_src->buffer == NULL) {
790
+ // this tensor was allocated without ggml-backend
791
+ return;
792
+ }
793
+ ggml_backend_view_init(tensor);
794
+ }
795
+ } else {
796
+ if (tensor->data == NULL) {
797
+ assert(tensor_alloc->offset != SIZE_MAX);
798
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
799
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
800
+ void * addr = (char *)base + tensor_alloc->offset;
801
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
802
+ } else {
803
+ if (tensor->buffer == NULL) {
804
+ // this tensor was allocated without ggml-backend
805
+ return;
806
+ }
807
+ }
808
+ }
809
+ }
810
+
811
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
812
+ size_t node_size = 0;
813
+ if (!node->data && !node->view_src) {
814
+ GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
815
+ node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
816
+ }
817
+ return talloc->size_max >= node_size;
818
+ }
819
+
820
+ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
821
+ if (galloc->n_nodes != graph->n_nodes) {
822
+ #ifndef NDEBUG
823
+ GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
824
+ #endif
825
+ return true;
826
+ }
827
+
828
+ if (galloc->n_leafs != graph->n_leafs) {
829
+ #ifndef NDEBUG
830
+ GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
831
+ #endif
832
+ return true;
833
+ }
834
+
835
+ for (int i = 0; i < graph->n_nodes; i++) {
836
+ struct ggml_tensor * node = graph->nodes[i];
837
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
838
+
839
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
840
+ #ifndef NDEBUG
841
+ GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
842
+ #endif
843
+ return true;
844
+ }
845
+
846
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
847
+ struct ggml_tensor * src = node->src[j];
848
+ if (src == NULL) {
849
+ continue;
850
+ }
851
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
852
+ #ifndef NDEBUG
853
+ GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
854
+ #endif
855
+ return true;
856
+ }
857
+ }
858
+ }
859
+
860
+ return false;
861
+ }
862
+
863
+ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
864
+ if (ggml_gallocr_needs_realloc(galloc, graph)) {
865
+ if (galloc->n_buffers == 1) {
866
+ #ifndef NDEBUG
867
+ GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
868
+ #endif
869
+ if (!ggml_gallocr_reserve(galloc, graph)) {
870
+ return false;
871
+ }
872
+ } else {
873
+ #ifndef NDEBUG
874
+ GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
875
+ #endif
876
+ return false;
877
+ }
878
+ }
879
+
880
+ // reset buffers
881
+ for (int i = 0; i < galloc->n_buffers; i++) {
882
+ if (galloc->buffers[i] != NULL) {
883
+ ggml_backend_buffer_reset(galloc->buffers[i]);
884
+ }
885
+ }
886
+
887
+ // allocate the graph tensors from the previous assignments
888
+ // leafs
889
+ for (int i = 0; i < graph->n_leafs; i++) {
890
+ struct ggml_tensor * leaf = graph->leafs[i];
891
+ struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
892
+ ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
893
+ }
894
+ // nodes
895
+ for (int i = 0; i < graph->n_nodes; i++) {
896
+ struct ggml_tensor * node = graph->nodes[i];
897
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
898
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
899
+ struct ggml_tensor * src = node->src[j];
900
+ if (src == NULL) {
901
+ continue;
902
+ }
903
+ ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
904
+ }
905
+ ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
906
+ }
907
+
908
+ return true;
909
+ }
910
+
911
+ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
912
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
913
+
914
+ if (galloc->buffers[buffer_id] == NULL) {
915
+ return 0;
916
+ }
917
+
918
+ for (int i = 0; i < buffer_id; i++) {
919
+ if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
920
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
921
+ // only return the buffer size the first time it appears to avoid double counting
922
+ return 0;
923
+ }
924
+ }
925
+
926
+ return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
927
+ }
928
+
929
+ // utils
930
+
931
+ static bool alloc_tensor_range(struct ggml_context * ctx,
932
+ struct ggml_tensor * first, struct ggml_tensor * last,
933
+ ggml_backend_buffer_type_t buft, size_t size,
934
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
935
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
936
+ if (buffer == NULL) {
937
+ #ifndef NDEBUG
938
+ GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
939
+ #endif
940
+ for (size_t i = 0; i < *n_buffers; i++) {
941
+ ggml_backend_buffer_free((*buffers)[i]);
942
+ }
943
+ free(*buffers);
944
+ return false;
945
+ }
946
+
947
+ struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
948
+
949
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
950
+ if (t->data == NULL) {
951
+ if (t->view_src == NULL) {
952
+ ggml_tallocr_alloc(&tallocr, t);
953
+ } else if (t->buffer == NULL) {
954
+ ggml_backend_view_init(t);
955
+ }
956
+ } else {
957
+ if (t->view_src != NULL && t->buffer == NULL) {
958
+ // view of a pre-allocated tensor
959
+ ggml_backend_view_init(t);
960
+ }
961
+ }
962
+ }
963
+
964
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
965
+ (*buffers)[(*n_buffers)++] = buffer;
966
+
967
+ return true;
968
+ }
969
+
970
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
971
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
972
+
973
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
974
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
975
+
976
+ ggml_backend_buffer_t * buffers = NULL;
977
+ size_t n_buffers = 0;
978
+
979
+ size_t cur_buf_size = 0;
980
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
981
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
982
+ size_t this_size = 0;
983
+ if (t->data == NULL && t->view_src == NULL) {
984
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
985
+ }
986
+
987
+ if (this_size > max_size) {
988
+ GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
989
+ __func__, t->name,
990
+ ggml_backend_buft_name(buft),
991
+ this_size, max_size);
992
+ for (size_t i = 0; i < n_buffers; i++) {
993
+ ggml_backend_buffer_free(buffers[i]);
994
+ }
995
+ free(buffers);
996
+ return NULL;
997
+ }
998
+
999
+ if ((cur_buf_size + this_size) > max_size) {
1000
+ // allocate tensors in the current buffer
1001
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1002
+ return NULL;
1003
+ }
1004
+ first = t;
1005
+ cur_buf_size = this_size;
1006
+ } else {
1007
+ cur_buf_size += this_size;
1008
+ }
1009
+ }
1010
+
1011
+ // allocate remaining tensors
1012
+ if (cur_buf_size > 0) {
1013
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1014
+ return NULL;
1015
+ }
1016
+ }
1017
+
1018
+ if (n_buffers == 0) {
1019
+ #ifndef NDEBUG
1020
+ GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1021
+ #endif
1022
+ return NULL;
1023
+ }
1024
+
1025
+ ggml_backend_buffer_t buffer;
1026
+ if (n_buffers == 1) {
1027
+ buffer = buffers[0];
1028
+ } else {
1029
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1030
+ }
1031
+ free(buffers);
1032
+ return buffer;
1033
+ }
1034
+
1035
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
1036
+ return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
1037
+ }