llama_cpp 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e38c82f6ce7404a78b3ecdbc9574ae860322e6945499f0c4a905956bcbd2be7
|
4
|
+
data.tar.gz: 4a5effb6fcf3182baad091717bc510176eb127ccd660342ce0cc46bf2d392b4a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c471bd6c6afee142945d03da1c4908355fe900a5f0c259583b7b65f97d495d07c5397d1b551da888a5970170944596959ddef73d2df803acf001b8d079d0affb
|
7
|
+
data.tar.gz: 99cbb2d978723f9814d8ac7163f03c642a1ac6cabbd6cf09d003f563c629563a920d909ab797729f1e233f30d5776bf9f70f4c473919e5bf101d3e3f5fd6e938
|
data/CHANGELOG.md
CHANGED
@@ -1,7 +1,19 @@
|
|
1
|
+
## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1198 to b1.
|
4
|
+
- Add `n_ctx_train` method to Model and Context.
|
5
|
+
- Add nvcc option to avoid link error ([#8](https://github.com/yoshoku/llama_cpp.rb/pull/8)).
|
6
|
+
- Set encoding on output of `generate` module function to avoid encoding error ([#9](https://github.com/yoshoku/llama_cpp.rb/pull/9)).
|
7
|
+
- Add `only_copy` option to ModelQuantizeParams.
|
8
|
+
|
9
|
+
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
10
|
+
|
11
|
+
- Bump bundled llama.cpp from b1140 to b1198.
|
12
|
+
|
1
13
|
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
2
14
|
|
3
15
|
**Breaking Changes**
|
4
|
-
- Bump bundled llama.cpp from
|
16
|
+
- Bump bundled llama.cpp from b1060 to b1140.
|
5
17
|
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
6
18
|
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
7
19
|
- Rename `type` method on Model to `desc` method.
|
@@ -10,7 +22,7 @@
|
|
10
22
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
11
23
|
|
12
24
|
**Breaking Changes**
|
13
|
-
- Bump bundled llama.cpp from master-097e121 to
|
25
|
+
- Bump bundled llama.cpp from master-097e121 to b1060.
|
14
26
|
- Support new file format GGUF.
|
15
27
|
- You should re-convert / re-quantize your model files.
|
16
28
|
- Remove vocab methods.
|
data/examples/prompt_jp.txt
CHANGED
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -112,7 +112,7 @@ create_makefile('llama_cpp/llama_cpp')
|
|
112
112
|
if with_config('cublas')
|
113
113
|
File.open('Makefile', 'a') do |f|
|
114
114
|
f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
|
115
|
-
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
115
|
+
f.puts "\tnvcc -shared -Xcompiler -fPIC -arch=native -c -o $@ $<"
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -692,6 +692,8 @@ public:
|
|
692
692
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
693
693
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
694
694
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
695
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
|
696
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
695
697
|
}
|
696
698
|
|
697
699
|
private:
|
@@ -752,6 +754,18 @@ private:
|
|
752
754
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
753
755
|
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
754
756
|
}
|
757
|
+
|
758
|
+
// only_copy
|
759
|
+
static VALUE _llama_model_quantize_params_set_only_copy(VALUE self, VALUE only_copy) {
|
760
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
761
|
+
ptr->params.only_copy = RTEST(only_copy) ? true : false;
|
762
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
763
|
+
}
|
764
|
+
|
765
|
+
static VALUE _llama_model_quantize_params_get_only_copy(VALUE self) {
|
766
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
767
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
768
|
+
}
|
755
769
|
};
|
756
770
|
|
757
771
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -810,6 +824,7 @@ public:
|
|
810
824
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
811
825
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
826
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
827
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
813
828
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
829
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
830
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
@@ -971,6 +986,11 @@ private:
|
|
971
986
|
return INT2NUM(llama_model_n_ctx(ptr->model));
|
972
987
|
}
|
973
988
|
|
989
|
+
static VALUE _llama_model_get_model_n_ctx_train(VALUE self) {
|
990
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
991
|
+
return INT2NUM(llama_model_n_ctx_train(ptr->model));
|
992
|
+
}
|
993
|
+
|
974
994
|
static VALUE _llama_model_get_model_n_embd(VALUE self) {
|
975
995
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
976
996
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
@@ -1341,6 +1361,7 @@ public:
|
|
1341
1361
|
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1342
1362
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1343
1363
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1364
|
+
rb_define_method(rb_cLLaMAContext, "n_ctx_train", RUBY_METHOD_FUNC(_llama_context_n_ctx_train), 0);
|
1344
1365
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
1345
1366
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
1346
1367
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
@@ -1733,6 +1754,15 @@ private:
|
|
1733
1754
|
return INT2NUM(llama_n_ctx(ptr->ctx));
|
1734
1755
|
}
|
1735
1756
|
|
1757
|
+
static VALUE _llama_context_n_ctx_train(VALUE self) {
|
1758
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1759
|
+
if (ptr->ctx == NULL) {
|
1760
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1761
|
+
return Qnil;
|
1762
|
+
}
|
1763
|
+
return INT2NUM(llama_n_ctx_train(ptr->ctx));
|
1764
|
+
}
|
1765
|
+
|
1736
1766
|
static VALUE _llama_context_n_embd(VALUE self) {
|
1737
1767
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1738
1768
|
if (ptr->ctx == NULL) {
|
@@ -6,6 +6,26 @@
|
|
6
6
|
#include <stdlib.h>
|
7
7
|
#include <string.h>
|
8
8
|
|
9
|
+
#ifdef __has_include
|
10
|
+
#if __has_include(<unistd.h>)
|
11
|
+
#include <unistd.h>
|
12
|
+
#if defined(_POSIX_MAPPED_FILES)
|
13
|
+
#include <sys/types.h>
|
14
|
+
#include <sys/mman.h>
|
15
|
+
#endif
|
16
|
+
#endif
|
17
|
+
#endif
|
18
|
+
|
19
|
+
#if defined(_WIN32)
|
20
|
+
#define WIN32_LEAN_AND_MEAN
|
21
|
+
#ifndef NOMINMAX
|
22
|
+
#define NOMINMAX
|
23
|
+
#endif
|
24
|
+
#include <windows.h>
|
25
|
+
#include <memoryapi.h>
|
26
|
+
#endif
|
27
|
+
|
28
|
+
|
9
29
|
#define UNUSED(x) (void)(x)
|
10
30
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
31
|
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
@@ -99,19 +119,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
99
119
|
}
|
100
120
|
#endif
|
101
121
|
|
102
|
-
|
103
|
-
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
122
|
+
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
104
123
|
return ggml_nbytes(tensor);
|
105
124
|
|
106
125
|
UNUSED(alloc);
|
107
126
|
}
|
108
127
|
|
128
|
+
// check if a tensor is allocated by this buffer
|
129
|
+
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
130
|
+
void * ptr = tensor->data;
|
131
|
+
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132
|
+
}
|
133
|
+
|
109
134
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
135
|
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
-
GGML_ASSERT(ggml_is_view(tensor)
|
136
|
+
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
112
137
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
138
|
#endif
|
114
|
-
size_t size =
|
139
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
115
140
|
size = aligned_offset(NULL, size, alloc->alignment);
|
116
141
|
|
117
142
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -135,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
135
160
|
if (best_fit_block == -1) {
|
136
161
|
// the last block is our last resort
|
137
162
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
163
|
+
max_avail = MAX(max_avail, block->size);
|
138
164
|
if (block->size >= size) {
|
139
165
|
best_fit_block = alloc->n_free_blocks - 1;
|
140
|
-
max_avail = MAX(max_avail, block->size);
|
141
166
|
} else {
|
142
167
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
143
168
|
__func__, size, max_avail);
|
144
169
|
GGML_ASSERT(!"not enough space in the buffer");
|
145
|
-
|
170
|
+
return;
|
146
171
|
}
|
147
172
|
}
|
148
173
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
@@ -177,17 +202,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
177
202
|
}
|
178
203
|
|
179
204
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
180
|
-
static void
|
205
|
+
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
181
206
|
void * ptr = tensor->data;
|
182
207
|
|
183
|
-
if (
|
208
|
+
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
184
209
|
// the tensor was not allocated in this buffer
|
185
210
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
186
211
|
// the easiest way to deal with this is just to ignore it
|
187
212
|
return;
|
188
213
|
}
|
189
214
|
|
190
|
-
size_t size =
|
215
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
191
216
|
size = aligned_offset(NULL, size, alloc->alignment);
|
192
217
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
193
218
|
|
@@ -281,17 +306,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
281
306
|
return alloc;
|
282
307
|
}
|
283
308
|
|
284
|
-
//
|
285
|
-
|
286
|
-
|
287
|
-
|
309
|
+
// OS specific functions to allocate and free uncommitted virtual memory
|
310
|
+
static void * alloc_vmem(size_t size) {
|
311
|
+
#if defined(_WIN32)
|
312
|
+
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
313
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
314
|
+
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
315
|
+
if (ptr == MAP_FAILED) {
|
316
|
+
return NULL;
|
317
|
+
}
|
318
|
+
return ptr;
|
319
|
+
#else
|
320
|
+
// use a fixed address for other platforms
|
321
|
+
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
322
|
+
return (void *)base_addr;
|
323
|
+
#endif
|
324
|
+
}
|
325
|
+
|
326
|
+
static void free_vmem(void * base_addr, size_t size) {
|
327
|
+
#if defined(_WIN32)
|
328
|
+
VirtualFree(base_addr, 0, MEM_RELEASE);
|
329
|
+
UNUSED(size);
|
330
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
331
|
+
munmap(base_addr, size);
|
332
|
+
#else
|
333
|
+
// nothing to do
|
334
|
+
UNUSED(base_addr);
|
335
|
+
UNUSED(size);
|
336
|
+
#endif
|
337
|
+
}
|
338
|
+
|
339
|
+
// allocate uncommitted virtual memory to measure the size of the graph
|
340
|
+
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
341
|
+
// 1TB for 64-bit, 1GB for 32-bit
|
342
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
343
|
+
do {
|
344
|
+
*base_addr = alloc_vmem(*size);
|
345
|
+
if (*base_addr != NULL) {
|
346
|
+
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
347
|
+
return;
|
348
|
+
}
|
349
|
+
// try again with half the size
|
350
|
+
*size /= 2;
|
351
|
+
} while (*size > 0);
|
352
|
+
|
353
|
+
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
354
|
+
}
|
355
|
+
|
356
|
+
static void free_measure_vmem(void * base_addr, size_t size) {
|
357
|
+
free_vmem(base_addr, size);
|
358
|
+
}
|
288
359
|
|
289
360
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
290
361
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
291
362
|
|
363
|
+
void * base_addr;
|
364
|
+
size_t size;
|
365
|
+
|
366
|
+
alloc_measure_vmem(&base_addr, &size);
|
367
|
+
|
292
368
|
*alloc = (struct ggml_allocr){
|
293
|
-
/*.data = */
|
294
|
-
/*.size = */
|
369
|
+
/*.data = */ base_addr,
|
370
|
+
/*.size = */ size,
|
295
371
|
/*.alignment = */ alignment,
|
296
372
|
/*.n_free_blocks = */ 0,
|
297
373
|
/*.free_blocks = */ {{0}},
|
@@ -311,6 +387,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
311
387
|
}
|
312
388
|
|
313
389
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
390
|
+
if (alloc->measure) {
|
391
|
+
free_measure_vmem(alloc->data, alloc->size);
|
392
|
+
}
|
314
393
|
free(alloc);
|
315
394
|
}
|
316
395
|
|
@@ -380,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
380
459
|
}
|
381
460
|
|
382
461
|
// if the node's data is external, then we cannot re-use it
|
383
|
-
if ((
|
384
|
-
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
462
|
+
if (ggml_allocr_is_own(alloc, parent) == false) {
|
385
463
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
386
464
|
continue;
|
387
465
|
}
|
@@ -415,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
415
493
|
}
|
416
494
|
}
|
417
495
|
|
418
|
-
static size_t
|
496
|
+
static size_t ggml_allocr_alloc_graph_tensors_n(
|
419
497
|
struct ggml_allocr * alloc,
|
420
498
|
struct ggml_cgraph ** graphs, int n_graphs,
|
421
499
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -493,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
493
571
|
AT_PRINTF("\n");
|
494
572
|
}
|
495
573
|
|
496
|
-
|
497
574
|
// update parents
|
498
575
|
// update immediately if there is no parse_seq
|
499
576
|
// update only at barriers if there is parse_seq
|
500
|
-
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
577
|
+
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
501
578
|
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
502
579
|
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
503
580
|
for (int i = update_start; i < update_end; i++) {
|
@@ -521,12 +598,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
521
598
|
view_src_hn->n_views -= 1;
|
522
599
|
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
523
600
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
524
|
-
|
601
|
+
ggml_allocr_free_tensor(alloc, view_src);
|
525
602
|
}
|
526
603
|
}
|
527
604
|
else {
|
528
605
|
if (parent->data != node->data) {
|
529
|
-
|
606
|
+
ggml_allocr_free_tensor(alloc, parent);
|
530
607
|
}
|
531
608
|
}
|
532
609
|
}
|
@@ -543,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
543
620
|
for (int i = 0; outputs[g][i] != NULL; i++) {
|
544
621
|
struct ggml_tensor * output = outputs[g][i];
|
545
622
|
AT_PRINTF("output: %s\n", output->name);
|
546
|
-
|
623
|
+
ggml_allocr_free_tensor(alloc, output);
|
547
624
|
}
|
548
625
|
}
|
549
626
|
}
|
@@ -552,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
552
629
|
}
|
553
630
|
|
554
631
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
555
|
-
return
|
632
|
+
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
556
633
|
}
|