llama_cpp 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e38c82f6ce7404a78b3ecdbc9574ae860322e6945499f0c4a905956bcbd2be7
|
4
|
+
data.tar.gz: 4a5effb6fcf3182baad091717bc510176eb127ccd660342ce0cc46bf2d392b4a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c471bd6c6afee142945d03da1c4908355fe900a5f0c259583b7b65f97d495d07c5397d1b551da888a5970170944596959ddef73d2df803acf001b8d079d0affb
|
7
|
+
data.tar.gz: 99cbb2d978723f9814d8ac7163f03c642a1ac6cabbd6cf09d003f563c629563a920d909ab797729f1e233f30d5776bf9f70f4c473919e5bf101d3e3f5fd6e938
|
data/CHANGELOG.md
CHANGED
@@ -1,7 +1,19 @@
|
|
1
|
+
## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1198 to b1.
|
4
|
+
- Add `n_ctx_train` method to Model and Context.
|
5
|
+
- Add nvcc option to avoid link error ([#8](https://github.com/yoshoku/llama_cpp.rb/pull/8)).
|
6
|
+
- Set encoding on output of `generate` module function to avoid encoding error ([#9](https://github.com/yoshoku/llama_cpp.rb/pull/9)).
|
7
|
+
- Add `only_copy` option to ModelQuantizeParams.
|
8
|
+
|
9
|
+
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
10
|
+
|
11
|
+
- Bump bundled llama.cpp from b1140 to b1198.
|
12
|
+
|
1
13
|
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
2
14
|
|
3
15
|
**Breaking Changes**
|
4
|
-
- Bump bundled llama.cpp from
|
16
|
+
- Bump bundled llama.cpp from b1060 to b1140.
|
5
17
|
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
6
18
|
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
7
19
|
- Rename `type` method on Model to `desc` method.
|
@@ -10,7 +22,7 @@
|
|
10
22
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
11
23
|
|
12
24
|
**Breaking Changes**
|
13
|
-
- Bump bundled llama.cpp from master-097e121 to
|
25
|
+
- Bump bundled llama.cpp from master-097e121 to b1060.
|
14
26
|
- Support new file format GGUF.
|
15
27
|
- You should re-convert / re-quantize your model files.
|
16
28
|
- Remove vocab methods.
|
data/examples/prompt_jp.txt
CHANGED
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -112,7 +112,7 @@ create_makefile('llama_cpp/llama_cpp')
|
|
112
112
|
if with_config('cublas')
|
113
113
|
File.open('Makefile', 'a') do |f|
|
114
114
|
f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
|
115
|
-
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
115
|
+
f.puts "\tnvcc -shared -Xcompiler -fPIC -arch=native -c -o $@ $<"
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -692,6 +692,8 @@ public:
|
|
692
692
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
693
693
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
694
694
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
695
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
|
696
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
695
697
|
}
|
696
698
|
|
697
699
|
private:
|
@@ -752,6 +754,18 @@ private:
|
|
752
754
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
753
755
|
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
754
756
|
}
|
757
|
+
|
758
|
+
// only_copy
|
759
|
+
static VALUE _llama_model_quantize_params_set_only_copy(VALUE self, VALUE only_copy) {
|
760
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
761
|
+
ptr->params.only_copy = RTEST(only_copy) ? true : false;
|
762
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
763
|
+
}
|
764
|
+
|
765
|
+
static VALUE _llama_model_quantize_params_get_only_copy(VALUE self) {
|
766
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
767
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
768
|
+
}
|
755
769
|
};
|
756
770
|
|
757
771
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -810,6 +824,7 @@ public:
|
|
810
824
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
811
825
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
826
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
827
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
813
828
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
829
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
830
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
@@ -971,6 +986,11 @@ private:
|
|
971
986
|
return INT2NUM(llama_model_n_ctx(ptr->model));
|
972
987
|
}
|
973
988
|
|
989
|
+
static VALUE _llama_model_get_model_n_ctx_train(VALUE self) {
|
990
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
991
|
+
return INT2NUM(llama_model_n_ctx_train(ptr->model));
|
992
|
+
}
|
993
|
+
|
974
994
|
static VALUE _llama_model_get_model_n_embd(VALUE self) {
|
975
995
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
976
996
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
@@ -1341,6 +1361,7 @@ public:
|
|
1341
1361
|
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1342
1362
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1343
1363
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1364
|
+
rb_define_method(rb_cLLaMAContext, "n_ctx_train", RUBY_METHOD_FUNC(_llama_context_n_ctx_train), 0);
|
1344
1365
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
1345
1366
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
1346
1367
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
@@ -1733,6 +1754,15 @@ private:
|
|
1733
1754
|
return INT2NUM(llama_n_ctx(ptr->ctx));
|
1734
1755
|
}
|
1735
1756
|
|
1757
|
+
static VALUE _llama_context_n_ctx_train(VALUE self) {
|
1758
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1759
|
+
if (ptr->ctx == NULL) {
|
1760
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1761
|
+
return Qnil;
|
1762
|
+
}
|
1763
|
+
return INT2NUM(llama_n_ctx_train(ptr->ctx));
|
1764
|
+
}
|
1765
|
+
|
1736
1766
|
static VALUE _llama_context_n_embd(VALUE self) {
|
1737
1767
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1738
1768
|
if (ptr->ctx == NULL) {
|
@@ -6,6 +6,26 @@
|
|
6
6
|
#include <stdlib.h>
|
7
7
|
#include <string.h>
|
8
8
|
|
9
|
+
#ifdef __has_include
|
10
|
+
#if __has_include(<unistd.h>)
|
11
|
+
#include <unistd.h>
|
12
|
+
#if defined(_POSIX_MAPPED_FILES)
|
13
|
+
#include <sys/types.h>
|
14
|
+
#include <sys/mman.h>
|
15
|
+
#endif
|
16
|
+
#endif
|
17
|
+
#endif
|
18
|
+
|
19
|
+
#if defined(_WIN32)
|
20
|
+
#define WIN32_LEAN_AND_MEAN
|
21
|
+
#ifndef NOMINMAX
|
22
|
+
#define NOMINMAX
|
23
|
+
#endif
|
24
|
+
#include <windows.h>
|
25
|
+
#include <memoryapi.h>
|
26
|
+
#endif
|
27
|
+
|
28
|
+
|
9
29
|
#define UNUSED(x) (void)(x)
|
10
30
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
31
|
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
@@ -99,19 +119,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
99
119
|
}
|
100
120
|
#endif
|
101
121
|
|
102
|
-
|
103
|
-
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
122
|
+
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
104
123
|
return ggml_nbytes(tensor);
|
105
124
|
|
106
125
|
UNUSED(alloc);
|
107
126
|
}
|
108
127
|
|
128
|
+
// check if a tensor is allocated by this buffer
|
129
|
+
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
130
|
+
void * ptr = tensor->data;
|
131
|
+
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132
|
+
}
|
133
|
+
|
109
134
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
135
|
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
-
GGML_ASSERT(ggml_is_view(tensor)
|
136
|
+
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
112
137
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
138
|
#endif
|
114
|
-
size_t size =
|
139
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
115
140
|
size = aligned_offset(NULL, size, alloc->alignment);
|
116
141
|
|
117
142
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -135,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
135
160
|
if (best_fit_block == -1) {
|
136
161
|
// the last block is our last resort
|
137
162
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
163
|
+
max_avail = MAX(max_avail, block->size);
|
138
164
|
if (block->size >= size) {
|
139
165
|
best_fit_block = alloc->n_free_blocks - 1;
|
140
|
-
max_avail = MAX(max_avail, block->size);
|
141
166
|
} else {
|
142
167
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
143
168
|
__func__, size, max_avail);
|
144
169
|
GGML_ASSERT(!"not enough space in the buffer");
|
145
|
-
|
170
|
+
return;
|
146
171
|
}
|
147
172
|
}
|
148
173
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
@@ -177,17 +202,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
177
202
|
}
|
178
203
|
|
179
204
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
180
|
-
static void
|
205
|
+
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
181
206
|
void * ptr = tensor->data;
|
182
207
|
|
183
|
-
if (
|
208
|
+
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
184
209
|
// the tensor was not allocated in this buffer
|
185
210
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
186
211
|
// the easiest way to deal with this is just to ignore it
|
187
212
|
return;
|
188
213
|
}
|
189
214
|
|
190
|
-
size_t size =
|
215
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
191
216
|
size = aligned_offset(NULL, size, alloc->alignment);
|
192
217
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
193
218
|
|
@@ -281,17 +306,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
281
306
|
return alloc;
|
282
307
|
}
|
283
308
|
|
284
|
-
//
|
285
|
-
|
286
|
-
|
287
|
-
|
309
|
+
// OS specific functions to allocate and free uncommitted virtual memory
|
310
|
+
static void * alloc_vmem(size_t size) {
|
311
|
+
#if defined(_WIN32)
|
312
|
+
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
313
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
314
|
+
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
315
|
+
if (ptr == MAP_FAILED) {
|
316
|
+
return NULL;
|
317
|
+
}
|
318
|
+
return ptr;
|
319
|
+
#else
|
320
|
+
// use a fixed address for other platforms
|
321
|
+
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
322
|
+
return (void *)base_addr;
|
323
|
+
#endif
|
324
|
+
}
|
325
|
+
|
326
|
+
static void free_vmem(void * base_addr, size_t size) {
|
327
|
+
#if defined(_WIN32)
|
328
|
+
VirtualFree(base_addr, 0, MEM_RELEASE);
|
329
|
+
UNUSED(size);
|
330
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
331
|
+
munmap(base_addr, size);
|
332
|
+
#else
|
333
|
+
// nothing to do
|
334
|
+
UNUSED(base_addr);
|
335
|
+
UNUSED(size);
|
336
|
+
#endif
|
337
|
+
}
|
338
|
+
|
339
|
+
// allocate uncommitted virtual memory to measure the size of the graph
|
340
|
+
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
341
|
+
// 1TB for 64-bit, 1GB for 32-bit
|
342
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
343
|
+
do {
|
344
|
+
*base_addr = alloc_vmem(*size);
|
345
|
+
if (*base_addr != NULL) {
|
346
|
+
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
347
|
+
return;
|
348
|
+
}
|
349
|
+
// try again with half the size
|
350
|
+
*size /= 2;
|
351
|
+
} while (*size > 0);
|
352
|
+
|
353
|
+
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
354
|
+
}
|
355
|
+
|
356
|
+
static void free_measure_vmem(void * base_addr, size_t size) {
|
357
|
+
free_vmem(base_addr, size);
|
358
|
+
}
|
288
359
|
|
289
360
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
290
361
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
291
362
|
|
363
|
+
void * base_addr;
|
364
|
+
size_t size;
|
365
|
+
|
366
|
+
alloc_measure_vmem(&base_addr, &size);
|
367
|
+
|
292
368
|
*alloc = (struct ggml_allocr){
|
293
|
-
/*.data = */
|
294
|
-
/*.size = */
|
369
|
+
/*.data = */ base_addr,
|
370
|
+
/*.size = */ size,
|
295
371
|
/*.alignment = */ alignment,
|
296
372
|
/*.n_free_blocks = */ 0,
|
297
373
|
/*.free_blocks = */ {{0}},
|
@@ -311,6 +387,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
311
387
|
}
|
312
388
|
|
313
389
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
390
|
+
if (alloc->measure) {
|
391
|
+
free_measure_vmem(alloc->data, alloc->size);
|
392
|
+
}
|
314
393
|
free(alloc);
|
315
394
|
}
|
316
395
|
|
@@ -380,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
380
459
|
}
|
381
460
|
|
382
461
|
// if the node's data is external, then we cannot re-use it
|
383
|
-
if ((
|
384
|
-
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
462
|
+
if (ggml_allocr_is_own(alloc, parent) == false) {
|
385
463
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
386
464
|
continue;
|
387
465
|
}
|
@@ -415,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
415
493
|
}
|
416
494
|
}
|
417
495
|
|
418
|
-
static size_t
|
496
|
+
static size_t ggml_allocr_alloc_graph_tensors_n(
|
419
497
|
struct ggml_allocr * alloc,
|
420
498
|
struct ggml_cgraph ** graphs, int n_graphs,
|
421
499
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -493,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
493
571
|
AT_PRINTF("\n");
|
494
572
|
}
|
495
573
|
|
496
|
-
|
497
574
|
// update parents
|
498
575
|
// update immediately if there is no parse_seq
|
499
576
|
// update only at barriers if there is parse_seq
|
500
|
-
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
577
|
+
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
501
578
|
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
502
579
|
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
503
580
|
for (int i = update_start; i < update_end; i++) {
|
@@ -521,12 +598,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
521
598
|
view_src_hn->n_views -= 1;
|
522
599
|
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
523
600
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
524
|
-
|
601
|
+
ggml_allocr_free_tensor(alloc, view_src);
|
525
602
|
}
|
526
603
|
}
|
527
604
|
else {
|
528
605
|
if (parent->data != node->data) {
|
529
|
-
|
606
|
+
ggml_allocr_free_tensor(alloc, parent);
|
530
607
|
}
|
531
608
|
}
|
532
609
|
}
|
@@ -543,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
543
620
|
for (int i = 0; outputs[g][i] != NULL; i++) {
|
544
621
|
struct ggml_tensor * output = outputs[g][i];
|
545
622
|
AT_PRINTF("output: %s\n", output->name);
|
546
|
-
|
623
|
+
ggml_allocr_free_tensor(alloc, output);
|
547
624
|
}
|
548
625
|
}
|
549
626
|
}
|
@@ -552,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
552
629
|
}
|
553
630
|
|
554
631
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
555
|
-
return
|
632
|
+
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
556
633
|
}
|