llama_cpp 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +14 -8
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +307 -127
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +200 -94
- data/ext/llama_cpp/src/ggml-metal.metal +264 -82
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +1647 -865
- data/ext/llama_cpp/src/ggml.h +143 -52
- data/ext/llama_cpp/src/llama.cpp +1427 -635
- data/ext/llama_cpp/src/llama.h +308 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
@@ -77,7 +77,7 @@ struct free_block {
|
|
77
77
|
size_t size;
|
78
78
|
};
|
79
79
|
|
80
|
-
#define MAX_FREE_BLOCKS
|
80
|
+
#define MAX_FREE_BLOCKS 256
|
81
81
|
|
82
82
|
struct ggml_allocr {
|
83
83
|
void * data;
|
@@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
|
131
131
|
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132
132
|
}
|
133
133
|
|
134
|
+
static bool ggml_is_view(struct ggml_tensor * t) {
|
135
|
+
return t->view_src != NULL;
|
136
|
+
}
|
137
|
+
|
134
138
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
135
139
|
#ifdef GGML_ALLOCATOR_DEBUG
|
136
140
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
@@ -183,6 +187,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
183
187
|
}
|
184
188
|
|
185
189
|
tensor->data = addr;
|
190
|
+
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
186
191
|
|
187
192
|
#ifdef GGML_ALLOCATOR_DEBUG
|
188
193
|
add_allocated_tensor(alloc, tensor);
|
@@ -214,7 +219,8 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
214
219
|
|
215
220
|
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
216
221
|
size = aligned_offset(NULL, size, alloc->alignment);
|
217
|
-
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
222
|
+
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
223
|
+
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
|
218
224
|
|
219
225
|
#ifdef GGML_ALLOCATOR_DEBUG
|
220
226
|
remove_allocated_tensor(alloc, tensor);
|
@@ -338,8 +344,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
|
338
344
|
|
339
345
|
// allocate uncommitted virtual memory to measure the size of the graph
|
340
346
|
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
341
|
-
//
|
342
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<
|
347
|
+
// 128GB for 64-bit, 1GB for 32-bit
|
348
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
343
349
|
do {
|
344
350
|
*base_addr = alloc_vmem(*size);
|
345
351
|
if (*base_addr != NULL) {
|
@@ -399,10 +405,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
399
405
|
|
400
406
|
//////////// compute graph allocator
|
401
407
|
|
402
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
403
|
-
return t->view_src != NULL;
|
404
|
-
}
|
405
|
-
|
406
408
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
407
409
|
if (a->type != b->type) {
|
408
410
|
return false;
|
@@ -631,3 +633,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
631
633
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
632
634
|
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
633
635
|
}
|
636
|
+
|
637
|
+
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
638
|
+
return alloc->max_size;
|
639
|
+
}
|
@@ -19,6 +19,7 @@ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
|
19
19
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
20
20
|
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
21
21
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
22
|
+
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
22
23
|
|
23
24
|
|
24
25
|
#ifdef __cplusplus
|