llama_cpp 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +14 -8
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +307 -127
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +200 -94
- data/ext/llama_cpp/src/ggml-metal.metal +264 -82
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +1647 -865
- data/ext/llama_cpp/src/ggml.h +143 -52
- data/ext/llama_cpp/src/llama.cpp +1427 -635
- data/ext/llama_cpp/src/llama.h +308 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
@@ -77,7 +77,7 @@ struct free_block {
|
|
77
77
|
size_t size;
|
78
78
|
};
|
79
79
|
|
80
|
-
#define MAX_FREE_BLOCKS
|
80
|
+
#define MAX_FREE_BLOCKS 256
|
81
81
|
|
82
82
|
struct ggml_allocr {
|
83
83
|
void * data;
|
@@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
|
131
131
|
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132
132
|
}
|
133
133
|
|
134
|
+
static bool ggml_is_view(struct ggml_tensor * t) {
|
135
|
+
return t->view_src != NULL;
|
136
|
+
}
|
137
|
+
|
134
138
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
135
139
|
#ifdef GGML_ALLOCATOR_DEBUG
|
136
140
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
@@ -183,6 +187,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
183
187
|
}
|
184
188
|
|
185
189
|
tensor->data = addr;
|
190
|
+
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
186
191
|
|
187
192
|
#ifdef GGML_ALLOCATOR_DEBUG
|
188
193
|
add_allocated_tensor(alloc, tensor);
|
@@ -214,7 +219,8 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
214
219
|
|
215
220
|
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
216
221
|
size = aligned_offset(NULL, size, alloc->alignment);
|
217
|
-
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
222
|
+
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
223
|
+
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
|
218
224
|
|
219
225
|
#ifdef GGML_ALLOCATOR_DEBUG
|
220
226
|
remove_allocated_tensor(alloc, tensor);
|
@@ -338,8 +344,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
|
338
344
|
|
339
345
|
// allocate uncommitted virtual memory to measure the size of the graph
|
340
346
|
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
341
|
-
//
|
342
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<
|
347
|
+
// 128GB for 64-bit, 1GB for 32-bit
|
348
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
343
349
|
do {
|
344
350
|
*base_addr = alloc_vmem(*size);
|
345
351
|
if (*base_addr != NULL) {
|
@@ -399,10 +405,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
399
405
|
|
400
406
|
//////////// compute graph allocator
|
401
407
|
|
402
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
403
|
-
return t->view_src != NULL;
|
404
|
-
}
|
405
|
-
|
406
408
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
407
409
|
if (a->type != b->type) {
|
408
410
|
return false;
|
@@ -631,3 +633,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
631
633
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
632
634
|
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
633
635
|
}
|
636
|
+
|
637
|
+
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
638
|
+
return alloc->max_size;
|
639
|
+
}
|
@@ -19,6 +19,7 @@ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
|
19
19
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
20
20
|
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
21
21
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
22
|
+
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
22
23
|
|
23
24
|
|
24
25
|
#ifdef __cplusplus
|