llama_cpp 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b4e94e20f142572fb46cff141e109025e1c5b91b9cd6cabfbaeac163a920bb82
|
4
|
+
data.tar.gz: 10fcd2922c057a0c960cbde83a0fa22a56eae6e21a73e20931cfe082108dec26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 570f5f257d947bad8489fcafe6c7cc0dc342d593132111d3479c0202e0998e444ca75c2140883dc8cf6a22f0f4f74fecd264a577cee6aabde4c168f5003b4c98
|
7
|
+
data.tar.gz: d105e41aac3bb39b2ee53c43eb320272ffa5290caa7e14e2f08bebbade94417aed065e01d23b8def9cce08fd6789576d33ba4082750647c0bbf9ff4c776db9ac
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.10.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.1...v0.10.2)] - 2023-12-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1641 to b1686.
|
4
|
+
- Add `LLAMA_FILE_MAGIC_GGLA` constant.
|
5
|
+
- Add `n_batch` method to `Context`.
|
6
|
+
|
1
7
|
## [[0.10.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.0...v0.10.1)] - 2023-12-16
|
2
8
|
|
3
9
|
- Bump bundled llama.cpp from b1620 to b1641.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1949,6 +1949,7 @@ public:
|
|
1949
1949
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
1950
1950
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
1951
1951
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1952
|
+
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
1952
1953
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
1953
1954
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
1954
1955
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2201,7 +2202,16 @@ private:
|
|
2201
2202
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2202
2203
|
return Qnil;
|
2203
2204
|
}
|
2204
|
-
return
|
2205
|
+
return UINT2NUM(llama_n_ctx(ptr->ctx));
|
2206
|
+
}
|
2207
|
+
|
2208
|
+
static VALUE _llama_context_n_batch(VALUE self) {
|
2209
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2210
|
+
if (ptr->ctx == NULL) {
|
2211
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2212
|
+
return Qnil;
|
2213
|
+
}
|
2214
|
+
return UINT2NUM(llama_n_batch(ptr->ctx));
|
2205
2215
|
}
|
2206
2216
|
|
2207
2217
|
static VALUE _llama_context_get_timings(VALUE self) {
|
@@ -3146,6 +3156,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
3146
3156
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3147
3157
|
|
3148
3158
|
std::stringstream ss_magic;
|
3159
|
+
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
3160
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
|
3161
|
+
|
3162
|
+
ss_magic.str("");
|
3163
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3149
3164
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
3150
3165
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
3151
3166
|
|
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|
449
449
|
if (update_backend) {
|
450
450
|
view->backend = view->view_src->backend;
|
451
451
|
}
|
452
|
-
|
452
|
+
// views are initialized in the alloc buffer rather than the view_src buffer
|
453
|
+
view->buffer = alloc->buffer;
|
453
454
|
view->data = (char *)view->view_src->data + view->view_offs;
|
454
455
|
|
455
|
-
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
456
|
-
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
457
456
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
458
457
|
|
459
458
|
if (!alloc->measure) {
|
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
|
736
735
|
}
|
737
736
|
|
738
737
|
void ggml_allocr_free(ggml_allocr_t alloc) {
|
738
|
+
if (alloc == NULL) {
|
739
|
+
return;
|
740
|
+
}
|
741
|
+
|
739
742
|
ggml_gallocr_free(alloc->galloc);
|
740
743
|
ggml_tallocr_free(alloc->talloc);
|
741
744
|
free(alloc);
|
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
775
778
|
}
|
776
779
|
|
777
780
|
if (nbytes == 0) {
|
778
|
-
|
781
|
+
// all the tensors in the context are already allocated
|
779
782
|
return NULL;
|
780
783
|
}
|
781
784
|
|
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
789
792
|
} else {
|
790
793
|
ggml_backend_view_init(buffer, t);
|
791
794
|
}
|
795
|
+
} else {
|
796
|
+
if (t->view_src != NULL) {
|
797
|
+
// view of a pre-allocated tensor
|
798
|
+
ggml_backend_view_init(buffer, t);
|
799
|
+
}
|
792
800
|
}
|
793
801
|
}
|
794
802
|
|
@@ -20,6 +20,9 @@ extern "C" {
|
|
20
20
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
21
21
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
22
22
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
23
|
+
// check if tensor data is in host memory
|
24
|
+
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
25
|
+
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
23
26
|
};
|
24
27
|
|
25
28
|
struct ggml_backend_buffer_type {
|
@@ -31,15 +34,16 @@ extern "C" {
|
|
31
34
|
typedef void * ggml_backend_buffer_context_t;
|
32
35
|
|
33
36
|
struct ggml_backend_buffer_i {
|
34
|
-
void
|
37
|
+
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
35
38
|
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
36
|
-
void *
|
37
|
-
void
|
38
|
-
void
|
39
|
-
void
|
39
|
+
void * (*get_base) (ggml_backend_buffer_t buffer);
|
40
|
+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
41
|
+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
42
|
+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
40
43
|
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
41
|
-
void
|
42
|
-
void
|
44
|
+
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
45
|
+
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
46
|
+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
43
47
|
};
|
44
48
|
|
45
49
|
struct ggml_backend_buffer {
|
@@ -78,7 +82,7 @@ extern "C" {
|
|
78
82
|
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
79
83
|
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
84
|
|
81
|
-
void (*synchronize)
|
85
|
+
void (*synchronize)(ggml_backend_t backend);
|
82
86
|
|
83
87
|
// compute graph with a plan
|
84
88
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
|
|
35
35
|
return buft->iface.supports_backend(buft, backend);
|
36
36
|
}
|
37
37
|
|
38
|
+
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
39
|
+
if (buft->iface.is_host) {
|
40
|
+
return buft->iface.is_host(buft);
|
41
|
+
}
|
42
|
+
return false;
|
43
|
+
}
|
44
|
+
|
38
45
|
// backend buffer
|
39
46
|
|
40
47
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
94
101
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
95
102
|
}
|
96
103
|
|
104
|
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
105
|
+
buffer->iface.clear(buffer, value);
|
106
|
+
}
|
107
|
+
|
108
|
+
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
109
|
+
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
110
|
+
}
|
111
|
+
|
97
112
|
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
98
113
|
return buffer->buft;
|
99
114
|
}
|
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
378
393
|
|
379
394
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
380
395
|
free(buffer->context);
|
381
|
-
GGML_UNUSED(buffer);
|
382
396
|
}
|
383
397
|
|
384
398
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
|
|
411
425
|
GGML_UNUSED(buffer);
|
412
426
|
}
|
413
427
|
|
428
|
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
429
|
+
memset(buffer->context, value, buffer->size);
|
430
|
+
}
|
431
|
+
|
414
432
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
415
433
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
416
434
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
419
437
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
420
438
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
421
439
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
440
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
422
441
|
};
|
423
442
|
|
424
443
|
// for buffers from ptr, free is not called
|
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
430
449
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
431
450
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
432
451
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
452
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
433
453
|
};
|
434
454
|
|
435
455
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
|
|
455
475
|
GGML_UNUSED(buft);
|
456
476
|
}
|
457
477
|
|
478
|
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
479
|
+
return true;
|
480
|
+
|
481
|
+
GGML_UNUSED(buft);
|
482
|
+
}
|
483
|
+
|
458
484
|
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
459
|
-
static struct ggml_backend_buffer_type
|
485
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
460
486
|
/* .iface = */ {
|
461
487
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
462
488
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
463
489
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
464
490
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
491
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
465
492
|
},
|
466
493
|
/* .context = */ NULL,
|
467
494
|
};
|
468
495
|
|
469
|
-
return &
|
496
|
+
return &ggml_backend_cpu_buffer_type;
|
470
497
|
}
|
471
498
|
|
499
|
+
#ifdef GGML_USE_CPU_HBM
|
500
|
+
|
501
|
+
// buffer type HBM
|
502
|
+
|
503
|
+
#include <hbwmalloc.h>
|
504
|
+
|
505
|
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
506
|
+
hbw_free(buffer->context);
|
507
|
+
}
|
508
|
+
|
509
|
+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
510
|
+
//void * ptr = hbw_malloc(size);
|
511
|
+
void * ptr;
|
512
|
+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
513
|
+
if (result != 0) {
|
514
|
+
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
515
|
+
return NULL;
|
516
|
+
}
|
517
|
+
|
518
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
519
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
520
|
+
buffer->buft = buft;
|
521
|
+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
522
|
+
|
523
|
+
return buffer;
|
524
|
+
}
|
525
|
+
|
526
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
527
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
528
|
+
/* .iface = */ {
|
529
|
+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
530
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
531
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
532
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
533
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
534
|
+
},
|
535
|
+
/* .context = */ NULL,
|
536
|
+
};
|
537
|
+
|
538
|
+
return &ggml_backend_cpu_buffer_type_hbm;
|
539
|
+
}
|
540
|
+
#endif
|
541
|
+
|
472
542
|
struct ggml_backend_cpu_context {
|
473
543
|
int n_threads;
|
474
544
|
void * work_data;
|
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|
505
575
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
506
576
|
|
507
577
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
508
|
-
cpu_plan->cgraph = *cgraph;
|
578
|
+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
509
579
|
|
510
580
|
if (cpu_plan->cplan.work_size > 0) {
|
511
581
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
1180
1250
|
// utils
|
1181
1251
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
1182
1252
|
GGML_ASSERT(tensor->buffer == NULL);
|
1183
|
-
GGML_ASSERT(tensor->data == NULL);
|
1253
|
+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
1184
1254
|
GGML_ASSERT(tensor->view_src != NULL);
|
1185
1255
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1186
1256
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
@@ -21,6 +21,7 @@ extern "C" {
|
|
21
21
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
22
22
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
23
23
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
24
|
+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
24
25
|
|
25
26
|
// buffer
|
26
27
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
@@ -29,6 +30,8 @@ extern "C" {
|
|
29
30
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
30
31
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
31
32
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
33
|
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
34
|
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
32
35
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
33
36
|
|
34
37
|
//
|
@@ -76,6 +79,10 @@ extern "C" {
|
|
76
79
|
|
77
80
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
78
81
|
|
82
|
+
#ifdef GGML_USE_CPU_HBM
|
83
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
84
|
+
#endif
|
85
|
+
|
79
86
|
//
|
80
87
|
// Backend registry
|
81
88
|
//
|