llama_cpp 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b4e94e20f142572fb46cff141e109025e1c5b91b9cd6cabfbaeac163a920bb82
|
4
|
+
data.tar.gz: 10fcd2922c057a0c960cbde83a0fa22a56eae6e21a73e20931cfe082108dec26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 570f5f257d947bad8489fcafe6c7cc0dc342d593132111d3479c0202e0998e444ca75c2140883dc8cf6a22f0f4f74fecd264a577cee6aabde4c168f5003b4c98
|
7
|
+
data.tar.gz: d105e41aac3bb39b2ee53c43eb320272ffa5290caa7e14e2f08bebbade94417aed065e01d23b8def9cce08fd6789576d33ba4082750647c0bbf9ff4c776db9ac
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.10.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.1...v0.10.2)] - 2023-12-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1641 to b1686.
|
4
|
+
- Add `LLAMA_FILE_MAGIC_GGLA` constant.
|
5
|
+
- Add `n_batch` method to `Context`.
|
6
|
+
|
1
7
|
## [[0.10.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.0...v0.10.1)] - 2023-12-16
|
2
8
|
|
3
9
|
- Bump bundled llama.cpp from b1620 to b1641.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1949,6 +1949,7 @@ public:
|
|
1949
1949
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
1950
1950
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
1951
1951
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1952
|
+
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
1952
1953
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
1953
1954
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
1954
1955
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2201,7 +2202,16 @@ private:
|
|
2201
2202
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2202
2203
|
return Qnil;
|
2203
2204
|
}
|
2204
|
-
return
|
2205
|
+
return UINT2NUM(llama_n_ctx(ptr->ctx));
|
2206
|
+
}
|
2207
|
+
|
2208
|
+
static VALUE _llama_context_n_batch(VALUE self) {
|
2209
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2210
|
+
if (ptr->ctx == NULL) {
|
2211
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2212
|
+
return Qnil;
|
2213
|
+
}
|
2214
|
+
return UINT2NUM(llama_n_batch(ptr->ctx));
|
2205
2215
|
}
|
2206
2216
|
|
2207
2217
|
static VALUE _llama_context_get_timings(VALUE self) {
|
@@ -3146,6 +3156,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
3146
3156
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3147
3157
|
|
3148
3158
|
std::stringstream ss_magic;
|
3159
|
+
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
3160
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
|
3161
|
+
|
3162
|
+
ss_magic.str("");
|
3163
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3149
3164
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
3150
3165
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
3151
3166
|
|
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|
449
449
|
if (update_backend) {
|
450
450
|
view->backend = view->view_src->backend;
|
451
451
|
}
|
452
|
-
|
452
|
+
// views are initialized in the alloc buffer rather than the view_src buffer
|
453
|
+
view->buffer = alloc->buffer;
|
453
454
|
view->data = (char *)view->view_src->data + view->view_offs;
|
454
455
|
|
455
|
-
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
456
|
-
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
457
456
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
458
457
|
|
459
458
|
if (!alloc->measure) {
|
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
|
736
735
|
}
|
737
736
|
|
738
737
|
void ggml_allocr_free(ggml_allocr_t alloc) {
|
738
|
+
if (alloc == NULL) {
|
739
|
+
return;
|
740
|
+
}
|
741
|
+
|
739
742
|
ggml_gallocr_free(alloc->galloc);
|
740
743
|
ggml_tallocr_free(alloc->talloc);
|
741
744
|
free(alloc);
|
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
775
778
|
}
|
776
779
|
|
777
780
|
if (nbytes == 0) {
|
778
|
-
|
781
|
+
// all the tensors in the context are already allocated
|
779
782
|
return NULL;
|
780
783
|
}
|
781
784
|
|
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
789
792
|
} else {
|
790
793
|
ggml_backend_view_init(buffer, t);
|
791
794
|
}
|
795
|
+
} else {
|
796
|
+
if (t->view_src != NULL) {
|
797
|
+
// view of a pre-allocated tensor
|
798
|
+
ggml_backend_view_init(buffer, t);
|
799
|
+
}
|
792
800
|
}
|
793
801
|
}
|
794
802
|
|
@@ -20,6 +20,9 @@ extern "C" {
|
|
20
20
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
21
21
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
22
22
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
23
|
+
// check if tensor data is in host memory
|
24
|
+
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
25
|
+
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
23
26
|
};
|
24
27
|
|
25
28
|
struct ggml_backend_buffer_type {
|
@@ -31,15 +34,16 @@ extern "C" {
|
|
31
34
|
typedef void * ggml_backend_buffer_context_t;
|
32
35
|
|
33
36
|
struct ggml_backend_buffer_i {
|
34
|
-
void
|
37
|
+
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
35
38
|
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
36
|
-
void *
|
37
|
-
void
|
38
|
-
void
|
39
|
-
void
|
39
|
+
void * (*get_base) (ggml_backend_buffer_t buffer);
|
40
|
+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
41
|
+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
42
|
+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
40
43
|
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
41
|
-
void
|
42
|
-
void
|
44
|
+
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
45
|
+
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
46
|
+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
43
47
|
};
|
44
48
|
|
45
49
|
struct ggml_backend_buffer {
|
@@ -78,7 +82,7 @@ extern "C" {
|
|
78
82
|
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
79
83
|
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
84
|
|
81
|
-
void (*synchronize)
|
85
|
+
void (*synchronize)(ggml_backend_t backend);
|
82
86
|
|
83
87
|
// compute graph with a plan
|
84
88
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
|
|
35
35
|
return buft->iface.supports_backend(buft, backend);
|
36
36
|
}
|
37
37
|
|
38
|
+
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
39
|
+
if (buft->iface.is_host) {
|
40
|
+
return buft->iface.is_host(buft);
|
41
|
+
}
|
42
|
+
return false;
|
43
|
+
}
|
44
|
+
|
38
45
|
// backend buffer
|
39
46
|
|
40
47
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
94
101
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
95
102
|
}
|
96
103
|
|
104
|
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
105
|
+
buffer->iface.clear(buffer, value);
|
106
|
+
}
|
107
|
+
|
108
|
+
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
109
|
+
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
110
|
+
}
|
111
|
+
|
97
112
|
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
98
113
|
return buffer->buft;
|
99
114
|
}
|
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
378
393
|
|
379
394
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
380
395
|
free(buffer->context);
|
381
|
-
GGML_UNUSED(buffer);
|
382
396
|
}
|
383
397
|
|
384
398
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
|
|
411
425
|
GGML_UNUSED(buffer);
|
412
426
|
}
|
413
427
|
|
428
|
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
429
|
+
memset(buffer->context, value, buffer->size);
|
430
|
+
}
|
431
|
+
|
414
432
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
415
433
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
416
434
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
419
437
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
420
438
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
421
439
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
440
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
422
441
|
};
|
423
442
|
|
424
443
|
// for buffers from ptr, free is not called
|
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
430
449
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
431
450
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
432
451
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
452
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
433
453
|
};
|
434
454
|
|
435
455
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
|
|
455
475
|
GGML_UNUSED(buft);
|
456
476
|
}
|
457
477
|
|
478
|
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
479
|
+
return true;
|
480
|
+
|
481
|
+
GGML_UNUSED(buft);
|
482
|
+
}
|
483
|
+
|
458
484
|
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
459
|
-
static struct ggml_backend_buffer_type
|
485
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
460
486
|
/* .iface = */ {
|
461
487
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
462
488
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
463
489
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
464
490
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
491
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
465
492
|
},
|
466
493
|
/* .context = */ NULL,
|
467
494
|
};
|
468
495
|
|
469
|
-
return &
|
496
|
+
return &ggml_backend_cpu_buffer_type;
|
470
497
|
}
|
471
498
|
|
499
|
+
#ifdef GGML_USE_CPU_HBM
|
500
|
+
|
501
|
+
// buffer type HBM
|
502
|
+
|
503
|
+
#include <hbwmalloc.h>
|
504
|
+
|
505
|
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
506
|
+
hbw_free(buffer->context);
|
507
|
+
}
|
508
|
+
|
509
|
+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
510
|
+
//void * ptr = hbw_malloc(size);
|
511
|
+
void * ptr;
|
512
|
+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
513
|
+
if (result != 0) {
|
514
|
+
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
515
|
+
return NULL;
|
516
|
+
}
|
517
|
+
|
518
|
+
// FIXME: this is a hack to avoid having to implement a new buffer type
|
519
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
520
|
+
buffer->buft = buft;
|
521
|
+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
522
|
+
|
523
|
+
return buffer;
|
524
|
+
}
|
525
|
+
|
526
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
527
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
528
|
+
/* .iface = */ {
|
529
|
+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
530
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
531
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
532
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
533
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
534
|
+
},
|
535
|
+
/* .context = */ NULL,
|
536
|
+
};
|
537
|
+
|
538
|
+
return &ggml_backend_cpu_buffer_type_hbm;
|
539
|
+
}
|
540
|
+
#endif
|
541
|
+
|
472
542
|
struct ggml_backend_cpu_context {
|
473
543
|
int n_threads;
|
474
544
|
void * work_data;
|
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|
505
575
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
506
576
|
|
507
577
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
508
|
-
cpu_plan->cgraph = *cgraph;
|
578
|
+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
509
579
|
|
510
580
|
if (cpu_plan->cplan.work_size > 0) {
|
511
581
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
1180
1250
|
// utils
|
1181
1251
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
1182
1252
|
GGML_ASSERT(tensor->buffer == NULL);
|
1183
|
-
GGML_ASSERT(tensor->data == NULL);
|
1253
|
+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
1184
1254
|
GGML_ASSERT(tensor->view_src != NULL);
|
1185
1255
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1186
1256
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
@@ -21,6 +21,7 @@ extern "C" {
|
|
21
21
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
22
22
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
23
23
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
24
|
+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
24
25
|
|
25
26
|
// buffer
|
26
27
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
@@ -29,6 +30,8 @@ extern "C" {
|
|
29
30
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
30
31
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
31
32
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
33
|
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
34
|
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
32
35
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
33
36
|
|
34
37
|
//
|
@@ -76,6 +79,10 @@ extern "C" {
|
|
76
79
|
|
77
80
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
78
81
|
|
82
|
+
#ifdef GGML_USE_CPU_HBM
|
83
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
84
|
+
#endif
|
85
|
+
|
79
86
|
//
|
80
87
|
// Backend registry
|
81
88
|
//
|