llama_cpp 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9bd5dbea8695fdb41ea6d97e372c2cea452ee8ed070e26bd558a720d6c24fe27
4
- data.tar.gz: 8d7bfd02445df81644eebb3a6db49bb1ddf241a344fef76a0b949f1c12d0639d
3
+ metadata.gz: b4e94e20f142572fb46cff141e109025e1c5b91b9cd6cabfbaeac163a920bb82
4
+ data.tar.gz: 10fcd2922c057a0c960cbde83a0fa22a56eae6e21a73e20931cfe082108dec26
5
5
  SHA512:
6
- metadata.gz: 116423f4581f605ee379bcd690299e152087c03665be6171bc137205f0824be5d5e0ce6d3c0b548fc6d28193c601679c3cc27a101b6f5b58968c69388a70cbfc
7
- data.tar.gz: fd162b1c4e26732573d32ba7439fc44dd2ff09a9024c64ff5adce997e154a934d9556f0ff3cfd6b83dd7dcbfb8366cc6ce394b8324f39ef349d4cb5834ea43f1
6
+ metadata.gz: 570f5f257d947bad8489fcafe6c7cc0dc342d593132111d3479c0202e0998e444ca75c2140883dc8cf6a22f0f4f74fecd264a577cee6aabde4c168f5003b4c98
7
+ data.tar.gz: d105e41aac3bb39b2ee53c43eb320272ffa5290caa7e14e2f08bebbade94417aed065e01d23b8def9cce08fd6789576d33ba4082750647c0bbf9ff4c776db9ac
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.10.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.1...v0.10.2)] - 2023-12-23
2
+
3
+ - Bump bundled llama.cpp from b1641 to b1686.
4
+ - Add `LLAMA_FILE_MAGIC_GGLA` constant.
5
+ - Add `n_batch` method to `Context`.
6
+
1
7
  ## [[0.10.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.0...v0.10.1)] - 2023-12-16
2
8
 
3
9
  - Bump bundled llama.cpp from b1620 to b1641.
@@ -1949,6 +1949,7 @@ public:
1949
1949
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
1950
1950
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
1951
1951
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1952
+ rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
1952
1953
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
1953
1954
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
1954
1955
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2201,7 +2202,16 @@ private:
2201
2202
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2202
2203
  return Qnil;
2203
2204
  }
2204
- return INT2NUM(llama_n_ctx(ptr->ctx));
2205
+ return UINT2NUM(llama_n_ctx(ptr->ctx));
2206
+ }
2207
+
2208
+ static VALUE _llama_context_n_batch(VALUE self) {
2209
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2210
+ if (ptr->ctx == NULL) {
2211
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2212
+ return Qnil;
2213
+ }
2214
+ return UINT2NUM(llama_n_batch(ptr->ctx));
2205
2215
  }
2206
2216
 
2207
2217
  static VALUE _llama_context_get_timings(VALUE self) {
@@ -3146,6 +3156,11 @@ extern "C" void Init_llama_cpp(void) {
3146
3156
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3147
3157
 
3148
3158
  std::stringstream ss_magic;
3159
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
3160
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
3161
+
3162
+ ss_magic.str("");
3163
+ ss_magic.clear(std::stringstream::goodbit);
3149
3164
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3150
3165
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3151
3166
 
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
449
449
  if (update_backend) {
450
450
  view->backend = view->view_src->backend;
451
451
  }
452
- view->buffer = view->view_src->buffer;
452
+ // views are initialized in the alloc buffer rather than the view_src buffer
453
+ view->buffer = alloc->buffer;
453
454
  view->data = (char *)view->view_src->data + view->view_offs;
454
455
 
455
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
456
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
457
456
  assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458
457
 
459
458
  if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
736
735
  }
737
736
 
738
737
  void ggml_allocr_free(ggml_allocr_t alloc) {
738
+ if (alloc == NULL) {
739
+ return;
740
+ }
741
+
739
742
  ggml_gallocr_free(alloc->galloc);
740
743
  ggml_tallocr_free(alloc->talloc);
741
744
  free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
775
778
  }
776
779
 
777
780
  if (nbytes == 0) {
778
- fprintf(stderr, "%s: no tensors to allocate\n", __func__);
781
+ // all the tensors in the context are already allocated
779
782
  return NULL;
780
783
  }
781
784
 
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
789
792
  } else {
790
793
  ggml_backend_view_init(buffer, t);
791
794
  }
795
+ } else {
796
+ if (t->view_src != NULL) {
797
+ // view of a pre-allocated tensor
798
+ ggml_backend_view_init(buffer, t);
799
+ }
792
800
  }
793
801
  }
794
802
 
@@ -20,6 +20,9 @@ extern "C" {
20
20
  size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
21
  size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
22
  bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23
+ // check if tensor data is in host memory
24
+ // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
23
26
  };
24
27
 
25
28
  struct ggml_backend_buffer_type {
@@ -31,15 +34,16 @@ extern "C" {
31
34
  typedef void * ggml_backend_buffer_context_t;
32
35
 
33
36
  struct ggml_backend_buffer_i {
34
- void (*free_buffer)(ggml_backend_buffer_t buffer);
37
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
35
38
  //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
36
- void * (*get_base) (ggml_backend_buffer_t buffer);
37
- void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
39
+ void * (*get_base) (ggml_backend_buffer_t buffer);
40
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
40
43
  // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
41
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
44
+ void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45
+ void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
43
47
  };
44
48
 
45
49
  struct ggml_backend_buffer {
@@ -78,7 +82,7 @@ extern "C" {
78
82
  void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
79
83
  void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80
84
 
81
- void (*synchronize) (ggml_backend_t backend);
85
+ void (*synchronize)(ggml_backend_t backend);
82
86
 
83
87
  // compute graph with a plan
84
88
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
35
35
  return buft->iface.supports_backend(buft, backend);
36
36
  }
37
37
 
38
+ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
39
+ if (buft->iface.is_host) {
40
+ return buft->iface.is_host(buft);
41
+ }
42
+ return false;
43
+ }
44
+
38
45
  // backend buffer
39
46
 
40
47
  ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
94
101
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
95
102
  }
96
103
 
104
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
105
+ buffer->iface.clear(buffer, value);
106
+ }
107
+
108
+ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
+ return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110
+ }
111
+
97
112
  ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
98
113
  return buffer->buft;
99
114
  }
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
378
393
 
379
394
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
380
395
  free(buffer->context);
381
- GGML_UNUSED(buffer);
382
396
  }
383
397
 
384
398
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
411
425
  GGML_UNUSED(buffer);
412
426
  }
413
427
 
428
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
429
+ memset(buffer->context, value, buffer->size);
430
+ }
431
+
414
432
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
415
433
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
416
434
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
419
437
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
420
438
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
421
439
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
440
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
422
441
  };
423
442
 
424
443
  // for buffers from ptr, free is not called
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
430
449
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
431
450
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
432
451
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
452
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
433
453
  };
434
454
 
435
455
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
455
475
  GGML_UNUSED(buft);
456
476
  }
457
477
 
478
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
479
+ return true;
480
+
481
+ GGML_UNUSED(buft);
482
+ }
483
+
458
484
  ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
459
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
485
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
460
486
  /* .iface = */ {
461
487
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
462
488
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
463
489
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
464
490
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
491
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
465
492
  },
466
493
  /* .context = */ NULL,
467
494
  };
468
495
 
469
- return &ggml_backend_buffer_type_cpu;
496
+ return &ggml_backend_cpu_buffer_type;
470
497
  }
471
498
 
499
+ #ifdef GGML_USE_CPU_HBM
500
+
501
+ // buffer type HBM
502
+
503
+ #include <hbwmalloc.h>
504
+
505
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
506
+ hbw_free(buffer->context);
507
+ }
508
+
509
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
510
+ //void * ptr = hbw_malloc(size);
511
+ void * ptr;
512
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
513
+ if (result != 0) {
514
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
515
+ return NULL;
516
+ }
517
+
518
+ // FIXME: this is a hack to avoid having to implement a new buffer type
519
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
520
+ buffer->buft = buft;
521
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
522
+
523
+ return buffer;
524
+ }
525
+
526
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
527
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
528
+ /* .iface = */ {
529
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
530
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
531
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
532
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
533
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
534
+ },
535
+ /* .context = */ NULL,
536
+ };
537
+
538
+ return &ggml_backend_cpu_buffer_type_hbm;
539
+ }
540
+ #endif
541
+
472
542
  struct ggml_backend_cpu_context {
473
543
  int n_threads;
474
544
  void * work_data;
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
505
575
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
506
576
 
507
577
  cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
508
- cpu_plan->cgraph = *cgraph;
578
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
509
579
 
510
580
  if (cpu_plan->cplan.work_size > 0) {
511
581
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1180
1250
  // utils
1181
1251
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1182
1252
  GGML_ASSERT(tensor->buffer == NULL);
1183
- GGML_ASSERT(tensor->data == NULL);
1253
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1184
1254
  GGML_ASSERT(tensor->view_src != NULL);
1185
1255
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1186
1256
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -21,6 +21,7 @@ extern "C" {
21
21
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
22
22
  GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
23
23
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
24
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
24
25
 
25
26
  // buffer
26
27
  GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
@@ -29,6 +30,8 @@ extern "C" {
29
30
  GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
30
31
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
31
32
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
33
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
34
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
32
35
  GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
33
36
 
34
37
  //
@@ -76,6 +79,10 @@ extern "C" {
76
79
 
77
80
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
78
81
 
82
+ #ifdef GGML_USE_CPU_HBM
83
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
84
+ #endif
85
+
79
86
  //
80
87
  // Backend registry
81
88
  //