llama_cpp 0.10.0 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7f406c15621a7c247adaacf1d588ddf278225e6846466afd1184c00f1ee61768
4
- data.tar.gz: df73657c75a80cb44f41d34a3c1054676cf59a5d7d56cb1c2ce8a94264002293
3
+ metadata.gz: b4e94e20f142572fb46cff141e109025e1c5b91b9cd6cabfbaeac163a920bb82
4
+ data.tar.gz: 10fcd2922c057a0c960cbde83a0fa22a56eae6e21a73e20931cfe082108dec26
5
5
  SHA512:
6
- metadata.gz: acd08d5099f14bf2bd4c8f9bf016253f0e316179b79d72fbe7066b0d645ca31e9bab427fcc53d93874f8df74cb1746731e2cd21864bfecdecff91f9778919b42
7
- data.tar.gz: 5014a1bd545be90c56bebd48119a198cf7276513cb6c5f00d8322aa6eaa9a27442bc51bf06953a11c2fc04145f797c630cefee17b36589fe38f9226003416a09
6
+ metadata.gz: 570f5f257d947bad8489fcafe6c7cc0dc342d593132111d3479c0202e0998e444ca75c2140883dc8cf6a22f0f4f74fecd264a577cee6aabde4c168f5003b4c98
7
+ data.tar.gz: d105e41aac3bb39b2ee53c43eb320272ffa5290caa7e14e2f08bebbade94417aed065e01d23b8def9cce08fd6789576d33ba4082750647c0bbf9ff4c776db9ac
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [[0.10.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.1...v0.10.2)] - 2023-12-23
2
+
3
+ - Bump bundled llama.cpp from b1641 to b1686.
4
+ - Add `LLAMA_FILE_MAGIC_GGLA` constant.
5
+ - Add `n_batch` method to `Context`.
6
+
7
+ ## [[0.10.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.0...v0.10.1)] - 2023-12-16
8
+
9
+ - Bump bundled llama.cpp from b1620 to b1641.
10
+ - Add attribute reader for `params` to `Model`.
11
+ - Add `Batch` class, this class was not published because the author forgot to write `rb_define_class`.
12
+
1
13
  ## [[0.10.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.5...v0.10.0)] - 2023-12-09
2
14
 
3
15
  - Bump bundled llama.cpp from b1593 to b1620.
@@ -1333,6 +1333,7 @@ public:
1333
1333
  static void define_class(VALUE outer) {
1334
1334
  rb_cLLaMAModel = rb_define_class_under(outer, "Model", rb_cObject);
1335
1335
  rb_define_alloc_func(rb_cLLaMAModel, llama_model_alloc);
1336
+ rb_define_attr(rb_cLLaMAModel, "params", 1, 0);
1336
1337
  rb_define_method(rb_cLLaMAModel, "initialize", RUBY_METHOD_FUNC(_llama_model_initialize), -1);
1337
1338
  rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
1338
1339
  rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
@@ -1948,6 +1949,7 @@ public:
1948
1949
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
1949
1950
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
1950
1951
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1952
+ rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
1951
1953
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
1952
1954
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
1953
1955
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2200,7 +2202,16 @@ private:
2200
2202
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2201
2203
  return Qnil;
2202
2204
  }
2203
- return INT2NUM(llama_n_ctx(ptr->ctx));
2205
+ return UINT2NUM(llama_n_ctx(ptr->ctx));
2206
+ }
2207
+
2208
+ static VALUE _llama_context_n_batch(VALUE self) {
2209
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2210
+ if (ptr->ctx == NULL) {
2211
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2212
+ return Qnil;
2213
+ }
2214
+ return UINT2NUM(llama_n_batch(ptr->ctx));
2204
2215
  }
2205
2216
 
2206
2217
  static VALUE _llama_context_get_timings(VALUE self) {
@@ -3071,6 +3082,7 @@ static VALUE rb_llama_max_devices(VALUE self) {
3071
3082
  extern "C" void Init_llama_cpp(void) {
3072
3083
  rb_mLLaMACpp = rb_define_module("LLaMACpp");
3073
3084
 
3085
+ RbLLaMABatch::define_class(rb_mLLaMACpp);
3074
3086
  RbLLaMATokenData::define_class(rb_mLLaMACpp);
3075
3087
  RbLLaMATokenDataArray::define_class(rb_mLLaMACpp);
3076
3088
  RbLLaMAModel::define_class(rb_mLLaMACpp);
@@ -3144,6 +3156,11 @@ extern "C" void Init_llama_cpp(void) {
3144
3156
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3145
3157
 
3146
3158
  std::stringstream ss_magic;
3159
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
3160
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
3161
+
3162
+ ss_magic.str("");
3163
+ ss_magic.clear(std::stringstream::goodbit);
3147
3164
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3148
3165
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3149
3166
 
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
449
449
  if (update_backend) {
450
450
  view->backend = view->view_src->backend;
451
451
  }
452
- view->buffer = view->view_src->buffer;
452
+ // views are initialized in the alloc buffer rather than the view_src buffer
453
+ view->buffer = alloc->buffer;
453
454
  view->data = (char *)view->view_src->data + view->view_offs;
454
455
 
455
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
456
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
457
456
  assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458
457
 
459
458
  if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
736
735
  }
737
736
 
738
737
  void ggml_allocr_free(ggml_allocr_t alloc) {
738
+ if (alloc == NULL) {
739
+ return;
740
+ }
741
+
739
742
  ggml_gallocr_free(alloc->galloc);
740
743
  ggml_tallocr_free(alloc->talloc);
741
744
  free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
775
778
  }
776
779
 
777
780
  if (nbytes == 0) {
778
- fprintf(stderr, "%s: no tensors to allocate\n", __func__);
781
+ // all the tensors in the context are already allocated
779
782
  return NULL;
780
783
  }
781
784
 
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
789
792
  } else {
790
793
  ggml_backend_view_init(buffer, t);
791
794
  }
795
+ } else {
796
+ if (t->view_src != NULL) {
797
+ // view of a pre-allocated tensor
798
+ ggml_backend_view_init(buffer, t);
799
+ }
792
800
  }
793
801
  }
794
802
 
@@ -43,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
43
43
  // ggml-backend v2 API
44
44
  //
45
45
 
46
- // Seperate tensor and graph allocator objects
46
+ // Separate tensor and graph allocator objects
47
47
  // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
48
48
  // The original API is kept as a wrapper around the new API
49
49
 
@@ -20,6 +20,9 @@ extern "C" {
20
20
  size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
21
  size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
22
  bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23
+ // check if tensor data is in host memory
24
+ // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
23
26
  };
24
27
 
25
28
  struct ggml_backend_buffer_type {
@@ -31,15 +34,16 @@ extern "C" {
31
34
  typedef void * ggml_backend_buffer_context_t;
32
35
 
33
36
  struct ggml_backend_buffer_i {
34
- void (*free_buffer)(ggml_backend_buffer_t buffer);
37
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
35
38
  //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
36
- void * (*get_base) (ggml_backend_buffer_t buffer);
37
- void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
39
+ void * (*get_base) (ggml_backend_buffer_t buffer);
40
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
40
43
  // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
41
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
44
+ void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45
+ void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
43
47
  };
44
48
 
45
49
  struct ggml_backend_buffer {
@@ -78,7 +82,7 @@ extern "C" {
78
82
  void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
79
83
  void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80
84
 
81
- void (*synchronize) (ggml_backend_t backend);
85
+ void (*synchronize)(ggml_backend_t backend);
82
86
 
83
87
  // compute graph with a plan
84
88
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
35
35
  return buft->iface.supports_backend(buft, backend);
36
36
  }
37
37
 
38
+ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
39
+ if (buft->iface.is_host) {
40
+ return buft->iface.is_host(buft);
41
+ }
42
+ return false;
43
+ }
44
+
38
45
  // backend buffer
39
46
 
40
47
  ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
94
101
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
95
102
  }
96
103
 
104
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
105
+ buffer->iface.clear(buffer, value);
106
+ }
107
+
108
+ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
+ return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110
+ }
111
+
97
112
  ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
98
113
  return buffer->buft;
99
114
  }
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
378
393
 
379
394
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
380
395
  free(buffer->context);
381
- GGML_UNUSED(buffer);
382
396
  }
383
397
 
384
398
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
411
425
  GGML_UNUSED(buffer);
412
426
  }
413
427
 
428
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
429
+ memset(buffer->context, value, buffer->size);
430
+ }
431
+
414
432
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
415
433
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
416
434
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
419
437
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
420
438
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
421
439
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
440
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
422
441
  };
423
442
 
424
443
  // for buffers from ptr, free is not called
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
430
449
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
431
450
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
432
451
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
452
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
433
453
  };
434
454
 
435
455
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
455
475
  GGML_UNUSED(buft);
456
476
  }
457
477
 
478
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
479
+ return true;
480
+
481
+ GGML_UNUSED(buft);
482
+ }
483
+
458
484
  ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
459
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
485
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
460
486
  /* .iface = */ {
461
487
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
462
488
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
463
489
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
464
490
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
491
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
465
492
  },
466
493
  /* .context = */ NULL,
467
494
  };
468
495
 
469
- return &ggml_backend_buffer_type_cpu;
496
+ return &ggml_backend_cpu_buffer_type;
470
497
  }
471
498
 
499
+ #ifdef GGML_USE_CPU_HBM
500
+
501
+ // buffer type HBM
502
+
503
+ #include <hbwmalloc.h>
504
+
505
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
506
+ hbw_free(buffer->context);
507
+ }
508
+
509
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
510
+ //void * ptr = hbw_malloc(size);
511
+ void * ptr;
512
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
513
+ if (result != 0) {
514
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
515
+ return NULL;
516
+ }
517
+
518
+ // FIXME: this is a hack to avoid having to implement a new buffer type
519
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
520
+ buffer->buft = buft;
521
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
522
+
523
+ return buffer;
524
+ }
525
+
526
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
527
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
528
+ /* .iface = */ {
529
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
530
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
531
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
532
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
533
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
534
+ },
535
+ /* .context = */ NULL,
536
+ };
537
+
538
+ return &ggml_backend_cpu_buffer_type_hbm;
539
+ }
540
+ #endif
541
+
472
542
  struct ggml_backend_cpu_context {
473
543
  int n_threads;
474
544
  void * work_data;
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
505
575
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
506
576
 
507
577
  cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
508
- cpu_plan->cgraph = *cgraph;
578
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
509
579
 
510
580
  if (cpu_plan->cplan.work_size > 0) {
511
581
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1180
1250
  // utils
1181
1251
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1182
1252
  GGML_ASSERT(tensor->buffer == NULL);
1183
- GGML_ASSERT(tensor->data == NULL);
1253
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1184
1254
  GGML_ASSERT(tensor->view_src != NULL);
1185
1255
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1186
1256
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -21,6 +21,7 @@ extern "C" {
21
21
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
22
22
  GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
23
23
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
24
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
24
25
 
25
26
  // buffer
26
27
  GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
@@ -29,6 +30,8 @@ extern "C" {
29
30
  GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
30
31
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
31
32
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
33
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
34
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
32
35
  GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
33
36
 
34
37
  //
@@ -76,6 +79,10 @@ extern "C" {
76
79
 
77
80
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
78
81
 
82
+ #ifdef GGML_USE_CPU_HBM
83
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
84
+ #endif
85
+
79
86
  //
80
87
  // Backend registry
81
88
  //