llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
61
61
  }
62
62
  }
63
63
 
64
- // TODO: GGML_PAD ?
65
64
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
66
65
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
67
66
  size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
69
68
  }
70
69
 
71
70
  // tallocr
72
- struct ggml_tallocr {
73
- ggml_backend_buffer_t buffer;
74
- void * base;
75
- size_t alignment;
76
- size_t offset;
77
- };
78
-
79
- ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
- ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
- if (talloc == NULL) {
82
- return NULL;
83
- }
84
71
 
72
+ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
85
73
  void * base = ggml_backend_buffer_get_base(buffer);
86
74
  size_t align = ggml_backend_buffer_get_alignment(buffer);
87
75
 
88
76
  assert(align && !(align & (align - 1))); // power of 2
89
77
 
90
- *talloc = (struct ggml_tallocr) {
78
+ struct ggml_tallocr talloc = (struct ggml_tallocr) {
91
79
  /*.buffer = */ buffer,
92
80
  /*.base = */ base,
93
81
  /*.alignment = */ align,
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
96
84
  return talloc;
97
85
  }
98
86
 
99
- void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
- free(talloc);
101
- }
102
-
103
- void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
87
+ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
104
88
  size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
89
  size = GGML_PAD(size, talloc->alignment);
106
90
 
@@ -354,12 +338,16 @@ struct hash_node {
354
338
  bool allocated;
355
339
  };
356
340
 
357
- //
358
341
  struct tensor_alloc {
359
342
  size_t offset;
360
343
  size_t size_max; // 0 = pre-allocated, unused, or view
361
344
  };
362
345
 
346
+ struct leaf_alloc {
347
+ int buffer_id;
348
+ struct tensor_alloc leaf;
349
+ };
350
+
363
351
  struct node_alloc {
364
352
  int buffer_id;
365
353
  struct tensor_alloc dst;
@@ -378,7 +366,7 @@ struct ggml_gallocr {
378
366
  struct node_alloc * node_allocs; // [n_nodes]
379
367
  int n_nodes;
380
368
 
381
- struct tensor_alloc * leaf_allocs; // [n_leafs]
369
+ struct leaf_alloc * leaf_allocs; // [n_leafs]
382
370
  int n_leafs;
383
371
  };
384
372
 
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
543
531
  return node_buffer_ids ? node_buffer_ids[i] : 0;
544
532
  }
545
533
 
546
- static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
534
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
547
535
  // clear hash tables
548
536
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
549
537
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
550
538
 
539
+ // allocate leafs
540
+ // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
541
+ for (int i = 0; i < graph->n_leafs; i++) {
542
+ struct ggml_tensor * leaf = graph->leafs[i];
543
+ ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
544
+ }
545
+
551
546
  // count number of children and views
552
- // allocate all graph inputs and leafs first to avoid overwriting them
547
+ // allocate other graph inputs and leafs first to avoid overwriting them
553
548
  for (int i = 0; i < graph->n_nodes; i++) {
554
549
  struct ggml_tensor * node = graph->nodes[i];
555
550
 
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
577
572
  }
578
573
  }
579
574
 
580
- // allocate the remaining leafs that are unused on the graph
581
- // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
- for (int i = 0; i < graph->n_leafs; i++) {
583
- struct ggml_tensor * leaf = graph->leafs[i];
584
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
-
586
- if (hn->n_children == 0) {
587
- assert(!hn->allocated);
588
- // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
- ggml_gallocr_allocate_node(galloc, leaf, 0);
590
- }
591
- }
592
-
593
575
  // allocate tensors
594
576
  for (int i = 0; i < graph->n_nodes; i++) {
595
577
  struct ggml_tensor * node = graph->nodes[i];
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
652
634
  }
653
635
  }
654
636
 
655
- bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
637
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
656
638
  size_t hash_size = graph->visited_hash_table.size;
657
639
 
658
640
  // initialize hash table
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
676
658
  }
677
659
 
678
660
  // allocate in hash table
679
- ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
661
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
680
662
 
681
663
  // set the node_allocs from the hash table
682
664
  if (galloc->n_nodes < graph->n_nodes) {
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
711
693
  }
712
694
  if (galloc->n_leafs < graph->n_leafs) {
713
695
  free(galloc->leaf_allocs);
714
- galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
696
+ galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
715
697
  GGML_ASSERT(galloc->leaf_allocs != NULL);
716
698
  }
717
699
  galloc->n_leafs = graph->n_leafs;
718
700
  for (int i = 0; i < graph->n_leafs; i++) {
719
701
  struct ggml_tensor * leaf = graph->leafs[i];
720
702
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
- galloc->leaf_allocs[i].offset = hn->offset;
722
- galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
703
+ galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
704
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
705
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
723
706
  }
724
707
 
725
708
  // reallocate buffers if needed
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
727
710
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
728
711
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
729
712
 
730
- if (new_size > cur_size) {
713
+ // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
714
+ if (new_size > cur_size || galloc->buffers[i] == NULL) {
731
715
  #ifndef NDEBUG
732
716
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
733
717
  #endif
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
744
728
  }
745
729
 
746
730
  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
747
- return ggml_gallocr_reserve_n(galloc, graph, NULL);
731
+ return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
748
732
  }
749
733
 
750
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
734
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
735
+ assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
752
736
 
753
- if (node->view_src != NULL) {
754
- if (node->buffer == NULL) {
737
+ if (tensor->view_src != NULL) {
738
+ if (tensor->buffer == NULL) {
755
739
  assert(tensor_alloc->offset == SIZE_MAX);
756
- if (node->view_src->buffer == NULL) {
740
+ if (tensor->view_src->buffer == NULL) {
757
741
  // this tensor was allocated without ggml-backend
758
742
  return;
759
743
  }
760
- ggml_backend_view_init(galloc->buffers[buffer_id], node);
744
+ ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
761
745
  }
762
746
  } else {
763
- if (node->data == NULL) {
747
+ if (tensor->data == NULL) {
764
748
  assert(tensor_alloc->offset != SIZE_MAX);
765
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
749
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
766
750
  void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
767
751
  void * addr = (char *)base + tensor_alloc->offset;
768
- ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
752
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
769
753
  } else {
770
- if (node->buffer == NULL) {
754
+ if (tensor->buffer == NULL) {
771
755
  // this tensor was allocated without ggml-backend
772
756
  return;
773
757
  }
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
843
827
 
844
828
  // reset buffers
845
829
  for (int i = 0; i < galloc->n_buffers; i++) {
846
- // zero size buffers are not allocated
847
830
  if (galloc->buffers[i] != NULL) {
848
831
  ggml_backend_buffer_reset(galloc->buffers[i]);
849
832
  }
850
833
  }
851
834
 
852
835
  // allocate the graph tensors from the previous assignments
836
+ // leafs
837
+ for (int i = 0; i < graph->n_leafs; i++) {
838
+ struct ggml_tensor * leaf = graph->leafs[i];
839
+ struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
840
+ ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
841
+ }
853
842
  // nodes
854
843
  for (int i = 0; i < graph->n_nodes; i++) {
855
844
  struct ggml_tensor * node = graph->nodes[i];
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
863
852
  }
864
853
  ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
854
  }
866
- // leafs
867
- for (int i = 0; i < graph->n_leafs; i++) {
868
- struct ggml_tensor * leaf = graph->leafs[i];
869
- struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
- ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
871
- }
872
855
 
873
856
  return true;
874
857
  }
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
900
883
  return false;
901
884
  }
902
885
 
903
- struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
886
+ struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
904
887
 
905
888
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
906
889
  if (t->data == NULL) {
907
890
  if (t->view_src == NULL) {
908
- ggml_tallocr_alloc(tallocr, t);
891
+ ggml_tallocr_alloc(&tallocr, t);
909
892
  } else if (t->buffer == NULL) {
910
893
  ggml_backend_view_init(buffer, t);
911
894
  }
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
917
900
  }
918
901
  }
919
902
 
920
- ggml_tallocr_free(tallocr);
921
-
922
903
  *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
923
904
  (*buffers)[(*n_buffers)++] = buffer;
924
905
 
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
11
  typedef struct ggml_backend * ggml_backend_t;
12
12
 
13
13
  // Tensor allocator
14
- typedef struct ggml_tallocr * ggml_tallocr_t;
14
+ struct ggml_tallocr {
15
+ ggml_backend_buffer_t buffer;
16
+ void * base;
17
+ size_t alignment;
18
+ size_t offset;
19
+ };
15
20
 
16
- GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
17
- GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
18
- GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
21
+ GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
22
+ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
19
23
 
20
24
  // Graph allocator
21
25
  /*
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
50
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
51
55
  // returns false if the buffer allocation failed
52
56
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
53
- GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
57
+ GGML_API bool ggml_gallocr_reserve_n(
58
+ ggml_gallocr_t galloc,
59
+ struct ggml_cgraph * graph,
60
+ const int * node_buffer_ids,
61
+ const int * leaf_buffer_ids);
54
62
 
55
63
  // automatic reallocation if the topology changes when using a single buffer
56
64
  // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
@@ -86,31 +86,43 @@ extern "C" {
86
86
  // (optional) asynchronous tensor data access
87
87
  void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88
88
  void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
89
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
90
90
 
91
91
  // (optional) complete all pending operations
92
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
93
 
94
- // compute graph with a plan
94
+ // compute graph with a plan (not used currently)
95
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
- void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
98
97
 
98
+ // compute graph with a plan
99
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
99
100
  // compute graph without a plan (async)
100
- bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
+ enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
102
 
102
103
  // check if the backend supports an operation
103
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
+
106
+ // (optional) event synchronization
107
+ ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
+ void (*GGML_CALL event_free) (ggml_backend_event_t event);
109
+ void (*GGML_CALL event_record) (ggml_backend_event_t event);
110
+ void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
111
+ void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
104
112
  };
105
113
 
106
114
  struct ggml_backend {
107
115
  ggml_guid_t guid;
108
116
 
109
117
  struct ggml_backend_i iface;
110
-
111
118
  ggml_backend_context_t context;
112
119
  };
113
120
 
121
+ struct ggml_backend_event {
122
+ ggml_backend_t backend;
123
+ void * context;
124
+ };
125
+
114
126
  //
115
127
  // Backend registry
116
128
  //