llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
61
61
  }
62
62
  }
63
63
 
64
- // TODO: GGML_PAD ?
65
64
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
66
65
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
67
66
  size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
69
68
  }
70
69
 
71
70
  // tallocr
72
- struct ggml_tallocr {
73
- ggml_backend_buffer_t buffer;
74
- void * base;
75
- size_t alignment;
76
- size_t offset;
77
- };
78
-
79
- ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
- ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
- if (talloc == NULL) {
82
- return NULL;
83
- }
84
71
 
72
+ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
85
73
  void * base = ggml_backend_buffer_get_base(buffer);
86
74
  size_t align = ggml_backend_buffer_get_alignment(buffer);
87
75
 
88
76
  assert(align && !(align & (align - 1))); // power of 2
89
77
 
90
- *talloc = (struct ggml_tallocr) {
78
+ struct ggml_tallocr talloc = (struct ggml_tallocr) {
91
79
  /*.buffer = */ buffer,
92
80
  /*.base = */ base,
93
81
  /*.alignment = */ align,
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
96
84
  return talloc;
97
85
  }
98
86
 
99
- void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
- free(talloc);
101
- }
102
-
103
- void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
87
+ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
104
88
  size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
89
  size = GGML_PAD(size, talloc->alignment);
106
90
 
@@ -354,12 +338,16 @@ struct hash_node {
354
338
  bool allocated;
355
339
  };
356
340
 
357
- //
358
341
  struct tensor_alloc {
359
342
  size_t offset;
360
343
  size_t size_max; // 0 = pre-allocated, unused, or view
361
344
  };
362
345
 
346
+ struct leaf_alloc {
347
+ int buffer_id;
348
+ struct tensor_alloc leaf;
349
+ };
350
+
363
351
  struct node_alloc {
364
352
  int buffer_id;
365
353
  struct tensor_alloc dst;
@@ -378,7 +366,7 @@ struct ggml_gallocr {
378
366
  struct node_alloc * node_allocs; // [n_nodes]
379
367
  int n_nodes;
380
368
 
381
- struct tensor_alloc * leaf_allocs; // [n_leafs]
369
+ struct leaf_alloc * leaf_allocs; // [n_leafs]
382
370
  int n_leafs;
383
371
  };
384
372
 
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
543
531
  return node_buffer_ids ? node_buffer_ids[i] : 0;
544
532
  }
545
533
 
546
- static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
534
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
547
535
  // clear hash tables
548
536
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
549
537
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
550
538
 
539
+ // allocate leafs
540
+ // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
541
+ for (int i = 0; i < graph->n_leafs; i++) {
542
+ struct ggml_tensor * leaf = graph->leafs[i];
543
+ ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
544
+ }
545
+
551
546
  // count number of children and views
552
- // allocate all graph inputs and leafs first to avoid overwriting them
547
+ // allocate other graph inputs and leafs first to avoid overwriting them
553
548
  for (int i = 0; i < graph->n_nodes; i++) {
554
549
  struct ggml_tensor * node = graph->nodes[i];
555
550
 
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
577
572
  }
578
573
  }
579
574
 
580
- // allocate the remaining leafs that are unused on the graph
581
- // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
- for (int i = 0; i < graph->n_leafs; i++) {
583
- struct ggml_tensor * leaf = graph->leafs[i];
584
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
-
586
- if (hn->n_children == 0) {
587
- assert(!hn->allocated);
588
- // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
- ggml_gallocr_allocate_node(galloc, leaf, 0);
590
- }
591
- }
592
-
593
575
  // allocate tensors
594
576
  for (int i = 0; i < graph->n_nodes; i++) {
595
577
  struct ggml_tensor * node = graph->nodes[i];
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
652
634
  }
653
635
  }
654
636
 
655
- bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
637
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
656
638
  size_t hash_size = graph->visited_hash_table.size;
657
639
 
658
640
  // initialize hash table
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
676
658
  }
677
659
 
678
660
  // allocate in hash table
679
- ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
661
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
680
662
 
681
663
  // set the node_allocs from the hash table
682
664
  if (galloc->n_nodes < graph->n_nodes) {
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
711
693
  }
712
694
  if (galloc->n_leafs < graph->n_leafs) {
713
695
  free(galloc->leaf_allocs);
714
- galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
696
+ galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
715
697
  GGML_ASSERT(galloc->leaf_allocs != NULL);
716
698
  }
717
699
  galloc->n_leafs = graph->n_leafs;
718
700
  for (int i = 0; i < graph->n_leafs; i++) {
719
701
  struct ggml_tensor * leaf = graph->leafs[i];
720
702
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
- galloc->leaf_allocs[i].offset = hn->offset;
722
- galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
703
+ galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
704
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
705
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
723
706
  }
724
707
 
725
708
  // reallocate buffers if needed
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
727
710
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
728
711
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
729
712
 
730
- if (new_size > cur_size) {
713
+ // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
714
+ if (new_size > cur_size || galloc->buffers[i] == NULL) {
731
715
  #ifndef NDEBUG
732
716
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
733
717
  #endif
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
744
728
  }
745
729
 
746
730
  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
747
- return ggml_gallocr_reserve_n(galloc, graph, NULL);
731
+ return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
748
732
  }
749
733
 
750
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
734
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
735
+ assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
752
736
 
753
- if (node->view_src != NULL) {
754
- if (node->buffer == NULL) {
737
+ if (tensor->view_src != NULL) {
738
+ if (tensor->buffer == NULL) {
755
739
  assert(tensor_alloc->offset == SIZE_MAX);
756
- if (node->view_src->buffer == NULL) {
740
+ if (tensor->view_src->buffer == NULL) {
757
741
  // this tensor was allocated without ggml-backend
758
742
  return;
759
743
  }
760
- ggml_backend_view_init(galloc->buffers[buffer_id], node);
744
+ ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
761
745
  }
762
746
  } else {
763
- if (node->data == NULL) {
747
+ if (tensor->data == NULL) {
764
748
  assert(tensor_alloc->offset != SIZE_MAX);
765
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
749
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
766
750
  void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
767
751
  void * addr = (char *)base + tensor_alloc->offset;
768
- ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
752
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
769
753
  } else {
770
- if (node->buffer == NULL) {
754
+ if (tensor->buffer == NULL) {
771
755
  // this tensor was allocated without ggml-backend
772
756
  return;
773
757
  }
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
843
827
 
844
828
  // reset buffers
845
829
  for (int i = 0; i < galloc->n_buffers; i++) {
846
- // zero size buffers are not allocated
847
830
  if (galloc->buffers[i] != NULL) {
848
831
  ggml_backend_buffer_reset(galloc->buffers[i]);
849
832
  }
850
833
  }
851
834
 
852
835
  // allocate the graph tensors from the previous assignments
836
+ // leafs
837
+ for (int i = 0; i < graph->n_leafs; i++) {
838
+ struct ggml_tensor * leaf = graph->leafs[i];
839
+ struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
840
+ ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
841
+ }
853
842
  // nodes
854
843
  for (int i = 0; i < graph->n_nodes; i++) {
855
844
  struct ggml_tensor * node = graph->nodes[i];
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
863
852
  }
864
853
  ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
854
  }
866
- // leafs
867
- for (int i = 0; i < graph->n_leafs; i++) {
868
- struct ggml_tensor * leaf = graph->leafs[i];
869
- struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
- ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
871
- }
872
855
 
873
856
  return true;
874
857
  }
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
900
883
  return false;
901
884
  }
902
885
 
903
- struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
886
+ struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
904
887
 
905
888
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
906
889
  if (t->data == NULL) {
907
890
  if (t->view_src == NULL) {
908
- ggml_tallocr_alloc(tallocr, t);
891
+ ggml_tallocr_alloc(&tallocr, t);
909
892
  } else if (t->buffer == NULL) {
910
893
  ggml_backend_view_init(buffer, t);
911
894
  }
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
917
900
  }
918
901
  }
919
902
 
920
- ggml_tallocr_free(tallocr);
921
-
922
903
  *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
923
904
  (*buffers)[(*n_buffers)++] = buffer;
924
905
 
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
11
  typedef struct ggml_backend * ggml_backend_t;
12
12
 
13
13
  // Tensor allocator
14
- typedef struct ggml_tallocr * ggml_tallocr_t;
14
+ struct ggml_tallocr {
15
+ ggml_backend_buffer_t buffer;
16
+ void * base;
17
+ size_t alignment;
18
+ size_t offset;
19
+ };
15
20
 
16
- GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
17
- GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
18
- GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
21
+ GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
22
+ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
19
23
 
20
24
  // Graph allocator
21
25
  /*
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
50
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
51
55
  // returns false if the buffer allocation failed
52
56
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
53
- GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
57
+ GGML_API bool ggml_gallocr_reserve_n(
58
+ ggml_gallocr_t galloc,
59
+ struct ggml_cgraph * graph,
60
+ const int * node_buffer_ids,
61
+ const int * leaf_buffer_ids);
54
62
 
55
63
  // automatic reallocation if the topology changes when using a single buffer
56
64
  // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
@@ -86,31 +86,43 @@ extern "C" {
86
86
  // (optional) asynchronous tensor data access
87
87
  void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88
88
  void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
89
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
90
90
 
91
91
  // (optional) complete all pending operations
92
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
93
 
94
- // compute graph with a plan
94
+ // compute graph with a plan (not used currently)
95
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
- void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
98
97
 
98
+ // compute graph with a plan
99
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
99
100
  // compute graph without a plan (async)
100
- bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
+ enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
102
 
102
103
  // check if the backend supports an operation
103
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
+
106
+ // (optional) event synchronization
107
+ ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
+ void (*GGML_CALL event_free) (ggml_backend_event_t event);
109
+ void (*GGML_CALL event_record) (ggml_backend_event_t event);
110
+ void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
111
+ void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
104
112
  };
105
113
 
106
114
  struct ggml_backend {
107
115
  ggml_guid_t guid;
108
116
 
109
117
  struct ggml_backend_i iface;
110
-
111
118
  ggml_backend_context_t context;
112
119
  };
113
120
 
121
+ struct ggml_backend_event {
122
+ ggml_backend_t backend;
123
+ void * context;
124
+ };
125
+
114
126
  //
115
127
  // Backend registry
116
128
  //