llama_cpp 0.14.0 → 0.14.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,6 +9,7 @@ extern "C" {
9
9
 
10
10
  typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
11
11
  typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
12
+ typedef struct ggml_backend_event * ggml_backend_event_t;
12
13
  typedef struct ggml_backend * ggml_backend_t;
13
14
  typedef void * ggml_backend_graph_plan_t;
14
15
 
@@ -72,11 +73,24 @@ extern "C" {
72
73
  GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
73
74
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
74
75
 
76
+ GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
77
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
76
78
 
77
79
  // tensor copy between different backends
78
80
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
79
- GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
81
+
82
+ // asynchronous copy
83
+ // the copy is performed after all the currently queued operations in backend_src
84
+ // backend_dst will wait for the copy to complete before performing other operations
85
+ // automatic fallback to sync copy if async is not supported
86
+ GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
87
+
88
+ // events
89
+ GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
90
+ GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
+ GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
+ GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
+ GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
80
94
 
81
95
  //
82
96
  // CPU backend
@@ -123,27 +137,31 @@ extern "C" {
123
137
  /*
124
138
  Example usage:
125
139
 
126
- sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
127
- // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
141
+ // preferrably to run on the same backend as the buffer
142
+ ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
128
143
 
129
- // initialize buffers from a measure graph
130
- measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
144
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
131
145
 
132
- // in build_graph:
133
- build_graph(...) {
134
- // manually assign nodes to a backend (optional, should not be needed in most cases)
135
- struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
136
- ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
137
- }
146
+ // initialize buffers from a max size graph (optional)
147
+ reserve_graph = build_graph(sched, max_batch_size);
138
148
 
139
- // allocate backend buffers from measure graph
140
- ggml_backend_sched_init_measure(sched, measure_graph);
149
+ // manually assign nodes to a backend (optional, should not be needed in most cases)
150
+ struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
151
+ ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
141
152
 
142
- // the scheduler is now ready to compute graphs
153
+ ggml_backend_sched_reserve(sched, reserve_graph);
143
154
 
144
155
  // compute
145
156
  graph = build_graph(sched);
146
157
  ggml_backend_sched_graph_compute(sched, graph);
158
+
159
+ // if there are graph inputs:
160
+ ggml_backend_sched_reset(sched);
161
+ ggml_backend_sched_alloc_graph(sched, graph);
162
+ ggml_backend_tensor_set(input_tensor, ...);
163
+ ggml_backend_sched_graph_compute(sched, graph);
164
+ }
147
165
  */
148
166
 
149
167
  struct ggml_backend_sched;
@@ -158,20 +176,26 @@ extern "C" {
158
176
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
159
177
 
160
178
  // Initialize a backend scheduler
161
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
179
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
162
180
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
181
+
163
182
  // Initialize backend buffers from a measure graph
164
183
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
184
+
165
185
  // Get the number of splits of the last graph
166
186
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187
+ GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
167
188
 
168
189
  GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
169
190
 
170
- GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
171
- GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
191
+ GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
192
+ GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
172
193
 
173
194
  // Allocate and compute graph on the backend scheduler
195
+ GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
174
196
  GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
197
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
198
+ GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
175
199
 
176
200
  // Reset all assignments and allocators - must be called before changing the node backends
177
201
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);