llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,6 +9,7 @@ extern "C" {
9
9
 
10
10
  typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
11
11
  typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
12
+ typedef struct ggml_backend_event * ggml_backend_event_t;
12
13
  typedef struct ggml_backend * ggml_backend_t;
13
14
  typedef void * ggml_backend_graph_plan_t;
14
15
 
@@ -66,16 +67,30 @@ extern "C" {
66
67
 
67
68
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
68
69
 
69
- GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
70
+ GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
71
+ GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
70
72
 
71
- GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
- GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
73
- GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
74
- GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
73
+ GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
+ GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
+
76
+ GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
77
+ GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
75
78
 
76
79
  // tensor copy between different backends
77
80
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
78
- GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
81
+
82
+ // asynchronous copy
83
+ // the copy is performed after all the currently queued operations in backend_src
84
+ // backend_dst will wait for the copy to complete before performing other operations
85
+ // automatic fallback to sync copy if async is not supported
86
+ GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
87
+
88
+ // events
89
+ GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
90
+ GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
+ GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
+ GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
+ GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
79
94
 
80
95
  //
81
96
  // CPU backend
@@ -122,27 +137,31 @@ extern "C" {
122
137
  /*
123
138
  Example usage:
124
139
 
125
- sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
126
- // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
141
+ // preferrably to run on the same backend as the buffer
142
+ ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
127
143
 
128
- // initialize buffers from a measure graph
129
- measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
144
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
130
145
 
131
- // in build_graph:
132
- build_graph(...) {
133
- // manually assign nodes to a backend (optional, should not be needed in most cases)
134
- struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
135
- ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
136
- }
146
+ // initialize buffers from a max size graph (optional)
147
+ reserve_graph = build_graph(sched, max_batch_size);
137
148
 
138
- // allocate backend buffers from measure graph
139
- ggml_backend_sched_init_measure(sched, measure_graph);
149
+ // manually assign nodes to a backend (optional, should not be needed in most cases)
150
+ struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
151
+ ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
140
152
 
141
- // the scheduler is now ready to compute graphs
153
+ ggml_backend_sched_reserve(sched, reserve_graph);
142
154
 
143
155
  // compute
144
156
  graph = build_graph(sched);
145
157
  ggml_backend_sched_graph_compute(sched, graph);
158
+
159
+ // if there are graph inputs:
160
+ ggml_backend_sched_reset(sched);
161
+ ggml_backend_sched_alloc_graph(sched, graph);
162
+ ggml_backend_tensor_set(input_tensor, ...);
163
+ ggml_backend_sched_graph_compute(sched, graph);
164
+ }
146
165
  */
147
166
 
148
167
  struct ggml_backend_sched;
@@ -157,26 +176,32 @@ extern "C" {
157
176
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
158
177
 
159
178
  // Initialize a backend scheduler
160
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
161
- GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
179
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
180
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
181
+
162
182
  // Initialize backend buffers from a measure graph
163
- GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
183
+ GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
184
+
164
185
  // Get the number of splits of the last graph
165
- GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
186
+ GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187
+ GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
166
188
 
167
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
189
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
168
190
 
169
- GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
170
- GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
191
+ GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
192
+ GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
171
193
 
172
194
  // Allocate and compute graph on the backend scheduler
173
- GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
195
+ GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
196
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
197
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
198
+ GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
174
199
 
175
200
  // Reset all assignments and allocators - must be called before changing the node backends
176
- GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
201
+ GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
177
202
 
178
203
  // Set a callback to be called for each resulting node during graph compute
179
- GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
204
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
180
205
 
181
206
  //
182
207
  // Utils