llama_cpp 0.12.7 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
data/sig/llama_cpp.rbs
CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
|
|
27
27
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
28
28
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
29
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
30
|
-
|
30
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
|
31
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
36
|
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
37
|
+
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
38
38
|
|
39
39
|
LLAMA_GRETYPE_END: Integer
|
40
40
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -44,19 +44,20 @@ module LLaMACpp
|
|
44
44
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
45
45
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
47
|
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
|
48
|
+
LLAMA_ROPE_SCALING_TYPE_NONE: Integer
|
49
|
+
LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
|
50
|
+
LLAMA_ROPE_SCALING_TYPE_YARN: Integer
|
51
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
|
53
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
|
54
|
+
LLAMA_POOLING_TYPE_NONE: Integer
|
55
|
+
LLAMA_POOLING_TYPE_MEAN: Integer
|
56
|
+
LLAMA_POOLING_TYPE_CLS: Integer
|
56
57
|
|
57
|
-
|
58
|
-
|
59
|
-
|
58
|
+
LLAMA_SPLIT_MODE_NONE: Integer
|
59
|
+
LLAMA_SPLIT_MODE_LAYER: Integer
|
60
|
+
LLAMA_SPLIT_MODE_ROW: Integer
|
60
61
|
|
61
62
|
def self?.backend_init: () -> void
|
62
63
|
def self?.backend_free: () -> void
|
@@ -68,8 +69,6 @@ module LLaMACpp
|
|
68
69
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
69
70
|
def self?.print_system_info: () -> void
|
70
71
|
def self?.time_us: () -> Integer
|
71
|
-
def self?.mmap_supported?: () -> bool
|
72
|
-
def self?.mlock_supported?: () -> bool
|
73
72
|
def self?.max_devices: () -> Integer
|
74
73
|
def self?.supports_mmap?: () -> bool
|
75
74
|
def self?.supports_mlock?: () -> bool
|
@@ -103,7 +102,8 @@ module LLaMACpp
|
|
103
102
|
def empty?: () -> bool
|
104
103
|
def free: () -> void
|
105
104
|
def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
106
|
-
def
|
105
|
+
def vocab_type: () -> Integer
|
106
|
+
def rope_type: () -> Integer
|
107
107
|
def n_vocab: () -> Integer
|
108
108
|
def n_ctx_train: () -> Integer
|
109
109
|
def n_embd: () -> Integer
|
@@ -202,8 +202,7 @@ module LLaMACpp
|
|
202
202
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
203
203
|
def embeddings: () -> Array[Float]
|
204
204
|
def embeddings_ith: (Integer) -> Array[Float]
|
205
|
-
def
|
206
|
-
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
205
|
+
def embeddings_seq: (Integer) -> Array[Float]
|
207
206
|
def decode: (::LLaMACpp::Batch) -> void
|
208
207
|
def logits: () -> Array[Float]
|
209
208
|
def n_ctx: () -> Integer
|
@@ -216,14 +215,16 @@ module LLaMACpp
|
|
216
215
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
217
216
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
218
217
|
def kv_cache_seq_keep: (Integer) -> void
|
219
|
-
def
|
218
|
+
def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
|
220
219
|
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
220
|
+
def kv_cache_seq_pos_max: (Integer) -> Integer
|
221
|
+
def kv_cache_defrag: () -> void
|
222
|
+
def kv_cache_update: () -> void
|
221
223
|
def set_rng_seed: (Integer) -> void
|
222
224
|
def load_session_file: (session_path: String) -> void
|
223
225
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
224
226
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
225
227
|
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
226
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
227
228
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
228
229
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
229
230
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -232,7 +233,6 @@ module LLaMACpp
|
|
232
233
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
233
234
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
234
235
|
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
235
|
-
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
236
236
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
237
237
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
238
238
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
@@ -256,6 +256,8 @@ module LLaMACpp
|
|
256
256
|
def n_threads_batch=: (Integer) -> Integer
|
257
257
|
def rope_scaling_type=: (Integer) -> Integer
|
258
258
|
def rope_scaling_type: () -> Integer
|
259
|
+
def pooling_type=: (Integer) -> Integer
|
260
|
+
def pooling_type: () -> Integer
|
259
261
|
def rope_freq_base=: (Float) -> Float
|
260
262
|
def rope_freq_base: () -> Float
|
261
263
|
def rope_freq_scale=: (Float) -> Float
|
@@ -270,20 +272,18 @@ module LLaMACpp
|
|
270
272
|
def yarn_beta_slow: () -> Float
|
271
273
|
def yarn_orig_ctx=: (Integer) -> Integer
|
272
274
|
def yarn_orig_ctx: () -> Integer
|
275
|
+
def defrag_thold=: (Float) -> Float
|
276
|
+
def defrag_thold: () -> Float
|
273
277
|
def type_k=: (Integer) -> Integer
|
274
278
|
def type_k: () -> Integer
|
275
279
|
def type_v=: (Integer) -> Integer
|
276
280
|
def type_v: () -> Integer
|
277
|
-
def mul_mat_q: () -> bool
|
278
|
-
def mul_mat_q=: (bool) -> bool
|
279
281
|
def logits_all: () -> bool
|
280
282
|
def logits_all=: (bool) -> bool
|
281
|
-
def
|
282
|
-
def
|
283
|
+
def embeddings: () -> bool
|
284
|
+
def embeddings=: (bool) -> bool
|
283
285
|
def offload_kqv: () -> bool
|
284
286
|
def offload_kqv=: (bool) -> bool
|
285
|
-
def do_pooling: () -> bool
|
286
|
-
def do_pooling=: (bool) -> bool
|
287
287
|
end
|
288
288
|
|
289
289
|
class ModelQuantizeParams
|
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
|
|
383
383
|
endif # LLAMA_BLIS
|
384
384
|
|
385
385
|
ifdef LLAMA_CUBLAS
|
386
|
-
|
387
|
-
|
386
|
+
ifneq ('', '$(wildcard /opt/cuda)')
|
387
|
+
CUDA_PATH ?= /opt/cuda
|
388
|
+
else
|
389
|
+
CUDA_PATH ?= /usr/local/cuda
|
390
|
+
endif
|
391
|
+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
392
|
+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
388
393
|
OBJS += ggml-cuda.o
|
389
394
|
MK_NVCCFLAGS += -use_fast_math
|
390
395
|
ifdef LLAMA_FATAL_WARNINGS
|
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|
599
604
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
600
605
|
ifdef LLAMA_CUBLAS
|
601
606
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
602
|
-
CUDA_VERSION := $(shell
|
607
|
+
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
603
608
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
604
609
|
ifndef CUDA_DOCKER_ARCH
|
605
610
|
ifndef CUDA_POWER_ARCH
|
@@ -724,10 +729,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
724
729
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
725
730
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
726
731
|
|
727
|
-
server: examples/server/server.cpp examples/server/
|
732
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
728
733
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
729
|
-
$(CXX) $(CXXFLAGS) -
|
730
|
-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
734
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
731
735
|
|
732
736
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
733
737
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
@@ -91,19 +91,22 @@ extern "C" {
|
|
91
91
|
// (optional) complete all pending operations
|
92
92
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
93
93
|
|
94
|
-
//
|
94
|
+
// create a plan for ggml_cgraph and free it
|
95
95
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
96
96
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
97
|
-
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
98
97
|
|
98
|
+
// compute graph with a plan
|
99
|
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
99
100
|
// compute graph without a plan (async)
|
100
|
-
|
101
|
+
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
101
102
|
|
102
103
|
// check if the backend supports an operation
|
103
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
104
105
|
};
|
105
106
|
|
106
107
|
struct ggml_backend {
|
108
|
+
ggml_guid_t guid;
|
109
|
+
|
107
110
|
struct ggml_backend_i iface;
|
108
111
|
|
109
112
|
ggml_backend_context_t context;
|
@@ -12,7 +12,6 @@
|
|
12
12
|
|
13
13
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
14
14
|
|
15
|
-
|
16
15
|
// backend buffer type
|
17
16
|
|
18
17
|
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
@@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml
|
|
159
158
|
|
160
159
|
// backend
|
161
160
|
|
161
|
+
ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
|
162
|
+
if (backend == NULL) {
|
163
|
+
return NULL;
|
164
|
+
}
|
165
|
+
return backend->guid;
|
166
|
+
}
|
167
|
+
|
162
168
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
163
169
|
if (backend == NULL) {
|
164
170
|
return "NULL";
|
@@ -256,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
256
262
|
backend->iface.graph_plan_free(backend, plan);
|
257
263
|
}
|
258
264
|
|
259
|
-
|
260
|
-
backend->iface.graph_plan_compute(backend, plan);
|
265
|
+
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
266
|
+
return backend->iface.graph_plan_compute(backend, plan);
|
261
267
|
}
|
262
268
|
|
263
|
-
|
269
|
+
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
264
270
|
return backend->iface.graph_compute(backend, cgraph);
|
265
271
|
}
|
266
272
|
|
@@ -726,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
|
|
726
732
|
GGML_UNUSED(backend);
|
727
733
|
}
|
728
734
|
|
729
|
-
GGML_CALL static
|
735
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
730
736
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
731
737
|
|
732
|
-
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
738
|
+
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
733
739
|
|
734
740
|
GGML_UNUSED(backend);
|
735
741
|
}
|
736
742
|
|
737
|
-
GGML_CALL static
|
743
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
738
744
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
739
745
|
|
740
746
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -749,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
749
755
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
750
756
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
751
757
|
|
752
|
-
ggml_graph_compute(cgraph, &cplan);
|
753
|
-
return true;
|
758
|
+
return ggml_graph_compute(cgraph, &cplan);
|
754
759
|
}
|
755
760
|
|
756
761
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -781,6 +786,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
781
786
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
782
787
|
};
|
783
788
|
|
789
|
+
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
790
|
+
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
791
|
+
return &guid;
|
792
|
+
}
|
793
|
+
|
784
794
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
785
795
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
786
796
|
if (ctx == NULL) {
|
@@ -800,6 +810,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
800
810
|
}
|
801
811
|
|
802
812
|
*cpu_backend = (struct ggml_backend) {
|
813
|
+
/* .guid = */ ggml_backend_cpu_guid(),
|
803
814
|
/* .interface = */ cpu_backend_i,
|
804
815
|
/* .context = */ ctx
|
805
816
|
};
|
@@ -807,7 +818,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
807
818
|
}
|
808
819
|
|
809
820
|
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
810
|
-
return backend && backend->
|
821
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
811
822
|
}
|
812
823
|
|
813
824
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
@@ -1425,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
1425
1436
|
return true;
|
1426
1437
|
}
|
1427
1438
|
|
1428
|
-
static
|
1439
|
+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1429
1440
|
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
1430
1441
|
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
1431
1442
|
|
@@ -1460,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1460
1471
|
|
1461
1472
|
uint64_t compute_start_us = ggml_time_us();
|
1462
1473
|
if (!sched->callback_eval) {
|
1463
|
-
|
1464
|
-
|
1474
|
+
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
|
1475
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1476
|
+
return ec;
|
1465
1477
|
}
|
1466
1478
|
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1467
1479
|
} else {
|
@@ -1482,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1482
1494
|
|
1483
1495
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1484
1496
|
|
1485
|
-
|
1486
|
-
|
1497
|
+
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
|
1498
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1499
|
+
return ec;
|
1487
1500
|
}
|
1488
1501
|
|
1489
1502
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
@@ -1507,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1507
1520
|
}
|
1508
1521
|
#endif
|
1509
1522
|
|
1510
|
-
return
|
1523
|
+
return GGML_STATUS_SUCCESS;
|
1511
1524
|
}
|
1512
1525
|
|
1513
1526
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
@@ -1569,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
1569
1582
|
return true;
|
1570
1583
|
}
|
1571
1584
|
|
1572
|
-
|
1585
|
+
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1573
1586
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1574
1587
|
|
1575
1588
|
if (!sched->is_reset) {
|
@@ -1578,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
|
|
1578
1591
|
|
1579
1592
|
ggml_backend_sched_split_graph(sched, graph);
|
1580
1593
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1581
|
-
return
|
1594
|
+
return GGML_STATUS_ALLOC_FAILED;
|
1582
1595
|
}
|
1583
1596
|
|
1584
|
-
|
1585
|
-
return false;
|
1586
|
-
}
|
1587
|
-
|
1588
|
-
return true;
|
1597
|
+
return ggml_backend_sched_compute_splits(sched);
|
1589
1598
|
}
|
1590
1599
|
|
1591
1600
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
@@ -49,7 +49,7 @@ extern "C" {
|
|
49
49
|
// Backend
|
50
50
|
//
|
51
51
|
|
52
|
-
|
52
|
+
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
53
53
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
54
54
|
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
55
55
|
|
@@ -66,12 +66,13 @@ extern "C" {
|
|
66
66
|
|
67
67
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
68
68
|
|
69
|
-
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create
|
69
|
+
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
70
|
+
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
71
|
+
|
72
|
+
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
73
|
+
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
70
74
|
|
71
|
-
GGML_API
|
72
|
-
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
73
|
-
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
74
|
-
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
75
|
+
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
75
76
|
|
76
77
|
// tensor copy between different backends
|
77
78
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
@@ -157,26 +158,26 @@ extern "C" {
|
|
157
158
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
158
159
|
|
159
160
|
// Initialize a backend scheduler
|
160
|
-
GGML_API ggml_backend_sched_t
|
161
|
-
GGML_API void
|
161
|
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
162
|
+
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
162
163
|
// Initialize backend buffers from a measure graph
|
163
|
-
GGML_API bool
|
164
|
+
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
164
165
|
// Get the number of splits of the last graph
|
165
|
-
GGML_API int
|
166
|
+
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
166
167
|
|
167
|
-
GGML_API size_t
|
168
|
+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
168
169
|
|
169
|
-
GGML_API void
|
170
|
-
GGML_API ggml_backend_t
|
170
|
+
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
171
|
+
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
171
172
|
|
172
173
|
// Allocate and compute graph on the backend scheduler
|
173
|
-
GGML_API
|
174
|
+
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
174
175
|
|
175
176
|
// Reset all assignments and allocators - must be called before changing the node backends
|
176
|
-
GGML_API void
|
177
|
+
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
177
178
|
|
178
179
|
// Set a callback to be called for each resulting node during graph compute
|
179
|
-
GGML_API void
|
180
|
+
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
180
181
|
|
181
182
|
//
|
182
183
|
// Utils
|