llama_cpp 0.12.7 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
data/sig/llama_cpp.rbs
CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
|
|
27
27
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
28
28
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
29
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
30
|
-
|
30
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
|
31
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
36
|
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
37
|
+
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
38
38
|
|
39
39
|
LLAMA_GRETYPE_END: Integer
|
40
40
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -44,19 +44,20 @@ module LLaMACpp
|
|
44
44
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
45
45
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
47
|
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
|
48
|
+
LLAMA_ROPE_SCALING_TYPE_NONE: Integer
|
49
|
+
LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
|
50
|
+
LLAMA_ROPE_SCALING_TYPE_YARN: Integer
|
51
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
|
53
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
|
54
|
+
LLAMA_POOLING_TYPE_NONE: Integer
|
55
|
+
LLAMA_POOLING_TYPE_MEAN: Integer
|
56
|
+
LLAMA_POOLING_TYPE_CLS: Integer
|
56
57
|
|
57
|
-
|
58
|
-
|
59
|
-
|
58
|
+
LLAMA_SPLIT_MODE_NONE: Integer
|
59
|
+
LLAMA_SPLIT_MODE_LAYER: Integer
|
60
|
+
LLAMA_SPLIT_MODE_ROW: Integer
|
60
61
|
|
61
62
|
def self?.backend_init: () -> void
|
62
63
|
def self?.backend_free: () -> void
|
@@ -68,8 +69,6 @@ module LLaMACpp
|
|
68
69
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
69
70
|
def self?.print_system_info: () -> void
|
70
71
|
def self?.time_us: () -> Integer
|
71
|
-
def self?.mmap_supported?: () -> bool
|
72
|
-
def self?.mlock_supported?: () -> bool
|
73
72
|
def self?.max_devices: () -> Integer
|
74
73
|
def self?.supports_mmap?: () -> bool
|
75
74
|
def self?.supports_mlock?: () -> bool
|
@@ -103,7 +102,8 @@ module LLaMACpp
|
|
103
102
|
def empty?: () -> bool
|
104
103
|
def free: () -> void
|
105
104
|
def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
106
|
-
def
|
105
|
+
def vocab_type: () -> Integer
|
106
|
+
def rope_type: () -> Integer
|
107
107
|
def n_vocab: () -> Integer
|
108
108
|
def n_ctx_train: () -> Integer
|
109
109
|
def n_embd: () -> Integer
|
@@ -202,8 +202,7 @@ module LLaMACpp
|
|
202
202
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
203
203
|
def embeddings: () -> Array[Float]
|
204
204
|
def embeddings_ith: (Integer) -> Array[Float]
|
205
|
-
def
|
206
|
-
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
205
|
+
def embeddings_seq: (Integer) -> Array[Float]
|
207
206
|
def decode: (::LLaMACpp::Batch) -> void
|
208
207
|
def logits: () -> Array[Float]
|
209
208
|
def n_ctx: () -> Integer
|
@@ -216,14 +215,16 @@ module LLaMACpp
|
|
216
215
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
217
216
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
218
217
|
def kv_cache_seq_keep: (Integer) -> void
|
219
|
-
def
|
218
|
+
def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
|
220
219
|
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
220
|
+
def kv_cache_seq_pos_max: (Integer) -> Integer
|
221
|
+
def kv_cache_defrag: () -> void
|
222
|
+
def kv_cache_update: () -> void
|
221
223
|
def set_rng_seed: (Integer) -> void
|
222
224
|
def load_session_file: (session_path: String) -> void
|
223
225
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
224
226
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
225
227
|
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
226
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
227
228
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
228
229
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
229
230
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -232,7 +233,6 @@ module LLaMACpp
|
|
232
233
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
233
234
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
234
235
|
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
235
|
-
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
236
236
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
237
237
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
238
238
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
@@ -256,6 +256,8 @@ module LLaMACpp
|
|
256
256
|
def n_threads_batch=: (Integer) -> Integer
|
257
257
|
def rope_scaling_type=: (Integer) -> Integer
|
258
258
|
def rope_scaling_type: () -> Integer
|
259
|
+
def pooling_type=: (Integer) -> Integer
|
260
|
+
def pooling_type: () -> Integer
|
259
261
|
def rope_freq_base=: (Float) -> Float
|
260
262
|
def rope_freq_base: () -> Float
|
261
263
|
def rope_freq_scale=: (Float) -> Float
|
@@ -270,20 +272,18 @@ module LLaMACpp
|
|
270
272
|
def yarn_beta_slow: () -> Float
|
271
273
|
def yarn_orig_ctx=: (Integer) -> Integer
|
272
274
|
def yarn_orig_ctx: () -> Integer
|
275
|
+
def defrag_thold=: (Float) -> Float
|
276
|
+
def defrag_thold: () -> Float
|
273
277
|
def type_k=: (Integer) -> Integer
|
274
278
|
def type_k: () -> Integer
|
275
279
|
def type_v=: (Integer) -> Integer
|
276
280
|
def type_v: () -> Integer
|
277
|
-
def mul_mat_q: () -> bool
|
278
|
-
def mul_mat_q=: (bool) -> bool
|
279
281
|
def logits_all: () -> bool
|
280
282
|
def logits_all=: (bool) -> bool
|
281
|
-
def
|
282
|
-
def
|
283
|
+
def embeddings: () -> bool
|
284
|
+
def embeddings=: (bool) -> bool
|
283
285
|
def offload_kqv: () -> bool
|
284
286
|
def offload_kqv=: (bool) -> bool
|
285
|
-
def do_pooling: () -> bool
|
286
|
-
def do_pooling=: (bool) -> bool
|
287
287
|
end
|
288
288
|
|
289
289
|
class ModelQuantizeParams
|
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
|
|
383
383
|
endif # LLAMA_BLIS
|
384
384
|
|
385
385
|
ifdef LLAMA_CUBLAS
|
386
|
-
|
387
|
-
|
386
|
+
ifneq ('', '$(wildcard /opt/cuda)')
|
387
|
+
CUDA_PATH ?= /opt/cuda
|
388
|
+
else
|
389
|
+
CUDA_PATH ?= /usr/local/cuda
|
390
|
+
endif
|
391
|
+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
392
|
+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
388
393
|
OBJS += ggml-cuda.o
|
389
394
|
MK_NVCCFLAGS += -use_fast_math
|
390
395
|
ifdef LLAMA_FATAL_WARNINGS
|
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|
599
604
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
600
605
|
ifdef LLAMA_CUBLAS
|
601
606
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
602
|
-
CUDA_VERSION := $(shell
|
607
|
+
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
603
608
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
604
609
|
ifndef CUDA_DOCKER_ARCH
|
605
610
|
ifndef CUDA_POWER_ARCH
|
@@ -724,10 +729,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
724
729
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
725
730
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
726
731
|
|
727
|
-
server: examples/server/server.cpp examples/server/
|
732
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
728
733
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
729
|
-
$(CXX) $(CXXFLAGS) -
|
730
|
-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
734
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
731
735
|
|
732
736
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
733
737
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
@@ -91,19 +91,22 @@ extern "C" {
|
|
91
91
|
// (optional) complete all pending operations
|
92
92
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
93
93
|
|
94
|
-
//
|
94
|
+
// create a plan for ggml_cgraph and free it
|
95
95
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
96
96
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
97
|
-
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
98
97
|
|
98
|
+
// compute graph with a plan
|
99
|
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
99
100
|
// compute graph without a plan (async)
|
100
|
-
|
101
|
+
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
101
102
|
|
102
103
|
// check if the backend supports an operation
|
103
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
104
105
|
};
|
105
106
|
|
106
107
|
struct ggml_backend {
|
108
|
+
ggml_guid_t guid;
|
109
|
+
|
107
110
|
struct ggml_backend_i iface;
|
108
111
|
|
109
112
|
ggml_backend_context_t context;
|
@@ -12,7 +12,6 @@
|
|
12
12
|
|
13
13
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
14
14
|
|
15
|
-
|
16
15
|
// backend buffer type
|
17
16
|
|
18
17
|
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
@@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml
|
|
159
158
|
|
160
159
|
// backend
|
161
160
|
|
161
|
+
ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
|
162
|
+
if (backend == NULL) {
|
163
|
+
return NULL;
|
164
|
+
}
|
165
|
+
return backend->guid;
|
166
|
+
}
|
167
|
+
|
162
168
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
163
169
|
if (backend == NULL) {
|
164
170
|
return "NULL";
|
@@ -256,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
256
262
|
backend->iface.graph_plan_free(backend, plan);
|
257
263
|
}
|
258
264
|
|
259
|
-
|
260
|
-
backend->iface.graph_plan_compute(backend, plan);
|
265
|
+
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
266
|
+
return backend->iface.graph_plan_compute(backend, plan);
|
261
267
|
}
|
262
268
|
|
263
|
-
|
269
|
+
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
264
270
|
return backend->iface.graph_compute(backend, cgraph);
|
265
271
|
}
|
266
272
|
|
@@ -726,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
|
|
726
732
|
GGML_UNUSED(backend);
|
727
733
|
}
|
728
734
|
|
729
|
-
GGML_CALL static
|
735
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
730
736
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
731
737
|
|
732
|
-
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
738
|
+
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
733
739
|
|
734
740
|
GGML_UNUSED(backend);
|
735
741
|
}
|
736
742
|
|
737
|
-
GGML_CALL static
|
743
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
738
744
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
739
745
|
|
740
746
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -749,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
749
755
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
750
756
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
751
757
|
|
752
|
-
ggml_graph_compute(cgraph, &cplan);
|
753
|
-
return true;
|
758
|
+
return ggml_graph_compute(cgraph, &cplan);
|
754
759
|
}
|
755
760
|
|
756
761
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -781,6 +786,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
781
786
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
782
787
|
};
|
783
788
|
|
789
|
+
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
790
|
+
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
791
|
+
return &guid;
|
792
|
+
}
|
793
|
+
|
784
794
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
785
795
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
786
796
|
if (ctx == NULL) {
|
@@ -800,6 +810,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
800
810
|
}
|
801
811
|
|
802
812
|
*cpu_backend = (struct ggml_backend) {
|
813
|
+
/* .guid = */ ggml_backend_cpu_guid(),
|
803
814
|
/* .interface = */ cpu_backend_i,
|
804
815
|
/* .context = */ ctx
|
805
816
|
};
|
@@ -807,7 +818,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
807
818
|
}
|
808
819
|
|
809
820
|
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
810
|
-
return backend && backend->
|
821
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
811
822
|
}
|
812
823
|
|
813
824
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
@@ -1425,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
1425
1436
|
return true;
|
1426
1437
|
}
|
1427
1438
|
|
1428
|
-
static
|
1439
|
+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1429
1440
|
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
1430
1441
|
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
1431
1442
|
|
@@ -1460,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1460
1471
|
|
1461
1472
|
uint64_t compute_start_us = ggml_time_us();
|
1462
1473
|
if (!sched->callback_eval) {
|
1463
|
-
|
1464
|
-
|
1474
|
+
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
|
1475
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1476
|
+
return ec;
|
1465
1477
|
}
|
1466
1478
|
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1467
1479
|
} else {
|
@@ -1482,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1482
1494
|
|
1483
1495
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1484
1496
|
|
1485
|
-
|
1486
|
-
|
1497
|
+
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
|
1498
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1499
|
+
return ec;
|
1487
1500
|
}
|
1488
1501
|
|
1489
1502
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
@@ -1507,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1507
1520
|
}
|
1508
1521
|
#endif
|
1509
1522
|
|
1510
|
-
return
|
1523
|
+
return GGML_STATUS_SUCCESS;
|
1511
1524
|
}
|
1512
1525
|
|
1513
1526
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
@@ -1569,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
1569
1582
|
return true;
|
1570
1583
|
}
|
1571
1584
|
|
1572
|
-
|
1585
|
+
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1573
1586
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1574
1587
|
|
1575
1588
|
if (!sched->is_reset) {
|
@@ -1578,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
|
|
1578
1591
|
|
1579
1592
|
ggml_backend_sched_split_graph(sched, graph);
|
1580
1593
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1581
|
-
return
|
1594
|
+
return GGML_STATUS_ALLOC_FAILED;
|
1582
1595
|
}
|
1583
1596
|
|
1584
|
-
|
1585
|
-
return false;
|
1586
|
-
}
|
1587
|
-
|
1588
|
-
return true;
|
1597
|
+
return ggml_backend_sched_compute_splits(sched);
|
1589
1598
|
}
|
1590
1599
|
|
1591
1600
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
@@ -49,7 +49,7 @@ extern "C" {
|
|
49
49
|
// Backend
|
50
50
|
//
|
51
51
|
|
52
|
-
|
52
|
+
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
53
53
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
54
54
|
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
55
55
|
|
@@ -66,12 +66,13 @@ extern "C" {
|
|
66
66
|
|
67
67
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
68
68
|
|
69
|
-
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create
|
69
|
+
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
70
|
+
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
71
|
+
|
72
|
+
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
73
|
+
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
70
74
|
|
71
|
-
GGML_API
|
72
|
-
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
73
|
-
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
74
|
-
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
75
|
+
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
75
76
|
|
76
77
|
// tensor copy between different backends
|
77
78
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
@@ -157,26 +158,26 @@ extern "C" {
|
|
157
158
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
158
159
|
|
159
160
|
// Initialize a backend scheduler
|
160
|
-
GGML_API ggml_backend_sched_t
|
161
|
-
GGML_API void
|
161
|
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
162
|
+
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
162
163
|
// Initialize backend buffers from a measure graph
|
163
|
-
GGML_API bool
|
164
|
+
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
164
165
|
// Get the number of splits of the last graph
|
165
|
-
GGML_API int
|
166
|
+
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
166
167
|
|
167
|
-
GGML_API size_t
|
168
|
+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
168
169
|
|
169
|
-
GGML_API void
|
170
|
-
GGML_API ggml_backend_t
|
170
|
+
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
171
|
+
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
171
172
|
|
172
173
|
// Allocate and compute graph on the backend scheduler
|
173
|
-
GGML_API
|
174
|
+
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
174
175
|
|
175
176
|
// Reset all assignments and allocators - must be called before changing the node backends
|
176
|
-
GGML_API void
|
177
|
+
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
177
178
|
|
178
179
|
// Set a callback to be called for each resulting node during graph compute
|
179
|
-
GGML_API void
|
180
|
+
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
180
181
|
|
181
182
|
//
|
182
183
|
// Utils
|