llama_cpp 0.12.6 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/ext/llama_cpp/llama_cpp.cpp +90 -269
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +28 -23
- data/vendor/tmp/llama.cpp/Makefile +51 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -11
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +191 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +2472 -862
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +3176 -667
- data/vendor/tmp/llama.cpp/ggml-quants.h +77 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +373 -424
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +186 -102
- data/vendor/tmp/llama.cpp/ggml.c +1266 -699
- data/vendor/tmp/llama.cpp/ggml.h +59 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1517 -717
- data/vendor/tmp/llama.cpp/llama.h +87 -63
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
data/sig/llama_cpp.rbs
CHANGED
|
@@ -27,12 +27,14 @@ module LLaMACpp
|
|
|
27
27
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
|
28
28
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
|
29
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
|
30
|
-
|
|
30
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
|
|
31
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
|
32
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
|
33
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
|
32
34
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
35
|
+
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
|
36
|
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
|
37
|
+
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
|
36
38
|
|
|
37
39
|
LLAMA_GRETYPE_END: Integer
|
|
38
40
|
LLAMA_GRETYPE_ALT: Integer
|
|
@@ -42,18 +44,23 @@ module LLaMACpp
|
|
|
42
44
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
|
43
45
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
|
44
46
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
|
|
48
|
+
LLAMA_ROPE_SCALING_TYPE_NONE: Integer
|
|
49
|
+
LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
|
|
50
|
+
LLAMA_ROPE_SCALING_TYPE_YARN: Integer
|
|
51
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
|
|
50
52
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
53
|
+
LLAMA_POOLING_TYPE_NONE: Integer
|
|
54
|
+
LLAMA_POOLING_TYPE_MEAN: Integer
|
|
55
|
+
LLAMA_POOLING_TYPE_CLS: Integer
|
|
54
56
|
|
|
55
|
-
|
|
57
|
+
LLAMA_SPLIT_MODE_NONE: Integer
|
|
58
|
+
LLAMA_SPLIT_MODE_LAYER: Integer
|
|
59
|
+
LLAMA_SPLIT_MODE_ROW: Integer
|
|
60
|
+
|
|
61
|
+
def self?.backend_init: () -> void
|
|
56
62
|
def self?.backend_free: () -> void
|
|
63
|
+
def self?.numa_init: (Integer) -> void
|
|
57
64
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
|
58
65
|
def self?.generate: (::LLaMACpp::Context, String,
|
|
59
66
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
|
@@ -61,8 +68,6 @@ module LLaMACpp
|
|
|
61
68
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
|
62
69
|
def self?.print_system_info: () -> void
|
|
63
70
|
def self?.time_us: () -> Integer
|
|
64
|
-
def self?.mmap_supported?: () -> bool
|
|
65
|
-
def self?.mlock_supported?: () -> bool
|
|
66
71
|
def self?.max_devices: () -> Integer
|
|
67
72
|
def self?.supports_mmap?: () -> bool
|
|
68
73
|
def self?.supports_mlock?: () -> bool
|
|
@@ -96,7 +101,8 @@ module LLaMACpp
|
|
|
96
101
|
def empty?: () -> bool
|
|
97
102
|
def free: () -> void
|
|
98
103
|
def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
|
99
|
-
def
|
|
104
|
+
def vocab_type: () -> Integer
|
|
105
|
+
def rope_type: () -> Integer
|
|
100
106
|
def n_vocab: () -> Integer
|
|
101
107
|
def n_ctx_train: () -> Integer
|
|
102
108
|
def n_embd: () -> Integer
|
|
@@ -195,8 +201,6 @@ module LLaMACpp
|
|
|
195
201
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
|
196
202
|
def embeddings: () -> Array[Float]
|
|
197
203
|
def embeddings_ith: (Integer) -> Array[Float]
|
|
198
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
|
199
|
-
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
|
200
204
|
def decode: (::LLaMACpp::Batch) -> void
|
|
201
205
|
def logits: () -> Array[Float]
|
|
202
206
|
def n_ctx: () -> Integer
|
|
@@ -209,14 +213,16 @@ module LLaMACpp
|
|
|
209
213
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
|
210
214
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
|
211
215
|
def kv_cache_seq_keep: (Integer) -> void
|
|
212
|
-
def
|
|
216
|
+
def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
|
|
213
217
|
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
|
218
|
+
def kv_cache_seq_pos_max: (Integer) -> Integer
|
|
219
|
+
def kv_cache_defrag: () -> void
|
|
220
|
+
def kv_cache_update: () -> void
|
|
214
221
|
def set_rng_seed: (Integer) -> void
|
|
215
222
|
def load_session_file: (session_path: String) -> void
|
|
216
223
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
|
217
224
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
|
218
225
|
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
|
219
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
|
220
226
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
|
221
227
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
|
222
228
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
|
@@ -225,7 +231,6 @@ module LLaMACpp
|
|
|
225
231
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
|
226
232
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
|
227
233
|
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
|
228
|
-
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
|
229
234
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
|
230
235
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
|
231
236
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
|
@@ -263,12 +268,12 @@ module LLaMACpp
|
|
|
263
268
|
def yarn_beta_slow: () -> Float
|
|
264
269
|
def yarn_orig_ctx=: (Integer) -> Integer
|
|
265
270
|
def yarn_orig_ctx: () -> Integer
|
|
271
|
+
def defrag_thold=: (Float) -> Float
|
|
272
|
+
def defrag_thold: () -> Float
|
|
266
273
|
def type_k=: (Integer) -> Integer
|
|
267
274
|
def type_k: () -> Integer
|
|
268
275
|
def type_v=: (Integer) -> Integer
|
|
269
276
|
def type_v: () -> Integer
|
|
270
|
-
def mul_mat_q: () -> bool
|
|
271
|
-
def mul_mat_q=: (bool) -> bool
|
|
272
277
|
def logits_all: () -> bool
|
|
273
278
|
def logits_all=: (bool) -> bool
|
|
274
279
|
def embedding: () -> bool
|
|
@@ -97,9 +97,10 @@ endif
|
|
|
97
97
|
#
|
|
98
98
|
|
|
99
99
|
# keep standard at C11 and C++11
|
|
100
|
-
MK_CPPFLAGS
|
|
101
|
-
MK_CFLAGS
|
|
102
|
-
MK_CXXFLAGS
|
|
100
|
+
MK_CPPFLAGS = -I. -Icommon
|
|
101
|
+
MK_CFLAGS = -std=c11 -fPIC
|
|
102
|
+
MK_CXXFLAGS = -std=c++11 -fPIC
|
|
103
|
+
MK_NVCCFLAGS = -std=c++11
|
|
103
104
|
|
|
104
105
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
|
105
106
|
ifdef LLAMA_FAST
|
|
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
|
|
|
172
173
|
MK_LDFLAGS += -g
|
|
173
174
|
|
|
174
175
|
ifeq ($(UNAME_S),Linux)
|
|
175
|
-
|
|
176
|
+
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
|
176
177
|
endif
|
|
177
178
|
else
|
|
178
179
|
MK_CPPFLAGS += -DNDEBUG
|
|
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
|
|
215
216
|
-Werror=implicit-function-declaration
|
|
216
217
|
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
|
217
218
|
|
|
219
|
+
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
|
220
|
+
MK_CFLAGS += -Werror
|
|
221
|
+
MK_CXXFLAGS += -Werror
|
|
222
|
+
endif
|
|
223
|
+
|
|
218
224
|
# this version of Apple ld64 is buggy
|
|
219
225
|
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
|
220
226
|
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
|
@@ -377,10 +383,18 @@ ifdef LLAMA_BLIS
|
|
|
377
383
|
endif # LLAMA_BLIS
|
|
378
384
|
|
|
379
385
|
ifdef LLAMA_CUBLAS
|
|
380
|
-
|
|
381
|
-
|
|
386
|
+
ifneq ('', '$(wildcard /opt/cuda)')
|
|
387
|
+
CUDA_PATH ?= /opt/cuda
|
|
388
|
+
else
|
|
389
|
+
CUDA_PATH ?= /usr/local/cuda
|
|
390
|
+
endif
|
|
391
|
+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
|
392
|
+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
|
382
393
|
OBJS += ggml-cuda.o
|
|
383
394
|
MK_NVCCFLAGS += -use_fast_math
|
|
395
|
+
ifdef LLAMA_FATAL_WARNINGS
|
|
396
|
+
MK_NVCCFLAGS += -Werror all-warnings
|
|
397
|
+
endif # LLAMA_FATAL_WARNINGS
|
|
384
398
|
ifndef JETSON_EOL_MODULE_DETECT
|
|
385
399
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
|
386
400
|
endif # JETSON_EOL_MODULE_DETECT
|
|
@@ -439,9 +453,9 @@ ifdef LLAMA_CUDA_CCBIN
|
|
|
439
453
|
endif
|
|
440
454
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
|
441
455
|
ifdef JETSON_EOL_MODULE_DETECT
|
|
442
|
-
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
|
456
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
|
443
457
|
else
|
|
444
|
-
$(NVCC) $(
|
|
458
|
+
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
|
445
459
|
endif # JETSON_EOL_MODULE_DETECT
|
|
446
460
|
endif # LLAMA_CUBLAS
|
|
447
461
|
|
|
@@ -526,11 +540,29 @@ ifdef LLAMA_METAL
|
|
|
526
540
|
ifdef LLAMA_METAL_NDEBUG
|
|
527
541
|
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
|
528
542
|
endif
|
|
543
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
|
544
|
+
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
|
545
|
+
OBJS += ggml-metal-embed.o
|
|
546
|
+
endif
|
|
529
547
|
endif # LLAMA_METAL
|
|
530
548
|
|
|
531
549
|
ifdef LLAMA_METAL
|
|
532
550
|
ggml-metal.o: ggml-metal.m ggml-metal.h
|
|
533
551
|
$(CC) $(CFLAGS) -c $< -o $@
|
|
552
|
+
|
|
553
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
|
554
|
+
ggml-metal-embed.o: ggml-metal.metal
|
|
555
|
+
@echo "Embedding Metal library"
|
|
556
|
+
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
|
557
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
|
558
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
|
559
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
|
560
|
+
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
|
561
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
|
562
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
|
563
|
+
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
|
564
|
+
@rm -f ${TEMP_ASSEMBLY}
|
|
565
|
+
endif
|
|
534
566
|
endif # LLAMA_METAL
|
|
535
567
|
|
|
536
568
|
ifdef LLAMA_MPI
|
|
@@ -542,9 +574,10 @@ GF_CC := $(CC)
|
|
|
542
574
|
include scripts/get-flags.mk
|
|
543
575
|
|
|
544
576
|
# combine build flags with cmdline overrides
|
|
545
|
-
override
|
|
546
|
-
|
|
547
|
-
|
|
577
|
+
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
|
|
578
|
+
override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
|
579
|
+
BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
|
|
580
|
+
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
|
|
548
581
|
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
|
549
582
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
|
550
583
|
|
|
@@ -552,7 +585,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
|
|
552
585
|
ifdef LLAMA_CUBLAS
|
|
553
586
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
|
554
587
|
include scripts/get-flags.mk
|
|
555
|
-
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
|
588
|
+
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
|
556
589
|
endif
|
|
557
590
|
|
|
558
591
|
#
|
|
@@ -571,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|
|
571
604
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
|
572
605
|
ifdef LLAMA_CUBLAS
|
|
573
606
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
|
574
|
-
CUDA_VERSION := $(shell
|
|
607
|
+
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
|
575
608
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
|
576
609
|
ifndef CUDA_DOCKER_ARCH
|
|
577
610
|
ifndef CUDA_POWER_ARCH
|
|
@@ -633,7 +666,6 @@ lib: llama.o ggml.o $(OBJS)
|
|
|
633
666
|
|
|
634
667
|
clean:
|
|
635
668
|
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
|
636
|
-
# find examples pocs -type f -name "*.o" -delete
|
|
637
669
|
|
|
638
670
|
#
|
|
639
671
|
# Examples
|
|
@@ -697,7 +729,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
|
697
729
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
698
730
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
699
731
|
|
|
700
|
-
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
|
732
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
|
701
733
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
702
734
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
|
703
735
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
|
@@ -868,3 +900,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
|
|
|
868
900
|
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
|
869
901
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
870
902
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
903
|
+
|
|
904
|
+
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
905
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
906
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
@@ -377,6 +377,9 @@ struct ggml_gallocr {
|
|
|
377
377
|
|
|
378
378
|
struct node_alloc * node_allocs; // [n_nodes]
|
|
379
379
|
int n_nodes;
|
|
380
|
+
|
|
381
|
+
struct tensor_alloc * leaf_allocs; // [n_leafs]
|
|
382
|
+
int n_leafs;
|
|
380
383
|
};
|
|
381
384
|
|
|
382
385
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
|
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
|
427
430
|
free(galloc->buffers);
|
|
428
431
|
free(galloc->buf_tallocs);
|
|
429
432
|
free(galloc->node_allocs);
|
|
433
|
+
free(galloc->leaf_allocs);
|
|
430
434
|
free(galloc);
|
|
431
435
|
}
|
|
432
436
|
|
|
@@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
|
464
468
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
465
469
|
struct ggml_tensor * parent = node->src[i];
|
|
466
470
|
if (parent == NULL) {
|
|
467
|
-
|
|
471
|
+
continue;
|
|
468
472
|
}
|
|
469
473
|
|
|
470
474
|
// if the node's data is external, then we cannot re-use it
|
|
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
544
548
|
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
|
545
549
|
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
|
546
550
|
|
|
547
|
-
// allocate all graph inputs first to avoid overwriting them
|
|
548
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
|
549
|
-
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
550
|
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
|
551
|
-
}
|
|
552
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
553
|
-
if (graph->nodes[i]->src[j] == NULL) {
|
|
554
|
-
break;
|
|
555
|
-
}
|
|
556
|
-
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
557
|
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
|
558
|
-
}
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
|
|
562
551
|
// count number of children and views
|
|
552
|
+
// allocate all graph inputs and leafs first to avoid overwriting them
|
|
563
553
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
564
554
|
struct ggml_tensor * node = graph->nodes[i];
|
|
565
555
|
|
|
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
568
558
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
|
569
559
|
}
|
|
570
560
|
|
|
561
|
+
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
562
|
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
|
563
|
+
}
|
|
564
|
+
|
|
571
565
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
572
|
-
struct ggml_tensor *
|
|
573
|
-
if (
|
|
574
|
-
|
|
566
|
+
struct ggml_tensor * src = node->src[j];
|
|
567
|
+
if (src == NULL) {
|
|
568
|
+
continue;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
|
572
|
+
|
|
573
|
+
// allocate explicit inputs and leafs
|
|
574
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
|
575
|
+
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
|
575
576
|
}
|
|
576
|
-
ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
|
|
577
577
|
}
|
|
578
|
-
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// allocate the remaining leafs that are unused on the graph
|
|
581
|
+
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
|
582
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
|
583
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
|
584
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
|
585
|
+
|
|
586
|
+
if (hn->n_children == 0) {
|
|
587
|
+
assert(!hn->allocated);
|
|
588
|
+
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
|
589
|
+
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
|
590
|
+
}
|
|
591
|
+
}
|
|
579
592
|
|
|
580
593
|
// allocate tensors
|
|
581
594
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
586
599
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
587
600
|
struct ggml_tensor * parent = node->src[j];
|
|
588
601
|
if (parent == NULL) {
|
|
589
|
-
|
|
602
|
+
continue;
|
|
590
603
|
}
|
|
591
604
|
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
|
592
605
|
}
|
|
@@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
598
611
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
599
612
|
struct ggml_tensor * parent = node->src[j];
|
|
600
613
|
if (parent == NULL) {
|
|
601
|
-
|
|
614
|
+
continue;
|
|
602
615
|
}
|
|
603
616
|
AT_PRINTF("%s", parent->name);
|
|
604
617
|
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
|
@@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
611
624
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
612
625
|
struct ggml_tensor * parent = node->src[j];
|
|
613
626
|
if (parent == NULL) {
|
|
614
|
-
|
|
627
|
+
continue;
|
|
615
628
|
}
|
|
616
629
|
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
|
617
630
|
p_hn->n_children -= 1;
|
|
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
696
709
|
}
|
|
697
710
|
}
|
|
698
711
|
}
|
|
712
|
+
if (galloc->n_leafs < graph->n_leafs) {
|
|
713
|
+
free(galloc->leaf_allocs);
|
|
714
|
+
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
|
|
715
|
+
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
|
716
|
+
}
|
|
717
|
+
galloc->n_leafs = graph->n_leafs;
|
|
718
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
|
719
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
|
720
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
|
721
|
+
galloc->leaf_allocs[i].offset = hn->offset;
|
|
722
|
+
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
|
723
|
+
}
|
|
699
724
|
|
|
700
725
|
// reallocate buffers if needed
|
|
701
726
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
|
722
747
|
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
|
723
748
|
}
|
|
724
749
|
|
|
725
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node,
|
|
726
|
-
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
|
750
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
|
751
|
+
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
|
727
752
|
|
|
728
753
|
if (node->view_src != NULL) {
|
|
729
754
|
if (node->buffer == NULL) {
|
|
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
732
757
|
// this tensor was allocated without ggml-backend
|
|
733
758
|
return;
|
|
734
759
|
}
|
|
735
|
-
ggml_backend_view_init(galloc->buffers[
|
|
760
|
+
ggml_backend_view_init(galloc->buffers[buffer_id], node);
|
|
736
761
|
}
|
|
737
762
|
} else {
|
|
738
763
|
if (node->data == NULL) {
|
|
739
764
|
assert(tensor_alloc->offset != SIZE_MAX);
|
|
740
|
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
|
741
|
-
void * base = ggml_backend_buffer_get_base(galloc->buffers[
|
|
765
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
|
766
|
+
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
|
742
767
|
void * addr = (char *)base + tensor_alloc->offset;
|
|
743
|
-
ggml_backend_tensor_alloc(galloc->buffers[
|
|
768
|
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
|
|
744
769
|
} else {
|
|
745
770
|
if (node->buffer == NULL) {
|
|
746
771
|
// this tensor was allocated without ggml-backend
|
|
747
772
|
return;
|
|
748
773
|
}
|
|
749
|
-
|
|
750
|
-
#ifndef NDEBUG
|
|
751
|
-
size_t offset =
|
|
752
|
-
(char *)node->data -
|
|
753
|
-
(char *)ggml_backend_buffer_get_base(node->buffer);
|
|
754
|
-
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
|
755
|
-
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
|
756
|
-
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
|
757
|
-
#endif
|
|
758
774
|
}
|
|
759
775
|
}
|
|
760
776
|
}
|
|
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
773
789
|
return true;
|
|
774
790
|
}
|
|
775
791
|
|
|
792
|
+
if (galloc->n_leafs != graph->n_leafs) {
|
|
793
|
+
#ifndef NDEBUG
|
|
794
|
+
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
|
795
|
+
#endif
|
|
796
|
+
return true;
|
|
797
|
+
}
|
|
798
|
+
|
|
776
799
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
777
800
|
struct ggml_tensor * node = graph->nodes[i];
|
|
778
801
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
@@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
787
810
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
788
811
|
struct ggml_tensor * src = node->src[j];
|
|
789
812
|
if (src == NULL) {
|
|
790
|
-
|
|
813
|
+
continue;
|
|
791
814
|
}
|
|
792
815
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
|
793
816
|
#ifndef NDEBUG
|
|
@@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
|
827
850
|
}
|
|
828
851
|
|
|
829
852
|
// allocate the graph tensors from the previous assignments
|
|
853
|
+
// nodes
|
|
830
854
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
831
855
|
struct ggml_tensor * node = graph->nodes[i];
|
|
832
856
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
833
857
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
834
858
|
struct ggml_tensor * src = node->src[j];
|
|
835
859
|
if (src == NULL) {
|
|
836
|
-
|
|
860
|
+
continue;
|
|
837
861
|
}
|
|
838
|
-
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
|
862
|
+
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
|
839
863
|
}
|
|
840
|
-
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
|
864
|
+
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
|
865
|
+
}
|
|
866
|
+
// leafs
|
|
867
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
|
868
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
|
869
|
+
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
|
870
|
+
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
|
841
871
|
}
|
|
842
872
|
|
|
843
873
|
return true;
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
|
|
13
13
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
// backend buffer type
|
|
17
16
|
|
|
18
17
|
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
@@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml
|
|
|
159
158
|
|
|
160
159
|
// backend
|
|
161
160
|
|
|
161
|
+
ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
|
|
162
|
+
if (backend == NULL) {
|
|
163
|
+
return NULL;
|
|
164
|
+
}
|
|
165
|
+
return backend->guid;
|
|
166
|
+
}
|
|
167
|
+
|
|
162
168
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
|
163
169
|
if (backend == NULL) {
|
|
164
170
|
return "NULL";
|
|
@@ -219,6 +225,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
|
|
219
225
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
220
226
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
221
227
|
|
|
228
|
+
if (!size) {
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
|
|
222
232
|
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
|
223
233
|
}
|
|
224
234
|
|
|
@@ -229,6 +239,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
|
|
|
229
239
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
|
230
240
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
231
241
|
|
|
242
|
+
if (!size) {
|
|
243
|
+
return;
|
|
244
|
+
}
|
|
245
|
+
|
|
232
246
|
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
|
233
247
|
}
|
|
234
248
|
|
|
@@ -748,7 +762,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
|
748
762
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
749
763
|
switch (op->op) {
|
|
750
764
|
case GGML_OP_CPY:
|
|
751
|
-
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
|
765
|
+
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
|
|
752
766
|
case GGML_OP_MUL_MAT:
|
|
753
767
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
754
768
|
default:
|
|
@@ -773,6 +787,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
|
773
787
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
|
774
788
|
};
|
|
775
789
|
|
|
790
|
+
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
791
|
+
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
|
792
|
+
return &guid;
|
|
793
|
+
}
|
|
794
|
+
|
|
776
795
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
|
777
796
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
|
778
797
|
if (ctx == NULL) {
|
|
@@ -792,6 +811,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
|
792
811
|
}
|
|
793
812
|
|
|
794
813
|
*cpu_backend = (struct ggml_backend) {
|
|
814
|
+
/* .guid = */ ggml_backend_cpu_guid(),
|
|
795
815
|
/* .interface = */ cpu_backend_i,
|
|
796
816
|
/* .context = */ ctx
|
|
797
817
|
};
|
|
@@ -799,7 +819,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
|
799
819
|
}
|
|
800
820
|
|
|
801
821
|
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
802
|
-
return backend && backend->
|
|
822
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
|
803
823
|
}
|
|
804
824
|
|
|
805
825
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
@@ -998,6 +1018,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
|
|
998
1018
|
}
|
|
999
1019
|
}
|
|
1000
1020
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
|
1021
|
+
return -1; // silence warning
|
|
1001
1022
|
}
|
|
1002
1023
|
|
|
1003
1024
|
#if 0
|
|
@@ -1032,7 +1053,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1032
1053
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
1033
1054
|
const struct ggml_tensor * src = tensor->src[i];
|
|
1034
1055
|
if (src == NULL) {
|
|
1035
|
-
|
|
1056
|
+
continue;
|
|
1036
1057
|
}
|
|
1037
1058
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1038
1059
|
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
|
@@ -1079,7 +1100,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
|
1079
1100
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1080
1101
|
struct ggml_tensor * src = node->src[j];
|
|
1081
1102
|
if (src == NULL) {
|
|
1082
|
-
|
|
1103
|
+
continue;
|
|
1083
1104
|
}
|
|
1084
1105
|
ggml_backend_t src_backend = tensor_backend(src);
|
|
1085
1106
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
@@ -1135,7 +1156,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1135
1156
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1136
1157
|
struct ggml_tensor * src = node->src[j];
|
|
1137
1158
|
if (src == NULL) {
|
|
1138
|
-
|
|
1159
|
+
continue;
|
|
1139
1160
|
}
|
|
1140
1161
|
if (tensor_backend_id(src) == -1) {
|
|
1141
1162
|
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
|
@@ -1247,7 +1268,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1247
1268
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1248
1269
|
struct ggml_tensor * src = node->src[j];
|
|
1249
1270
|
if (src == NULL) {
|
|
1250
|
-
|
|
1271
|
+
continue;
|
|
1251
1272
|
}
|
|
1252
1273
|
int src_backend_id = tensor_backend_id(src);
|
|
1253
1274
|
if (src_backend_id == -1) {
|
|
@@ -1306,7 +1327,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1306
1327
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1307
1328
|
struct ggml_tensor * src = node->src[j];
|
|
1308
1329
|
if (src == NULL) {
|
|
1309
|
-
|
|
1330
|
+
continue;
|
|
1310
1331
|
}
|
|
1311
1332
|
int src_backend_id = tensor_backend_id(src);
|
|
1312
1333
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
|
@@ -1353,7 +1374,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1353
1374
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1354
1375
|
struct ggml_tensor * src = node->src[j];
|
|
1355
1376
|
if (src == NULL) {
|
|
1356
|
-
|
|
1377
|
+
continue;
|
|
1357
1378
|
}
|
|
1358
1379
|
ggml_backend_t src_backend = tensor_backend(src);
|
|
1359
1380
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
|
@@ -1659,7 +1680,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
|
|
1659
1680
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
1660
1681
|
struct ggml_tensor * s = src->src[i];
|
|
1661
1682
|
if (s == NULL) {
|
|
1662
|
-
|
|
1683
|
+
continue;
|
|
1663
1684
|
}
|
|
1664
1685
|
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
|
1665
1686
|
}
|
|
@@ -1688,7 +1709,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
|
1688
1709
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
1689
1710
|
struct ggml_tensor * s = src->src[i];
|
|
1690
1711
|
if (s == NULL) {
|
|
1691
|
-
|
|
1712
|
+
continue;
|
|
1692
1713
|
}
|
|
1693
1714
|
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
|
1694
1715
|
}
|