llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/sig/llama_cpp.rbs CHANGED
@@ -27,12 +27,14 @@ module LLaMACpp
27
27
  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
28
28
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
29
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
30
- LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
30
+ LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
31
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
+ LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
+ LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
32
34
 
33
- LLAMA_KV_OVERRIDE_INT: Integer
34
- LLAMA_KV_OVERRIDE_FLOAT: Integer
35
- LLAMA_KV_OVERRIDE_BOOL: Integer
35
+ LLAMA_KV_OVERRIDE_TYPE_INT: Integer
36
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
37
+ LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
36
38
 
37
39
  LLAMA_GRETYPE_END: Integer
38
40
  LLAMA_GRETYPE_ALT: Integer
@@ -42,18 +44,23 @@ module LLaMACpp
42
44
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
43
45
  LLAMA_GRETYPE_CHAR_ALT: Integer
44
46
 
45
- LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
46
- LLAMA_ROPE_SCALING_NONE: Integer
47
- LLAMA_ROPE_SCALING_LINEAR: Integer
48
- LLAMA_ROPE_SCALING_YARN: Integer
49
- LLAMA_ROPE_SCALING_MAX_VALUE: Integer
47
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
48
+ LLAMA_ROPE_SCALING_TYPE_NONE: Integer
49
+ LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
50
+ LLAMA_ROPE_SCALING_TYPE_YARN: Integer
51
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
50
52
 
51
- LLAMA_SPLIT_NONE: Integer
52
- LLAMA_SPLIT_LAYER: Integer
53
- LLAMA_SPLIT_ROW: Integer
53
+ LLAMA_POOLING_TYPE_NONE: Integer
54
+ LLAMA_POOLING_TYPE_MEAN: Integer
55
+ LLAMA_POOLING_TYPE_CLS: Integer
54
56
 
55
- def self?.backend_init: (?numa: bool) -> void
57
+ LLAMA_SPLIT_MODE_NONE: Integer
58
+ LLAMA_SPLIT_MODE_LAYER: Integer
59
+ LLAMA_SPLIT_MODE_ROW: Integer
60
+
61
+ def self?.backend_init: () -> void
56
62
  def self?.backend_free: () -> void
63
+ def self?.numa_init: (Integer) -> void
57
64
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
58
65
  def self?.generate: (::LLaMACpp::Context, String,
59
66
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -61,8 +68,6 @@ module LLaMACpp
61
68
  ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
62
69
  def self?.print_system_info: () -> void
63
70
  def self?.time_us: () -> Integer
64
- def self?.mmap_supported?: () -> bool
65
- def self?.mlock_supported?: () -> bool
66
71
  def self?.max_devices: () -> Integer
67
72
  def self?.supports_mmap?: () -> bool
68
73
  def self?.supports_mlock?: () -> bool
@@ -96,7 +101,8 @@ module LLaMACpp
96
101
  def empty?: () -> bool
97
102
  def free: () -> void
98
103
  def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
99
- def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
104
+ def vocab_type: () -> Integer
105
+ def rope_type: () -> Integer
100
106
  def n_vocab: () -> Integer
101
107
  def n_ctx_train: () -> Integer
102
108
  def n_embd: () -> Integer
@@ -195,8 +201,6 @@ module LLaMACpp
195
201
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
196
202
  def embeddings: () -> Array[Float]
197
203
  def embeddings_ith: (Integer) -> Array[Float]
198
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
199
- def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
200
204
  def decode: (::LLaMACpp::Batch) -> void
201
205
  def logits: () -> Array[Float]
202
206
  def n_ctx: () -> Integer
@@ -209,14 +213,16 @@ module LLaMACpp
209
213
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
210
214
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
211
215
  def kv_cache_seq_keep: (Integer) -> void
212
- def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
216
+ def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
213
217
  def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
218
+ def kv_cache_seq_pos_max: (Integer) -> Integer
219
+ def kv_cache_defrag: () -> void
220
+ def kv_cache_update: () -> void
214
221
  def set_rng_seed: (Integer) -> void
215
222
  def load_session_file: (session_path: String) -> void
216
223
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
217
224
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
218
225
  def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
219
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
220
226
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
221
227
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
222
228
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -225,7 +231,6 @@ module LLaMACpp
225
231
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
226
232
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
227
233
  def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
228
- def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
229
234
  def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
230
235
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
231
236
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
@@ -263,12 +268,12 @@ module LLaMACpp
263
268
  def yarn_beta_slow: () -> Float
264
269
  def yarn_orig_ctx=: (Integer) -> Integer
265
270
  def yarn_orig_ctx: () -> Integer
271
+ def defrag_thold=: (Float) -> Float
272
+ def defrag_thold: () -> Float
266
273
  def type_k=: (Integer) -> Integer
267
274
  def type_k: () -> Integer
268
275
  def type_v=: (Integer) -> Integer
269
276
  def type_v: () -> Integer
270
- def mul_mat_q: () -> bool
271
- def mul_mat_q=: (bool) -> bool
272
277
  def logits_all: () -> bool
273
278
  def logits_all=: (bool) -> bool
274
279
  def embedding: () -> bool
@@ -97,9 +97,10 @@ endif
97
97
  #
98
98
 
99
99
  # keep standard at C11 and C++11
100
- MK_CPPFLAGS = -I. -Icommon
101
- MK_CFLAGS = -std=c11 -fPIC
102
- MK_CXXFLAGS = -std=c++11 -fPIC
100
+ MK_CPPFLAGS = -I. -Icommon
101
+ MK_CFLAGS = -std=c11 -fPIC
102
+ MK_CXXFLAGS = -std=c++11 -fPIC
103
+ MK_NVCCFLAGS = -std=c++11
103
104
 
104
105
  # -Ofast tends to produce faster code, but may not be available for some compilers.
105
106
  ifdef LLAMA_FAST
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
172
173
  MK_LDFLAGS += -g
173
174
 
174
175
  ifeq ($(UNAME_S),Linux)
175
- MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
176
+ MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
176
177
  endif
177
178
  else
178
179
  MK_CPPFLAGS += -DNDEBUG
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
215
216
  -Werror=implicit-function-declaration
216
217
  MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
217
218
 
219
+ ifeq ($(LLAMA_FATAL_WARNINGS),1)
220
+ MK_CFLAGS += -Werror
221
+ MK_CXXFLAGS += -Werror
222
+ endif
223
+
218
224
  # this version of Apple ld64 is buggy
219
225
  ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
220
226
  MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -377,10 +383,18 @@ ifdef LLAMA_BLIS
377
383
  endif # LLAMA_BLIS
378
384
 
379
385
  ifdef LLAMA_CUBLAS
380
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
381
- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
386
+ ifneq ('', '$(wildcard /opt/cuda)')
387
+ CUDA_PATH ?= /opt/cuda
388
+ else
389
+ CUDA_PATH ?= /usr/local/cuda
390
+ endif
391
+ MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
392
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
382
393
  OBJS += ggml-cuda.o
383
394
  MK_NVCCFLAGS += -use_fast_math
395
+ ifdef LLAMA_FATAL_WARNINGS
396
+ MK_NVCCFLAGS += -Werror all-warnings
397
+ endif # LLAMA_FATAL_WARNINGS
384
398
  ifndef JETSON_EOL_MODULE_DETECT
385
399
  MK_NVCCFLAGS += --forward-unknown-to-host-compiler
386
400
  endif # JETSON_EOL_MODULE_DETECT
@@ -439,9 +453,9 @@ ifdef LLAMA_CUDA_CCBIN
439
453
  endif
440
454
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
441
455
  ifdef JETSON_EOL_MODULE_DETECT
442
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
456
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
443
457
  else
444
- $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
458
+ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
445
459
  endif # JETSON_EOL_MODULE_DETECT
446
460
  endif # LLAMA_CUBLAS
447
461
 
@@ -526,11 +540,29 @@ ifdef LLAMA_METAL
526
540
  ifdef LLAMA_METAL_NDEBUG
527
541
  MK_CPPFLAGS += -DGGML_METAL_NDEBUG
528
542
  endif
543
+ ifdef LLAMA_METAL_EMBED_LIBRARY
544
+ MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
545
+ OBJS += ggml-metal-embed.o
546
+ endif
529
547
  endif # LLAMA_METAL
530
548
 
531
549
  ifdef LLAMA_METAL
532
550
  ggml-metal.o: ggml-metal.m ggml-metal.h
533
551
  $(CC) $(CFLAGS) -c $< -o $@
552
+
553
+ ifdef LLAMA_METAL_EMBED_LIBRARY
554
+ ggml-metal-embed.o: ggml-metal.metal
555
+ @echo "Embedding Metal library"
556
+ $(eval TEMP_ASSEMBLY=$(shell mktemp))
557
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
558
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
559
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
560
+ @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
561
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
562
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
563
+ @$(AS) $(TEMP_ASSEMBLY) -o $@
564
+ @rm -f ${TEMP_ASSEMBLY}
565
+ endif
534
566
  endif # LLAMA_METAL
535
567
 
536
568
  ifdef LLAMA_MPI
@@ -542,9 +574,10 @@ GF_CC := $(CC)
542
574
  include scripts/get-flags.mk
543
575
 
544
576
  # combine build flags with cmdline overrides
545
- override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
546
- BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
547
- override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
577
+ override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
578
+ override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
579
+ BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
580
+ override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
548
581
  override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
549
582
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
550
583
 
@@ -552,7 +585,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
552
585
  ifdef LLAMA_CUBLAS
553
586
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
554
587
  include scripts/get-flags.mk
555
- CUDA_CXXFLAGS := $(GF_CXXFLAGS)
588
+ CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
556
589
  endif
557
590
 
558
591
  #
@@ -571,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
571
604
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
572
605
  ifdef LLAMA_CUBLAS
573
606
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
574
- CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
607
+ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
575
608
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
576
609
  ifndef CUDA_DOCKER_ARCH
577
610
  ifndef CUDA_POWER_ARCH
@@ -633,7 +666,6 @@ lib: llama.o ggml.o $(OBJS)
633
666
 
634
667
  clean:
635
668
  rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
636
- # find examples pocs -type f -name "*.o" -delete
637
669
 
638
670
  #
639
671
  # Examples
@@ -697,7 +729,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
697
729
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
698
730
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
699
731
 
700
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
732
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
701
733
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
702
734
  $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
703
735
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -868,3 +900,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
868
900
  tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
869
901
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
870
902
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
903
+
904
+ tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
905
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
906
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -377,6 +377,9 @@ struct ggml_gallocr {
377
377
 
378
378
  struct node_alloc * node_allocs; // [n_nodes]
379
379
  int n_nodes;
380
+
381
+ struct tensor_alloc * leaf_allocs; // [n_leafs]
382
+ int n_leafs;
380
383
  };
381
384
 
382
385
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
427
430
  free(galloc->buffers);
428
431
  free(galloc->buf_tallocs);
429
432
  free(galloc->node_allocs);
433
+ free(galloc->leaf_allocs);
430
434
  free(galloc);
431
435
  }
432
436
 
@@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
464
468
  for (int i = 0; i < GGML_MAX_SRC; i++) {
465
469
  struct ggml_tensor * parent = node->src[i];
466
470
  if (parent == NULL) {
467
- break;
471
+ continue;
468
472
  }
469
473
 
470
474
  // if the node's data is external, then we cannot re-use it
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
544
548
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
545
549
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
546
550
 
547
- // allocate all graph inputs first to avoid overwriting them
548
- for (int i = 0; i < graph->n_nodes; i++) {
549
- if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
550
- ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
551
- }
552
- for (int j = 0; j < GGML_MAX_SRC; j++) {
553
- if (graph->nodes[i]->src[j] == NULL) {
554
- break;
555
- }
556
- if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
557
- ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
558
- }
559
- }
560
- }
561
-
562
551
  // count number of children and views
552
+ // allocate all graph inputs and leafs first to avoid overwriting them
563
553
  for (int i = 0; i < graph->n_nodes; i++) {
564
554
  struct ggml_tensor * node = graph->nodes[i];
565
555
 
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
568
558
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
569
559
  }
570
560
 
561
+ if (node->flags & GGML_TENSOR_FLAG_INPUT) {
562
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
563
+ }
564
+
571
565
  for (int j = 0; j < GGML_MAX_SRC; j++) {
572
- struct ggml_tensor * parent = node->src[j];
573
- if (parent == NULL) {
574
- break;
566
+ struct ggml_tensor * src = node->src[j];
567
+ if (src == NULL) {
568
+ continue;
569
+ }
570
+
571
+ ggml_gallocr_hash_get(galloc, src)->n_children += 1;
572
+
573
+ // allocate explicit inputs and leafs
574
+ if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
575
+ ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
575
576
  }
576
- ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
577
577
  }
578
- }
578
+ }
579
+
580
+ // allocate the remaining leafs that are unused on the graph
581
+ // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
+ for (int i = 0; i < graph->n_leafs; i++) {
583
+ struct ggml_tensor * leaf = graph->leafs[i];
584
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
+
586
+ if (hn->n_children == 0) {
587
+ assert(!hn->allocated);
588
+ // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
+ ggml_gallocr_allocate_node(galloc, leaf, 0);
590
+ }
591
+ }
579
592
 
580
593
  // allocate tensors
581
594
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
586
599
  for (int j = 0; j < GGML_MAX_SRC; j++) {
587
600
  struct ggml_tensor * parent = node->src[j];
588
601
  if (parent == NULL) {
589
- break;
602
+ continue;
590
603
  }
591
604
  ggml_gallocr_allocate_node(galloc, parent, buffer_id);
592
605
  }
@@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
598
611
  for (int j = 0; j < GGML_MAX_SRC; j++) {
599
612
  struct ggml_tensor * parent = node->src[j];
600
613
  if (parent == NULL) {
601
- break;
614
+ continue;
602
615
  }
603
616
  AT_PRINTF("%s", parent->name);
604
617
  if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
@@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
611
624
  for (int j = 0; j < GGML_MAX_SRC; j++) {
612
625
  struct ggml_tensor * parent = node->src[j];
613
626
  if (parent == NULL) {
614
- break;
627
+ continue;
615
628
  }
616
629
  struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
617
630
  p_hn->n_children -= 1;
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
696
709
  }
697
710
  }
698
711
  }
712
+ if (galloc->n_leafs < graph->n_leafs) {
713
+ free(galloc->leaf_allocs);
714
+ galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
715
+ GGML_ASSERT(galloc->leaf_allocs != NULL);
716
+ }
717
+ galloc->n_leafs = graph->n_leafs;
718
+ for (int i = 0; i < graph->n_leafs; i++) {
719
+ struct ggml_tensor * leaf = graph->leafs[i];
720
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
+ galloc->leaf_allocs[i].offset = hn->offset;
722
+ galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
723
+ }
699
724
 
700
725
  // reallocate buffers if needed
701
726
  for (int i = 0; i < galloc->n_buffers; i++) {
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
722
747
  return ggml_gallocr_reserve_n(galloc, graph, NULL);
723
748
  }
724
749
 
725
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
726
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
750
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
727
752
 
728
753
  if (node->view_src != NULL) {
729
754
  if (node->buffer == NULL) {
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
732
757
  // this tensor was allocated without ggml-backend
733
758
  return;
734
759
  }
735
- ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
760
+ ggml_backend_view_init(galloc->buffers[buffer_id], node);
736
761
  }
737
762
  } else {
738
763
  if (node->data == NULL) {
739
764
  assert(tensor_alloc->offset != SIZE_MAX);
740
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
741
- void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
765
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
766
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
742
767
  void * addr = (char *)base + tensor_alloc->offset;
743
- ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
768
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
744
769
  } else {
745
770
  if (node->buffer == NULL) {
746
771
  // this tensor was allocated without ggml-backend
747
772
  return;
748
773
  }
749
-
750
- #ifndef NDEBUG
751
- size_t offset =
752
- (char *)node->data -
753
- (char *)ggml_backend_buffer_get_base(node->buffer);
754
- size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
755
- assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
756
- assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
757
- #endif
758
774
  }
759
775
  }
760
776
  }
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
773
789
  return true;
774
790
  }
775
791
 
792
+ if (galloc->n_leafs != graph->n_leafs) {
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
795
+ #endif
796
+ return true;
797
+ }
798
+
776
799
  for (int i = 0; i < graph->n_nodes; i++) {
777
800
  struct ggml_tensor * node = graph->nodes[i];
778
801
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
787
810
  for (int j = 0; j < GGML_MAX_SRC; j++) {
788
811
  struct ggml_tensor * src = node->src[j];
789
812
  if (src == NULL) {
790
- break;
813
+ continue;
791
814
  }
792
815
  if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
793
816
  #ifndef NDEBUG
@@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
827
850
  }
828
851
 
829
852
  // allocate the graph tensors from the previous assignments
853
+ // nodes
830
854
  for (int i = 0; i < graph->n_nodes; i++) {
831
855
  struct ggml_tensor * node = graph->nodes[i];
832
856
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
833
857
  for (int j = 0; j < GGML_MAX_SRC; j++) {
834
858
  struct ggml_tensor * src = node->src[j];
835
859
  if (src == NULL) {
836
- break;
860
+ continue;
837
861
  }
838
- ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
862
+ ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
839
863
  }
840
- ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
864
+ ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
+ }
866
+ // leafs
867
+ for (int i = 0; i < graph->n_leafs; i++) {
868
+ struct ggml_tensor * leaf = graph->leafs[i];
869
+ struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
+ ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
841
871
  }
842
872
 
843
873
  return true;
@@ -104,6 +104,8 @@ extern "C" {
104
104
  };
105
105
 
106
106
  struct ggml_backend {
107
+ ggml_guid_t guid;
108
+
107
109
  struct ggml_backend_i iface;
108
110
 
109
111
  ggml_backend_context_t context;
@@ -12,7 +12,6 @@
12
12
 
13
13
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
14
 
15
-
16
15
  // backend buffer type
17
16
 
18
17
  const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
@@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml
159
158
 
160
159
  // backend
161
160
 
161
+ ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
162
+ if (backend == NULL) {
163
+ return NULL;
164
+ }
165
+ return backend->guid;
166
+ }
167
+
162
168
  const char * ggml_backend_name(ggml_backend_t backend) {
163
169
  if (backend == NULL) {
164
170
  return "NULL";
@@ -219,6 +225,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
219
225
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
220
226
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
221
227
 
228
+ if (!size) {
229
+ return;
230
+ }
231
+
222
232
  tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
223
233
  }
224
234
 
@@ -229,6 +239,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
229
239
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
230
240
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
231
241
 
242
+ if (!size) {
243
+ return;
244
+ }
245
+
232
246
  tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
233
247
  }
234
248
 
@@ -748,7 +762,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
748
762
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
749
763
  switch (op->op) {
750
764
  case GGML_OP_CPY:
751
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
765
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
752
766
  case GGML_OP_MUL_MAT:
753
767
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
754
768
  default:
@@ -773,6 +787,11 @@ static struct ggml_backend_i cpu_backend_i = {
773
787
  /* .supports_op = */ ggml_backend_cpu_supports_op,
774
788
  };
775
789
 
790
+ static ggml_guid_t ggml_backend_cpu_guid(void) {
791
+ static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
792
+ return &guid;
793
+ }
794
+
776
795
  ggml_backend_t ggml_backend_cpu_init(void) {
777
796
  struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
778
797
  if (ctx == NULL) {
@@ -792,6 +811,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
792
811
  }
793
812
 
794
813
  *cpu_backend = (struct ggml_backend) {
814
+ /* .guid = */ ggml_backend_cpu_guid(),
795
815
  /* .interface = */ cpu_backend_i,
796
816
  /* .context = */ ctx
797
817
  };
@@ -799,7 +819,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
799
819
  }
800
820
 
801
821
  GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
802
- return backend && backend->iface.get_name == ggml_backend_cpu_name;
822
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
803
823
  }
804
824
 
805
825
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -998,6 +1018,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
998
1018
  }
999
1019
  }
1000
1020
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1021
+ return -1; // silence warning
1001
1022
  }
1002
1023
 
1003
1024
  #if 0
@@ -1032,7 +1053,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1032
1053
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1033
1054
  const struct ggml_tensor * src = tensor->src[i];
1034
1055
  if (src == NULL) {
1035
- break;
1056
+ continue;
1036
1057
  }
1037
1058
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1038
1059
  int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
@@ -1079,7 +1100,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1079
1100
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1080
1101
  struct ggml_tensor * src = node->src[j];
1081
1102
  if (src == NULL) {
1082
- break;
1103
+ continue;
1083
1104
  }
1084
1105
  ggml_backend_t src_backend = tensor_backend(src);
1085
1106
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
@@ -1135,7 +1156,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1135
1156
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1136
1157
  struct ggml_tensor * src = node->src[j];
1137
1158
  if (src == NULL) {
1138
- break;
1159
+ continue;
1139
1160
  }
1140
1161
  if (tensor_backend_id(src) == -1) {
1141
1162
  tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
@@ -1247,7 +1268,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1247
1268
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1248
1269
  struct ggml_tensor * src = node->src[j];
1249
1270
  if (src == NULL) {
1250
- break;
1271
+ continue;
1251
1272
  }
1252
1273
  int src_backend_id = tensor_backend_id(src);
1253
1274
  if (src_backend_id == -1) {
@@ -1306,7 +1327,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1306
1327
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1307
1328
  struct ggml_tensor * src = node->src[j];
1308
1329
  if (src == NULL) {
1309
- break;
1330
+ continue;
1310
1331
  }
1311
1332
  int src_backend_id = tensor_backend_id(src);
1312
1333
  assert(src_backend_id != -1); // all inputs should be assigned by now
@@ -1353,7 +1374,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1353
1374
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1354
1375
  struct ggml_tensor * src = node->src[j];
1355
1376
  if (src == NULL) {
1356
- break;
1377
+ continue;
1357
1378
  }
1358
1379
  ggml_backend_t src_backend = tensor_backend(src);
1359
1380
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
@@ -1659,7 +1680,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
1659
1680
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1660
1681
  struct ggml_tensor * s = src->src[i];
1661
1682
  if (s == NULL) {
1662
- break;
1683
+ continue;
1663
1684
  }
1664
1685
  dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1665
1686
  }
@@ -1688,7 +1709,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1688
1709
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1689
1710
  struct ggml_tensor * s = src->src[i];
1690
1711
  if (s == NULL) {
1691
- break;
1712
+ continue;
1692
1713
  }
1693
1714
  graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1694
1715
  }