llama_cpp 0.12.6 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 296b29b7d20c7bfd66f69749ccd41e63d6998589af0d3514db8f6c08011d545f
4
- data.tar.gz: 48f8787a63759a95049bbc515f4b35c74d07b356f1635d751d8d9d852e386c5a
3
+ metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
4
+ data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
5
5
  SHA512:
6
- metadata.gz: 5cd4c284a31fcdd36565b481c2456545eaf3fe19fda3778121f26f529ca01d18a894ba73739d966dc29f5aa239f8784ed56801bac5db3d21ae13e5b5aa2b4012
7
- data.tar.gz: 7d03f1d081d097913fe3489a0432a5869a13e0a0371458c6c4d6cdea7296422a5af51c13ae05ea0d752e068865cc99e52ee0c4f3d67de892003c76e9126d5940
6
+ metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
7
+ data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
2
+
3
+ - Bump bundled llama.cpp from b2106 to b2143.
4
+ - Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
5
+ - Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
6
+ - Add `numa_init` module function to `LLaMACpp`.
7
+ - Remove unnecessary argument from `backend_init`.
8
+
9
+ Implementation of llama_chat_apply_template binding has been postponed for the time being.
10
+
1
11
  ## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
2
12
 
3
13
  - Bump bundled llama.cpp from b2106 to b2143.
@@ -3243,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
3243
3243
 
3244
3244
  // module functions
3245
3245
 
3246
- static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
3247
- VALUE kw_args = Qnil;
3248
- ID kw_table[1] = { rb_intern("numa") };
3249
- VALUE kw_values[1] = { Qundef };
3250
- rb_scan_args(argc, argv, ":", &kw_args);
3251
- rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
3252
-
3253
- const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
3254
- llama_backend_init(numa);
3246
+ static VALUE rb_llama_llama_backend_init(VALUE self) {
3247
+ llama_backend_init();
3255
3248
 
3256
3249
  return Qnil;
3257
3250
  }
@@ -3262,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
3262
3255
  return Qnil;
3263
3256
  }
3264
3257
 
3258
+ static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
3259
+ if (!RB_INTEGER_TYPE_P(strategy)) {
3260
+ rb_raise(rb_eArgError, "strategy must be an integer");
3261
+ return Qnil;
3262
+ }
3263
+
3264
+ llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
3265
+
3266
+ return Qnil;
3267
+ }
3268
+
3265
3269
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
3266
3270
  VALUE kw_args = Qnil;
3267
3271
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
@@ -3345,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
3345
3349
  RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
3346
3350
  RbLLaMAGrammar::define_class(rb_mLLaMACpp);
3347
3351
 
3348
- rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
3352
+ rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
3349
3353
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
3354
+ rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
3350
3355
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
3351
3356
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
3352
3357
  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
@@ -3391,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
3391
3396
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3392
3397
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3393
3398
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3399
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3400
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3394
3401
 
3395
3402
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3396
3403
 
@@ -3412,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
3412
3419
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3413
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3414
3421
 
3422
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
3425
+
3415
3426
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3416
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3417
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.6'
6
+ VERSION = '0.12.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2143'
9
+ LLAMA_CPP_VERSION = 'b2249'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -29,6 +29,8 @@ module LLaMACpp
29
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
30
30
  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
31
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
+ LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
+ LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
32
34
 
33
35
  LLAMA_KV_OVERRIDE_INT: Integer
34
36
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -48,12 +50,17 @@ module LLaMACpp
48
50
  LLAMA_ROPE_SCALING_YARN: Integer
49
51
  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
50
52
 
53
+ LLAMA_POOLING_NONE: Integer
54
+ LLAMA_POOLING_MEAN: Integer
55
+ LLAMA_POOLING_CLS: Integer
56
+
51
57
  LLAMA_SPLIT_NONE: Integer
52
58
  LLAMA_SPLIT_LAYER: Integer
53
59
  LLAMA_SPLIT_ROW: Integer
54
60
 
55
- def self?.backend_init: (?numa: bool) -> void
61
+ def self?.backend_init: () -> void
56
62
  def self?.backend_free: () -> void
63
+ def self?.numa_init: (Integer) -> void
57
64
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
58
65
  def self?.generate: (::LLaMACpp::Context, String,
59
66
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -97,9 +97,10 @@ endif
97
97
  #
98
98
 
99
99
  # keep standard at C11 and C++11
100
- MK_CPPFLAGS = -I. -Icommon
101
- MK_CFLAGS = -std=c11 -fPIC
102
- MK_CXXFLAGS = -std=c++11 -fPIC
100
+ MK_CPPFLAGS = -I. -Icommon
101
+ MK_CFLAGS = -std=c11 -fPIC
102
+ MK_CXXFLAGS = -std=c++11 -fPIC
103
+ MK_NVCCFLAGS = -std=c++11
103
104
 
104
105
  # -Ofast tends to produce faster code, but may not be available for some compilers.
105
106
  ifdef LLAMA_FAST
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
172
173
  MK_LDFLAGS += -g
173
174
 
174
175
  ifeq ($(UNAME_S),Linux)
175
- MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
176
+ MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
176
177
  endif
177
178
  else
178
179
  MK_CPPFLAGS += -DNDEBUG
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
215
216
  -Werror=implicit-function-declaration
216
217
  MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
217
218
 
219
+ ifeq ($(LLAMA_FATAL_WARNINGS),1)
220
+ MK_CFLAGS += -Werror
221
+ MK_CXXFLAGS += -Werror
222
+ endif
223
+
218
224
  # this version of Apple ld64 is buggy
219
225
  ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
220
226
  MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
381
387
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
382
388
  OBJS += ggml-cuda.o
383
389
  MK_NVCCFLAGS += -use_fast_math
390
+ ifdef LLAMA_FATAL_WARNINGS
391
+ MK_NVCCFLAGS += -Werror all-warnings
392
+ endif # LLAMA_FATAL_WARNINGS
384
393
  ifndef JETSON_EOL_MODULE_DETECT
385
394
  MK_NVCCFLAGS += --forward-unknown-to-host-compiler
386
395
  endif # JETSON_EOL_MODULE_DETECT
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
439
448
  endif
440
449
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
441
450
  ifdef JETSON_EOL_MODULE_DETECT
442
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
451
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
443
452
  else
444
- $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
453
+ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
445
454
  endif # JETSON_EOL_MODULE_DETECT
446
455
  endif # LLAMA_CUBLAS
447
456
 
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
526
535
  ifdef LLAMA_METAL_NDEBUG
527
536
  MK_CPPFLAGS += -DGGML_METAL_NDEBUG
528
537
  endif
538
+ ifdef LLAMA_METAL_EMBED_LIBRARY
539
+ MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
540
+ OBJS += ggml-metal-embed.o
541
+ endif
529
542
  endif # LLAMA_METAL
530
543
 
531
544
  ifdef LLAMA_METAL
532
545
  ggml-metal.o: ggml-metal.m ggml-metal.h
533
546
  $(CC) $(CFLAGS) -c $< -o $@
547
+
548
+ ifdef LLAMA_METAL_EMBED_LIBRARY
549
+ ggml-metal-embed.o: ggml-metal.metal
550
+ @echo "Embedding Metal library"
551
+ $(eval TEMP_ASSEMBLY=$(shell mktemp))
552
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
553
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
554
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
555
+ @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
556
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
557
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
558
+ @$(AS) $(TEMP_ASSEMBLY) -o $@
559
+ @rm -f ${TEMP_ASSEMBLY}
560
+ endif
534
561
  endif # LLAMA_METAL
535
562
 
536
563
  ifdef LLAMA_MPI
@@ -542,9 +569,10 @@ GF_CC := $(CC)
542
569
  include scripts/get-flags.mk
543
570
 
544
571
  # combine build flags with cmdline overrides
545
- override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
546
- BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
547
- override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
572
+ override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
573
+ override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
574
+ BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
575
+ override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
548
576
  override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
549
577
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
550
578
 
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
552
580
  ifdef LLAMA_CUBLAS
553
581
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
554
582
  include scripts/get-flags.mk
555
- CUDA_CXXFLAGS := $(GF_CXXFLAGS)
583
+ CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
556
584
  endif
557
585
 
558
586
  #
@@ -633,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
633
661
 
634
662
  clean:
635
663
  rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
636
- # find examples pocs -type f -name "*.o" -delete
637
664
 
638
665
  #
639
666
  # Examples
@@ -697,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
697
724
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
698
725
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
699
726
 
700
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
727
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
701
728
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
702
729
  $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
703
730
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -868,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
868
895
  tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
869
896
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
870
897
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
898
+
899
+ tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
900
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
901
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -377,6 +377,9 @@ struct ggml_gallocr {
377
377
 
378
378
  struct node_alloc * node_allocs; // [n_nodes]
379
379
  int n_nodes;
380
+
381
+ struct tensor_alloc * leaf_allocs; // [n_leafs]
382
+ int n_leafs;
380
383
  };
381
384
 
382
385
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
427
430
  free(galloc->buffers);
428
431
  free(galloc->buf_tallocs);
429
432
  free(galloc->node_allocs);
433
+ free(galloc->leaf_allocs);
430
434
  free(galloc);
431
435
  }
432
436
 
@@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
464
468
  for (int i = 0; i < GGML_MAX_SRC; i++) {
465
469
  struct ggml_tensor * parent = node->src[i];
466
470
  if (parent == NULL) {
467
- break;
471
+ continue;
468
472
  }
469
473
 
470
474
  // if the node's data is external, then we cannot re-use it
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
544
548
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
545
549
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
546
550
 
547
- // allocate all graph inputs first to avoid overwriting them
548
- for (int i = 0; i < graph->n_nodes; i++) {
549
- if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
550
- ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
551
- }
552
- for (int j = 0; j < GGML_MAX_SRC; j++) {
553
- if (graph->nodes[i]->src[j] == NULL) {
554
- break;
555
- }
556
- if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
557
- ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
558
- }
559
- }
560
- }
561
-
562
551
  // count number of children and views
552
+ // allocate all graph inputs and leafs first to avoid overwriting them
563
553
  for (int i = 0; i < graph->n_nodes; i++) {
564
554
  struct ggml_tensor * node = graph->nodes[i];
565
555
 
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
568
558
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
569
559
  }
570
560
 
561
+ if (node->flags & GGML_TENSOR_FLAG_INPUT) {
562
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
563
+ }
564
+
571
565
  for (int j = 0; j < GGML_MAX_SRC; j++) {
572
- struct ggml_tensor * parent = node->src[j];
573
- if (parent == NULL) {
574
- break;
566
+ struct ggml_tensor * src = node->src[j];
567
+ if (src == NULL) {
568
+ continue;
569
+ }
570
+
571
+ ggml_gallocr_hash_get(galloc, src)->n_children += 1;
572
+
573
+ // allocate explicit inputs and leafs
574
+ if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
575
+ ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
575
576
  }
576
- ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
577
577
  }
578
- }
578
+ }
579
+
580
+ // allocate the remaining leafs that are unused on the graph
581
+ // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
+ for (int i = 0; i < graph->n_leafs; i++) {
583
+ struct ggml_tensor * leaf = graph->leafs[i];
584
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
+
586
+ if (hn->n_children == 0) {
587
+ assert(!hn->allocated);
588
+ // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
+ ggml_gallocr_allocate_node(galloc, leaf, 0);
590
+ }
591
+ }
579
592
 
580
593
  // allocate tensors
581
594
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
586
599
  for (int j = 0; j < GGML_MAX_SRC; j++) {
587
600
  struct ggml_tensor * parent = node->src[j];
588
601
  if (parent == NULL) {
589
- break;
602
+ continue;
590
603
  }
591
604
  ggml_gallocr_allocate_node(galloc, parent, buffer_id);
592
605
  }
@@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
598
611
  for (int j = 0; j < GGML_MAX_SRC; j++) {
599
612
  struct ggml_tensor * parent = node->src[j];
600
613
  if (parent == NULL) {
601
- break;
614
+ continue;
602
615
  }
603
616
  AT_PRINTF("%s", parent->name);
604
617
  if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
@@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
611
624
  for (int j = 0; j < GGML_MAX_SRC; j++) {
612
625
  struct ggml_tensor * parent = node->src[j];
613
626
  if (parent == NULL) {
614
- break;
627
+ continue;
615
628
  }
616
629
  struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
617
630
  p_hn->n_children -= 1;
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
696
709
  }
697
710
  }
698
711
  }
712
+ if (galloc->n_leafs < graph->n_leafs) {
713
+ free(galloc->leaf_allocs);
714
+ galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
715
+ GGML_ASSERT(galloc->leaf_allocs != NULL);
716
+ }
717
+ galloc->n_leafs = graph->n_leafs;
718
+ for (int i = 0; i < graph->n_leafs; i++) {
719
+ struct ggml_tensor * leaf = graph->leafs[i];
720
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
+ galloc->leaf_allocs[i].offset = hn->offset;
722
+ galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
723
+ }
699
724
 
700
725
  // reallocate buffers if needed
701
726
  for (int i = 0; i < galloc->n_buffers; i++) {
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
722
747
  return ggml_gallocr_reserve_n(galloc, graph, NULL);
723
748
  }
724
749
 
725
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
726
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
750
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
727
752
 
728
753
  if (node->view_src != NULL) {
729
754
  if (node->buffer == NULL) {
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
732
757
  // this tensor was allocated without ggml-backend
733
758
  return;
734
759
  }
735
- ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
760
+ ggml_backend_view_init(galloc->buffers[buffer_id], node);
736
761
  }
737
762
  } else {
738
763
  if (node->data == NULL) {
739
764
  assert(tensor_alloc->offset != SIZE_MAX);
740
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
741
- void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
765
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
766
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
742
767
  void * addr = (char *)base + tensor_alloc->offset;
743
- ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
768
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
744
769
  } else {
745
770
  if (node->buffer == NULL) {
746
771
  // this tensor was allocated without ggml-backend
747
772
  return;
748
773
  }
749
-
750
- #ifndef NDEBUG
751
- size_t offset =
752
- (char *)node->data -
753
- (char *)ggml_backend_buffer_get_base(node->buffer);
754
- size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
755
- assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
756
- assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
757
- #endif
758
774
  }
759
775
  }
760
776
  }
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
773
789
  return true;
774
790
  }
775
791
 
792
+ if (galloc->n_leafs != graph->n_leafs) {
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
795
+ #endif
796
+ return true;
797
+ }
798
+
776
799
  for (int i = 0; i < graph->n_nodes; i++) {
777
800
  struct ggml_tensor * node = graph->nodes[i];
778
801
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
787
810
  for (int j = 0; j < GGML_MAX_SRC; j++) {
788
811
  struct ggml_tensor * src = node->src[j];
789
812
  if (src == NULL) {
790
- break;
813
+ continue;
791
814
  }
792
815
  if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
793
816
  #ifndef NDEBUG
@@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
827
850
  }
828
851
 
829
852
  // allocate the graph tensors from the previous assignments
853
+ // nodes
830
854
  for (int i = 0; i < graph->n_nodes; i++) {
831
855
  struct ggml_tensor * node = graph->nodes[i];
832
856
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
833
857
  for (int j = 0; j < GGML_MAX_SRC; j++) {
834
858
  struct ggml_tensor * src = node->src[j];
835
859
  if (src == NULL) {
836
- break;
860
+ continue;
837
861
  }
838
- ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
862
+ ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
839
863
  }
840
- ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
864
+ ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
+ }
866
+ // leafs
867
+ for (int i = 0; i < graph->n_leafs; i++) {
868
+ struct ggml_tensor * leaf = graph->leafs[i];
869
+ struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
+ ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
841
871
  }
842
872
 
843
873
  return true;
@@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
219
219
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
220
220
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
221
221
 
222
+ if (!size) {
223
+ return;
224
+ }
225
+
222
226
  tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
223
227
  }
224
228
 
@@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
229
233
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
230
234
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
231
235
 
236
+ if (!size) {
237
+ return;
238
+ }
239
+
232
240
  tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
233
241
  }
234
242
 
@@ -748,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
748
756
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
749
757
  switch (op->op) {
750
758
  case GGML_OP_CPY:
751
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
759
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
752
760
  case GGML_OP_MUL_MAT:
753
761
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
754
762
  default:
@@ -998,6 +1006,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
998
1006
  }
999
1007
  }
1000
1008
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1009
+ return -1; // silence warning
1001
1010
  }
1002
1011
 
1003
1012
  #if 0
@@ -1032,7 +1041,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1032
1041
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1033
1042
  const struct ggml_tensor * src = tensor->src[i];
1034
1043
  if (src == NULL) {
1035
- break;
1044
+ continue;
1036
1045
  }
1037
1046
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1038
1047
  int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
@@ -1079,7 +1088,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1079
1088
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1080
1089
  struct ggml_tensor * src = node->src[j];
1081
1090
  if (src == NULL) {
1082
- break;
1091
+ continue;
1083
1092
  }
1084
1093
  ggml_backend_t src_backend = tensor_backend(src);
1085
1094
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
@@ -1135,7 +1144,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1135
1144
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1136
1145
  struct ggml_tensor * src = node->src[j];
1137
1146
  if (src == NULL) {
1138
- break;
1147
+ continue;
1139
1148
  }
1140
1149
  if (tensor_backend_id(src) == -1) {
1141
1150
  tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
@@ -1247,7 +1256,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1247
1256
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1248
1257
  struct ggml_tensor * src = node->src[j];
1249
1258
  if (src == NULL) {
1250
- break;
1259
+ continue;
1251
1260
  }
1252
1261
  int src_backend_id = tensor_backend_id(src);
1253
1262
  if (src_backend_id == -1) {
@@ -1306,7 +1315,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1306
1315
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1307
1316
  struct ggml_tensor * src = node->src[j];
1308
1317
  if (src == NULL) {
1309
- break;
1318
+ continue;
1310
1319
  }
1311
1320
  int src_backend_id = tensor_backend_id(src);
1312
1321
  assert(src_backend_id != -1); // all inputs should be assigned by now
@@ -1353,7 +1362,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1353
1362
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1354
1363
  struct ggml_tensor * src = node->src[j];
1355
1364
  if (src == NULL) {
1356
- break;
1365
+ continue;
1357
1366
  }
1358
1367
  ggml_backend_t src_backend = tensor_backend(src);
1359
1368
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
@@ -1659,7 +1668,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
1659
1668
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1660
1669
  struct ggml_tensor * s = src->src[i];
1661
1670
  if (s == NULL) {
1662
- break;
1671
+ continue;
1663
1672
  }
1664
1673
  dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1665
1674
  }
@@ -1688,7 +1697,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1688
1697
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1689
1698
  struct ggml_tensor * s = src->src[i];
1690
1699
  if (s == NULL) {
1691
- break;
1700
+ continue;
1692
1701
  }
1693
1702
  graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1694
1703
  }