llama_cpp 0.12.6 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 296b29b7d20c7bfd66f69749ccd41e63d6998589af0d3514db8f6c08011d545f
4
- data.tar.gz: 48f8787a63759a95049bbc515f4b35c74d07b356f1635d751d8d9d852e386c5a
3
+ metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
4
+ data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
5
5
  SHA512:
6
- metadata.gz: 5cd4c284a31fcdd36565b481c2456545eaf3fe19fda3778121f26f529ca01d18a894ba73739d966dc29f5aa239f8784ed56801bac5db3d21ae13e5b5aa2b4012
7
- data.tar.gz: 7d03f1d081d097913fe3489a0432a5869a13e0a0371458c6c4d6cdea7296422a5af51c13ae05ea0d752e068865cc99e52ee0c4f3d67de892003c76e9126d5940
6
+ metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
7
+ data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
2
+
3
+ - Bump bundled llama.cpp from b2106 to b2143.
4
+ - Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
5
+ - Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
6
+ - Add `numa_init` module function to `LLaMACpp`.
7
+ - Remove unnecessary argument from `backend_init`.
8
+
9
+ Implementation of llama_chat_apply_template binding has been postponed for the time being.
10
+
1
11
  ## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
2
12
 
3
13
  - Bump bundled llama.cpp from b2106 to b2143.
@@ -3243,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
3243
3243
 
3244
3244
  // module functions
3245
3245
 
3246
- static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
3247
- VALUE kw_args = Qnil;
3248
- ID kw_table[1] = { rb_intern("numa") };
3249
- VALUE kw_values[1] = { Qundef };
3250
- rb_scan_args(argc, argv, ":", &kw_args);
3251
- rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
3252
-
3253
- const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
3254
- llama_backend_init(numa);
3246
+ static VALUE rb_llama_llama_backend_init(VALUE self) {
3247
+ llama_backend_init();
3255
3248
 
3256
3249
  return Qnil;
3257
3250
  }
@@ -3262,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
3262
3255
  return Qnil;
3263
3256
  }
3264
3257
 
3258
+ static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
3259
+ if (!RB_INTEGER_TYPE_P(strategy)) {
3260
+ rb_raise(rb_eArgError, "strategy must be an integer");
3261
+ return Qnil;
3262
+ }
3263
+
3264
+ llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
3265
+
3266
+ return Qnil;
3267
+ }
3268
+
3265
3269
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
3266
3270
  VALUE kw_args = Qnil;
3267
3271
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
@@ -3345,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
3345
3349
  RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
3346
3350
  RbLLaMAGrammar::define_class(rb_mLLaMACpp);
3347
3351
 
3348
- rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
3352
+ rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
3349
3353
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
3354
+ rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
3350
3355
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
3351
3356
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
3352
3357
  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
@@ -3391,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
3391
3396
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3392
3397
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3393
3398
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3399
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3400
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3394
3401
 
3395
3402
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3396
3403
 
@@ -3412,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
3412
3419
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3413
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3414
3421
 
3422
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
3425
+
3415
3426
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3416
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3417
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.6'
6
+ VERSION = '0.12.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2143'
9
+ LLAMA_CPP_VERSION = 'b2249'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -29,6 +29,8 @@ module LLaMACpp
29
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
30
30
  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
31
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
+ LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
+ LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
32
34
 
33
35
  LLAMA_KV_OVERRIDE_INT: Integer
34
36
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -48,12 +50,17 @@ module LLaMACpp
48
50
  LLAMA_ROPE_SCALING_YARN: Integer
49
51
  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
50
52
 
53
+ LLAMA_POOLING_NONE: Integer
54
+ LLAMA_POOLING_MEAN: Integer
55
+ LLAMA_POOLING_CLS: Integer
56
+
51
57
  LLAMA_SPLIT_NONE: Integer
52
58
  LLAMA_SPLIT_LAYER: Integer
53
59
  LLAMA_SPLIT_ROW: Integer
54
60
 
55
- def self?.backend_init: (?numa: bool) -> void
61
+ def self?.backend_init: () -> void
56
62
  def self?.backend_free: () -> void
63
+ def self?.numa_init: (Integer) -> void
57
64
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
58
65
  def self?.generate: (::LLaMACpp::Context, String,
59
66
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -97,9 +97,10 @@ endif
97
97
  #
98
98
 
99
99
  # keep standard at C11 and C++11
100
- MK_CPPFLAGS = -I. -Icommon
101
- MK_CFLAGS = -std=c11 -fPIC
102
- MK_CXXFLAGS = -std=c++11 -fPIC
100
+ MK_CPPFLAGS = -I. -Icommon
101
+ MK_CFLAGS = -std=c11 -fPIC
102
+ MK_CXXFLAGS = -std=c++11 -fPIC
103
+ MK_NVCCFLAGS = -std=c++11
103
104
 
104
105
  # -Ofast tends to produce faster code, but may not be available for some compilers.
105
106
  ifdef LLAMA_FAST
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
172
173
  MK_LDFLAGS += -g
173
174
 
174
175
  ifeq ($(UNAME_S),Linux)
175
- MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
176
+ MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
176
177
  endif
177
178
  else
178
179
  MK_CPPFLAGS += -DNDEBUG
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
215
216
  -Werror=implicit-function-declaration
216
217
  MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
217
218
 
219
+ ifeq ($(LLAMA_FATAL_WARNINGS),1)
220
+ MK_CFLAGS += -Werror
221
+ MK_CXXFLAGS += -Werror
222
+ endif
223
+
218
224
  # this version of Apple ld64 is buggy
219
225
  ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
220
226
  MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
381
387
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
382
388
  OBJS += ggml-cuda.o
383
389
  MK_NVCCFLAGS += -use_fast_math
390
+ ifdef LLAMA_FATAL_WARNINGS
391
+ MK_NVCCFLAGS += -Werror all-warnings
392
+ endif # LLAMA_FATAL_WARNINGS
384
393
  ifndef JETSON_EOL_MODULE_DETECT
385
394
  MK_NVCCFLAGS += --forward-unknown-to-host-compiler
386
395
  endif # JETSON_EOL_MODULE_DETECT
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
439
448
  endif
440
449
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
441
450
  ifdef JETSON_EOL_MODULE_DETECT
442
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
451
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
443
452
  else
444
- $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
453
+ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
445
454
  endif # JETSON_EOL_MODULE_DETECT
446
455
  endif # LLAMA_CUBLAS
447
456
 
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
526
535
  ifdef LLAMA_METAL_NDEBUG
527
536
  MK_CPPFLAGS += -DGGML_METAL_NDEBUG
528
537
  endif
538
+ ifdef LLAMA_METAL_EMBED_LIBRARY
539
+ MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
540
+ OBJS += ggml-metal-embed.o
541
+ endif
529
542
  endif # LLAMA_METAL
530
543
 
531
544
  ifdef LLAMA_METAL
532
545
  ggml-metal.o: ggml-metal.m ggml-metal.h
533
546
  $(CC) $(CFLAGS) -c $< -o $@
547
+
548
+ ifdef LLAMA_METAL_EMBED_LIBRARY
549
+ ggml-metal-embed.o: ggml-metal.metal
550
+ @echo "Embedding Metal library"
551
+ $(eval TEMP_ASSEMBLY=$(shell mktemp))
552
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
553
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
554
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
555
+ @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
556
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
557
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
558
+ @$(AS) $(TEMP_ASSEMBLY) -o $@
559
+ @rm -f ${TEMP_ASSEMBLY}
560
+ endif
534
561
  endif # LLAMA_METAL
535
562
 
536
563
  ifdef LLAMA_MPI
@@ -542,9 +569,10 @@ GF_CC := $(CC)
542
569
  include scripts/get-flags.mk
543
570
 
544
571
  # combine build flags with cmdline overrides
545
- override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
546
- BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
547
- override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
572
+ override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
573
+ override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
574
+ BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
575
+ override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
548
576
  override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
549
577
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
550
578
 
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
552
580
  ifdef LLAMA_CUBLAS
553
581
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
554
582
  include scripts/get-flags.mk
555
- CUDA_CXXFLAGS := $(GF_CXXFLAGS)
583
+ CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
556
584
  endif
557
585
 
558
586
  #
@@ -633,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
633
661
 
634
662
  clean:
635
663
  rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
636
- # find examples pocs -type f -name "*.o" -delete
637
664
 
638
665
  #
639
666
  # Examples
@@ -697,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
697
724
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
698
725
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
699
726
 
700
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
727
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
701
728
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
702
729
  $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
703
730
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -868,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
868
895
  tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
869
896
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
870
897
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
898
+
899
+ tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
900
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
901
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -377,6 +377,9 @@ struct ggml_gallocr {
377
377
 
378
378
  struct node_alloc * node_allocs; // [n_nodes]
379
379
  int n_nodes;
380
+
381
+ struct tensor_alloc * leaf_allocs; // [n_leafs]
382
+ int n_leafs;
380
383
  };
381
384
 
382
385
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
427
430
  free(galloc->buffers);
428
431
  free(galloc->buf_tallocs);
429
432
  free(galloc->node_allocs);
433
+ free(galloc->leaf_allocs);
430
434
  free(galloc);
431
435
  }
432
436
 
@@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
464
468
  for (int i = 0; i < GGML_MAX_SRC; i++) {
465
469
  struct ggml_tensor * parent = node->src[i];
466
470
  if (parent == NULL) {
467
- break;
471
+ continue;
468
472
  }
469
473
 
470
474
  // if the node's data is external, then we cannot re-use it
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
544
548
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
545
549
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
546
550
 
547
- // allocate all graph inputs first to avoid overwriting them
548
- for (int i = 0; i < graph->n_nodes; i++) {
549
- if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
550
- ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
551
- }
552
- for (int j = 0; j < GGML_MAX_SRC; j++) {
553
- if (graph->nodes[i]->src[j] == NULL) {
554
- break;
555
- }
556
- if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
557
- ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
558
- }
559
- }
560
- }
561
-
562
551
  // count number of children and views
552
+ // allocate all graph inputs and leafs first to avoid overwriting them
563
553
  for (int i = 0; i < graph->n_nodes; i++) {
564
554
  struct ggml_tensor * node = graph->nodes[i];
565
555
 
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
568
558
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
569
559
  }
570
560
 
561
+ if (node->flags & GGML_TENSOR_FLAG_INPUT) {
562
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
563
+ }
564
+
571
565
  for (int j = 0; j < GGML_MAX_SRC; j++) {
572
- struct ggml_tensor * parent = node->src[j];
573
- if (parent == NULL) {
574
- break;
566
+ struct ggml_tensor * src = node->src[j];
567
+ if (src == NULL) {
568
+ continue;
569
+ }
570
+
571
+ ggml_gallocr_hash_get(galloc, src)->n_children += 1;
572
+
573
+ // allocate explicit inputs and leafs
574
+ if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
575
+ ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
575
576
  }
576
- ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
577
577
  }
578
- }
578
+ }
579
+
580
+ // allocate the remaining leafs that are unused on the graph
581
+ // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
+ for (int i = 0; i < graph->n_leafs; i++) {
583
+ struct ggml_tensor * leaf = graph->leafs[i];
584
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
+
586
+ if (hn->n_children == 0) {
587
+ assert(!hn->allocated);
588
+ // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
+ ggml_gallocr_allocate_node(galloc, leaf, 0);
590
+ }
591
+ }
579
592
 
580
593
  // allocate tensors
581
594
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
586
599
  for (int j = 0; j < GGML_MAX_SRC; j++) {
587
600
  struct ggml_tensor * parent = node->src[j];
588
601
  if (parent == NULL) {
589
- break;
602
+ continue;
590
603
  }
591
604
  ggml_gallocr_allocate_node(galloc, parent, buffer_id);
592
605
  }
@@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
598
611
  for (int j = 0; j < GGML_MAX_SRC; j++) {
599
612
  struct ggml_tensor * parent = node->src[j];
600
613
  if (parent == NULL) {
601
- break;
614
+ continue;
602
615
  }
603
616
  AT_PRINTF("%s", parent->name);
604
617
  if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
@@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
611
624
  for (int j = 0; j < GGML_MAX_SRC; j++) {
612
625
  struct ggml_tensor * parent = node->src[j];
613
626
  if (parent == NULL) {
614
- break;
627
+ continue;
615
628
  }
616
629
  struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
617
630
  p_hn->n_children -= 1;
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
696
709
  }
697
710
  }
698
711
  }
712
+ if (galloc->n_leafs < graph->n_leafs) {
713
+ free(galloc->leaf_allocs);
714
+ galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
715
+ GGML_ASSERT(galloc->leaf_allocs != NULL);
716
+ }
717
+ galloc->n_leafs = graph->n_leafs;
718
+ for (int i = 0; i < graph->n_leafs; i++) {
719
+ struct ggml_tensor * leaf = graph->leafs[i];
720
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
+ galloc->leaf_allocs[i].offset = hn->offset;
722
+ galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
723
+ }
699
724
 
700
725
  // reallocate buffers if needed
701
726
  for (int i = 0; i < galloc->n_buffers; i++) {
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
722
747
  return ggml_gallocr_reserve_n(galloc, graph, NULL);
723
748
  }
724
749
 
725
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
726
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
750
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
727
752
 
728
753
  if (node->view_src != NULL) {
729
754
  if (node->buffer == NULL) {
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
732
757
  // this tensor was allocated without ggml-backend
733
758
  return;
734
759
  }
735
- ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
760
+ ggml_backend_view_init(galloc->buffers[buffer_id], node);
736
761
  }
737
762
  } else {
738
763
  if (node->data == NULL) {
739
764
  assert(tensor_alloc->offset != SIZE_MAX);
740
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
741
- void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
765
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
766
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
742
767
  void * addr = (char *)base + tensor_alloc->offset;
743
- ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
768
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
744
769
  } else {
745
770
  if (node->buffer == NULL) {
746
771
  // this tensor was allocated without ggml-backend
747
772
  return;
748
773
  }
749
-
750
- #ifndef NDEBUG
751
- size_t offset =
752
- (char *)node->data -
753
- (char *)ggml_backend_buffer_get_base(node->buffer);
754
- size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
755
- assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
756
- assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
757
- #endif
758
774
  }
759
775
  }
760
776
  }
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
773
789
  return true;
774
790
  }
775
791
 
792
+ if (galloc->n_leafs != graph->n_leafs) {
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
795
+ #endif
796
+ return true;
797
+ }
798
+
776
799
  for (int i = 0; i < graph->n_nodes; i++) {
777
800
  struct ggml_tensor * node = graph->nodes[i];
778
801
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
787
810
  for (int j = 0; j < GGML_MAX_SRC; j++) {
788
811
  struct ggml_tensor * src = node->src[j];
789
812
  if (src == NULL) {
790
- break;
813
+ continue;
791
814
  }
792
815
  if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
793
816
  #ifndef NDEBUG
@@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
827
850
  }
828
851
 
829
852
  // allocate the graph tensors from the previous assignments
853
+ // nodes
830
854
  for (int i = 0; i < graph->n_nodes; i++) {
831
855
  struct ggml_tensor * node = graph->nodes[i];
832
856
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
833
857
  for (int j = 0; j < GGML_MAX_SRC; j++) {
834
858
  struct ggml_tensor * src = node->src[j];
835
859
  if (src == NULL) {
836
- break;
860
+ continue;
837
861
  }
838
- ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
862
+ ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
839
863
  }
840
- ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
864
+ ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
+ }
866
+ // leafs
867
+ for (int i = 0; i < graph->n_leafs; i++) {
868
+ struct ggml_tensor * leaf = graph->leafs[i];
869
+ struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
+ ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
841
871
  }
842
872
 
843
873
  return true;
@@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
219
219
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
220
220
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
221
221
 
222
+ if (!size) {
223
+ return;
224
+ }
225
+
222
226
  tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
223
227
  }
224
228
 
@@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
229
233
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
230
234
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
231
235
 
236
+ if (!size) {
237
+ return;
238
+ }
239
+
232
240
  tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
233
241
  }
234
242
 
@@ -748,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
748
756
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
749
757
  switch (op->op) {
750
758
  case GGML_OP_CPY:
751
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
759
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
752
760
  case GGML_OP_MUL_MAT:
753
761
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
754
762
  default:
@@ -998,6 +1006,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
998
1006
  }
999
1007
  }
1000
1008
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1009
+ return -1; // silence warning
1001
1010
  }
1002
1011
 
1003
1012
  #if 0
@@ -1032,7 +1041,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1032
1041
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1033
1042
  const struct ggml_tensor * src = tensor->src[i];
1034
1043
  if (src == NULL) {
1035
- break;
1044
+ continue;
1036
1045
  }
1037
1046
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1038
1047
  int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
@@ -1079,7 +1088,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1079
1088
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1080
1089
  struct ggml_tensor * src = node->src[j];
1081
1090
  if (src == NULL) {
1082
- break;
1091
+ continue;
1083
1092
  }
1084
1093
  ggml_backend_t src_backend = tensor_backend(src);
1085
1094
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
@@ -1135,7 +1144,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1135
1144
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1136
1145
  struct ggml_tensor * src = node->src[j];
1137
1146
  if (src == NULL) {
1138
- break;
1147
+ continue;
1139
1148
  }
1140
1149
  if (tensor_backend_id(src) == -1) {
1141
1150
  tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
@@ -1247,7 +1256,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1247
1256
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1248
1257
  struct ggml_tensor * src = node->src[j];
1249
1258
  if (src == NULL) {
1250
- break;
1259
+ continue;
1251
1260
  }
1252
1261
  int src_backend_id = tensor_backend_id(src);
1253
1262
  if (src_backend_id == -1) {
@@ -1306,7 +1315,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1306
1315
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1307
1316
  struct ggml_tensor * src = node->src[j];
1308
1317
  if (src == NULL) {
1309
- break;
1318
+ continue;
1310
1319
  }
1311
1320
  int src_backend_id = tensor_backend_id(src);
1312
1321
  assert(src_backend_id != -1); // all inputs should be assigned by now
@@ -1353,7 +1362,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1353
1362
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1354
1363
  struct ggml_tensor * src = node->src[j];
1355
1364
  if (src == NULL) {
1356
- break;
1365
+ continue;
1357
1366
  }
1358
1367
  ggml_backend_t src_backend = tensor_backend(src);
1359
1368
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
@@ -1659,7 +1668,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
1659
1668
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1660
1669
  struct ggml_tensor * s = src->src[i];
1661
1670
  if (s == NULL) {
1662
- break;
1671
+ continue;
1663
1672
  }
1664
1673
  dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1665
1674
  }
@@ -1688,7 +1697,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1688
1697
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1689
1698
  struct ggml_tensor * s = src->src[i];
1690
1699
  if (s == NULL) {
1691
- break;
1700
+ continue;
1692
1701
  }
1693
1702
  graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1694
1703
  }