llama_cpp 0.14.5 → 0.14.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
4
- data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
3
+ metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
4
+ data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
5
5
  SHA512:
6
- metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
7
- data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
6
+ metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
7
+ data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
2
+
3
+ - Bump llama.cpp from b2698 to b2740.
4
+ - Add `keep_split` accessor to `ModelQuantizeParams`.
5
+ - Add `pooling_type` method to `Context`.
6
+ - Add `token_is_eog?` method to `Model`.
7
+
8
+ Implementation binding for llama_sample_token_with_rng has been skipped.
9
+
10
+ ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
11
+
12
+ - Bump llama.cpp from b2658 to b2698.
13
+
1
14
  ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
2
15
 
3
16
  - Bump llama.cpp from b2608 to b2658.
@@ -1321,6 +1321,8 @@ public:
1321
1321
  rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
1322
1322
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
1323
1323
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
1324
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
1325
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
1324
1326
  }
1325
1327
 
1326
1328
  private:
@@ -1405,6 +1407,18 @@ private:
1405
1407
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1406
1408
  return ptr->params.pure ? Qtrue : Qfalse;
1407
1409
  }
1410
+
1411
+ // keep_split
1412
+ static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
1413
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1414
+ ptr->params.keep_split = RTEST(keep_split) ? true : false;
1415
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1416
+ }
1417
+
1418
+ static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
1419
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1420
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1421
+ }
1408
1422
  };
1409
1423
 
1410
1424
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1487,6 +1501,7 @@ public:
1487
1501
  rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
1488
1502
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1489
1503
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1504
+ rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1490
1505
  }
1491
1506
 
1492
1507
  private:
@@ -1634,10 +1649,10 @@ private:
1634
1649
  const llama_token token = NUM2INT(token_);
1635
1650
  LLaMAModelWrapper* ptr = get_llama_model(self);
1636
1651
  std::vector<char> result(8, 0);
1637
- const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1652
+ const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1638
1653
  if (n_tokens < 0) {
1639
1654
  result.resize(-n_tokens);
1640
- const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1655
+ const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1641
1656
  if (check != -n_tokens) {
1642
1657
  rb_raise(rb_eRuntimeError, "failed to convert");
1643
1658
  return Qnil;
@@ -1789,6 +1804,16 @@ private:
1789
1804
  LLaMAModelWrapper* ptr = get_llama_model(self);
1790
1805
  return INT2NUM(llama_token_eot(ptr->model));
1791
1806
  }
1807
+
1808
+ static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
1809
+ if (!RB_INTEGER_TYPE_P(token_)) {
1810
+ rb_raise(rb_eArgError, "token must be an integer");
1811
+ return Qnil;
1812
+ }
1813
+ const llama_token token = NUM2INT(token_);
1814
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1815
+ return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1816
+ }
1792
1817
  };
1793
1818
 
1794
1819
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2102,6 +2127,7 @@ public:
2102
2127
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2103
2128
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2104
2129
  rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2130
+ rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
2105
2131
  }
2106
2132
 
2107
2133
  private:
@@ -3225,6 +3251,15 @@ private:
3225
3251
 
3226
3252
  return Qnil;
3227
3253
  }
3254
+
3255
+ static VALUE _llama_context_pooling_type(VALUE self) {
3256
+ LLaMAContextWrapper* ptr = get_llama_context(self);
3257
+ if (ptr->ctx == NULL) {
3258
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3259
+ return Qnil;
3260
+ }
3261
+ return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
3262
+ }
3228
3263
  };
3229
3264
 
3230
3265
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.5'
6
+ VERSION = '0.14.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2658'
9
+ LLAMA_CPP_VERSION = 'b2740'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
141
141
  def token_middle: () -> Integer
142
142
  def token_suffix: () -> Integer
143
143
  def token_eot: () -> Integer
144
+ def token_is_eog?: (Integer) -> bool
144
145
  end
145
146
 
146
147
  class Timings
@@ -260,6 +261,7 @@ module LLaMACpp
260
261
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
261
262
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
262
263
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
264
+ def pooling_type: () -> Integer
263
265
  end
264
266
 
265
267
  class ContextParams
@@ -328,6 +330,8 @@ module LLaMACpp
328
330
  def only_copy=: (bool) -> bool
329
331
  def pure: () -> bool
330
332
  def pure=: (bool) -> bool
333
+ def keep_split: () -> bool
334
+ def keep_split=: (bool) -> bool
331
335
  end
332
336
 
333
337
  class Params = ContextParams
@@ -386,6 +386,11 @@ ifdef LLAMA_OPENBLAS
386
386
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
387
  endif # LLAMA_OPENBLAS
388
388
 
389
+ ifndef LLAMA_NO_LLAMAFILE
390
+ MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
391
+ OBJS += sgemm.o
392
+ endif
393
+
389
394
  ifdef LLAMA_BLIS
390
395
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
391
396
  MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -482,11 +487,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
482
487
 
483
488
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
489
  $(NVCC_COMPILE)
485
-
486
490
  endif # LLAMA_CUDA
487
491
 
488
492
  ifdef LLAMA_CLBLAST
489
-
490
493
  MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
491
494
  MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
492
495
  MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -605,6 +608,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
605
608
  $(CC) $(CFLAGS) -c $< -o $@
606
609
  endif # LLAMA_MPI
607
610
 
611
+ ifndef LLAMA_NO_LLAMAFILE
612
+ sgemm.o: sgemm.cpp sgemm.h ggml.h
613
+ $(CXX) $(CXXFLAGS) -c $< -o $@
614
+ endif
615
+
608
616
  GF_CC := $(CC)
609
617
  include scripts/get-flags.mk
610
618
 
@@ -689,8 +697,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
689
697
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
690
698
  $(CXX) $(CXXFLAGS) -c $< -o $@
691
699
 
692
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
693
- COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
700
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
701
+ COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
694
702
 
695
703
  common.o: common/common.cpp $(COMMON_H_DEPS)
696
704
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -724,7 +732,7 @@ lib: llama.o ggml.o $(OBJS)
724
732
  ar rcs libllama.a $^
725
733
 
726
734
  clean:
727
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
735
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
736
  rm -vrf ggml-cuda/*.o
729
737
 
730
738
  #
@@ -761,7 +769,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
761
769
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
762
770
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
763
771
 
764
- batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
772
+ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
765
773
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
766
774
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
767
775
 
@@ -793,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
793
801
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
794
802
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
795
803
 
796
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
804
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
797
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
798
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
799
807
 
808
+ # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
809
+ examples/server/%.hpp: examples/server/public/% Makefile
810
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
811
+ echo "unsigned char $${NAME}[] = {" && \
812
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
813
+ echo "};" && \
814
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
815
+ ) > $@
816
+
800
817
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
801
818
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
802
819
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -371,16 +371,16 @@ struct ggml_gallocr {
371
371
  };
372
372
 
373
373
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
374
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
374
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
375
375
  GGML_ASSERT(galloc != NULL);
376
376
 
377
- galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
377
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
- galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
383
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
384
384
  GGML_ASSERT(galloc->buf_tallocs != NULL);
385
385
 
386
386
  for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
646
646
  free(galloc->hash_set.keys);
647
647
  free(galloc->hash_values);
648
648
  galloc->hash_set.size = hash_size;
649
- galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
650
- galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
649
+ galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
650
+ galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
651
651
  GGML_ASSERT(galloc->hash_set.keys != NULL);
652
652
  GGML_ASSERT(galloc->hash_values != NULL);
653
653
  } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
667
667
  // set the node_allocs from the hash table
668
668
  if (galloc->n_nodes < graph->n_nodes) {
669
669
  free(galloc->node_allocs);
670
- galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
670
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
671
671
  GGML_ASSERT(galloc->node_allocs != NULL);
672
672
  }
673
673
  galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
697
697
  }
698
698
  if (galloc->n_leafs < graph->n_leafs) {
699
699
  free(galloc->leaf_allocs);
700
- galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
700
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
701
701
  GGML_ASSERT(galloc->leaf_allocs != NULL);
702
702
  }
703
703
  galloc->n_leafs = graph->n_leafs;
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
822
822
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
823
  switch (op->op) {
824
824
  case GGML_OP_CPY:
825
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
825
+ return
826
+ op->type != GGML_TYPE_IQ2_XXS &&
827
+ op->type != GGML_TYPE_IQ2_XS &&
828
+ op->type != GGML_TYPE_IQ1_S &&
829
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
826
830
  case GGML_OP_MUL_MAT:
827
831
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
828
832
  default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
1721
1725
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1722
1726
  GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1723
1727
 
1724
- struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1728
+ struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1725
1729
 
1726
1730
  // initialize hash table
1727
1731
  sched->hash_set = ggml_hash_set_new(graph_size);
1728
- sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
- sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1732
+ sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
+ sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1730
1734
 
1731
1735
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1736
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1734
1738
 
1735
1739
  sched->n_backends = n_backends;
1736
1740
 
1737
1741
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
1742
 
1739
1743
  const int initial_splits_capacity = 16;
1740
- sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1744
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1741
1745
  sched->splits_capacity = initial_splits_capacity;
1742
1746
 
1743
1747
  for (int b = 0; b < n_backends; b++) {
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1968
1972
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1969
1973
  struct ggml_hash_set hash_set = {
1970
1974
  /* .size = */ graph->visited_hash_table.size,
1971
- /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1975
+ /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1972
1976
  };
1973
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1974
- bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1977
+ struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1978
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1975
1979
 
1976
1980
  struct ggml_init_params params = {
1977
1981
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -1231,7 +1231,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1231
1231
 
1232
1232
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1233
1233
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1234
- ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
1234
+ ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1235
1235
  if (src0->type != GGML_TYPE_F16) {
1236
1236
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1237
1237
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1241
1241
  }
1242
1242
  const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1243
1243
 
1244
- ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
1244
+ ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1245
1245
  if (src1->type != GGML_TYPE_F16) {
1246
1246
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1247
1247
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1250
1250
  to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1251
1251
  }
1252
1252
  const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1253
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
1253
+ ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1254
1254
 
1255
1255
  const half alpha_f16 = 1.0f;
1256
1256
  const half beta_f16 = 0.0f;
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1946
1946
  } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1947
1947
  // KQV single-batch
1948
1948
  ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1949
- } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1949
+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1950
1950
  // KQ + KQV multi-batch
1951
1951
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1952
1952
  } else if (use_dequantize_mul_mat_vec) {
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1960
1960
  }
1961
1961
  }
1962
1962
 
1963
+ struct mmid_row_mapping {
1964
+ int32_t i1;
1965
+ int32_t i2;
1966
+ };
1967
+
1968
+ static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
1969
+ int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
1970
+ const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
1971
+ int64_t ne11, int64_t ne10,
1972
+ size_t nb11, size_t nb12) {
1973
+ int32_t iid1 = blockIdx.x;
1974
+ int32_t id = blockIdx.y;
1975
+
1976
+ const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
1977
+
1978
+ if (row_id_i != i02) {
1979
+ return;
1980
+ }
1981
+
1982
+ const int64_t i11 = id % ne11;
1983
+ const int64_t i12 = iid1;
1984
+
1985
+ __shared__ int src1_row;
1986
+ if (threadIdx.x == 0) {
1987
+ src1_row = atomicAdd(cur_src1_row, 1);
1988
+ row_mapping[src1_row] = {id, iid1};
1989
+ }
1990
+ __syncthreads();
1991
+
1992
+ const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
1993
+ float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
1994
+
1995
+ for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
1996
+ src1_row_contiguous[i] = src1_row_original[i];
1997
+ }
1998
+ }
1999
+
2000
+ static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
2001
+ const mmid_row_mapping * __restrict__ row_mapping,
2002
+ int64_t ne0,
2003
+ size_t nb1, size_t nb2) {
2004
+ int32_t i = blockIdx.x;
2005
+
2006
+ const int32_t i1 = row_mapping[i].i1;
2007
+ const int32_t i2 = row_mapping[i].i2;
2008
+
2009
+ const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
2010
+ float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
2011
+
2012
+ for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
2013
+ dst_row_original[j] = dst_row_contiguous[j];
2014
+ }
2015
+ }
2016
+
1963
2017
  static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1964
2018
  const ggml_tensor * src0 = dst->src[0];
1965
2019
  const ggml_tensor * src1 = dst->src[1];
1966
2020
  const ggml_tensor * ids = dst->src[2];
1967
2021
 
2022
+ GGML_TENSOR_BINARY_OP_LOCALS
2023
+
1968
2024
  GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
1969
2025
 
1970
2026
  cudaStream_t stream = ctx.stream();
1971
2027
 
1972
- const size_t nb11 = src1->nb[1];
1973
- const size_t nb1 = dst->nb[1];
1974
-
1975
- const int32_t id = ((int32_t *) dst->op_params)[0];
1976
- const int32_t n_as = src0->ne[2];
2028
+ const int64_t n_as = ne02;
2029
+ const int64_t n_ids = ids->ne[0];
1977
2030
 
1978
2031
  std::vector<char> ids_host(ggml_nbytes(ids));
1979
2032
  const char * ids_dev = (const char *) ids->data;
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1982
2035
 
1983
2036
  ggml_tensor src0_row = *src0;
1984
2037
  ggml_tensor src1_row = *src1;
1985
- ggml_tensor dst_row = *dst;
2038
+ ggml_tensor dst_row = *dst;
1986
2039
 
1987
2040
  char * src0_original = (char *) src0->data;
1988
2041
  char * src1_original = (char *) src1->data;
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1990
2043
 
1991
2044
  src0_row.ne[2] = 1;
1992
2045
  src0_row.ne[3] = 1;
1993
- src0_row.nb[3] = src0->nb[2];
2046
+ src0_row.nb[3] = nb02;
1994
2047
 
1995
- if (src1->ne[1] == 1) {
1996
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
1997
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2048
+ src1_row.ne[1] = 1;
2049
+ src1_row.ne[2] = 1;
2050
+ src1_row.ne[3] = 1;
2051
+ src1_row.nb[2] = nb11;
2052
+ src1_row.nb[3] = nb11;
1998
2053
 
1999
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2054
+ dst_row.ne[1] = 1;
2055
+ dst_row.ne[2] = 1;
2056
+ dst_row.ne[3] = 1;
2057
+ dst_row.nb[2] = nb1;
2058
+ dst_row.nb[3] = nb1;
2000
2059
 
2001
- src0_row.data = src0_original + row_id*src0->nb[2];
2002
- src1_row.data = src1_original + i01*src1->nb[1];
2003
- dst_row.data = dst_original + i01*dst->nb[1];
2060
+ if (ne12 == 1) {
2061
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2062
+ for (int64_t id = 0; id < n_ids; id++) {
2063
+ const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2004
2064
 
2005
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2065
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2066
+
2067
+ const int64_t i11 = id % ne11;
2068
+ const int64_t i12 = iid1;
2069
+
2070
+ const int64_t i1 = id;
2071
+ const int64_t i2 = i12;
2072
+
2073
+ src0_row.data = src0_original + i02*nb02;
2074
+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
2075
+ dst_row.data = dst_original + i1*nb1 + i2*nb2;
2076
+
2077
+ ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2078
+ }
2006
2079
  }
2007
2080
  } else {
2008
2081
  ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2011
2084
  src1_row.data = src1_contiguous.get();
2012
2085
  dst_row.data = dst_contiguous.get();
2013
2086
 
2014
- for (int32_t row_id = 0; row_id < n_as; ++row_id) {
2087
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
2015
2088
  int64_t num_src1_rows = 0;
2016
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2017
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2018
2089
 
2019
- if (row_id_i != row_id) {
2020
- continue;
2021
- }
2090
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2091
+ for (int64_t id = 0; id < n_ids; id++) {
2092
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2022
2093
 
2023
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2094
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2024
2095
 
2025
- CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
2026
- nb11, cudaMemcpyDeviceToDevice, stream));
2027
- num_src1_rows++;
2096
+ if (row_id_i != i02) {
2097
+ continue;
2098
+ }
2099
+
2100
+ num_src1_rows++;
2101
+ }
2028
2102
  }
2029
2103
 
2030
2104
  if (num_src1_rows == 0) {
2031
2105
  continue;
2032
2106
  }
2033
2107
 
2034
- src0_row.data = src0_original + row_id*src0->nb[2];
2108
+ ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2109
+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2110
+ CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2035
2111
 
2036
- src1_row.ne[1] = num_src1_rows;
2037
- dst_row.ne[1] = num_src1_rows;
2112
+ {
2113
+ dim3 block_dims(std::min((unsigned int)ne10, 768u));
2114
+ dim3 grid_dims(ids->ne[1], n_ids);
2115
+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2116
+ src1_original, src1_contiguous.get(),
2117
+ dev_cur_src1_row.get(), dev_row_mapping.get(),
2118
+ ids_dev, i02, ids->nb[1], ids->nb[0],
2119
+ ne11, ne10,
2120
+ nb11, nb12);
2121
+ CUDA_CHECK(cudaGetLastError());
2122
+ }
2123
+
2124
+ src0_row.data = src0_original + i02*nb02;
2038
2125
 
2126
+ GGML_ASSERT(nb11 == sizeof(float)*ne10);
2127
+ GGML_ASSERT(nb1 == sizeof(float)*ne0);
2128
+
2129
+ src1_row.ne[1] = num_src1_rows;
2039
2130
  src1_row.nb[1] = nb11;
2040
2131
  src1_row.nb[2] = num_src1_rows*nb11;
2041
2132
  src1_row.nb[3] = num_src1_rows*nb11;
2042
2133
 
2134
+ dst_row.ne[1] = num_src1_rows;
2043
2135
  dst_row.nb[1] = nb1;
2044
2136
  dst_row.nb[2] = num_src1_rows*nb1;
2045
2137
  dst_row.nb[3] = num_src1_rows*nb1;
2046
2138
 
2047
2139
  ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2048
2140
 
2049
- num_src1_rows = 0;
2050
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2051
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2052
-
2053
- if (row_id_i != row_id) {
2054
- continue;
2055
- }
2056
-
2057
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2058
-
2059
- CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
2060
- nb1, cudaMemcpyDeviceToDevice, stream));
2061
- num_src1_rows++;
2141
+ {
2142
+ dim3 block_dims(std::min((unsigned int)ne0, 768u));
2143
+ dim3 grid_dims(num_src1_rows);
2144
+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2145
+ dst_original, dst_contiguous.get(),
2146
+ dev_row_mapping.get(),
2147
+ ne0,
2148
+ nb1, nb2);
2149
+ CUDA_CHECK(cudaGetLastError());
2062
2150
  }
2063
2151
  }
2064
2152
  }
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2487
2575
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2488
2576
  const int min_batch_size = 32;
2489
2577
 
2490
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
2578
+ return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
2579
+ (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
2491
2580
 
2492
2581
  GGML_UNUSED(backend);
2493
2582
  }