llama_cpp 0.14.5 → 0.14.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
4
- data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
3
+ metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
4
+ data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
5
5
  SHA512:
6
- metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
7
- data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
6
+ metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
7
+ data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
2
+
3
+ - Bump llama.cpp from b2698 to b2740.
4
+ - Add `keep_split` accessor to `ModelQuantizeParams`.
5
+ - Add `pooling_type` method to `Context`.
6
+ - Add `token_is_eog?` method to `Model`.
7
+
8
+ Implementation binding for llama_sample_token_with_rng has been skipped.
9
+
10
+ ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
11
+
12
+ - Bump llama.cpp from b2658 to b2698.
13
+
1
14
  ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
2
15
 
3
16
  - Bump llama.cpp from b2608 to b2658.
@@ -1321,6 +1321,8 @@ public:
1321
1321
  rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
1322
1322
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
1323
1323
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
1324
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
1325
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
1324
1326
  }
1325
1327
 
1326
1328
  private:
@@ -1405,6 +1407,18 @@ private:
1405
1407
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1406
1408
  return ptr->params.pure ? Qtrue : Qfalse;
1407
1409
  }
1410
+
1411
+ // keep_split
1412
+ static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
1413
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1414
+ ptr->params.keep_split = RTEST(keep_split) ? true : false;
1415
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1416
+ }
1417
+
1418
+ static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
1419
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1420
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1421
+ }
1408
1422
  };
1409
1423
 
1410
1424
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1487,6 +1501,7 @@ public:
1487
1501
  rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
1488
1502
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1489
1503
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1504
+ rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1490
1505
  }
1491
1506
 
1492
1507
  private:
@@ -1634,10 +1649,10 @@ private:
1634
1649
  const llama_token token = NUM2INT(token_);
1635
1650
  LLaMAModelWrapper* ptr = get_llama_model(self);
1636
1651
  std::vector<char> result(8, 0);
1637
- const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1652
+ const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1638
1653
  if (n_tokens < 0) {
1639
1654
  result.resize(-n_tokens);
1640
- const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1655
+ const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1641
1656
  if (check != -n_tokens) {
1642
1657
  rb_raise(rb_eRuntimeError, "failed to convert");
1643
1658
  return Qnil;
@@ -1789,6 +1804,16 @@ private:
1789
1804
  LLaMAModelWrapper* ptr = get_llama_model(self);
1790
1805
  return INT2NUM(llama_token_eot(ptr->model));
1791
1806
  }
1807
+
1808
+ static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
1809
+ if (!RB_INTEGER_TYPE_P(token_)) {
1810
+ rb_raise(rb_eArgError, "token must be an integer");
1811
+ return Qnil;
1812
+ }
1813
+ const llama_token token = NUM2INT(token_);
1814
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1815
+ return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1816
+ }
1792
1817
  };
1793
1818
 
1794
1819
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2102,6 +2127,7 @@ public:
2102
2127
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2103
2128
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2104
2129
  rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2130
+ rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
2105
2131
  }
2106
2132
 
2107
2133
  private:
@@ -3225,6 +3251,15 @@ private:
3225
3251
 
3226
3252
  return Qnil;
3227
3253
  }
3254
+
3255
+ static VALUE _llama_context_pooling_type(VALUE self) {
3256
+ LLaMAContextWrapper* ptr = get_llama_context(self);
3257
+ if (ptr->ctx == NULL) {
3258
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3259
+ return Qnil;
3260
+ }
3261
+ return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
3262
+ }
3228
3263
  };
3229
3264
 
3230
3265
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.5'
6
+ VERSION = '0.14.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2658'
9
+ LLAMA_CPP_VERSION = 'b2740'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
141
141
  def token_middle: () -> Integer
142
142
  def token_suffix: () -> Integer
143
143
  def token_eot: () -> Integer
144
+ def token_is_eog?: (Integer) -> bool
144
145
  end
145
146
 
146
147
  class Timings
@@ -260,6 +261,7 @@ module LLaMACpp
260
261
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
261
262
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
262
263
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
264
+ def pooling_type: () -> Integer
263
265
  end
264
266
 
265
267
  class ContextParams
@@ -328,6 +330,8 @@ module LLaMACpp
328
330
  def only_copy=: (bool) -> bool
329
331
  def pure: () -> bool
330
332
  def pure=: (bool) -> bool
333
+ def keep_split: () -> bool
334
+ def keep_split=: (bool) -> bool
331
335
  end
332
336
 
333
337
  class Params = ContextParams
@@ -386,6 +386,11 @@ ifdef LLAMA_OPENBLAS
386
386
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
387
  endif # LLAMA_OPENBLAS
388
388
 
389
+ ifndef LLAMA_NO_LLAMAFILE
390
+ MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
391
+ OBJS += sgemm.o
392
+ endif
393
+
389
394
  ifdef LLAMA_BLIS
390
395
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
391
396
  MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -482,11 +487,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
482
487
 
483
488
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
489
  $(NVCC_COMPILE)
485
-
486
490
  endif # LLAMA_CUDA
487
491
 
488
492
  ifdef LLAMA_CLBLAST
489
-
490
493
  MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
491
494
  MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
492
495
  MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -605,6 +608,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
605
608
  $(CC) $(CFLAGS) -c $< -o $@
606
609
  endif # LLAMA_MPI
607
610
 
611
+ ifndef LLAMA_NO_LLAMAFILE
612
+ sgemm.o: sgemm.cpp sgemm.h ggml.h
613
+ $(CXX) $(CXXFLAGS) -c $< -o $@
614
+ endif
615
+
608
616
  GF_CC := $(CC)
609
617
  include scripts/get-flags.mk
610
618
 
@@ -689,8 +697,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
689
697
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
690
698
  $(CXX) $(CXXFLAGS) -c $< -o $@
691
699
 
692
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
693
- COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
700
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
701
+ COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
694
702
 
695
703
  common.o: common/common.cpp $(COMMON_H_DEPS)
696
704
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -724,7 +732,7 @@ lib: llama.o ggml.o $(OBJS)
724
732
  ar rcs libllama.a $^
725
733
 
726
734
  clean:
727
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
735
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
736
  rm -vrf ggml-cuda/*.o
729
737
 
730
738
  #
@@ -761,7 +769,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
761
769
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
762
770
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
763
771
 
764
- batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
772
+ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
765
773
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
766
774
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
767
775
 
@@ -793,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
793
801
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
794
802
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
795
803
 
796
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
804
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
797
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
798
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
799
807
 
808
+ # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
809
+ examples/server/%.hpp: examples/server/public/% Makefile
810
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
811
+ echo "unsigned char $${NAME}[] = {" && \
812
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
813
+ echo "};" && \
814
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
815
+ ) > $@
816
+
800
817
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
801
818
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
802
819
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -371,16 +371,16 @@ struct ggml_gallocr {
371
371
  };
372
372
 
373
373
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
374
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
374
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
375
375
  GGML_ASSERT(galloc != NULL);
376
376
 
377
- galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
377
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
- galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
383
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
384
384
  GGML_ASSERT(galloc->buf_tallocs != NULL);
385
385
 
386
386
  for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
646
646
  free(galloc->hash_set.keys);
647
647
  free(galloc->hash_values);
648
648
  galloc->hash_set.size = hash_size;
649
- galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
650
- galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
649
+ galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
650
+ galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
651
651
  GGML_ASSERT(galloc->hash_set.keys != NULL);
652
652
  GGML_ASSERT(galloc->hash_values != NULL);
653
653
  } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
667
667
  // set the node_allocs from the hash table
668
668
  if (galloc->n_nodes < graph->n_nodes) {
669
669
  free(galloc->node_allocs);
670
- galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
670
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
671
671
  GGML_ASSERT(galloc->node_allocs != NULL);
672
672
  }
673
673
  galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
697
697
  }
698
698
  if (galloc->n_leafs < graph->n_leafs) {
699
699
  free(galloc->leaf_allocs);
700
- galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
700
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
701
701
  GGML_ASSERT(galloc->leaf_allocs != NULL);
702
702
  }
703
703
  galloc->n_leafs = graph->n_leafs;
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
822
822
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
823
  switch (op->op) {
824
824
  case GGML_OP_CPY:
825
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
825
+ return
826
+ op->type != GGML_TYPE_IQ2_XXS &&
827
+ op->type != GGML_TYPE_IQ2_XS &&
828
+ op->type != GGML_TYPE_IQ1_S &&
829
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
826
830
  case GGML_OP_MUL_MAT:
827
831
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
828
832
  default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
1721
1725
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1722
1726
  GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1723
1727
 
1724
- struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1728
+ struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1725
1729
 
1726
1730
  // initialize hash table
1727
1731
  sched->hash_set = ggml_hash_set_new(graph_size);
1728
- sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
- sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1732
+ sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
+ sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1730
1734
 
1731
1735
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1736
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1734
1738
 
1735
1739
  sched->n_backends = n_backends;
1736
1740
 
1737
1741
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
1742
 
1739
1743
  const int initial_splits_capacity = 16;
1740
- sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1744
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1741
1745
  sched->splits_capacity = initial_splits_capacity;
1742
1746
 
1743
1747
  for (int b = 0; b < n_backends; b++) {
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1968
1972
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1969
1973
  struct ggml_hash_set hash_set = {
1970
1974
  /* .size = */ graph->visited_hash_table.size,
1971
- /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1975
+ /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1972
1976
  };
1973
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1974
- bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1977
+ struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1978
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1975
1979
 
1976
1980
  struct ggml_init_params params = {
1977
1981
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -1231,7 +1231,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1231
1231
 
1232
1232
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1233
1233
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1234
- ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
1234
+ ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1235
1235
  if (src0->type != GGML_TYPE_F16) {
1236
1236
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1237
1237
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1241
1241
  }
1242
1242
  const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1243
1243
 
1244
- ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
1244
+ ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1245
1245
  if (src1->type != GGML_TYPE_F16) {
1246
1246
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1247
1247
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1250
1250
  to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1251
1251
  }
1252
1252
  const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1253
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
1253
+ ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1254
1254
 
1255
1255
  const half alpha_f16 = 1.0f;
1256
1256
  const half beta_f16 = 0.0f;
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1946
1946
  } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1947
1947
  // KQV single-batch
1948
1948
  ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1949
- } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1949
+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1950
1950
  // KQ + KQV multi-batch
1951
1951
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1952
1952
  } else if (use_dequantize_mul_mat_vec) {
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1960
1960
  }
1961
1961
  }
1962
1962
 
1963
+ struct mmid_row_mapping {
1964
+ int32_t i1;
1965
+ int32_t i2;
1966
+ };
1967
+
1968
+ static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
1969
+ int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
1970
+ const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
1971
+ int64_t ne11, int64_t ne10,
1972
+ size_t nb11, size_t nb12) {
1973
+ int32_t iid1 = blockIdx.x;
1974
+ int32_t id = blockIdx.y;
1975
+
1976
+ const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
1977
+
1978
+ if (row_id_i != i02) {
1979
+ return;
1980
+ }
1981
+
1982
+ const int64_t i11 = id % ne11;
1983
+ const int64_t i12 = iid1;
1984
+
1985
+ __shared__ int src1_row;
1986
+ if (threadIdx.x == 0) {
1987
+ src1_row = atomicAdd(cur_src1_row, 1);
1988
+ row_mapping[src1_row] = {id, iid1};
1989
+ }
1990
+ __syncthreads();
1991
+
1992
+ const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
1993
+ float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
1994
+
1995
+ for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
1996
+ src1_row_contiguous[i] = src1_row_original[i];
1997
+ }
1998
+ }
1999
+
2000
+ static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
2001
+ const mmid_row_mapping * __restrict__ row_mapping,
2002
+ int64_t ne0,
2003
+ size_t nb1, size_t nb2) {
2004
+ int32_t i = blockIdx.x;
2005
+
2006
+ const int32_t i1 = row_mapping[i].i1;
2007
+ const int32_t i2 = row_mapping[i].i2;
2008
+
2009
+ const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
2010
+ float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
2011
+
2012
+ for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
2013
+ dst_row_original[j] = dst_row_contiguous[j];
2014
+ }
2015
+ }
2016
+
1963
2017
  static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1964
2018
  const ggml_tensor * src0 = dst->src[0];
1965
2019
  const ggml_tensor * src1 = dst->src[1];
1966
2020
  const ggml_tensor * ids = dst->src[2];
1967
2021
 
2022
+ GGML_TENSOR_BINARY_OP_LOCALS
2023
+
1968
2024
  GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
1969
2025
 
1970
2026
  cudaStream_t stream = ctx.stream();
1971
2027
 
1972
- const size_t nb11 = src1->nb[1];
1973
- const size_t nb1 = dst->nb[1];
1974
-
1975
- const int32_t id = ((int32_t *) dst->op_params)[0];
1976
- const int32_t n_as = src0->ne[2];
2028
+ const int64_t n_as = ne02;
2029
+ const int64_t n_ids = ids->ne[0];
1977
2030
 
1978
2031
  std::vector<char> ids_host(ggml_nbytes(ids));
1979
2032
  const char * ids_dev = (const char *) ids->data;
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1982
2035
 
1983
2036
  ggml_tensor src0_row = *src0;
1984
2037
  ggml_tensor src1_row = *src1;
1985
- ggml_tensor dst_row = *dst;
2038
+ ggml_tensor dst_row = *dst;
1986
2039
 
1987
2040
  char * src0_original = (char *) src0->data;
1988
2041
  char * src1_original = (char *) src1->data;
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1990
2043
 
1991
2044
  src0_row.ne[2] = 1;
1992
2045
  src0_row.ne[3] = 1;
1993
- src0_row.nb[3] = src0->nb[2];
2046
+ src0_row.nb[3] = nb02;
1994
2047
 
1995
- if (src1->ne[1] == 1) {
1996
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
1997
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2048
+ src1_row.ne[1] = 1;
2049
+ src1_row.ne[2] = 1;
2050
+ src1_row.ne[3] = 1;
2051
+ src1_row.nb[2] = nb11;
2052
+ src1_row.nb[3] = nb11;
1998
2053
 
1999
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2054
+ dst_row.ne[1] = 1;
2055
+ dst_row.ne[2] = 1;
2056
+ dst_row.ne[3] = 1;
2057
+ dst_row.nb[2] = nb1;
2058
+ dst_row.nb[3] = nb1;
2000
2059
 
2001
- src0_row.data = src0_original + row_id*src0->nb[2];
2002
- src1_row.data = src1_original + i01*src1->nb[1];
2003
- dst_row.data = dst_original + i01*dst->nb[1];
2060
+ if (ne12 == 1) {
2061
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2062
+ for (int64_t id = 0; id < n_ids; id++) {
2063
+ const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2004
2064
 
2005
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2065
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2066
+
2067
+ const int64_t i11 = id % ne11;
2068
+ const int64_t i12 = iid1;
2069
+
2070
+ const int64_t i1 = id;
2071
+ const int64_t i2 = i12;
2072
+
2073
+ src0_row.data = src0_original + i02*nb02;
2074
+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
2075
+ dst_row.data = dst_original + i1*nb1 + i2*nb2;
2076
+
2077
+ ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2078
+ }
2006
2079
  }
2007
2080
  } else {
2008
2081
  ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2011
2084
  src1_row.data = src1_contiguous.get();
2012
2085
  dst_row.data = dst_contiguous.get();
2013
2086
 
2014
- for (int32_t row_id = 0; row_id < n_as; ++row_id) {
2087
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
2015
2088
  int64_t num_src1_rows = 0;
2016
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2017
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2018
2089
 
2019
- if (row_id_i != row_id) {
2020
- continue;
2021
- }
2090
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2091
+ for (int64_t id = 0; id < n_ids; id++) {
2092
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2022
2093
 
2023
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2094
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2024
2095
 
2025
- CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
2026
- nb11, cudaMemcpyDeviceToDevice, stream));
2027
- num_src1_rows++;
2096
+ if (row_id_i != i02) {
2097
+ continue;
2098
+ }
2099
+
2100
+ num_src1_rows++;
2101
+ }
2028
2102
  }
2029
2103
 
2030
2104
  if (num_src1_rows == 0) {
2031
2105
  continue;
2032
2106
  }
2033
2107
 
2034
- src0_row.data = src0_original + row_id*src0->nb[2];
2108
+ ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2109
+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2110
+ CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2035
2111
 
2036
- src1_row.ne[1] = num_src1_rows;
2037
- dst_row.ne[1] = num_src1_rows;
2112
+ {
2113
+ dim3 block_dims(std::min((unsigned int)ne10, 768u));
2114
+ dim3 grid_dims(ids->ne[1], n_ids);
2115
+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2116
+ src1_original, src1_contiguous.get(),
2117
+ dev_cur_src1_row.get(), dev_row_mapping.get(),
2118
+ ids_dev, i02, ids->nb[1], ids->nb[0],
2119
+ ne11, ne10,
2120
+ nb11, nb12);
2121
+ CUDA_CHECK(cudaGetLastError());
2122
+ }
2123
+
2124
+ src0_row.data = src0_original + i02*nb02;
2038
2125
 
2126
+ GGML_ASSERT(nb11 == sizeof(float)*ne10);
2127
+ GGML_ASSERT(nb1 == sizeof(float)*ne0);
2128
+
2129
+ src1_row.ne[1] = num_src1_rows;
2039
2130
  src1_row.nb[1] = nb11;
2040
2131
  src1_row.nb[2] = num_src1_rows*nb11;
2041
2132
  src1_row.nb[3] = num_src1_rows*nb11;
2042
2133
 
2134
+ dst_row.ne[1] = num_src1_rows;
2043
2135
  dst_row.nb[1] = nb1;
2044
2136
  dst_row.nb[2] = num_src1_rows*nb1;
2045
2137
  dst_row.nb[3] = num_src1_rows*nb1;
2046
2138
 
2047
2139
  ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2048
2140
 
2049
- num_src1_rows = 0;
2050
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2051
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2052
-
2053
- if (row_id_i != row_id) {
2054
- continue;
2055
- }
2056
-
2057
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2058
-
2059
- CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
2060
- nb1, cudaMemcpyDeviceToDevice, stream));
2061
- num_src1_rows++;
2141
+ {
2142
+ dim3 block_dims(std::min((unsigned int)ne0, 768u));
2143
+ dim3 grid_dims(num_src1_rows);
2144
+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2145
+ dst_original, dst_contiguous.get(),
2146
+ dev_row_mapping.get(),
2147
+ ne0,
2148
+ nb1, nb2);
2149
+ CUDA_CHECK(cudaGetLastError());
2062
2150
  }
2063
2151
  }
2064
2152
  }
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2487
2575
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2488
2576
  const int min_batch_size = 32;
2489
2577
 
2490
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
2578
+ return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
2579
+ (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
2491
2580
 
2492
2581
  GGML_UNUSED(backend);
2493
2582
  }