llama_cpp 0.14.4 → 0.14.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
4
- data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
3
+ metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
4
+ data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
5
5
  SHA512:
6
- metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
7
- data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
6
+ metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
7
+ data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
data/CHANGELOG.md CHANGED
@@ -1,10 +1,22 @@
1
+ ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
2
+
3
+ - Bump llama.cpp from b2658 to b2698.
4
+
5
+ ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
6
+
7
+ - Bump llama.cpp from b2608 to b2658.
8
+ - Add magic number constants.
9
+ - Add `token_cls` and `token_sep` methods to `Model`.
10
+
11
+ Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
12
+
1
13
  ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
2
14
 
3
15
  - Bump llama.cpp from b2496 to b2573.
4
16
  - Add file type constants.
5
17
  - Bump llama.cpp from b2573 to b2608.
6
18
 
7
- Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
19
+ Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
8
20
 
9
21
  ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
10
22
 
data/examples/chat.rb CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
127
127
  end
128
128
 
129
129
  if input_echo
130
- output = []
131
- embd.each { |token| output << context.model.token_to_piece(token) }
130
+ output = embd.map { |token| context.model.token_to_piece(token) }
132
131
  output_str = output.join
133
132
  output_str.chomp!(antiprompt) if first_input
134
133
  print(output_str)
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
136
135
 
137
136
  if embd_input.size <= n_consumed
138
137
  if antiprompt.size.positive?
139
- last_output = []
140
- last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
138
+ last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
141
139
  last_output_str = last_output.join
142
140
 
143
141
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
37
37
  abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
38
38
  end
39
39
  FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
40
+ FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
40
41
  end
41
42
 
42
43
  abort('libstdc++ is not found.') unless have_library('stdc++')
@@ -1478,6 +1478,8 @@ public:
1478
1478
  rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1479
1479
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1480
1480
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1481
+ rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
1482
+ rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
1481
1483
  rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
1482
1484
  rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
1483
1485
  rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
@@ -1743,6 +1745,16 @@ private:
1743
1745
  return INT2NUM(llama_token_eos(ptr->model));
1744
1746
  }
1745
1747
 
1748
+ static VALUE _llama_model_token_cls(VALUE self) {
1749
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1750
+ return INT2NUM(llama_token_cls(ptr->model));
1751
+ }
1752
+
1753
+ static VALUE _llama_model_token_sep(VALUE self) {
1754
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1755
+ return INT2NUM(llama_token_sep(ptr->model));
1756
+ }
1757
+
1746
1758
  static VALUE _llama_model_token_nl(VALUE self) {
1747
1759
  LLaMAModelWrapper* ptr = get_llama_model(self);
1748
1760
  return INT2NUM(llama_token_nl(ptr->model));
@@ -3414,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
3414
3426
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3415
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3416
3428
 
3429
+ ss_magic.str("");
3430
+ ss_magic.clear(std::stringstream::goodbit);
3431
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
3433
+
3417
3434
  ss_magic.str("");
3418
3435
  ss_magic.clear(std::stringstream::goodbit);
3419
3436
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
3420
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3421
3438
 
3439
+ ss_magic.str("");
3440
+ ss_magic.clear(std::stringstream::goodbit);
3441
+ ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
3442
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3443
+
3422
3444
  ss_magic.str("");
3423
3445
  ss_magic.clear(std::stringstream::goodbit);
3424
3446
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
3425
3447
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
3426
3448
 
3427
3449
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
3450
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
3428
3451
  }
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.4'
6
+ VERSION = '0.14.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2608'
9
+ LLAMA_CPP_VERSION = 'b2698'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_FILE_MAGIC_GGLA: String
7
+ LLAMA_FILE_MAGIC_GGSN: String
8
+ LLAMA_FILE_MAGIC_GGSQ: String
9
+ LLAMA_SESSION_MAGIC: String
10
+ LLAMA_SESSION_VERSION: String
11
+ LLAMA_STATE_SEQ_MAGIC: String
12
+ LLAMA_STATE_SEQ_VERSION: String
13
+
6
14
  LLAMA_VOCAB_TYPE_NONE: Integer
7
15
  LLAMA_VOCAB_TYPE_SPM: Integer
8
16
  LLAMA_VOCAB_TYPE_BPE: Integer
@@ -124,6 +132,8 @@ module LLaMACpp
124
132
  def type: (Integer) -> Integer
125
133
  def token_bos: () -> Integer
126
134
  def token_eos: () -> Integer
135
+ def token_cls: () -> Integer
136
+ def token_sep: () -> Integer
127
137
  def token_nl: () -> Integer
128
138
  def add_bos_token?: () -> bool
129
139
  def add_eos_token?: () -> bool
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2023 Georgi Gerganov
3
+ Copyright (c) 2023-2024 The ggml authors
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,7 +1,7 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
5
5
  retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
@@ -10,7 +10,7 @@ TEST_TARGETS = \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
12
  tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar
13
+ tests/test-json-schema-to-grammar tests/test-grammar-integration
14
14
 
15
15
  # Code coverage output files
16
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -386,6 +386,15 @@ ifdef LLAMA_OPENBLAS
386
386
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
387
  endif # LLAMA_OPENBLAS
388
388
 
389
+ # TODO: temporary disable until MoE is fixed
390
+ # https://github.com/ggerganov/llama.cpp/pull/6716
391
+ LLAMA_NO_LLAMAFILE := 1
392
+
393
+ ifndef LLAMA_NO_LLAMAFILE
394
+ MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
395
+ OBJS += sgemm.o
396
+ endif
397
+
389
398
  ifdef LLAMA_BLIS
390
399
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
391
400
  MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -482,11 +491,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
482
491
 
483
492
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
493
  $(NVCC_COMPILE)
485
-
486
494
  endif # LLAMA_CUDA
487
495
 
488
496
  ifdef LLAMA_CLBLAST
489
-
490
497
  MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
491
498
  MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
492
499
  MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -605,6 +612,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
605
612
  $(CC) $(CFLAGS) -c $< -o $@
606
613
  endif # LLAMA_MPI
607
614
 
615
+ ifndef LLAMA_NO_LLAMAFILE
616
+ sgemm.o: sgemm.cpp sgemm.h ggml.h
617
+ $(CXX) $(CXXFLAGS) -c $< -o $@
618
+ endif
619
+
608
620
  GF_CC := $(CC)
609
621
  include scripts/get-flags.mk
610
622
 
@@ -648,7 +660,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
648
660
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
649
661
  ifndef CUDA_DOCKER_ARCH
650
662
  ifndef CUDA_POWER_ARCH
651
- $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
663
+ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
652
664
  endif # CUDA_POWER_ARCH
653
665
  endif # CUDA_DOCKER_ARCH
654
666
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -690,7 +702,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml
690
702
  $(CXX) $(CXXFLAGS) -c $< -o $@
691
703
 
692
704
  COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
693
- COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
705
+ COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
694
706
 
695
707
  common.o: common/common.cpp $(COMMON_H_DEPS)
696
708
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -724,7 +736,7 @@ lib: llama.o ggml.o $(OBJS)
724
736
  ar rcs libllama.a $^
725
737
 
726
738
  clean:
727
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
739
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
740
  rm -vrf ggml-cuda/*.o
729
741
 
730
742
  #
@@ -761,7 +773,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
761
773
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
762
774
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
763
775
 
764
- batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
776
+ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
765
777
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
766
778
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
767
779
 
@@ -793,7 +805,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
793
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
794
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
795
807
 
796
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
808
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
797
809
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
798
810
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
799
811
 
@@ -805,6 +817,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
805
817
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
818
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
819
 
820
+ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
821
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
822
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
823
+
808
824
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
809
825
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
826
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -923,6 +939,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
923
939
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
924
940
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
925
941
 
942
+ tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
943
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
944
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
945
+
926
946
  tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
927
947
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
928
948
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -137,7 +137,7 @@ extern "C" {
137
137
  /*
138
138
  Example usage:
139
139
 
140
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
141
141
  // preferrably to run on the same backend as the buffer
142
142
  ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
143
143
 
@@ -1225,13 +1225,13 @@ static void ggml_cuda_op_mul_mat_cublas(
1225
1225
 
1226
1226
  // the main device has a larger memory buffer to hold the results from all GPUs
1227
1227
  // ldc == nrows of the matrix that cuBLAS writes into
1228
- int ldc = id == ctx.device ? ne0 : row_diff;
1228
+ int64_t ldc = id == ctx.device ? ne0 : row_diff;
1229
1229
 
1230
1230
  const int compute_capability = ggml_cuda_info().devices[id].cc;
1231
1231
 
1232
1232
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1233
1233
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1234
- ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
1234
+ ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1235
1235
  if (src0->type != GGML_TYPE_F16) {
1236
1236
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1237
1237
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1241
1241
  }
1242
1242
  const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1243
1243
 
1244
- ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
1244
+ ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1245
1245
  if (src1->type != GGML_TYPE_F16) {
1246
1246
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1247
1247
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1250
1250
  to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1251
1251
  }
1252
1252
  const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1253
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
1253
+ ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1254
1254
 
1255
1255
  const half alpha_f16 = 1.0f;
1256
1256
  const half beta_f16 = 0.0f;
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
1377
1377
  const int64_t ne0 = dst->ne[0];
1378
1378
  const int64_t ne1 = dst->ne[1];
1379
1379
 
1380
- const int nb2 = dst->nb[2];
1381
- const int nb3 = dst->nb[3];
1380
+ const int64_t nb2 = dst->nb[2];
1381
+ const int64_t nb3 = dst->nb[3];
1382
1382
 
1383
1383
  GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1384
1384
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1946
1946
  } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1947
1947
  // KQV single-batch
1948
1948
  ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1949
- } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1949
+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1950
1950
  // KQ + KQV multi-batch
1951
1951
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1952
1952
  } else if (use_dequantize_mul_mat_vec) {
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1960
1960
  }
1961
1961
  }
1962
1962
 
1963
+ struct mmid_row_mapping {
1964
+ int32_t i1;
1965
+ int32_t i2;
1966
+ };
1967
+
1968
+ static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
1969
+ int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
1970
+ const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
1971
+ int64_t ne11, int64_t ne10,
1972
+ size_t nb11, size_t nb12) {
1973
+ int32_t iid1 = blockIdx.x;
1974
+ int32_t id = blockIdx.y;
1975
+
1976
+ const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
1977
+
1978
+ if (row_id_i != i02) {
1979
+ return;
1980
+ }
1981
+
1982
+ const int64_t i11 = id % ne11;
1983
+ const int64_t i12 = iid1;
1984
+
1985
+ __shared__ int src1_row;
1986
+ if (threadIdx.x == 0) {
1987
+ src1_row = atomicAdd(cur_src1_row, 1);
1988
+ row_mapping[src1_row] = {id, iid1};
1989
+ }
1990
+ __syncthreads();
1991
+
1992
+ const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
1993
+ float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
1994
+
1995
+ for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
1996
+ src1_row_contiguous[i] = src1_row_original[i];
1997
+ }
1998
+ }
1999
+
2000
+ static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
2001
+ const mmid_row_mapping * __restrict__ row_mapping,
2002
+ int64_t ne0,
2003
+ size_t nb1, size_t nb2) {
2004
+ int32_t i = blockIdx.x;
2005
+
2006
+ const int32_t i1 = row_mapping[i].i1;
2007
+ const int32_t i2 = row_mapping[i].i2;
2008
+
2009
+ const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
2010
+ float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
2011
+
2012
+ for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
2013
+ dst_row_original[j] = dst_row_contiguous[j];
2014
+ }
2015
+ }
2016
+
1963
2017
  static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1964
2018
  const ggml_tensor * src0 = dst->src[0];
1965
2019
  const ggml_tensor * src1 = dst->src[1];
1966
2020
  const ggml_tensor * ids = dst->src[2];
1967
2021
 
2022
+ GGML_TENSOR_BINARY_OP_LOCALS
2023
+
1968
2024
  GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
1969
2025
 
1970
2026
  cudaStream_t stream = ctx.stream();
1971
2027
 
1972
- const size_t nb11 = src1->nb[1];
1973
- const size_t nb1 = dst->nb[1];
1974
-
1975
- const int32_t id = ((int32_t *) dst->op_params)[0];
1976
- const int32_t n_as = src0->ne[2];
2028
+ const int64_t n_as = ne02;
2029
+ const int64_t n_ids = ids->ne[0];
1977
2030
 
1978
2031
  std::vector<char> ids_host(ggml_nbytes(ids));
1979
2032
  const char * ids_dev = (const char *) ids->data;
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1982
2035
 
1983
2036
  ggml_tensor src0_row = *src0;
1984
2037
  ggml_tensor src1_row = *src1;
1985
- ggml_tensor dst_row = *dst;
2038
+ ggml_tensor dst_row = *dst;
1986
2039
 
1987
2040
  char * src0_original = (char *) src0->data;
1988
2041
  char * src1_original = (char *) src1->data;
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1990
2043
 
1991
2044
  src0_row.ne[2] = 1;
1992
2045
  src0_row.ne[3] = 1;
1993
- src0_row.nb[3] = src0->nb[2];
2046
+ src0_row.nb[3] = nb02;
1994
2047
 
1995
- if (src1->ne[1] == 1) {
1996
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
1997
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2048
+ src1_row.ne[1] = 1;
2049
+ src1_row.ne[2] = 1;
2050
+ src1_row.ne[3] = 1;
2051
+ src1_row.nb[2] = nb11;
2052
+ src1_row.nb[3] = nb11;
1998
2053
 
1999
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2054
+ dst_row.ne[1] = 1;
2055
+ dst_row.ne[2] = 1;
2056
+ dst_row.ne[3] = 1;
2057
+ dst_row.nb[2] = nb1;
2058
+ dst_row.nb[3] = nb1;
2000
2059
 
2001
- src0_row.data = src0_original + row_id*src0->nb[2];
2002
- src1_row.data = src1_original + i01*src1->nb[1];
2003
- dst_row.data = dst_original + i01*dst->nb[1];
2060
+ if (ne12 == 1) {
2061
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2062
+ for (int64_t id = 0; id < n_ids; id++) {
2063
+ const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2004
2064
 
2005
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2065
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2066
+
2067
+ const int64_t i11 = id % ne11;
2068
+ const int64_t i12 = iid1;
2069
+
2070
+ const int64_t i1 = id;
2071
+ const int64_t i2 = i12;
2072
+
2073
+ src0_row.data = src0_original + i02*nb02;
2074
+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
2075
+ dst_row.data = dst_original + i1*nb1 + i2*nb2;
2076
+
2077
+ ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2078
+ }
2006
2079
  }
2007
2080
  } else {
2008
2081
  ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2011
2084
  src1_row.data = src1_contiguous.get();
2012
2085
  dst_row.data = dst_contiguous.get();
2013
2086
 
2014
- for (int32_t row_id = 0; row_id < n_as; ++row_id) {
2087
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
2015
2088
  int64_t num_src1_rows = 0;
2016
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2017
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2018
2089
 
2019
- if (row_id_i != row_id) {
2020
- continue;
2021
- }
2090
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2091
+ for (int64_t id = 0; id < n_ids; id++) {
2092
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2093
+
2094
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2022
2095
 
2023
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2096
+ if (row_id_i != i02) {
2097
+ continue;
2098
+ }
2024
2099
 
2025
- CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
2026
- nb11, cudaMemcpyDeviceToDevice, stream));
2027
- num_src1_rows++;
2100
+ num_src1_rows++;
2101
+ }
2028
2102
  }
2029
2103
 
2030
2104
  if (num_src1_rows == 0) {
2031
2105
  continue;
2032
2106
  }
2033
2107
 
2034
- src0_row.data = src0_original + row_id*src0->nb[2];
2108
+ ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2109
+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2110
+ CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2035
2111
 
2036
- src1_row.ne[1] = num_src1_rows;
2037
- dst_row.ne[1] = num_src1_rows;
2112
+ {
2113
+ dim3 block_dims(std::min((unsigned int)ne10, 768u));
2114
+ dim3 grid_dims(ids->ne[1], n_ids);
2115
+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2116
+ src1_original, src1_contiguous.get(),
2117
+ dev_cur_src1_row.get(), dev_row_mapping.get(),
2118
+ ids_dev, i02, ids->nb[1], ids->nb[0],
2119
+ ne11, ne10,
2120
+ nb11, nb12);
2121
+ CUDA_CHECK(cudaGetLastError());
2122
+ }
2038
2123
 
2124
+ src0_row.data = src0_original + i02*nb02;
2125
+
2126
+ GGML_ASSERT(nb11 == sizeof(float)*ne10);
2127
+ GGML_ASSERT(nb1 == sizeof(float)*ne0);
2128
+
2129
+ src1_row.ne[1] = num_src1_rows;
2039
2130
  src1_row.nb[1] = nb11;
2040
2131
  src1_row.nb[2] = num_src1_rows*nb11;
2041
2132
  src1_row.nb[3] = num_src1_rows*nb11;
2042
2133
 
2134
+ dst_row.ne[1] = num_src1_rows;
2043
2135
  dst_row.nb[1] = nb1;
2044
2136
  dst_row.nb[2] = num_src1_rows*nb1;
2045
2137
  dst_row.nb[3] = num_src1_rows*nb1;
2046
2138
 
2047
2139
  ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2048
2140
 
2049
- num_src1_rows = 0;
2050
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2051
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2052
-
2053
- if (row_id_i != row_id) {
2054
- continue;
2055
- }
2056
-
2057
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2058
-
2059
- CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
2060
- nb1, cudaMemcpyDeviceToDevice, stream));
2061
- num_src1_rows++;
2141
+ {
2142
+ dim3 block_dims(std::min((unsigned int)ne0, 768u));
2143
+ dim3 grid_dims(num_src1_rows);
2144
+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2145
+ dst_original, dst_contiguous.get(),
2146
+ dev_row_mapping.get(),
2147
+ ne0,
2148
+ nb1, nb2);
2149
+ CUDA_CHECK(cudaGetLastError());
2062
2150
  }
2063
2151
  }
2064
2152
  }
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2487
2575
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2488
2576
  const int min_batch_size = 32;
2489
2577
 
2490
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
2578
+ return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
2579
+ (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
2491
2580
 
2492
2581
  GGML_UNUSED(backend);
2493
2582
  }
@@ -2617,6 +2706,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2617
2706
  return false;
2618
2707
  }
2619
2708
 
2709
+ #if CUDART_VERSION >= 11100
2620
2710
  cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
2621
2711
  if (err != cudaSuccess) {
2622
2712
  // clear the error
@@ -2627,6 +2717,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2627
2717
  return false;
2628
2718
  }
2629
2719
  return true;
2720
+ #else
2721
+ return false;
2722
+ #endif
2630
2723
  }
2631
2724
 
2632
2725
  GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
@@ -88,7 +88,7 @@ typedef uint16_t ggml_fp16_internal_t;
88
88
  #if defined(_MSC_VER) || defined(__MINGW32__)
89
89
  #include <intrin.h>
90
90
  #else
91
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
91
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
92
92
  #if !defined(__riscv)
93
93
  #include <immintrin.h>
94
94
  #endif