llama_cpp 0.14.4 → 0.14.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
4
- data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
3
+ metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
4
+ data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
5
5
  SHA512:
6
- metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
7
- data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
6
+ metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
7
+ data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
data/CHANGELOG.md CHANGED
@@ -1,10 +1,22 @@
1
+ ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
2
+
3
+ - Bump llama.cpp from b2658 to b2698.
4
+
5
+ ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
6
+
7
+ - Bump llama.cpp from b2608 to b2658.
8
+ - Add magic number constants.
9
+ - Add `token_cls` and `token_sep` methods to `Model`.
10
+
11
+ Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
12
+
1
13
  ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
2
14
 
3
15
  - Bump llama.cpp from b2496 to b2573.
4
16
  - Add file type constants.
5
17
  - Bump llama.cpp from b2573 to b2608.
6
18
 
7
- Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
19
+ Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
8
20
 
9
21
  ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
10
22
 
data/examples/chat.rb CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
127
127
  end
128
128
 
129
129
  if input_echo
130
- output = []
131
- embd.each { |token| output << context.model.token_to_piece(token) }
130
+ output = embd.map { |token| context.model.token_to_piece(token) }
132
131
  output_str = output.join
133
132
  output_str.chomp!(antiprompt) if first_input
134
133
  print(output_str)
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
136
135
 
137
136
  if embd_input.size <= n_consumed
138
137
  if antiprompt.size.positive?
139
- last_output = []
140
- last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
138
+ last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
141
139
  last_output_str = last_output.join
142
140
 
143
141
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
37
37
  abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
38
38
  end
39
39
  FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
40
+ FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
40
41
  end
41
42
 
42
43
  abort('libstdc++ is not found.') unless have_library('stdc++')
@@ -1478,6 +1478,8 @@ public:
1478
1478
  rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1479
1479
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1480
1480
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1481
+ rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
1482
+ rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
1481
1483
  rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
1482
1484
  rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
1483
1485
  rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
@@ -1743,6 +1745,16 @@ private:
1743
1745
  return INT2NUM(llama_token_eos(ptr->model));
1744
1746
  }
1745
1747
 
1748
+ static VALUE _llama_model_token_cls(VALUE self) {
1749
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1750
+ return INT2NUM(llama_token_cls(ptr->model));
1751
+ }
1752
+
1753
+ static VALUE _llama_model_token_sep(VALUE self) {
1754
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1755
+ return INT2NUM(llama_token_sep(ptr->model));
1756
+ }
1757
+
1746
1758
  static VALUE _llama_model_token_nl(VALUE self) {
1747
1759
  LLaMAModelWrapper* ptr = get_llama_model(self);
1748
1760
  return INT2NUM(llama_token_nl(ptr->model));
@@ -3414,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
3414
3426
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3415
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3416
3428
 
3429
+ ss_magic.str("");
3430
+ ss_magic.clear(std::stringstream::goodbit);
3431
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
3433
+
3417
3434
  ss_magic.str("");
3418
3435
  ss_magic.clear(std::stringstream::goodbit);
3419
3436
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
3420
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3421
3438
 
3439
+ ss_magic.str("");
3440
+ ss_magic.clear(std::stringstream::goodbit);
3441
+ ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
3442
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3443
+
3422
3444
  ss_magic.str("");
3423
3445
  ss_magic.clear(std::stringstream::goodbit);
3424
3446
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
3425
3447
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
3426
3448
 
3427
3449
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
3450
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
3428
3451
  }
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.4'
6
+ VERSION = '0.14.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2608'
9
+ LLAMA_CPP_VERSION = 'b2698'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_FILE_MAGIC_GGLA: String
7
+ LLAMA_FILE_MAGIC_GGSN: String
8
+ LLAMA_FILE_MAGIC_GGSQ: String
9
+ LLAMA_SESSION_MAGIC: String
10
+ LLAMA_SESSION_VERSION: String
11
+ LLAMA_STATE_SEQ_MAGIC: String
12
+ LLAMA_STATE_SEQ_VERSION: String
13
+
6
14
  LLAMA_VOCAB_TYPE_NONE: Integer
7
15
  LLAMA_VOCAB_TYPE_SPM: Integer
8
16
  LLAMA_VOCAB_TYPE_BPE: Integer
@@ -124,6 +132,8 @@ module LLaMACpp
124
132
  def type: (Integer) -> Integer
125
133
  def token_bos: () -> Integer
126
134
  def token_eos: () -> Integer
135
+ def token_cls: () -> Integer
136
+ def token_sep: () -> Integer
127
137
  def token_nl: () -> Integer
128
138
  def add_bos_token?: () -> bool
129
139
  def add_eos_token?: () -> bool
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2023 Georgi Gerganov
3
+ Copyright (c) 2023-2024 The ggml authors
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,7 +1,7 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
5
5
  retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
@@ -10,7 +10,7 @@ TEST_TARGETS = \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
12
  tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar
13
+ tests/test-json-schema-to-grammar tests/test-grammar-integration
14
14
 
15
15
  # Code coverage output files
16
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -386,6 +386,15 @@ ifdef LLAMA_OPENBLAS
386
386
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
387
  endif # LLAMA_OPENBLAS
388
388
 
389
+ # TODO: temporary disable until MoE is fixed
390
+ # https://github.com/ggerganov/llama.cpp/pull/6716
391
+ LLAMA_NO_LLAMAFILE := 1
392
+
393
+ ifndef LLAMA_NO_LLAMAFILE
394
+ MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
395
+ OBJS += sgemm.o
396
+ endif
397
+
389
398
  ifdef LLAMA_BLIS
390
399
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
391
400
  MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -482,11 +491,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
482
491
 
483
492
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
493
  $(NVCC_COMPILE)
485
-
486
494
  endif # LLAMA_CUDA
487
495
 
488
496
  ifdef LLAMA_CLBLAST
489
-
490
497
  MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
491
498
  MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
492
499
  MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -605,6 +612,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
605
612
  $(CC) $(CFLAGS) -c $< -o $@
606
613
  endif # LLAMA_MPI
607
614
 
615
+ ifndef LLAMA_NO_LLAMAFILE
616
+ sgemm.o: sgemm.cpp sgemm.h ggml.h
617
+ $(CXX) $(CXXFLAGS) -c $< -o $@
618
+ endif
619
+
608
620
  GF_CC := $(CC)
609
621
  include scripts/get-flags.mk
610
622
 
@@ -648,7 +660,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
648
660
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
649
661
  ifndef CUDA_DOCKER_ARCH
650
662
  ifndef CUDA_POWER_ARCH
651
- $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
663
+ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
652
664
  endif # CUDA_POWER_ARCH
653
665
  endif # CUDA_DOCKER_ARCH
654
666
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -690,7 +702,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml
690
702
  $(CXX) $(CXXFLAGS) -c $< -o $@
691
703
 
692
704
  COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
693
- COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
705
+ COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
694
706
 
695
707
  common.o: common/common.cpp $(COMMON_H_DEPS)
696
708
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -724,7 +736,7 @@ lib: llama.o ggml.o $(OBJS)
724
736
  ar rcs libllama.a $^
725
737
 
726
738
  clean:
727
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
739
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
740
  rm -vrf ggml-cuda/*.o
729
741
 
730
742
  #
@@ -761,7 +773,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
761
773
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
762
774
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
763
775
 
764
- batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
776
+ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
765
777
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
766
778
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
767
779
 
@@ -793,7 +805,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
793
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
794
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
795
807
 
796
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
808
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
797
809
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
798
810
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
799
811
 
@@ -805,6 +817,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
805
817
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
818
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
819
 
820
+ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
821
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
822
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
823
+
808
824
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
809
825
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
826
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -923,6 +939,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
923
939
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
924
940
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
925
941
 
942
+ tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
943
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
944
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
945
+
926
946
  tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
927
947
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
928
948
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -137,7 +137,7 @@ extern "C" {
137
137
  /*
138
138
  Example usage:
139
139
 
140
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
141
141
  // preferrably to run on the same backend as the buffer
142
142
  ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
143
143
 
@@ -1225,13 +1225,13 @@ static void ggml_cuda_op_mul_mat_cublas(
1225
1225
 
1226
1226
  // the main device has a larger memory buffer to hold the results from all GPUs
1227
1227
  // ldc == nrows of the matrix that cuBLAS writes into
1228
- int ldc = id == ctx.device ? ne0 : row_diff;
1228
+ int64_t ldc = id == ctx.device ? ne0 : row_diff;
1229
1229
 
1230
1230
  const int compute_capability = ggml_cuda_info().devices[id].cc;
1231
1231
 
1232
1232
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1233
1233
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1234
- ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
1234
+ ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1235
1235
  if (src0->type != GGML_TYPE_F16) {
1236
1236
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1237
1237
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1241
1241
  }
1242
1242
  const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1243
1243
 
1244
- ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
1244
+ ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1245
1245
  if (src1->type != GGML_TYPE_F16) {
1246
1246
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1247
1247
  GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1250
1250
  to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1251
1251
  }
1252
1252
  const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1253
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
1253
+ ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1254
1254
 
1255
1255
  const half alpha_f16 = 1.0f;
1256
1256
  const half beta_f16 = 0.0f;
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
1377
1377
  const int64_t ne0 = dst->ne[0];
1378
1378
  const int64_t ne1 = dst->ne[1];
1379
1379
 
1380
- const int nb2 = dst->nb[2];
1381
- const int nb3 = dst->nb[3];
1380
+ const int64_t nb2 = dst->nb[2];
1381
+ const int64_t nb3 = dst->nb[3];
1382
1382
 
1383
1383
  GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1384
1384
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1946
1946
  } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1947
1947
  // KQV single-batch
1948
1948
  ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1949
- } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1949
+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1950
1950
  // KQ + KQV multi-batch
1951
1951
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1952
1952
  } else if (use_dequantize_mul_mat_vec) {
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1960
1960
  }
1961
1961
  }
1962
1962
 
1963
+ struct mmid_row_mapping {
1964
+ int32_t i1;
1965
+ int32_t i2;
1966
+ };
1967
+
1968
+ static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
1969
+ int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
1970
+ const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
1971
+ int64_t ne11, int64_t ne10,
1972
+ size_t nb11, size_t nb12) {
1973
+ int32_t iid1 = blockIdx.x;
1974
+ int32_t id = blockIdx.y;
1975
+
1976
+ const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
1977
+
1978
+ if (row_id_i != i02) {
1979
+ return;
1980
+ }
1981
+
1982
+ const int64_t i11 = id % ne11;
1983
+ const int64_t i12 = iid1;
1984
+
1985
+ __shared__ int src1_row;
1986
+ if (threadIdx.x == 0) {
1987
+ src1_row = atomicAdd(cur_src1_row, 1);
1988
+ row_mapping[src1_row] = {id, iid1};
1989
+ }
1990
+ __syncthreads();
1991
+
1992
+ const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
1993
+ float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
1994
+
1995
+ for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
1996
+ src1_row_contiguous[i] = src1_row_original[i];
1997
+ }
1998
+ }
1999
+
2000
+ static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
2001
+ const mmid_row_mapping * __restrict__ row_mapping,
2002
+ int64_t ne0,
2003
+ size_t nb1, size_t nb2) {
2004
+ int32_t i = blockIdx.x;
2005
+
2006
+ const int32_t i1 = row_mapping[i].i1;
2007
+ const int32_t i2 = row_mapping[i].i2;
2008
+
2009
+ const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
2010
+ float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
2011
+
2012
+ for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
2013
+ dst_row_original[j] = dst_row_contiguous[j];
2014
+ }
2015
+ }
2016
+
1963
2017
  static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1964
2018
  const ggml_tensor * src0 = dst->src[0];
1965
2019
  const ggml_tensor * src1 = dst->src[1];
1966
2020
  const ggml_tensor * ids = dst->src[2];
1967
2021
 
2022
+ GGML_TENSOR_BINARY_OP_LOCALS
2023
+
1968
2024
  GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
1969
2025
 
1970
2026
  cudaStream_t stream = ctx.stream();
1971
2027
 
1972
- const size_t nb11 = src1->nb[1];
1973
- const size_t nb1 = dst->nb[1];
1974
-
1975
- const int32_t id = ((int32_t *) dst->op_params)[0];
1976
- const int32_t n_as = src0->ne[2];
2028
+ const int64_t n_as = ne02;
2029
+ const int64_t n_ids = ids->ne[0];
1977
2030
 
1978
2031
  std::vector<char> ids_host(ggml_nbytes(ids));
1979
2032
  const char * ids_dev = (const char *) ids->data;
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1982
2035
 
1983
2036
  ggml_tensor src0_row = *src0;
1984
2037
  ggml_tensor src1_row = *src1;
1985
- ggml_tensor dst_row = *dst;
2038
+ ggml_tensor dst_row = *dst;
1986
2039
 
1987
2040
  char * src0_original = (char *) src0->data;
1988
2041
  char * src1_original = (char *) src1->data;
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
1990
2043
 
1991
2044
  src0_row.ne[2] = 1;
1992
2045
  src0_row.ne[3] = 1;
1993
- src0_row.nb[3] = src0->nb[2];
2046
+ src0_row.nb[3] = nb02;
1994
2047
 
1995
- if (src1->ne[1] == 1) {
1996
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
1997
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2048
+ src1_row.ne[1] = 1;
2049
+ src1_row.ne[2] = 1;
2050
+ src1_row.ne[3] = 1;
2051
+ src1_row.nb[2] = nb11;
2052
+ src1_row.nb[3] = nb11;
1998
2053
 
1999
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2054
+ dst_row.ne[1] = 1;
2055
+ dst_row.ne[2] = 1;
2056
+ dst_row.ne[3] = 1;
2057
+ dst_row.nb[2] = nb1;
2058
+ dst_row.nb[3] = nb1;
2000
2059
 
2001
- src0_row.data = src0_original + row_id*src0->nb[2];
2002
- src1_row.data = src1_original + i01*src1->nb[1];
2003
- dst_row.data = dst_original + i01*dst->nb[1];
2060
+ if (ne12 == 1) {
2061
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2062
+ for (int64_t id = 0; id < n_ids; id++) {
2063
+ const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2004
2064
 
2005
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2065
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2066
+
2067
+ const int64_t i11 = id % ne11;
2068
+ const int64_t i12 = iid1;
2069
+
2070
+ const int64_t i1 = id;
2071
+ const int64_t i2 = i12;
2072
+
2073
+ src0_row.data = src0_original + i02*nb02;
2074
+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
2075
+ dst_row.data = dst_original + i1*nb1 + i2*nb2;
2076
+
2077
+ ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2078
+ }
2006
2079
  }
2007
2080
  } else {
2008
2081
  ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2011
2084
  src1_row.data = src1_contiguous.get();
2012
2085
  dst_row.data = dst_contiguous.get();
2013
2086
 
2014
- for (int32_t row_id = 0; row_id < n_as; ++row_id) {
2087
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
2015
2088
  int64_t num_src1_rows = 0;
2016
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2017
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2018
2089
 
2019
- if (row_id_i != row_id) {
2020
- continue;
2021
- }
2090
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2091
+ for (int64_t id = 0; id < n_ids; id++) {
2092
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2093
+
2094
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2022
2095
 
2023
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2096
+ if (row_id_i != i02) {
2097
+ continue;
2098
+ }
2024
2099
 
2025
- CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
2026
- nb11, cudaMemcpyDeviceToDevice, stream));
2027
- num_src1_rows++;
2100
+ num_src1_rows++;
2101
+ }
2028
2102
  }
2029
2103
 
2030
2104
  if (num_src1_rows == 0) {
2031
2105
  continue;
2032
2106
  }
2033
2107
 
2034
- src0_row.data = src0_original + row_id*src0->nb[2];
2108
+ ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2109
+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2110
+ CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2035
2111
 
2036
- src1_row.ne[1] = num_src1_rows;
2037
- dst_row.ne[1] = num_src1_rows;
2112
+ {
2113
+ dim3 block_dims(std::min((unsigned int)ne10, 768u));
2114
+ dim3 grid_dims(ids->ne[1], n_ids);
2115
+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2116
+ src1_original, src1_contiguous.get(),
2117
+ dev_cur_src1_row.get(), dev_row_mapping.get(),
2118
+ ids_dev, i02, ids->nb[1], ids->nb[0],
2119
+ ne11, ne10,
2120
+ nb11, nb12);
2121
+ CUDA_CHECK(cudaGetLastError());
2122
+ }
2038
2123
 
2124
+ src0_row.data = src0_original + i02*nb02;
2125
+
2126
+ GGML_ASSERT(nb11 == sizeof(float)*ne10);
2127
+ GGML_ASSERT(nb1 == sizeof(float)*ne0);
2128
+
2129
+ src1_row.ne[1] = num_src1_rows;
2039
2130
  src1_row.nb[1] = nb11;
2040
2131
  src1_row.nb[2] = num_src1_rows*nb11;
2041
2132
  src1_row.nb[3] = num_src1_rows*nb11;
2042
2133
 
2134
+ dst_row.ne[1] = num_src1_rows;
2043
2135
  dst_row.nb[1] = nb1;
2044
2136
  dst_row.nb[2] = num_src1_rows*nb1;
2045
2137
  dst_row.nb[3] = num_src1_rows*nb1;
2046
2138
 
2047
2139
  ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2048
2140
 
2049
- num_src1_rows = 0;
2050
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
2051
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
2052
-
2053
- if (row_id_i != row_id) {
2054
- continue;
2055
- }
2056
-
2057
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
2058
-
2059
- CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
2060
- nb1, cudaMemcpyDeviceToDevice, stream));
2061
- num_src1_rows++;
2141
+ {
2142
+ dim3 block_dims(std::min((unsigned int)ne0, 768u));
2143
+ dim3 grid_dims(num_src1_rows);
2144
+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2145
+ dst_original, dst_contiguous.get(),
2146
+ dev_row_mapping.get(),
2147
+ ne0,
2148
+ nb1, nb2);
2149
+ CUDA_CHECK(cudaGetLastError());
2062
2150
  }
2063
2151
  }
2064
2152
  }
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2487
2575
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2488
2576
  const int min_batch_size = 32;
2489
2577
 
2490
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
2578
+ return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
2579
+ (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
2491
2580
 
2492
2581
  GGML_UNUSED(backend);
2493
2582
  }
@@ -2617,6 +2706,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2617
2706
  return false;
2618
2707
  }
2619
2708
 
2709
+ #if CUDART_VERSION >= 11100
2620
2710
  cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
2621
2711
  if (err != cudaSuccess) {
2622
2712
  // clear the error
@@ -2627,6 +2717,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2627
2717
  return false;
2628
2718
  }
2629
2719
  return true;
2720
+ #else
2721
+ return false;
2722
+ #endif
2630
2723
  }
2631
2724
 
2632
2725
  GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
@@ -88,7 +88,7 @@ typedef uint16_t ggml_fp16_internal_t;
88
88
  #if defined(_MSC_VER) || defined(__MINGW32__)
89
89
  #include <intrin.h>
90
90
  #else
91
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
91
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
92
92
  #if !defined(__riscv)
93
93
  #include <immintrin.h>
94
94
  #endif