llama_cpp 0.14.7 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
4
- data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
3
+ metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
+ data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
5
5
  SHA512:
6
- metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
7
- data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
6
+ metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
+ data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
+
3
+ - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
4
+ - Bump llama.cpp from b2740 to b2781.
5
+ - Change `LLAMA_SESSION_VERSION` value from 5 to 6.
6
+ - Add contants for pre-tokenization types.
7
+ - Add `flash_attn` accessor to `ContextParams`.
8
+ - Add `heck_tensors` accessor to `ModelParams`.
9
+ - Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
10
+
11
+ **Breaking Change**
12
+ - Change method names in `ModelKVOverride`.
13
+
1
14
  ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
2
15
 
3
16
  - Bump llama.cpp from b2698 to b2740.
data/README.md CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use cuBLAS
32
- $ gem install llama_cpp -- --with-cublas
31
+ # use CUDA
32
+ $ gem install llama_cpp -- --with-cuda
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
15
15
  make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
16
16
  make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
17
17
  make_envs << ' LLAMA_BLIS=1' if with_config('blis')
18
- make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
18
+ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
19
+ make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
19
20
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
21
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
22
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
@@ -708,9 +708,10 @@ public:
708
708
  rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
709
709
  rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
710
710
  rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
711
- rb_define_method(rb_cLLaMAModelKVOverride, "int_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_int_value), 0);
712
- rb_define_method(rb_cLLaMAModelKVOverride, "float_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_float_value), 0);
713
- rb_define_method(rb_cLLaMAModelKVOverride, "bool_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_bool_value), 0);
711
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
712
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
713
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
714
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
714
715
  }
715
716
 
716
717
  static const rb_data_type_t llama_model_kv_override_type;
@@ -726,19 +727,24 @@ private:
726
727
  return INT2NUM(ptr->tag);
727
728
  }
728
729
 
729
- static VALUE _llama_model_kv_override_get_int_value(VALUE self) {
730
+ static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
730
731
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
731
- return INT2NUM(ptr->int_value);
732
+ return INT2NUM(ptr->val_i64);
732
733
  }
733
734
 
734
- static VALUE _llama_model_kv_override_get_float_value(VALUE self) {
735
+ static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
735
736
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
736
- return DBL2NUM(ptr->float_value);
737
+ return DBL2NUM(ptr->val_f64);
737
738
  }
738
739
 
739
- static VALUE _llama_model_kv_override_get_bool_value(VALUE self) {
740
+ static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
740
741
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
741
- return ptr->bool_value ? Qtrue : Qfalse;
742
+ return ptr->val_bool ? Qtrue : Qfalse;
743
+ }
744
+
745
+ static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
746
+ llama_model_kv_override* ptr = get_llama_model_kv_override(self);
747
+ return rb_utf8_str_new_cstr(ptr->val_str);
742
748
  }
743
749
  };
744
750
 
@@ -800,6 +806,8 @@ public:
800
806
  rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
801
807
  rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
802
808
  rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
809
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
810
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
803
811
  }
804
812
 
805
813
  private:
@@ -892,6 +900,18 @@ private:
892
900
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
893
901
  return ptr->params.use_mlock ? Qtrue : Qfalse;
894
902
  }
903
+
904
+ // check_tensors
905
+ static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
906
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
907
+ ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
908
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
909
+ }
910
+
911
+ static VALUE _llama_model_params_get_check_tensors(VALUE self) {
912
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
913
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
914
+ }
895
915
  };
896
916
 
897
917
  const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
@@ -984,6 +1004,8 @@ public:
984
1004
  rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
985
1005
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
986
1006
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
1007
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
1008
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
987
1009
  }
988
1010
 
989
1011
  private:
@@ -1262,6 +1284,18 @@ private:
1262
1284
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1263
1285
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1264
1286
  }
1287
+
1288
+ // flash_attn
1289
+ static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
1290
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1291
+ ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
1292
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1293
+ }
1294
+
1295
+ static VALUE _llama_context_params_get_flash_attn(VALUE self) {
1296
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1297
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1298
+ }
1265
1299
  };
1266
1300
 
1267
1301
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -3386,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
3386
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3387
3421
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
3388
3422
 
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
3425
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
3426
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
3427
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
3428
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+
3389
3432
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3390
3433
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
3391
3434
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
@@ -3428,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
3428
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
3429
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
3430
3473
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
3474
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
3431
3475
 
3432
3476
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
3433
3477
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.7'
6
+ VERSION = '0.15.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2740'
9
+ LLAMA_CPP_VERSION = 'b2781'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -16,6 +16,15 @@ module LLaMACpp
16
16
  LLAMA_VOCAB_TYPE_BPE: Integer
17
17
  LLAMA_VOCAB_TYPE_WPM: Integer
18
18
 
19
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
20
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
21
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
22
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
23
+ LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
24
+ LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
+ LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
+ LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+
19
28
  LLAMA_FTYPE_ALL_F32: Integer
20
29
  LLAMA_FTYPE_MOSTLY_F16: Integer
21
30
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -48,6 +57,7 @@ module LLaMACpp
48
57
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
49
58
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
50
59
  LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
60
+ LLAMA_KV_OVERRIDE_TYPE_STR: Integer
51
61
 
52
62
  LLAMA_GRETYPE_END: Integer
53
63
  LLAMA_GRETYPE_ALT: Integer
@@ -163,9 +173,10 @@ module LLaMACpp
163
173
 
164
174
  def key: () -> String
165
175
  def tag: () -> Integer
166
- def int_value: () -> Integer
167
- def float_value: () -> Float
168
- def bool_value: () -> bool
176
+ def val_i64: () -> Integer
177
+ def val_f64: () -> Float
178
+ def val_bool: () -> bool
179
+ def val_str: () -> String
169
180
  end
170
181
 
171
182
  class ModelParams
@@ -184,6 +195,8 @@ module LLaMACpp
184
195
  def use_mmap=: (bool) -> bool
185
196
  def use_mlock: () -> bool
186
197
  def use_mlock=: (bool) -> bool
198
+ def check_tensors: () -> bool
199
+ def check_tensors=: (bool) -> bool
187
200
  end
188
201
 
189
202
  class Batch
@@ -311,6 +324,8 @@ module LLaMACpp
311
324
  def embeddings=: (bool) -> bool
312
325
  def offload_kqv: () -> bool
313
326
  def offload_kqv=: (bool) -> bool
327
+ def flash_attn: () -> bool
328
+ def flash_attn=: (bool) -> bool
314
329
  end
315
330
 
316
331
  class ModelQuantizeParams
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
9
- tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
- tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
- tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar tests/test-grammar-integration
9
+ tests/test-autorelease \
10
+ tests/test-backend-ops \
11
+ tests/test-double-float \
12
+ tests/test-grad0 \
13
+ tests/test-grammar-integration \
14
+ tests/test-grammar-parser \
15
+ tests/test-json-schema-to-grammar \
16
+ tests/test-llama-grammar \
17
+ tests/test-model-load-cancel \
18
+ tests/test-opt \
19
+ tests/test-quantize-fns \
20
+ tests/test-quantize-perf \
21
+ tests/test-rope \
22
+ tests/test-sampling \
23
+ tests/test-tokenizer-0 \
24
+ tests/test-tokenizer-1-bpe \
25
+ tests/test-tokenizer-1-spm
14
26
 
15
27
  # Code coverage output files
16
28
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -27,6 +39,17 @@ ifndef UNAME_M
27
39
  UNAME_M := $(shell uname -m)
28
40
  endif
29
41
 
42
+ # In GNU make default CXX is g++ instead of c++. Let's fix that so that users
43
+ # of non-gcc compilers don't have to provide g++ alias or wrapper.
44
+ DEFCC := cc
45
+ DEFCXX := c++
46
+ ifeq ($(origin CC),default)
47
+ CC := $(DEFCC)
48
+ endif
49
+ ifeq ($(origin CXX),default)
50
+ CXX := $(DEFCXX)
51
+ endif
52
+
30
53
  # Mac OS + Arm can report x86_64
31
54
  # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
32
55
  ifeq ($(UNAME_S),Darwin)
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
49
72
  test: $(TEST_TARGETS)
50
73
  @failures=0; \
51
74
  for test_target in $(TEST_TARGETS); do \
52
- if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
53
- ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
54
- elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
75
+ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
76
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
77
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
+ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
55
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
56
- elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
80
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
+ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
+ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
85
+ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
57
86
  continue; \
58
87
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
59
88
  continue; \
@@ -773,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
773
802
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
774
803
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
775
804
 
776
- quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
805
+ quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
777
806
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
778
807
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
779
808
 
@@ -976,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
976
1005
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
977
1006
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
978
1007
 
979
- tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
980
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
981
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
982
-
983
- tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1008
+ tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
984
1009
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
985
1010
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
986
1011
 
@@ -988,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
988
1013
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
989
1014
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
990
1015
 
991
- tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1016
+ tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
992
1017
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
993
1018
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
994
1019
 
@@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1784
1784
 
1785
1785
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1786
1786
  // reset state for the next run
1787
- size_t hash_size = sched->hash_set.size;
1788
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1789
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1790
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
+ if (!sched->is_reset) {
1788
+ size_t hash_size = sched->hash_set.size;
1789
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1790
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1791
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1791
1792
 
1792
- sched->is_reset = true;
1793
+ sched->is_reset = true;
1794
+ }
1793
1795
  sched->is_alloc = false;
1794
1796
  }
1795
1797
 
@@ -14,6 +14,7 @@
14
14
  #include "ggml-cuda/cpy.cuh"
15
15
  #include "ggml-cuda/diagmask.cuh"
16
16
  #include "ggml-cuda/dmmv.cuh"
17
+ #include "ggml-cuda/fattn.cuh"
17
18
  #include "ggml-cuda/getrows.cuh"
18
19
  #include "ggml-cuda/im2col.cuh"
19
20
  #include "ggml-cuda/mmq.cuh"
@@ -140,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
140
141
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
141
142
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
142
143
  info.devices[id].smpb = prop.sharedMemPerBlock;
144
+ info.devices[id].nsm = prop.multiProcessorCount;
143
145
  }
144
146
 
145
147
  for (int id = 0; id < info.device_count; ++id) {
@@ -2290,6 +2292,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2290
2292
  case GGML_OP_ARGSORT:
2291
2293
  ggml_cuda_op_argsort(ctx, dst);
2292
2294
  break;
2295
+ case GGML_OP_FLASH_ATTN_EXT:
2296
+ ggml_cuda_flash_attn_ext(ctx, dst);
2297
+ break;
2293
2298
  default:
2294
2299
  return false;
2295
2300
  }
@@ -2564,6 +2569,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2564
2569
  case GGML_OP_ARANGE:
2565
2570
  case GGML_OP_TIMESTEP_EMBEDDING:
2566
2571
  case GGML_OP_LEAKY_RELU:
2572
+ case GGML_OP_FLASH_ATTN_EXT:
2567
2573
  return true;
2568
2574
  default:
2569
2575
  return false;
@@ -313,7 +313,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
313
313
 
314
314
  #endif // defined(__ARM_NEON)
315
315
 
316
- #if defined(__ARM_NEON) && !defined(__MSC_VER)
316
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
317
317
 
318
318
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
319
319
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
@@ -1427,6 +1427,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1427
1427
  for (int i = node_start; i < node_end; ++i) {
1428
1428
  struct ggml_tensor * src0 = gf->nodes[i]->src[0];
1429
1429
  struct ggml_tensor * src1 = gf->nodes[i]->src[1];
1430
+ struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
1430
1431
  struct ggml_tensor * dst = gf->nodes[i];
1431
1432
  GGML_ASSERT(dst->data != nullptr);
1432
1433
 
@@ -1559,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1560
  {
1560
1561
  float scale;
1561
1562
  memcpy(&scale, dst->op_params, sizeof(float));
1563
+
1564
+ #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1565
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
+ GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
+ GGML_ASSERT(src2 == nullptr);
1568
+
1562
1569
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1563
1570
  } break;
1564
1571
  case GGML_OP_DIAG_MASK_INF: