llama_cpp 0.14.7 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
4
- data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
3
+ metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
+ data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
5
5
  SHA512:
6
- metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
7
- data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
6
+ metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
+ data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
+
3
+ - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
4
+ - Bump llama.cpp from b2740 to b2781.
5
+ - Change `LLAMA_SESSION_VERSION` value from 5 to 6.
6
+ - Add contants for pre-tokenization types.
7
+ - Add `flash_attn` accessor to `ContextParams`.
8
+ - Add `heck_tensors` accessor to `ModelParams`.
9
+ - Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
10
+
11
+ **Breaking Change**
12
+ - Change method names in `ModelKVOverride`.
13
+
1
14
  ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
2
15
 
3
16
  - Bump llama.cpp from b2698 to b2740.
data/README.md CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use cuBLAS
32
- $ gem install llama_cpp -- --with-cublas
31
+ # use CUDA
32
+ $ gem install llama_cpp -- --with-cuda
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
15
15
  make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
16
16
  make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
17
17
  make_envs << ' LLAMA_BLIS=1' if with_config('blis')
18
- make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
18
+ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
19
+ make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
19
20
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
21
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
22
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
@@ -708,9 +708,10 @@ public:
708
708
  rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
709
709
  rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
710
710
  rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
711
- rb_define_method(rb_cLLaMAModelKVOverride, "int_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_int_value), 0);
712
- rb_define_method(rb_cLLaMAModelKVOverride, "float_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_float_value), 0);
713
- rb_define_method(rb_cLLaMAModelKVOverride, "bool_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_bool_value), 0);
711
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
712
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
713
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
714
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
714
715
  }
715
716
 
716
717
  static const rb_data_type_t llama_model_kv_override_type;
@@ -726,19 +727,24 @@ private:
726
727
  return INT2NUM(ptr->tag);
727
728
  }
728
729
 
729
- static VALUE _llama_model_kv_override_get_int_value(VALUE self) {
730
+ static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
730
731
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
731
- return INT2NUM(ptr->int_value);
732
+ return INT2NUM(ptr->val_i64);
732
733
  }
733
734
 
734
- static VALUE _llama_model_kv_override_get_float_value(VALUE self) {
735
+ static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
735
736
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
736
- return DBL2NUM(ptr->float_value);
737
+ return DBL2NUM(ptr->val_f64);
737
738
  }
738
739
 
739
- static VALUE _llama_model_kv_override_get_bool_value(VALUE self) {
740
+ static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
740
741
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
741
- return ptr->bool_value ? Qtrue : Qfalse;
742
+ return ptr->val_bool ? Qtrue : Qfalse;
743
+ }
744
+
745
+ static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
746
+ llama_model_kv_override* ptr = get_llama_model_kv_override(self);
747
+ return rb_utf8_str_new_cstr(ptr->val_str);
742
748
  }
743
749
  };
744
750
 
@@ -800,6 +806,8 @@ public:
800
806
  rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
801
807
  rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
802
808
  rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
809
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
810
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
803
811
  }
804
812
 
805
813
  private:
@@ -892,6 +900,18 @@ private:
892
900
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
893
901
  return ptr->params.use_mlock ? Qtrue : Qfalse;
894
902
  }
903
+
904
+ // check_tensors
905
+ static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
906
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
907
+ ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
908
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
909
+ }
910
+
911
+ static VALUE _llama_model_params_get_check_tensors(VALUE self) {
912
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
913
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
914
+ }
895
915
  };
896
916
 
897
917
  const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
@@ -984,6 +1004,8 @@ public:
984
1004
  rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
985
1005
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
986
1006
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
1007
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
1008
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
987
1009
  }
988
1010
 
989
1011
  private:
@@ -1262,6 +1284,18 @@ private:
1262
1284
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1263
1285
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1264
1286
  }
1287
+
1288
+ // flash_attn
1289
+ static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
1290
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1291
+ ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
1292
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1293
+ }
1294
+
1295
+ static VALUE _llama_context_params_get_flash_attn(VALUE self) {
1296
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1297
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1298
+ }
1265
1299
  };
1266
1300
 
1267
1301
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -3386,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
3386
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3387
3421
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
3388
3422
 
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
3425
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
3426
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
3427
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
3428
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+
3389
3432
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3390
3433
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
3391
3434
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
@@ -3428,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
3428
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
3429
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
3430
3473
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
3474
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
3431
3475
 
3432
3476
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
3433
3477
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.7'
6
+ VERSION = '0.15.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2740'
9
+ LLAMA_CPP_VERSION = 'b2781'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -16,6 +16,15 @@ module LLaMACpp
16
16
  LLAMA_VOCAB_TYPE_BPE: Integer
17
17
  LLAMA_VOCAB_TYPE_WPM: Integer
18
18
 
19
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
20
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
21
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
22
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
23
+ LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
24
+ LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
+ LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
+ LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+
19
28
  LLAMA_FTYPE_ALL_F32: Integer
20
29
  LLAMA_FTYPE_MOSTLY_F16: Integer
21
30
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -48,6 +57,7 @@ module LLaMACpp
48
57
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
49
58
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
50
59
  LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
60
+ LLAMA_KV_OVERRIDE_TYPE_STR: Integer
51
61
 
52
62
  LLAMA_GRETYPE_END: Integer
53
63
  LLAMA_GRETYPE_ALT: Integer
@@ -163,9 +173,10 @@ module LLaMACpp
163
173
 
164
174
  def key: () -> String
165
175
  def tag: () -> Integer
166
- def int_value: () -> Integer
167
- def float_value: () -> Float
168
- def bool_value: () -> bool
176
+ def val_i64: () -> Integer
177
+ def val_f64: () -> Float
178
+ def val_bool: () -> bool
179
+ def val_str: () -> String
169
180
  end
170
181
 
171
182
  class ModelParams
@@ -184,6 +195,8 @@ module LLaMACpp
184
195
  def use_mmap=: (bool) -> bool
185
196
  def use_mlock: () -> bool
186
197
  def use_mlock=: (bool) -> bool
198
+ def check_tensors: () -> bool
199
+ def check_tensors=: (bool) -> bool
187
200
  end
188
201
 
189
202
  class Batch
@@ -311,6 +324,8 @@ module LLaMACpp
311
324
  def embeddings=: (bool) -> bool
312
325
  def offload_kqv: () -> bool
313
326
  def offload_kqv=: (bool) -> bool
327
+ def flash_attn: () -> bool
328
+ def flash_attn=: (bool) -> bool
314
329
  end
315
330
 
316
331
  class ModelQuantizeParams
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
9
- tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
- tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
- tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar tests/test-grammar-integration
9
+ tests/test-autorelease \
10
+ tests/test-backend-ops \
11
+ tests/test-double-float \
12
+ tests/test-grad0 \
13
+ tests/test-grammar-integration \
14
+ tests/test-grammar-parser \
15
+ tests/test-json-schema-to-grammar \
16
+ tests/test-llama-grammar \
17
+ tests/test-model-load-cancel \
18
+ tests/test-opt \
19
+ tests/test-quantize-fns \
20
+ tests/test-quantize-perf \
21
+ tests/test-rope \
22
+ tests/test-sampling \
23
+ tests/test-tokenizer-0 \
24
+ tests/test-tokenizer-1-bpe \
25
+ tests/test-tokenizer-1-spm
14
26
 
15
27
  # Code coverage output files
16
28
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -27,6 +39,17 @@ ifndef UNAME_M
27
39
  UNAME_M := $(shell uname -m)
28
40
  endif
29
41
 
42
+ # In GNU make default CXX is g++ instead of c++. Let's fix that so that users
43
+ # of non-gcc compilers don't have to provide g++ alias or wrapper.
44
+ DEFCC := cc
45
+ DEFCXX := c++
46
+ ifeq ($(origin CC),default)
47
+ CC := $(DEFCC)
48
+ endif
49
+ ifeq ($(origin CXX),default)
50
+ CXX := $(DEFCXX)
51
+ endif
52
+
30
53
  # Mac OS + Arm can report x86_64
31
54
  # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
32
55
  ifeq ($(UNAME_S),Darwin)
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
49
72
  test: $(TEST_TARGETS)
50
73
  @failures=0; \
51
74
  for test_target in $(TEST_TARGETS); do \
52
- if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
53
- ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
54
- elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
75
+ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
76
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
77
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
+ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
55
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
56
- elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
80
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
+ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
+ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
85
+ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
57
86
  continue; \
58
87
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
59
88
  continue; \
@@ -773,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
773
802
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
774
803
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
775
804
 
776
- quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
805
+ quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
777
806
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
778
807
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
779
808
 
@@ -976,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
976
1005
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
977
1006
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
978
1007
 
979
- tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
980
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
981
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
982
-
983
- tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1008
+ tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
984
1009
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
985
1010
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
986
1011
 
@@ -988,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
988
1013
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
989
1014
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
990
1015
 
991
- tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1016
+ tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
992
1017
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
993
1018
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
994
1019
 
@@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1784
1784
 
1785
1785
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1786
1786
  // reset state for the next run
1787
- size_t hash_size = sched->hash_set.size;
1788
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1789
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1790
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
+ if (!sched->is_reset) {
1788
+ size_t hash_size = sched->hash_set.size;
1789
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1790
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1791
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1791
1792
 
1792
- sched->is_reset = true;
1793
+ sched->is_reset = true;
1794
+ }
1793
1795
  sched->is_alloc = false;
1794
1796
  }
1795
1797
 
@@ -14,6 +14,7 @@
14
14
  #include "ggml-cuda/cpy.cuh"
15
15
  #include "ggml-cuda/diagmask.cuh"
16
16
  #include "ggml-cuda/dmmv.cuh"
17
+ #include "ggml-cuda/fattn.cuh"
17
18
  #include "ggml-cuda/getrows.cuh"
18
19
  #include "ggml-cuda/im2col.cuh"
19
20
  #include "ggml-cuda/mmq.cuh"
@@ -140,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
140
141
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
141
142
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
142
143
  info.devices[id].smpb = prop.sharedMemPerBlock;
144
+ info.devices[id].nsm = prop.multiProcessorCount;
143
145
  }
144
146
 
145
147
  for (int id = 0; id < info.device_count; ++id) {
@@ -2290,6 +2292,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2290
2292
  case GGML_OP_ARGSORT:
2291
2293
  ggml_cuda_op_argsort(ctx, dst);
2292
2294
  break;
2295
+ case GGML_OP_FLASH_ATTN_EXT:
2296
+ ggml_cuda_flash_attn_ext(ctx, dst);
2297
+ break;
2293
2298
  default:
2294
2299
  return false;
2295
2300
  }
@@ -2564,6 +2569,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2564
2569
  case GGML_OP_ARANGE:
2565
2570
  case GGML_OP_TIMESTEP_EMBEDDING:
2566
2571
  case GGML_OP_LEAKY_RELU:
2572
+ case GGML_OP_FLASH_ATTN_EXT:
2567
2573
  return true;
2568
2574
  default:
2569
2575
  return false;
@@ -313,7 +313,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
313
313
 
314
314
  #endif // defined(__ARM_NEON)
315
315
 
316
- #if defined(__ARM_NEON) && !defined(__MSC_VER)
316
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
317
317
 
318
318
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
319
319
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
@@ -1427,6 +1427,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1427
1427
  for (int i = node_start; i < node_end; ++i) {
1428
1428
  struct ggml_tensor * src0 = gf->nodes[i]->src[0];
1429
1429
  struct ggml_tensor * src1 = gf->nodes[i]->src[1];
1430
+ struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
1430
1431
  struct ggml_tensor * dst = gf->nodes[i];
1431
1432
  GGML_ASSERT(dst->data != nullptr);
1432
1433
 
@@ -1559,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1560
  {
1560
1561
  float scale;
1561
1562
  memcpy(&scale, dst->op_params, sizeof(float));
1563
+
1564
+ #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1565
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
+ GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
+ GGML_ASSERT(src2 == nullptr);
1568
+
1562
1569
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1563
1570
  } break;
1564
1571
  case GGML_OP_DIAG_MASK_INF: