llama_cpp 0.14.7 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
|
4
|
+
data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
|
7
|
+
data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
|
2
|
+
|
3
|
+
- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
|
4
|
+
- Bump llama.cpp from b2740 to b2781.
|
5
|
+
- Change `LLAMA_SESSION_VERSION` value from 5 to 6.
|
6
|
+
- Add contants for pre-tokenization types.
|
7
|
+
- Add `flash_attn` accessor to `ContextParams`.
|
8
|
+
- Add `heck_tensors` accessor to `ModelParams`.
|
9
|
+
- Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
|
10
|
+
|
11
|
+
**Breaking Change**
|
12
|
+
- Change method names in `ModelKVOverride`.
|
13
|
+
|
1
14
|
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
2
15
|
|
3
16
|
- Bump llama.cpp from b2698 to b2740.
|
data/README.md
CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
|
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
|
|
15
15
|
make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
|
16
16
|
make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
|
-
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
18
|
+
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
|
+
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
19
20
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
21
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
22
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -708,9 +708,10 @@ public:
|
|
708
708
|
rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
|
709
709
|
rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
|
710
710
|
rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
|
711
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
712
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
713
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
711
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
|
712
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
|
713
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
|
714
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
|
714
715
|
}
|
715
716
|
|
716
717
|
static const rb_data_type_t llama_model_kv_override_type;
|
@@ -726,19 +727,24 @@ private:
|
|
726
727
|
return INT2NUM(ptr->tag);
|
727
728
|
}
|
728
729
|
|
729
|
-
static VALUE
|
730
|
+
static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
|
730
731
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
731
|
-
return INT2NUM(ptr->
|
732
|
+
return INT2NUM(ptr->val_i64);
|
732
733
|
}
|
733
734
|
|
734
|
-
static VALUE
|
735
|
+
static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
|
735
736
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
736
|
-
return DBL2NUM(ptr->
|
737
|
+
return DBL2NUM(ptr->val_f64);
|
737
738
|
}
|
738
739
|
|
739
|
-
static VALUE
|
740
|
+
static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
|
740
741
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
741
|
-
return ptr->
|
742
|
+
return ptr->val_bool ? Qtrue : Qfalse;
|
743
|
+
}
|
744
|
+
|
745
|
+
static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
|
746
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
747
|
+
return rb_utf8_str_new_cstr(ptr->val_str);
|
742
748
|
}
|
743
749
|
};
|
744
750
|
|
@@ -800,6 +806,8 @@ public:
|
|
800
806
|
rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
|
801
807
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
|
802
808
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
|
809
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
|
810
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
|
803
811
|
}
|
804
812
|
|
805
813
|
private:
|
@@ -892,6 +900,18 @@ private:
|
|
892
900
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
893
901
|
return ptr->params.use_mlock ? Qtrue : Qfalse;
|
894
902
|
}
|
903
|
+
|
904
|
+
// check_tensors
|
905
|
+
static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
|
906
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
907
|
+
ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
|
908
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
909
|
+
}
|
910
|
+
|
911
|
+
static VALUE _llama_model_params_get_check_tensors(VALUE self) {
|
912
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
913
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
914
|
+
}
|
895
915
|
};
|
896
916
|
|
897
917
|
const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
|
@@ -984,6 +1004,8 @@ public:
|
|
984
1004
|
rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
|
985
1005
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
986
1006
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
1007
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
|
1008
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
|
987
1009
|
}
|
988
1010
|
|
989
1011
|
private:
|
@@ -1262,6 +1284,18 @@ private:
|
|
1262
1284
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1263
1285
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1264
1286
|
}
|
1287
|
+
|
1288
|
+
// flash_attn
|
1289
|
+
static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
|
1290
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1291
|
+
ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
|
1292
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
static VALUE _llama_context_params_get_flash_attn(VALUE self) {
|
1296
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1297
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1298
|
+
}
|
1265
1299
|
};
|
1266
1300
|
|
1267
1301
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -3386,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
|
|
3386
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3387
3421
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
3388
3422
|
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
|
3425
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
|
3426
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
|
3427
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
|
3428
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
|
3429
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
|
3430
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
|
+
|
3389
3432
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3390
3433
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
3391
3434
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
|
@@ -3428,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3428
3471
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
|
3429
3472
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
|
3430
3473
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
|
3474
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
|
3431
3475
|
|
3432
3476
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3433
3477
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.15.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2781'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -16,6 +16,15 @@ module LLaMACpp
|
|
16
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
17
17
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
18
18
|
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_MPT: Integer
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
|
+
|
19
28
|
LLAMA_FTYPE_ALL_F32: Integer
|
20
29
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
21
30
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -48,6 +57,7 @@ module LLaMACpp
|
|
48
57
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
49
58
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
50
59
|
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
60
|
+
LLAMA_KV_OVERRIDE_TYPE_STR: Integer
|
51
61
|
|
52
62
|
LLAMA_GRETYPE_END: Integer
|
53
63
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -163,9 +173,10 @@ module LLaMACpp
|
|
163
173
|
|
164
174
|
def key: () -> String
|
165
175
|
def tag: () -> Integer
|
166
|
-
def
|
167
|
-
def
|
168
|
-
def
|
176
|
+
def val_i64: () -> Integer
|
177
|
+
def val_f64: () -> Float
|
178
|
+
def val_bool: () -> bool
|
179
|
+
def val_str: () -> String
|
169
180
|
end
|
170
181
|
|
171
182
|
class ModelParams
|
@@ -184,6 +195,8 @@ module LLaMACpp
|
|
184
195
|
def use_mmap=: (bool) -> bool
|
185
196
|
def use_mlock: () -> bool
|
186
197
|
def use_mlock=: (bool) -> bool
|
198
|
+
def check_tensors: () -> bool
|
199
|
+
def check_tensors=: (bool) -> bool
|
187
200
|
end
|
188
201
|
|
189
202
|
class Batch
|
@@ -311,6 +324,8 @@ module LLaMACpp
|
|
311
324
|
def embeddings=: (bool) -> bool
|
312
325
|
def offload_kqv: () -> bool
|
313
326
|
def offload_kqv=: (bool) -> bool
|
327
|
+
def flash_attn: () -> bool
|
328
|
+
def flash_attn=: (bool) -> bool
|
314
329
|
end
|
315
330
|
|
316
331
|
class ModelQuantizeParams
|
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
|
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
9
|
-
tests/test-
|
10
|
-
tests/test-
|
11
|
-
tests/test-
|
12
|
-
tests/test-
|
13
|
-
tests/test-
|
9
|
+
tests/test-autorelease \
|
10
|
+
tests/test-backend-ops \
|
11
|
+
tests/test-double-float \
|
12
|
+
tests/test-grad0 \
|
13
|
+
tests/test-grammar-integration \
|
14
|
+
tests/test-grammar-parser \
|
15
|
+
tests/test-json-schema-to-grammar \
|
16
|
+
tests/test-llama-grammar \
|
17
|
+
tests/test-model-load-cancel \
|
18
|
+
tests/test-opt \
|
19
|
+
tests/test-quantize-fns \
|
20
|
+
tests/test-quantize-perf \
|
21
|
+
tests/test-rope \
|
22
|
+
tests/test-sampling \
|
23
|
+
tests/test-tokenizer-0 \
|
24
|
+
tests/test-tokenizer-1-bpe \
|
25
|
+
tests/test-tokenizer-1-spm
|
14
26
|
|
15
27
|
# Code coverage output files
|
16
28
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -27,6 +39,17 @@ ifndef UNAME_M
|
|
27
39
|
UNAME_M := $(shell uname -m)
|
28
40
|
endif
|
29
41
|
|
42
|
+
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
|
43
|
+
# of non-gcc compilers don't have to provide g++ alias or wrapper.
|
44
|
+
DEFCC := cc
|
45
|
+
DEFCXX := c++
|
46
|
+
ifeq ($(origin CC),default)
|
47
|
+
CC := $(DEFCC)
|
48
|
+
endif
|
49
|
+
ifeq ($(origin CXX),default)
|
50
|
+
CXX := $(DEFCXX)
|
51
|
+
endif
|
52
|
+
|
30
53
|
# Mac OS + Arm can report x86_64
|
31
54
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
32
55
|
ifeq ($(UNAME_S),Darwin)
|
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
|
|
49
72
|
test: $(TEST_TARGETS)
|
50
73
|
@failures=0; \
|
51
74
|
for test_target in $(TEST_TARGETS); do \
|
52
|
-
if [ "$$test_target" = "tests/test-tokenizer-0
|
53
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
54
|
-
|
75
|
+
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
76
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
|
77
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
78
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
55
79
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
56
|
-
|
80
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
81
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
|
82
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
83
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
84
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
85
|
+
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
57
86
|
continue; \
|
58
87
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
59
88
|
continue; \
|
@@ -773,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
|
|
773
802
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
774
803
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
775
804
|
|
776
|
-
quantize: examples/quantize/quantize.cpp
|
805
|
+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
777
806
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
778
807
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
779
808
|
|
@@ -976,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
|
976
1005
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
977
1006
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
978
1007
|
|
979
|
-
tests/test-tokenizer-0
|
980
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
981
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
982
|
-
|
983
|
-
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
1008
|
+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
984
1009
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
985
1010
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
986
1011
|
|
@@ -988,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
|
988
1013
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
989
1014
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
990
1015
|
|
991
|
-
tests/test-tokenizer-1-
|
1016
|
+
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
992
1017
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
993
1018
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
994
1019
|
|
@@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1784
1784
|
|
1785
1785
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1786
1786
|
// reset state for the next run
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1787
|
+
if (!sched->is_reset) {
|
1788
|
+
size_t hash_size = sched->hash_set.size;
|
1789
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
1790
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
1791
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1791
1792
|
|
1792
|
-
|
1793
|
+
sched->is_reset = true;
|
1794
|
+
}
|
1793
1795
|
sched->is_alloc = false;
|
1794
1796
|
}
|
1795
1797
|
|
@@ -14,6 +14,7 @@
|
|
14
14
|
#include "ggml-cuda/cpy.cuh"
|
15
15
|
#include "ggml-cuda/diagmask.cuh"
|
16
16
|
#include "ggml-cuda/dmmv.cuh"
|
17
|
+
#include "ggml-cuda/fattn.cuh"
|
17
18
|
#include "ggml-cuda/getrows.cuh"
|
18
19
|
#include "ggml-cuda/im2col.cuh"
|
19
20
|
#include "ggml-cuda/mmq.cuh"
|
@@ -140,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
140
141
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
141
142
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
142
143
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
144
|
+
info.devices[id].nsm = prop.multiProcessorCount;
|
143
145
|
}
|
144
146
|
|
145
147
|
for (int id = 0; id < info.device_count; ++id) {
|
@@ -2290,6 +2292,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2290
2292
|
case GGML_OP_ARGSORT:
|
2291
2293
|
ggml_cuda_op_argsort(ctx, dst);
|
2292
2294
|
break;
|
2295
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
2296
|
+
ggml_cuda_flash_attn_ext(ctx, dst);
|
2297
|
+
break;
|
2293
2298
|
default:
|
2294
2299
|
return false;
|
2295
2300
|
}
|
@@ -2564,6 +2569,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2564
2569
|
case GGML_OP_ARANGE:
|
2565
2570
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
2566
2571
|
case GGML_OP_LEAKY_RELU:
|
2572
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
2567
2573
|
return true;
|
2568
2574
|
default:
|
2569
2575
|
return false;
|
@@ -313,7 +313,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
|
313
313
|
|
314
314
|
#endif // defined(__ARM_NEON)
|
315
315
|
|
316
|
-
#if defined(__ARM_NEON) && !defined(
|
316
|
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
317
317
|
|
318
318
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
319
319
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
@@ -1427,6 +1427,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1427
1427
|
for (int i = node_start; i < node_end; ++i) {
|
1428
1428
|
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
1429
1429
|
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
1430
|
+
struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
|
1430
1431
|
struct ggml_tensor * dst = gf->nodes[i];
|
1431
1432
|
GGML_ASSERT(dst->data != nullptr);
|
1432
1433
|
|
@@ -1559,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1559
1560
|
{
|
1560
1561
|
float scale;
|
1561
1562
|
memcpy(&scale, dst->op_params, sizeof(float));
|
1563
|
+
|
1564
|
+
#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
|
1565
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
1566
|
+
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
1567
|
+
GGML_ASSERT(src2 == nullptr);
|
1568
|
+
|
1562
1569
|
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
|
1563
1570
|
} break;
|
1564
1571
|
case GGML_OP_DIAG_MASK_INF:
|