llama_cpp 0.14.7 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +59 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -3
- data/vendor/tmp/llama.cpp/Makefile +42 -18
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +78 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +399 -184
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +302 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +28 -16
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +951 -263
- data/vendor/tmp/llama.cpp/ggml.c +1457 -92
- data/vendor/tmp/llama.cpp/ggml.h +37 -7
- data/vendor/tmp/llama.cpp/llama.cpp +671 -403
- data/vendor/tmp/llama.cpp/llama.h +34 -10
- data/vendor/tmp/llama.cpp/sgemm.cpp +134 -103
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1188 -656
- data/vendor/tmp/llama.cpp/unicode-data.h +4 -3
- data/vendor/tmp/llama.cpp/unicode.cpp +590 -49
- data/vendor/tmp/llama.cpp/unicode.h +6 -3
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce6d72aeb5fb9aff775d44284bf934e164f8470973619507ef6e6eb1ac0bec4d
|
4
|
+
data.tar.gz: 7c1ae823c90f957219b3edbc20f091b65a50caa984c1a6f4d137a46c376b2f0c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d23cb6a63b7734df2547c5e61a699fa206878c747e274e004c829b77335a7cc7434e92168a55d8ab0a617b11eddb5d45d5057a91b92e848735fd9e852b2476cd
|
7
|
+
data.tar.gz: f54b09de3cc60de81be977e9706a9beb3bf28e7740a19a57f6add543fe10cd6dc4101cbbe22dd5b62870c78a1ad4d10f57dd29b7c3e3e12b950e6575cf67b0c7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
+
## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2781 to b2839.
|
4
|
+
- Add constants for pre-tokenization types.
|
5
|
+
- Add constant for model file type.
|
6
|
+
|
7
|
+
## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
|
8
|
+
|
9
|
+
- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
|
10
|
+
- Bump llama.cpp from b2740 to b2781.
|
11
|
+
- Change `LLAMA_SESSION_VERSION` value from 5 to 6.
|
12
|
+
- Add contants for pre-tokenization types.
|
13
|
+
- Add `flash_attn` accessor to `ContextParams`.
|
14
|
+
- Add `heck_tensors` accessor to `ModelParams`.
|
15
|
+
- Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
|
16
|
+
|
17
|
+
**Breaking Change**
|
18
|
+
- Change method names in `ModelKVOverride`.
|
19
|
+
|
1
20
|
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
2
21
|
|
3
22
|
- Bump llama.cpp from b2698 to b2740.
|
data/README.md
CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
|
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
|
|
15
15
|
make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
|
16
16
|
make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
|
-
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
18
|
+
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
|
+
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
19
20
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
21
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
22
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -708,9 +708,10 @@ public:
|
|
708
708
|
rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
|
709
709
|
rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
|
710
710
|
rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
|
711
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
712
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
713
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
711
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
|
712
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
|
713
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
|
714
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
|
714
715
|
}
|
715
716
|
|
716
717
|
static const rb_data_type_t llama_model_kv_override_type;
|
@@ -726,19 +727,24 @@ private:
|
|
726
727
|
return INT2NUM(ptr->tag);
|
727
728
|
}
|
728
729
|
|
729
|
-
static VALUE
|
730
|
+
static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
|
730
731
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
731
|
-
return INT2NUM(ptr->
|
732
|
+
return INT2NUM(ptr->val_i64);
|
732
733
|
}
|
733
734
|
|
734
|
-
static VALUE
|
735
|
+
static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
|
735
736
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
736
|
-
return DBL2NUM(ptr->
|
737
|
+
return DBL2NUM(ptr->val_f64);
|
737
738
|
}
|
738
739
|
|
739
|
-
static VALUE
|
740
|
+
static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
|
740
741
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
741
|
-
return ptr->
|
742
|
+
return ptr->val_bool ? Qtrue : Qfalse;
|
743
|
+
}
|
744
|
+
|
745
|
+
static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
|
746
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
747
|
+
return rb_utf8_str_new_cstr(ptr->val_str);
|
742
748
|
}
|
743
749
|
};
|
744
750
|
|
@@ -800,6 +806,8 @@ public:
|
|
800
806
|
rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
|
801
807
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
|
802
808
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
|
809
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
|
810
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
|
803
811
|
}
|
804
812
|
|
805
813
|
private:
|
@@ -892,6 +900,18 @@ private:
|
|
892
900
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
893
901
|
return ptr->params.use_mlock ? Qtrue : Qfalse;
|
894
902
|
}
|
903
|
+
|
904
|
+
// check_tensors
|
905
|
+
static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
|
906
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
907
|
+
ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
|
908
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
909
|
+
}
|
910
|
+
|
911
|
+
static VALUE _llama_model_params_get_check_tensors(VALUE self) {
|
912
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
913
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
914
|
+
}
|
895
915
|
};
|
896
916
|
|
897
917
|
const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
|
@@ -984,6 +1004,8 @@ public:
|
|
984
1004
|
rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
|
985
1005
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
986
1006
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
1007
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
|
1008
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
|
987
1009
|
}
|
988
1010
|
|
989
1011
|
private:
|
@@ -1262,6 +1284,18 @@ private:
|
|
1262
1284
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1263
1285
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1264
1286
|
}
|
1287
|
+
|
1288
|
+
// flash_attn
|
1289
|
+
static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
|
1290
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1291
|
+
ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
|
1292
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
static VALUE _llama_context_params_get_flash_attn(VALUE self) {
|
1296
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1297
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1298
|
+
}
|
1265
1299
|
};
|
1266
1300
|
|
1267
1301
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -3386,6 +3420,20 @@ extern "C" void Init_llama_cpp(void) {
|
|
3386
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3387
3421
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
3388
3422
|
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
|
3425
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
|
3426
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
|
3427
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
|
3428
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
|
3429
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
|
3430
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3433
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
3436
|
+
|
3389
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3390
3438
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
3391
3439
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
|
@@ -3422,12 +3470,14 @@ extern "C" void Init_llama_cpp(void) {
|
|
3422
3470
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3423
3471
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3424
3472
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3473
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
|
3425
3474
|
|
3426
3475
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3427
3476
|
|
3428
3477
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
|
3429
3478
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
|
3430
3479
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
|
3480
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
|
3431
3481
|
|
3432
3482
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3433
3483
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.15.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2839'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -16,6 +16,20 @@ module LLaMACpp
|
|
16
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
17
17
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
18
18
|
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_MPT: Integer
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
32
|
+
|
19
33
|
LLAMA_FTYPE_ALL_F32: Integer
|
20
34
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
21
35
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -44,10 +58,12 @@ module LLaMACpp
|
|
44
58
|
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
45
59
|
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
46
60
|
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
61
|
+
LLAMA_FTYPE_MOSTLY_BF16: Integer
|
47
62
|
|
48
63
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
49
64
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
50
65
|
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
66
|
+
LLAMA_KV_OVERRIDE_TYPE_STR: Integer
|
51
67
|
|
52
68
|
LLAMA_GRETYPE_END: Integer
|
53
69
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -163,9 +179,10 @@ module LLaMACpp
|
|
163
179
|
|
164
180
|
def key: () -> String
|
165
181
|
def tag: () -> Integer
|
166
|
-
def
|
167
|
-
def
|
168
|
-
def
|
182
|
+
def val_i64: () -> Integer
|
183
|
+
def val_f64: () -> Float
|
184
|
+
def val_bool: () -> bool
|
185
|
+
def val_str: () -> String
|
169
186
|
end
|
170
187
|
|
171
188
|
class ModelParams
|
@@ -184,6 +201,8 @@ module LLaMACpp
|
|
184
201
|
def use_mmap=: (bool) -> bool
|
185
202
|
def use_mlock: () -> bool
|
186
203
|
def use_mlock=: (bool) -> bool
|
204
|
+
def check_tensors: () -> bool
|
205
|
+
def check_tensors=: (bool) -> bool
|
187
206
|
end
|
188
207
|
|
189
208
|
class Batch
|
@@ -311,6 +330,8 @@ module LLaMACpp
|
|
311
330
|
def embeddings=: (bool) -> bool
|
312
331
|
def offload_kqv: () -> bool
|
313
332
|
def offload_kqv=: (bool) -> bool
|
333
|
+
def flash_attn: () -> bool
|
334
|
+
def flash_attn=: (bool) -> bool
|
314
335
|
end
|
315
336
|
|
316
337
|
class ModelQuantizeParams
|
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
|
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
9
|
-
tests/test-
|
10
|
-
tests/test-
|
11
|
-
tests/test-
|
12
|
-
tests/test-
|
13
|
-
tests/test-
|
9
|
+
tests/test-autorelease \
|
10
|
+
tests/test-backend-ops \
|
11
|
+
tests/test-double-float \
|
12
|
+
tests/test-grad0 \
|
13
|
+
tests/test-grammar-integration \
|
14
|
+
tests/test-grammar-parser \
|
15
|
+
tests/test-json-schema-to-grammar \
|
16
|
+
tests/test-llama-grammar \
|
17
|
+
tests/test-model-load-cancel \
|
18
|
+
tests/test-opt \
|
19
|
+
tests/test-quantize-fns \
|
20
|
+
tests/test-quantize-perf \
|
21
|
+
tests/test-rope \
|
22
|
+
tests/test-sampling \
|
23
|
+
tests/test-tokenizer-0 \
|
24
|
+
tests/test-tokenizer-1-bpe \
|
25
|
+
tests/test-tokenizer-1-spm
|
14
26
|
|
15
27
|
# Code coverage output files
|
16
28
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -27,6 +39,17 @@ ifndef UNAME_M
|
|
27
39
|
UNAME_M := $(shell uname -m)
|
28
40
|
endif
|
29
41
|
|
42
|
+
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
|
43
|
+
# of non-gcc compilers don't have to provide g++ alias or wrapper.
|
44
|
+
DEFCC := cc
|
45
|
+
DEFCXX := c++
|
46
|
+
ifeq ($(origin CC),default)
|
47
|
+
CC := $(DEFCC)
|
48
|
+
endif
|
49
|
+
ifeq ($(origin CXX),default)
|
50
|
+
CXX := $(DEFCXX)
|
51
|
+
endif
|
52
|
+
|
30
53
|
# Mac OS + Arm can report x86_64
|
31
54
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
32
55
|
ifeq ($(UNAME_S),Darwin)
|
@@ -49,11 +72,16 @@ default: $(BUILD_TARGETS)
|
|
49
72
|
test: $(TEST_TARGETS)
|
50
73
|
@failures=0; \
|
51
74
|
for test_target in $(TEST_TARGETS); do \
|
52
|
-
if [ "$$test_target" = "tests/test-tokenizer-0
|
53
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
54
|
-
|
75
|
+
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
76
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
|
77
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
78
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
55
79
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
56
|
-
|
80
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
81
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
82
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
83
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
|
84
|
+
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
57
85
|
continue; \
|
58
86
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
59
87
|
continue; \
|
@@ -407,7 +435,7 @@ ifdef LLAMA_CUDA
|
|
407
435
|
else
|
408
436
|
CUDA_PATH ?= /usr/local/cuda
|
409
437
|
endif
|
410
|
-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
438
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
411
439
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
412
440
|
OBJS += ggml-cuda.o
|
413
441
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
@@ -732,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
732
760
|
ar rcs libllama.a $^
|
733
761
|
|
734
762
|
clean:
|
735
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
763
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
736
764
|
rm -vrf ggml-cuda/*.o
|
737
765
|
|
738
766
|
#
|
@@ -773,7 +801,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
|
|
773
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
774
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
775
803
|
|
776
|
-
quantize: examples/quantize/quantize.cpp
|
804
|
+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
777
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
778
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
779
807
|
|
@@ -976,11 +1004,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
|
976
1004
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
977
1005
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
978
1006
|
|
979
|
-
tests/test-tokenizer-0
|
980
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
981
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
982
|
-
|
983
|
-
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
1007
|
+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
984
1008
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
985
1009
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
986
1010
|
|
@@ -988,7 +1012,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
|
988
1012
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
989
1013
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
990
1014
|
|
991
|
-
tests/test-tokenizer-1-
|
1015
|
+
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
992
1016
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
993
1017
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
994
1018
|
|
@@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1784
1784
|
|
1785
1785
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1786
1786
|
// reset state for the next run
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1787
|
+
if (!sched->is_reset) {
|
1788
|
+
size_t hash_size = sched->hash_set.size;
|
1789
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
1790
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
1791
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1791
1792
|
|
1792
|
-
|
1793
|
+
sched->is_reset = true;
|
1794
|
+
}
|
1793
1795
|
sched->is_alloc = false;
|
1794
1796
|
}
|
1795
1797
|
|