llama_cpp 0.14.7 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +59 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -3
- data/vendor/tmp/llama.cpp/Makefile +42 -18
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +78 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +399 -184
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +302 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +28 -16
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +951 -263
- data/vendor/tmp/llama.cpp/ggml.c +1457 -92
- data/vendor/tmp/llama.cpp/ggml.h +37 -7
- data/vendor/tmp/llama.cpp/llama.cpp +671 -403
- data/vendor/tmp/llama.cpp/llama.h +34 -10
- data/vendor/tmp/llama.cpp/sgemm.cpp +134 -103
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1188 -656
- data/vendor/tmp/llama.cpp/unicode-data.h +4 -3
- data/vendor/tmp/llama.cpp/unicode.cpp +590 -49
- data/vendor/tmp/llama.cpp/unicode.h +6 -3
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce6d72aeb5fb9aff775d44284bf934e164f8470973619507ef6e6eb1ac0bec4d
|
4
|
+
data.tar.gz: 7c1ae823c90f957219b3edbc20f091b65a50caa984c1a6f4d137a46c376b2f0c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d23cb6a63b7734df2547c5e61a699fa206878c747e274e004c829b77335a7cc7434e92168a55d8ab0a617b11eddb5d45d5057a91b92e848735fd9e852b2476cd
|
7
|
+
data.tar.gz: f54b09de3cc60de81be977e9706a9beb3bf28e7740a19a57f6add543fe10cd6dc4101cbbe22dd5b62870c78a1ad4d10f57dd29b7c3e3e12b950e6575cf67b0c7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
+
## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2781 to b2839.
|
4
|
+
- Add constants for pre-tokenization types.
|
5
|
+
- Add constant for model file type.
|
6
|
+
|
7
|
+
## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
|
8
|
+
|
9
|
+
- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
|
10
|
+
- Bump llama.cpp from b2740 to b2781.
|
11
|
+
- Change `LLAMA_SESSION_VERSION` value from 5 to 6.
|
12
|
+
- Add contants for pre-tokenization types.
|
13
|
+
- Add `flash_attn` accessor to `ContextParams`.
|
14
|
+
- Add `heck_tensors` accessor to `ModelParams`.
|
15
|
+
- Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
|
16
|
+
|
17
|
+
**Breaking Change**
|
18
|
+
- Change method names in `ModelKVOverride`.
|
19
|
+
|
1
20
|
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
2
21
|
|
3
22
|
- Bump llama.cpp from b2698 to b2740.
|
data/README.md
CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
|
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
|
|
15
15
|
make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
|
16
16
|
make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
|
-
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
18
|
+
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
|
+
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
19
20
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
21
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
22
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -708,9 +708,10 @@ public:
|
|
708
708
|
rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
|
709
709
|
rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
|
710
710
|
rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
|
711
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
712
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
713
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
711
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
|
712
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
|
713
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
|
714
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
|
714
715
|
}
|
715
716
|
|
716
717
|
static const rb_data_type_t llama_model_kv_override_type;
|
@@ -726,19 +727,24 @@ private:
|
|
726
727
|
return INT2NUM(ptr->tag);
|
727
728
|
}
|
728
729
|
|
729
|
-
static VALUE
|
730
|
+
static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
|
730
731
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
731
|
-
return INT2NUM(ptr->
|
732
|
+
return INT2NUM(ptr->val_i64);
|
732
733
|
}
|
733
734
|
|
734
|
-
static VALUE
|
735
|
+
static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
|
735
736
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
736
|
-
return DBL2NUM(ptr->
|
737
|
+
return DBL2NUM(ptr->val_f64);
|
737
738
|
}
|
738
739
|
|
739
|
-
static VALUE
|
740
|
+
static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
|
740
741
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
741
|
-
return ptr->
|
742
|
+
return ptr->val_bool ? Qtrue : Qfalse;
|
743
|
+
}
|
744
|
+
|
745
|
+
static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
|
746
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
747
|
+
return rb_utf8_str_new_cstr(ptr->val_str);
|
742
748
|
}
|
743
749
|
};
|
744
750
|
|
@@ -800,6 +806,8 @@ public:
|
|
800
806
|
rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
|
801
807
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
|
802
808
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
|
809
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
|
810
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
|
803
811
|
}
|
804
812
|
|
805
813
|
private:
|
@@ -892,6 +900,18 @@ private:
|
|
892
900
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
893
901
|
return ptr->params.use_mlock ? Qtrue : Qfalse;
|
894
902
|
}
|
903
|
+
|
904
|
+
// check_tensors
|
905
|
+
static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
|
906
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
907
|
+
ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
|
908
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
909
|
+
}
|
910
|
+
|
911
|
+
static VALUE _llama_model_params_get_check_tensors(VALUE self) {
|
912
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
913
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
914
|
+
}
|
895
915
|
};
|
896
916
|
|
897
917
|
const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
|
@@ -984,6 +1004,8 @@ public:
|
|
984
1004
|
rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
|
985
1005
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
986
1006
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
1007
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
|
1008
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
|
987
1009
|
}
|
988
1010
|
|
989
1011
|
private:
|
@@ -1262,6 +1284,18 @@ private:
|
|
1262
1284
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1263
1285
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1264
1286
|
}
|
1287
|
+
|
1288
|
+
// flash_attn
|
1289
|
+
static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
|
1290
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1291
|
+
ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
|
1292
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
static VALUE _llama_context_params_get_flash_attn(VALUE self) {
|
1296
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1297
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1298
|
+
}
|
1265
1299
|
};
|
1266
1300
|
|
1267
1301
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -3386,6 +3420,20 @@ extern "C" void Init_llama_cpp(void) {
|
|
3386
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3387
3421
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
3388
3422
|
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
|
3425
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
|
3426
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
|
3427
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
|
3428
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
|
3429
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
|
3430
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3433
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
3436
|
+
|
3389
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3390
3438
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
3391
3439
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
|
@@ -3422,12 +3470,14 @@ extern "C" void Init_llama_cpp(void) {
|
|
3422
3470
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3423
3471
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3424
3472
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3473
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
|
3425
3474
|
|
3426
3475
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3427
3476
|
|
3428
3477
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
|
3429
3478
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
|
3430
3479
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
|
3480
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
|
3431
3481
|
|
3432
3482
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3433
3483
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.15.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2839'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -16,6 +16,20 @@ module LLaMACpp
|
|
16
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
17
17
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
18
18
|
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_MPT: Integer
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
32
|
+
|
19
33
|
LLAMA_FTYPE_ALL_F32: Integer
|
20
34
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
21
35
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -44,10 +58,12 @@ module LLaMACpp
|
|
44
58
|
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
45
59
|
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
46
60
|
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
61
|
+
LLAMA_FTYPE_MOSTLY_BF16: Integer
|
47
62
|
|
48
63
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
49
64
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
50
65
|
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
66
|
+
LLAMA_KV_OVERRIDE_TYPE_STR: Integer
|
51
67
|
|
52
68
|
LLAMA_GRETYPE_END: Integer
|
53
69
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -163,9 +179,10 @@ module LLaMACpp
|
|
163
179
|
|
164
180
|
def key: () -> String
|
165
181
|
def tag: () -> Integer
|
166
|
-
def
|
167
|
-
def
|
168
|
-
def
|
182
|
+
def val_i64: () -> Integer
|
183
|
+
def val_f64: () -> Float
|
184
|
+
def val_bool: () -> bool
|
185
|
+
def val_str: () -> String
|
169
186
|
end
|
170
187
|
|
171
188
|
class ModelParams
|
@@ -184,6 +201,8 @@ module LLaMACpp
|
|
184
201
|
def use_mmap=: (bool) -> bool
|
185
202
|
def use_mlock: () -> bool
|
186
203
|
def use_mlock=: (bool) -> bool
|
204
|
+
def check_tensors: () -> bool
|
205
|
+
def check_tensors=: (bool) -> bool
|
187
206
|
end
|
188
207
|
|
189
208
|
class Batch
|
@@ -311,6 +330,8 @@ module LLaMACpp
|
|
311
330
|
def embeddings=: (bool) -> bool
|
312
331
|
def offload_kqv: () -> bool
|
313
332
|
def offload_kqv=: (bool) -> bool
|
333
|
+
def flash_attn: () -> bool
|
334
|
+
def flash_attn=: (bool) -> bool
|
314
335
|
end
|
315
336
|
|
316
337
|
class ModelQuantizeParams
|
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
|
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
9
|
-
tests/test-
|
10
|
-
tests/test-
|
11
|
-
tests/test-
|
12
|
-
tests/test-
|
13
|
-
tests/test-
|
9
|
+
tests/test-autorelease \
|
10
|
+
tests/test-backend-ops \
|
11
|
+
tests/test-double-float \
|
12
|
+
tests/test-grad0 \
|
13
|
+
tests/test-grammar-integration \
|
14
|
+
tests/test-grammar-parser \
|
15
|
+
tests/test-json-schema-to-grammar \
|
16
|
+
tests/test-llama-grammar \
|
17
|
+
tests/test-model-load-cancel \
|
18
|
+
tests/test-opt \
|
19
|
+
tests/test-quantize-fns \
|
20
|
+
tests/test-quantize-perf \
|
21
|
+
tests/test-rope \
|
22
|
+
tests/test-sampling \
|
23
|
+
tests/test-tokenizer-0 \
|
24
|
+
tests/test-tokenizer-1-bpe \
|
25
|
+
tests/test-tokenizer-1-spm
|
14
26
|
|
15
27
|
# Code coverage output files
|
16
28
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -27,6 +39,17 @@ ifndef UNAME_M
|
|
27
39
|
UNAME_M := $(shell uname -m)
|
28
40
|
endif
|
29
41
|
|
42
|
+
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
|
43
|
+
# of non-gcc compilers don't have to provide g++ alias or wrapper.
|
44
|
+
DEFCC := cc
|
45
|
+
DEFCXX := c++
|
46
|
+
ifeq ($(origin CC),default)
|
47
|
+
CC := $(DEFCC)
|
48
|
+
endif
|
49
|
+
ifeq ($(origin CXX),default)
|
50
|
+
CXX := $(DEFCXX)
|
51
|
+
endif
|
52
|
+
|
30
53
|
# Mac OS + Arm can report x86_64
|
31
54
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
32
55
|
ifeq ($(UNAME_S),Darwin)
|
@@ -49,11 +72,16 @@ default: $(BUILD_TARGETS)
|
|
49
72
|
test: $(TEST_TARGETS)
|
50
73
|
@failures=0; \
|
51
74
|
for test_target in $(TEST_TARGETS); do \
|
52
|
-
if [ "$$test_target" = "tests/test-tokenizer-0
|
53
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
54
|
-
|
75
|
+
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
76
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
|
77
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
78
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
55
79
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
56
|
-
|
80
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
81
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
82
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
83
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
|
84
|
+
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
57
85
|
continue; \
|
58
86
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
59
87
|
continue; \
|
@@ -407,7 +435,7 @@ ifdef LLAMA_CUDA
|
|
407
435
|
else
|
408
436
|
CUDA_PATH ?= /usr/local/cuda
|
409
437
|
endif
|
410
|
-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
438
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
411
439
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
412
440
|
OBJS += ggml-cuda.o
|
413
441
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
@@ -732,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
732
760
|
ar rcs libllama.a $^
|
733
761
|
|
734
762
|
clean:
|
735
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
763
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
736
764
|
rm -vrf ggml-cuda/*.o
|
737
765
|
|
738
766
|
#
|
@@ -773,7 +801,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
|
|
773
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
774
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
775
803
|
|
776
|
-
quantize: examples/quantize/quantize.cpp
|
804
|
+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
777
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
778
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
779
807
|
|
@@ -976,11 +1004,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
|
976
1004
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
977
1005
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
978
1006
|
|
979
|
-
tests/test-tokenizer-0
|
980
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
981
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
982
|
-
|
983
|
-
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
1007
|
+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
984
1008
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
985
1009
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
986
1010
|
|
@@ -988,7 +1012,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
|
988
1012
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
989
1013
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
990
1014
|
|
991
|
-
tests/test-tokenizer-1-
|
1015
|
+
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
992
1016
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
993
1017
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
994
1018
|
|
@@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1784
1784
|
|
1785
1785
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1786
1786
|
// reset state for the next run
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1787
|
+
if (!sched->is_reset) {
|
1788
|
+
size_t hash_size = sched->hash_set.size;
|
1789
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
1790
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
1791
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1791
1792
|
|
1792
|
-
|
1793
|
+
sched->is_reset = true;
|
1794
|
+
}
|
1793
1795
|
sched->is_alloc = false;
|
1794
1796
|
}
|
1795
1797
|
|