llama_cpp 0.14.6 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +90 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -3
- data/vendor/tmp/llama.cpp/Makefile +52 -22
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +394 -44
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +996 -455
- data/vendor/tmp/llama.cpp/llama.h +46 -15
- data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
|
4
|
+
data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
|
7
|
+
data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
|
2
|
+
|
3
|
+
- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
|
4
|
+
- Bump llama.cpp from b2740 to b2781.
|
5
|
+
- Change `LLAMA_SESSION_VERSION` value from 5 to 6.
|
6
|
+
- Add contants for pre-tokenization types.
|
7
|
+
- Add `flash_attn` accessor to `ContextParams`.
|
8
|
+
- Add `heck_tensors` accessor to `ModelParams`.
|
9
|
+
- Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
|
10
|
+
|
11
|
+
**Breaking Change**
|
12
|
+
- Change method names in `ModelKVOverride`.
|
13
|
+
|
14
|
+
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
15
|
+
|
16
|
+
- Bump llama.cpp from b2698 to b2740.
|
17
|
+
- Add `keep_split` accessor to `ModelQuantizeParams`.
|
18
|
+
- Add `pooling_type` method to `Context`.
|
19
|
+
- Add `token_is_eog?` method to `Model`.
|
20
|
+
|
21
|
+
Implementation binding for llama_sample_token_with_rng has been skipped.
|
22
|
+
|
1
23
|
## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
|
2
24
|
|
3
25
|
- Bump llama.cpp from b2658 to b2698.
|
data/README.md
CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
|
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
|
|
15
15
|
make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
|
16
16
|
make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
|
-
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
18
|
+
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
|
+
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
19
20
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
21
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
22
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -708,9 +708,10 @@ public:
|
|
708
708
|
rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
|
709
709
|
rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
|
710
710
|
rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
|
711
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
712
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
713
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
711
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
|
712
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
|
713
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
|
714
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
|
714
715
|
}
|
715
716
|
|
716
717
|
static const rb_data_type_t llama_model_kv_override_type;
|
@@ -726,19 +727,24 @@ private:
|
|
726
727
|
return INT2NUM(ptr->tag);
|
727
728
|
}
|
728
729
|
|
729
|
-
static VALUE
|
730
|
+
static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
|
730
731
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
731
|
-
return INT2NUM(ptr->
|
732
|
+
return INT2NUM(ptr->val_i64);
|
732
733
|
}
|
733
734
|
|
734
|
-
static VALUE
|
735
|
+
static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
|
735
736
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
736
|
-
return DBL2NUM(ptr->
|
737
|
+
return DBL2NUM(ptr->val_f64);
|
737
738
|
}
|
738
739
|
|
739
|
-
static VALUE
|
740
|
+
static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
|
740
741
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
741
|
-
return ptr->
|
742
|
+
return ptr->val_bool ? Qtrue : Qfalse;
|
743
|
+
}
|
744
|
+
|
745
|
+
static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
|
746
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
747
|
+
return rb_utf8_str_new_cstr(ptr->val_str);
|
742
748
|
}
|
743
749
|
};
|
744
750
|
|
@@ -800,6 +806,8 @@ public:
|
|
800
806
|
rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
|
801
807
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
|
802
808
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
|
809
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
|
810
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
|
803
811
|
}
|
804
812
|
|
805
813
|
private:
|
@@ -892,6 +900,18 @@ private:
|
|
892
900
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
893
901
|
return ptr->params.use_mlock ? Qtrue : Qfalse;
|
894
902
|
}
|
903
|
+
|
904
|
+
// check_tensors
|
905
|
+
static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
|
906
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
907
|
+
ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
|
908
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
909
|
+
}
|
910
|
+
|
911
|
+
static VALUE _llama_model_params_get_check_tensors(VALUE self) {
|
912
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
913
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
914
|
+
}
|
895
915
|
};
|
896
916
|
|
897
917
|
const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
|
@@ -984,6 +1004,8 @@ public:
|
|
984
1004
|
rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
|
985
1005
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
986
1006
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
1007
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
|
1008
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
|
987
1009
|
}
|
988
1010
|
|
989
1011
|
private:
|
@@ -1262,6 +1284,18 @@ private:
|
|
1262
1284
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1263
1285
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1264
1286
|
}
|
1287
|
+
|
1288
|
+
// flash_attn
|
1289
|
+
static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
|
1290
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1291
|
+
ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
|
1292
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
static VALUE _llama_context_params_get_flash_attn(VALUE self) {
|
1296
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1297
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1298
|
+
}
|
1265
1299
|
};
|
1266
1300
|
|
1267
1301
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -1321,6 +1355,8 @@ public:
|
|
1321
1355
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
1322
1356
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
|
1323
1357
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
|
1358
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
|
1359
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
|
1324
1360
|
}
|
1325
1361
|
|
1326
1362
|
private:
|
@@ -1405,6 +1441,18 @@ private:
|
|
1405
1441
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1406
1442
|
return ptr->params.pure ? Qtrue : Qfalse;
|
1407
1443
|
}
|
1444
|
+
|
1445
|
+
// keep_split
|
1446
|
+
static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
|
1447
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1448
|
+
ptr->params.keep_split = RTEST(keep_split) ? true : false;
|
1449
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1450
|
+
}
|
1451
|
+
|
1452
|
+
static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
|
1453
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1454
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1455
|
+
}
|
1408
1456
|
};
|
1409
1457
|
|
1410
1458
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -1487,6 +1535,7 @@ public:
|
|
1487
1535
|
rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
|
1488
1536
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1489
1537
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1538
|
+
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1490
1539
|
}
|
1491
1540
|
|
1492
1541
|
private:
|
@@ -1634,10 +1683,10 @@ private:
|
|
1634
1683
|
const llama_token token = NUM2INT(token_);
|
1635
1684
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1636
1685
|
std::vector<char> result(8, 0);
|
1637
|
-
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1686
|
+
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1638
1687
|
if (n_tokens < 0) {
|
1639
1688
|
result.resize(-n_tokens);
|
1640
|
-
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1689
|
+
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1641
1690
|
if (check != -n_tokens) {
|
1642
1691
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1643
1692
|
return Qnil;
|
@@ -1789,6 +1838,16 @@ private:
|
|
1789
1838
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1790
1839
|
return INT2NUM(llama_token_eot(ptr->model));
|
1791
1840
|
}
|
1841
|
+
|
1842
|
+
static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
|
1843
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1844
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1845
|
+
return Qnil;
|
1846
|
+
}
|
1847
|
+
const llama_token token = NUM2INT(token_);
|
1848
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1849
|
+
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1850
|
+
}
|
1792
1851
|
};
|
1793
1852
|
|
1794
1853
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2102,6 +2161,7 @@ public:
|
|
2102
2161
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2103
2162
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2104
2163
|
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2164
|
+
rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
|
2105
2165
|
}
|
2106
2166
|
|
2107
2167
|
private:
|
@@ -3225,6 +3285,15 @@ private:
|
|
3225
3285
|
|
3226
3286
|
return Qnil;
|
3227
3287
|
}
|
3288
|
+
|
3289
|
+
static VALUE _llama_context_pooling_type(VALUE self) {
|
3290
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
3291
|
+
if (ptr->ctx == NULL) {
|
3292
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3293
|
+
return Qnil;
|
3294
|
+
}
|
3295
|
+
return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
|
3296
|
+
}
|
3228
3297
|
};
|
3229
3298
|
|
3230
3299
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
@@ -3351,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
|
|
3351
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3352
3421
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
3353
3422
|
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
|
3425
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
|
3426
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
|
3427
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
|
3428
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
|
3429
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
|
3430
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
|
+
|
3354
3432
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3355
3433
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
3356
3434
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
|
@@ -3393,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3393
3471
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
|
3394
3472
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
|
3395
3473
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
|
3474
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
|
3396
3475
|
|
3397
3476
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3398
3477
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.15.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2781'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -16,6 +16,15 @@ module LLaMACpp
|
|
16
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
17
17
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
18
18
|
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_MPT: Integer
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
|
+
|
19
28
|
LLAMA_FTYPE_ALL_F32: Integer
|
20
29
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
21
30
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -48,6 +57,7 @@ module LLaMACpp
|
|
48
57
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
49
58
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
50
59
|
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
60
|
+
LLAMA_KV_OVERRIDE_TYPE_STR: Integer
|
51
61
|
|
52
62
|
LLAMA_GRETYPE_END: Integer
|
53
63
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -141,6 +151,7 @@ module LLaMACpp
|
|
141
151
|
def token_middle: () -> Integer
|
142
152
|
def token_suffix: () -> Integer
|
143
153
|
def token_eot: () -> Integer
|
154
|
+
def token_is_eog?: (Integer) -> bool
|
144
155
|
end
|
145
156
|
|
146
157
|
class Timings
|
@@ -162,9 +173,10 @@ module LLaMACpp
|
|
162
173
|
|
163
174
|
def key: () -> String
|
164
175
|
def tag: () -> Integer
|
165
|
-
def
|
166
|
-
def
|
167
|
-
def
|
176
|
+
def val_i64: () -> Integer
|
177
|
+
def val_f64: () -> Float
|
178
|
+
def val_bool: () -> bool
|
179
|
+
def val_str: () -> String
|
168
180
|
end
|
169
181
|
|
170
182
|
class ModelParams
|
@@ -183,6 +195,8 @@ module LLaMACpp
|
|
183
195
|
def use_mmap=: (bool) -> bool
|
184
196
|
def use_mlock: () -> bool
|
185
197
|
def use_mlock=: (bool) -> bool
|
198
|
+
def check_tensors: () -> bool
|
199
|
+
def check_tensors=: (bool) -> bool
|
186
200
|
end
|
187
201
|
|
188
202
|
class Batch
|
@@ -260,6 +274,7 @@ module LLaMACpp
|
|
260
274
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
261
275
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
262
276
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
277
|
+
def pooling_type: () -> Integer
|
263
278
|
end
|
264
279
|
|
265
280
|
class ContextParams
|
@@ -309,6 +324,8 @@ module LLaMACpp
|
|
309
324
|
def embeddings=: (bool) -> bool
|
310
325
|
def offload_kqv: () -> bool
|
311
326
|
def offload_kqv=: (bool) -> bool
|
327
|
+
def flash_attn: () -> bool
|
328
|
+
def flash_attn=: (bool) -> bool
|
312
329
|
end
|
313
330
|
|
314
331
|
class ModelQuantizeParams
|
@@ -328,6 +345,8 @@ module LLaMACpp
|
|
328
345
|
def only_copy=: (bool) -> bool
|
329
346
|
def pure: () -> bool
|
330
347
|
def pure=: (bool) -> bool
|
348
|
+
def keep_split: () -> bool
|
349
|
+
def keep_split=: (bool) -> bool
|
331
350
|
end
|
332
351
|
|
333
352
|
class Params = ContextParams
|
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
|
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
9
|
-
tests/test-
|
10
|
-
tests/test-
|
11
|
-
tests/test-
|
12
|
-
tests/test-
|
13
|
-
tests/test-
|
9
|
+
tests/test-autorelease \
|
10
|
+
tests/test-backend-ops \
|
11
|
+
tests/test-double-float \
|
12
|
+
tests/test-grad0 \
|
13
|
+
tests/test-grammar-integration \
|
14
|
+
tests/test-grammar-parser \
|
15
|
+
tests/test-json-schema-to-grammar \
|
16
|
+
tests/test-llama-grammar \
|
17
|
+
tests/test-model-load-cancel \
|
18
|
+
tests/test-opt \
|
19
|
+
tests/test-quantize-fns \
|
20
|
+
tests/test-quantize-perf \
|
21
|
+
tests/test-rope \
|
22
|
+
tests/test-sampling \
|
23
|
+
tests/test-tokenizer-0 \
|
24
|
+
tests/test-tokenizer-1-bpe \
|
25
|
+
tests/test-tokenizer-1-spm
|
14
26
|
|
15
27
|
# Code coverage output files
|
16
28
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -27,6 +39,17 @@ ifndef UNAME_M
|
|
27
39
|
UNAME_M := $(shell uname -m)
|
28
40
|
endif
|
29
41
|
|
42
|
+
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
|
43
|
+
# of non-gcc compilers don't have to provide g++ alias or wrapper.
|
44
|
+
DEFCC := cc
|
45
|
+
DEFCXX := c++
|
46
|
+
ifeq ($(origin CC),default)
|
47
|
+
CC := $(DEFCC)
|
48
|
+
endif
|
49
|
+
ifeq ($(origin CXX),default)
|
50
|
+
CXX := $(DEFCXX)
|
51
|
+
endif
|
52
|
+
|
30
53
|
# Mac OS + Arm can report x86_64
|
31
54
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
32
55
|
ifeq ($(UNAME_S),Darwin)
|
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
|
|
49
72
|
test: $(TEST_TARGETS)
|
50
73
|
@failures=0; \
|
51
74
|
for test_target in $(TEST_TARGETS); do \
|
52
|
-
if [ "$$test_target" = "tests/test-tokenizer-0
|
53
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
54
|
-
|
75
|
+
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
76
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
|
77
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
78
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
55
79
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
56
|
-
|
80
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
81
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
|
82
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
83
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
84
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
85
|
+
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
57
86
|
continue; \
|
58
87
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
59
88
|
continue; \
|
@@ -386,10 +415,6 @@ ifdef LLAMA_OPENBLAS
|
|
386
415
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
387
416
|
endif # LLAMA_OPENBLAS
|
388
417
|
|
389
|
-
# TODO: temporary disable until MoE is fixed
|
390
|
-
# https://github.com/ggerganov/llama.cpp/pull/6716
|
391
|
-
LLAMA_NO_LLAMAFILE := 1
|
392
|
-
|
393
418
|
ifndef LLAMA_NO_LLAMAFILE
|
394
419
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
395
420
|
OBJS += sgemm.o
|
@@ -701,7 +726,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
701
726
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
702
727
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
703
728
|
|
704
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
729
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
705
730
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
706
731
|
|
707
732
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
@@ -777,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
|
|
777
802
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
778
803
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
779
804
|
|
780
|
-
quantize: examples/quantize/quantize.cpp
|
805
|
+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
781
806
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
782
807
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
783
808
|
|
@@ -805,10 +830,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
805
830
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
831
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
832
|
|
808
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
833
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
809
834
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
835
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
811
836
|
|
837
|
+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
838
|
+
examples/server/%.hpp: examples/server/public/% Makefile
|
839
|
+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
840
|
+
echo "unsigned char $${NAME}[] = {" && \
|
841
|
+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
842
|
+
echo "};" && \
|
843
|
+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
844
|
+
) > $@
|
845
|
+
|
812
846
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
813
847
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
814
848
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -971,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
|
971
1005
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
972
1006
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
973
1007
|
|
974
|
-
tests/test-tokenizer-0
|
975
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
976
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
977
|
-
|
978
|
-
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
1008
|
+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
979
1009
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
980
1010
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
981
1011
|
|
@@ -983,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
|
983
1013
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
984
1014
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
985
1015
|
|
986
|
-
tests/test-tokenizer-1-
|
1016
|
+
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
987
1017
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
988
1018
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
989
1019
|
|
@@ -371,16 +371,16 @@ struct ggml_gallocr {
|
|
371
371
|
};
|
372
372
|
|
373
373
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
374
|
-
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr)
|
374
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
375
375
|
GGML_ASSERT(galloc != NULL);
|
376
376
|
|
377
|
-
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t)
|
377
|
+
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
|
-
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *)
|
383
|
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
384
384
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
385
385
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
646
646
|
free(galloc->hash_set.keys);
|
647
647
|
free(galloc->hash_values);
|
648
648
|
galloc->hash_set.size = hash_size;
|
649
|
-
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *)
|
650
|
-
galloc->hash_values = calloc(sizeof(struct hash_node)
|
649
|
+
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
650
|
+
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
651
651
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
652
652
|
GGML_ASSERT(galloc->hash_values != NULL);
|
653
653
|
} else {
|
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
667
667
|
// set the node_allocs from the hash table
|
668
668
|
if (galloc->n_nodes < graph->n_nodes) {
|
669
669
|
free(galloc->node_allocs);
|
670
|
-
galloc->node_allocs = calloc(sizeof(struct node_alloc)
|
670
|
+
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
671
671
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
672
672
|
}
|
673
673
|
galloc->n_nodes = graph->n_nodes;
|
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
697
697
|
}
|
698
698
|
if (galloc->n_leafs < graph->n_leafs) {
|
699
699
|
free(galloc->leaf_allocs);
|
700
|
-
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0])
|
700
|
+
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
701
701
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
702
702
|
}
|
703
703
|
galloc->n_leafs = graph->n_leafs;
|
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
822
822
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
823
823
|
switch (op->op) {
|
824
824
|
case GGML_OP_CPY:
|
825
|
-
return
|
825
|
+
return
|
826
|
+
op->type != GGML_TYPE_IQ2_XXS &&
|
827
|
+
op->type != GGML_TYPE_IQ2_XS &&
|
828
|
+
op->type != GGML_TYPE_IQ1_S &&
|
829
|
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
826
830
|
case GGML_OP_MUL_MAT:
|
827
831
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
828
832
|
default:
|
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1721
1725
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1722
1726
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1723
1727
|
|
1724
|
-
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched)
|
1728
|
+
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
1725
1729
|
|
1726
1730
|
// initialize hash table
|
1727
1731
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
1728
|
-
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0])
|
1729
|
-
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0])
|
1732
|
+
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
1733
|
+
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
1730
1734
|
|
1731
1735
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
-
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0])
|
1733
|
-
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0])
|
1736
|
+
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1737
|
+
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1734
1738
|
|
1735
1739
|
sched->n_backends = n_backends;
|
1736
1740
|
|
1737
1741
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1738
1742
|
|
1739
1743
|
const int initial_splits_capacity = 16;
|
1740
|
-
sched->splits = calloc(sizeof(sched->splits[0])
|
1744
|
+
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1741
1745
|
sched->splits_capacity = initial_splits_capacity;
|
1742
1746
|
|
1743
1747
|
for (int b = 0; b < n_backends; b++) {
|
@@ -1780,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1780
1784
|
|
1781
1785
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1782
1786
|
// reset state for the next run
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
+
if (!sched->is_reset) {
|
1788
|
+
size_t hash_size = sched->hash_set.size;
|
1789
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
1790
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
1791
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1787
1792
|
|
1788
|
-
|
1793
|
+
sched->is_reset = true;
|
1794
|
+
}
|
1789
1795
|
sched->is_alloc = false;
|
1790
1796
|
}
|
1791
1797
|
|
@@ -1968,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1968
1974
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1969
1975
|
struct ggml_hash_set hash_set = {
|
1970
1976
|
/* .size = */ graph->visited_hash_table.size,
|
1971
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1977
|
+
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
1972
1978
|
};
|
1973
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1974
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1979
|
+
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1980
|
+
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
1975
1981
|
|
1976
1982
|
struct ggml_init_params params = {
|
1977
1983
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|