llama_cpp 0.14.6 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +90 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -3
- data/vendor/tmp/llama.cpp/Makefile +52 -22
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +394 -44
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +996 -455
- data/vendor/tmp/llama.cpp/llama.h +46 -15
- data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
|
4
|
+
data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
|
7
|
+
data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
|
2
|
+
|
3
|
+
- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
|
4
|
+
- Bump llama.cpp from b2740 to b2781.
|
5
|
+
- Change `LLAMA_SESSION_VERSION` value from 5 to 6.
|
6
|
+
- Add contants for pre-tokenization types.
|
7
|
+
- Add `flash_attn` accessor to `ContextParams`.
|
8
|
+
- Add `heck_tensors` accessor to `ModelParams`.
|
9
|
+
- Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
|
10
|
+
|
11
|
+
**Breaking Change**
|
12
|
+
- Change method names in `ModelKVOverride`.
|
13
|
+
|
14
|
+
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
15
|
+
|
16
|
+
- Bump llama.cpp from b2698 to b2740.
|
17
|
+
- Add `keep_split` accessor to `ModelQuantizeParams`.
|
18
|
+
- Add `pooling_type` method to `Context`.
|
19
|
+
- Add `token_is_eog?` method to `Model`.
|
20
|
+
|
21
|
+
Implementation binding for llama_sample_token_with_rng has been skipped.
|
22
|
+
|
1
23
|
## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
|
2
24
|
|
3
25
|
- Bump llama.cpp from b2658 to b2698.
|
data/README.md
CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
|
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
|
|
15
15
|
make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
|
16
16
|
make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
|
-
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
18
|
+
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
|
+
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
19
20
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
21
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
22
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -708,9 +708,10 @@ public:
|
|
708
708
|
rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
|
709
709
|
rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
|
710
710
|
rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
|
711
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
712
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
713
|
-
rb_define_method(rb_cLLaMAModelKVOverride, "
|
711
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
|
712
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
|
713
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
|
714
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
|
714
715
|
}
|
715
716
|
|
716
717
|
static const rb_data_type_t llama_model_kv_override_type;
|
@@ -726,19 +727,24 @@ private:
|
|
726
727
|
return INT2NUM(ptr->tag);
|
727
728
|
}
|
728
729
|
|
729
|
-
static VALUE
|
730
|
+
static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
|
730
731
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
731
|
-
return INT2NUM(ptr->
|
732
|
+
return INT2NUM(ptr->val_i64);
|
732
733
|
}
|
733
734
|
|
734
|
-
static VALUE
|
735
|
+
static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
|
735
736
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
736
|
-
return DBL2NUM(ptr->
|
737
|
+
return DBL2NUM(ptr->val_f64);
|
737
738
|
}
|
738
739
|
|
739
|
-
static VALUE
|
740
|
+
static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
|
740
741
|
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
741
|
-
return ptr->
|
742
|
+
return ptr->val_bool ? Qtrue : Qfalse;
|
743
|
+
}
|
744
|
+
|
745
|
+
static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
|
746
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
747
|
+
return rb_utf8_str_new_cstr(ptr->val_str);
|
742
748
|
}
|
743
749
|
};
|
744
750
|
|
@@ -800,6 +806,8 @@ public:
|
|
800
806
|
rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
|
801
807
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
|
802
808
|
rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
|
809
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
|
810
|
+
rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
|
803
811
|
}
|
804
812
|
|
805
813
|
private:
|
@@ -892,6 +900,18 @@ private:
|
|
892
900
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
893
901
|
return ptr->params.use_mlock ? Qtrue : Qfalse;
|
894
902
|
}
|
903
|
+
|
904
|
+
// check_tensors
|
905
|
+
static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
|
906
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
907
|
+
ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
|
908
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
909
|
+
}
|
910
|
+
|
911
|
+
static VALUE _llama_model_params_get_check_tensors(VALUE self) {
|
912
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
913
|
+
return ptr->params.check_tensors ? Qtrue : Qfalse;
|
914
|
+
}
|
895
915
|
};
|
896
916
|
|
897
917
|
const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
|
@@ -984,6 +1004,8 @@ public:
|
|
984
1004
|
rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
|
985
1005
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
986
1006
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
1007
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
|
1008
|
+
rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
|
987
1009
|
}
|
988
1010
|
|
989
1011
|
private:
|
@@ -1262,6 +1284,18 @@ private:
|
|
1262
1284
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1263
1285
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1264
1286
|
}
|
1287
|
+
|
1288
|
+
// flash_attn
|
1289
|
+
static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
|
1290
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1291
|
+
ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
|
1292
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
static VALUE _llama_context_params_get_flash_attn(VALUE self) {
|
1296
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1297
|
+
return ptr->params.flash_attn ? Qtrue : Qfalse;
|
1298
|
+
}
|
1265
1299
|
};
|
1266
1300
|
|
1267
1301
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -1321,6 +1355,8 @@ public:
|
|
1321
1355
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
1322
1356
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
|
1323
1357
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
|
1358
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
|
1359
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
|
1324
1360
|
}
|
1325
1361
|
|
1326
1362
|
private:
|
@@ -1405,6 +1441,18 @@ private:
|
|
1405
1441
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1406
1442
|
return ptr->params.pure ? Qtrue : Qfalse;
|
1407
1443
|
}
|
1444
|
+
|
1445
|
+
// keep_split
|
1446
|
+
static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
|
1447
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1448
|
+
ptr->params.keep_split = RTEST(keep_split) ? true : false;
|
1449
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1450
|
+
}
|
1451
|
+
|
1452
|
+
static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
|
1453
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1454
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1455
|
+
}
|
1408
1456
|
};
|
1409
1457
|
|
1410
1458
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -1487,6 +1535,7 @@ public:
|
|
1487
1535
|
rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
|
1488
1536
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1489
1537
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1538
|
+
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1490
1539
|
}
|
1491
1540
|
|
1492
1541
|
private:
|
@@ -1634,10 +1683,10 @@ private:
|
|
1634
1683
|
const llama_token token = NUM2INT(token_);
|
1635
1684
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1636
1685
|
std::vector<char> result(8, 0);
|
1637
|
-
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1686
|
+
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1638
1687
|
if (n_tokens < 0) {
|
1639
1688
|
result.resize(-n_tokens);
|
1640
|
-
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1689
|
+
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1641
1690
|
if (check != -n_tokens) {
|
1642
1691
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1643
1692
|
return Qnil;
|
@@ -1789,6 +1838,16 @@ private:
|
|
1789
1838
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1790
1839
|
return INT2NUM(llama_token_eot(ptr->model));
|
1791
1840
|
}
|
1841
|
+
|
1842
|
+
static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
|
1843
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1844
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1845
|
+
return Qnil;
|
1846
|
+
}
|
1847
|
+
const llama_token token = NUM2INT(token_);
|
1848
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1849
|
+
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1850
|
+
}
|
1792
1851
|
};
|
1793
1852
|
|
1794
1853
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2102,6 +2161,7 @@ public:
|
|
2102
2161
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2103
2162
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2104
2163
|
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2164
|
+
rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
|
2105
2165
|
}
|
2106
2166
|
|
2107
2167
|
private:
|
@@ -3225,6 +3285,15 @@ private:
|
|
3225
3285
|
|
3226
3286
|
return Qnil;
|
3227
3287
|
}
|
3288
|
+
|
3289
|
+
static VALUE _llama_context_pooling_type(VALUE self) {
|
3290
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
3291
|
+
if (ptr->ctx == NULL) {
|
3292
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3293
|
+
return Qnil;
|
3294
|
+
}
|
3295
|
+
return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
|
3296
|
+
}
|
3228
3297
|
};
|
3229
3298
|
|
3230
3299
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
@@ -3351,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
|
|
3351
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3352
3421
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
3353
3422
|
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
|
3425
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
|
3426
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
|
3427
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
|
3428
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
|
3429
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
|
3430
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
|
+
|
3354
3432
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3355
3433
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
3356
3434
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
|
@@ -3393,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3393
3471
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
|
3394
3472
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
|
3395
3473
|
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
|
3474
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
|
3396
3475
|
|
3397
3476
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3398
3477
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.15.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2781'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -16,6 +16,15 @@ module LLaMACpp
|
|
16
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
17
17
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
18
18
|
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_MPT: Integer
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
|
+
|
19
28
|
LLAMA_FTYPE_ALL_F32: Integer
|
20
29
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
21
30
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -48,6 +57,7 @@ module LLaMACpp
|
|
48
57
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
49
58
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
50
59
|
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
60
|
+
LLAMA_KV_OVERRIDE_TYPE_STR: Integer
|
51
61
|
|
52
62
|
LLAMA_GRETYPE_END: Integer
|
53
63
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -141,6 +151,7 @@ module LLaMACpp
|
|
141
151
|
def token_middle: () -> Integer
|
142
152
|
def token_suffix: () -> Integer
|
143
153
|
def token_eot: () -> Integer
|
154
|
+
def token_is_eog?: (Integer) -> bool
|
144
155
|
end
|
145
156
|
|
146
157
|
class Timings
|
@@ -162,9 +173,10 @@ module LLaMACpp
|
|
162
173
|
|
163
174
|
def key: () -> String
|
164
175
|
def tag: () -> Integer
|
165
|
-
def
|
166
|
-
def
|
167
|
-
def
|
176
|
+
def val_i64: () -> Integer
|
177
|
+
def val_f64: () -> Float
|
178
|
+
def val_bool: () -> bool
|
179
|
+
def val_str: () -> String
|
168
180
|
end
|
169
181
|
|
170
182
|
class ModelParams
|
@@ -183,6 +195,8 @@ module LLaMACpp
|
|
183
195
|
def use_mmap=: (bool) -> bool
|
184
196
|
def use_mlock: () -> bool
|
185
197
|
def use_mlock=: (bool) -> bool
|
198
|
+
def check_tensors: () -> bool
|
199
|
+
def check_tensors=: (bool) -> bool
|
186
200
|
end
|
187
201
|
|
188
202
|
class Batch
|
@@ -260,6 +274,7 @@ module LLaMACpp
|
|
260
274
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
261
275
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
262
276
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
277
|
+
def pooling_type: () -> Integer
|
263
278
|
end
|
264
279
|
|
265
280
|
class ContextParams
|
@@ -309,6 +324,8 @@ module LLaMACpp
|
|
309
324
|
def embeddings=: (bool) -> bool
|
310
325
|
def offload_kqv: () -> bool
|
311
326
|
def offload_kqv=: (bool) -> bool
|
327
|
+
def flash_attn: () -> bool
|
328
|
+
def flash_attn=: (bool) -> bool
|
312
329
|
end
|
313
330
|
|
314
331
|
class ModelQuantizeParams
|
@@ -328,6 +345,8 @@ module LLaMACpp
|
|
328
345
|
def only_copy=: (bool) -> bool
|
329
346
|
def pure: () -> bool
|
330
347
|
def pure=: (bool) -> bool
|
348
|
+
def keep_split: () -> bool
|
349
|
+
def keep_split=: (bool) -> bool
|
331
350
|
end
|
332
351
|
|
333
352
|
class Params = ContextParams
|
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
|
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
9
|
-
tests/test-
|
10
|
-
tests/test-
|
11
|
-
tests/test-
|
12
|
-
tests/test-
|
13
|
-
tests/test-
|
9
|
+
tests/test-autorelease \
|
10
|
+
tests/test-backend-ops \
|
11
|
+
tests/test-double-float \
|
12
|
+
tests/test-grad0 \
|
13
|
+
tests/test-grammar-integration \
|
14
|
+
tests/test-grammar-parser \
|
15
|
+
tests/test-json-schema-to-grammar \
|
16
|
+
tests/test-llama-grammar \
|
17
|
+
tests/test-model-load-cancel \
|
18
|
+
tests/test-opt \
|
19
|
+
tests/test-quantize-fns \
|
20
|
+
tests/test-quantize-perf \
|
21
|
+
tests/test-rope \
|
22
|
+
tests/test-sampling \
|
23
|
+
tests/test-tokenizer-0 \
|
24
|
+
tests/test-tokenizer-1-bpe \
|
25
|
+
tests/test-tokenizer-1-spm
|
14
26
|
|
15
27
|
# Code coverage output files
|
16
28
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -27,6 +39,17 @@ ifndef UNAME_M
|
|
27
39
|
UNAME_M := $(shell uname -m)
|
28
40
|
endif
|
29
41
|
|
42
|
+
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
|
43
|
+
# of non-gcc compilers don't have to provide g++ alias or wrapper.
|
44
|
+
DEFCC := cc
|
45
|
+
DEFCXX := c++
|
46
|
+
ifeq ($(origin CC),default)
|
47
|
+
CC := $(DEFCC)
|
48
|
+
endif
|
49
|
+
ifeq ($(origin CXX),default)
|
50
|
+
CXX := $(DEFCXX)
|
51
|
+
endif
|
52
|
+
|
30
53
|
# Mac OS + Arm can report x86_64
|
31
54
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
32
55
|
ifeq ($(UNAME_S),Darwin)
|
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
|
|
49
72
|
test: $(TEST_TARGETS)
|
50
73
|
@failures=0; \
|
51
74
|
for test_target in $(TEST_TARGETS); do \
|
52
|
-
if [ "$$test_target" = "tests/test-tokenizer-0
|
53
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
54
|
-
|
75
|
+
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
76
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
|
77
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
78
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
55
79
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
56
|
-
|
80
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
81
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
|
82
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
83
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
84
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
85
|
+
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
57
86
|
continue; \
|
58
87
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
59
88
|
continue; \
|
@@ -386,10 +415,6 @@ ifdef LLAMA_OPENBLAS
|
|
386
415
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
387
416
|
endif # LLAMA_OPENBLAS
|
388
417
|
|
389
|
-
# TODO: temporary disable until MoE is fixed
|
390
|
-
# https://github.com/ggerganov/llama.cpp/pull/6716
|
391
|
-
LLAMA_NO_LLAMAFILE := 1
|
392
|
-
|
393
418
|
ifndef LLAMA_NO_LLAMAFILE
|
394
419
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
395
420
|
OBJS += sgemm.o
|
@@ -701,7 +726,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
701
726
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
702
727
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
703
728
|
|
704
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
729
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
705
730
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
706
731
|
|
707
732
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
@@ -777,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
|
|
777
802
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
778
803
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
779
804
|
|
780
|
-
quantize: examples/quantize/quantize.cpp
|
805
|
+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
781
806
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
782
807
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
783
808
|
|
@@ -805,10 +830,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
805
830
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
831
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
832
|
|
808
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
833
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
809
834
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
835
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
811
836
|
|
837
|
+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
838
|
+
examples/server/%.hpp: examples/server/public/% Makefile
|
839
|
+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
840
|
+
echo "unsigned char $${NAME}[] = {" && \
|
841
|
+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
842
|
+
echo "};" && \
|
843
|
+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
844
|
+
) > $@
|
845
|
+
|
812
846
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
813
847
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
814
848
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -971,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
|
971
1005
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
972
1006
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
973
1007
|
|
974
|
-
tests/test-tokenizer-0
|
975
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
976
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
977
|
-
|
978
|
-
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
1008
|
+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
979
1009
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
980
1010
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
981
1011
|
|
@@ -983,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
|
983
1013
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
984
1014
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
985
1015
|
|
986
|
-
tests/test-tokenizer-1-
|
1016
|
+
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
|
987
1017
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
988
1018
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
989
1019
|
|
@@ -371,16 +371,16 @@ struct ggml_gallocr {
|
|
371
371
|
};
|
372
372
|
|
373
373
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
374
|
-
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr)
|
374
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
375
375
|
GGML_ASSERT(galloc != NULL);
|
376
376
|
|
377
|
-
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t)
|
377
|
+
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
|
-
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *)
|
383
|
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
384
384
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
385
385
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
646
646
|
free(galloc->hash_set.keys);
|
647
647
|
free(galloc->hash_values);
|
648
648
|
galloc->hash_set.size = hash_size;
|
649
|
-
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *)
|
650
|
-
galloc->hash_values = calloc(sizeof(struct hash_node)
|
649
|
+
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
650
|
+
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
651
651
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
652
652
|
GGML_ASSERT(galloc->hash_values != NULL);
|
653
653
|
} else {
|
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
667
667
|
// set the node_allocs from the hash table
|
668
668
|
if (galloc->n_nodes < graph->n_nodes) {
|
669
669
|
free(galloc->node_allocs);
|
670
|
-
galloc->node_allocs = calloc(sizeof(struct node_alloc)
|
670
|
+
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
671
671
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
672
672
|
}
|
673
673
|
galloc->n_nodes = graph->n_nodes;
|
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
697
697
|
}
|
698
698
|
if (galloc->n_leafs < graph->n_leafs) {
|
699
699
|
free(galloc->leaf_allocs);
|
700
|
-
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0])
|
700
|
+
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
701
701
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
702
702
|
}
|
703
703
|
galloc->n_leafs = graph->n_leafs;
|
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
822
822
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
823
823
|
switch (op->op) {
|
824
824
|
case GGML_OP_CPY:
|
825
|
-
return
|
825
|
+
return
|
826
|
+
op->type != GGML_TYPE_IQ2_XXS &&
|
827
|
+
op->type != GGML_TYPE_IQ2_XS &&
|
828
|
+
op->type != GGML_TYPE_IQ1_S &&
|
829
|
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
826
830
|
case GGML_OP_MUL_MAT:
|
827
831
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
828
832
|
default:
|
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1721
1725
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1722
1726
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1723
1727
|
|
1724
|
-
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched)
|
1728
|
+
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
1725
1729
|
|
1726
1730
|
// initialize hash table
|
1727
1731
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
1728
|
-
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0])
|
1729
|
-
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0])
|
1732
|
+
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
1733
|
+
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
1730
1734
|
|
1731
1735
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
-
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0])
|
1733
|
-
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0])
|
1736
|
+
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1737
|
+
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1734
1738
|
|
1735
1739
|
sched->n_backends = n_backends;
|
1736
1740
|
|
1737
1741
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1738
1742
|
|
1739
1743
|
const int initial_splits_capacity = 16;
|
1740
|
-
sched->splits = calloc(sizeof(sched->splits[0])
|
1744
|
+
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1741
1745
|
sched->splits_capacity = initial_splits_capacity;
|
1742
1746
|
|
1743
1747
|
for (int b = 0; b < n_backends; b++) {
|
@@ -1780,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1780
1784
|
|
1781
1785
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1782
1786
|
// reset state for the next run
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
+
if (!sched->is_reset) {
|
1788
|
+
size_t hash_size = sched->hash_set.size;
|
1789
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
1790
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
1791
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1787
1792
|
|
1788
|
-
|
1793
|
+
sched->is_reset = true;
|
1794
|
+
}
|
1789
1795
|
sched->is_alloc = false;
|
1790
1796
|
}
|
1791
1797
|
|
@@ -1968,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1968
1974
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1969
1975
|
struct ggml_hash_set hash_set = {
|
1970
1976
|
/* .size = */ graph->visited_hash_table.size,
|
1971
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1977
|
+
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
1972
1978
|
};
|
1973
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1974
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1979
|
+
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1980
|
+
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
1975
1981
|
|
1976
1982
|
struct ggml_init_params params = {
|
1977
1983
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|