llama_cpp 0.12.7 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
|
4
|
+
data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
|
7
|
+
data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b2143 to b2303.
|
4
|
+
- Remove deprecated methods:
|
5
|
+
- `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
|
6
|
+
- Rename some constants.
|
7
|
+
- Rename `kv_cache_seq_shift` method to `kv_cache_seq_add`.
|
8
|
+
- Add `defrag_thold` accessor to `ContextParams`.
|
9
|
+
- Add `vocab_type` and `rope_type` methods to `Model`.
|
10
|
+
- Add `kv_cache_seq_pos_max`, `kv_cache_defrag`, and `kv_cache_update` methods to `Context`.
|
11
|
+
|
1
12
|
## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
|
2
13
|
|
3
14
|
- Bump bundled llama.cpp from b2106 to b2143.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -966,12 +966,12 @@ public:
|
|
966
966
|
rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
|
967
967
|
rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
|
968
968
|
rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
|
969
|
+
rb_define_method(rb_cLLaMAContextParams, "defrag_thold=", RUBY_METHOD_FUNC(_llama_context_params_set_defrag_thold), 1);
|
970
|
+
rb_define_method(rb_cLLaMAContextParams, "defrag_thold", RUBY_METHOD_FUNC(_llama_context_params_get_defrag_thold), 0);
|
969
971
|
rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
|
970
972
|
rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
|
971
973
|
rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
|
972
974
|
rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
|
973
|
-
rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
|
974
|
-
rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
|
975
975
|
rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
|
976
976
|
rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
|
977
977
|
rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
|
@@ -1146,6 +1146,18 @@ private:
|
|
1146
1146
|
return UINT2NUM(ptr->params.yarn_orig_ctx);
|
1147
1147
|
}
|
1148
1148
|
|
1149
|
+
// defrag_thold
|
1150
|
+
static VALUE _llama_context_params_set_defrag_thold(VALUE self, VALUE defrag_thold) {
|
1151
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1152
|
+
ptr->params.defrag_thold = NUM2DBL(defrag_thold);
|
1153
|
+
return DBL2NUM(ptr->params.defrag_thold);
|
1154
|
+
}
|
1155
|
+
|
1156
|
+
static VALUE _llama_context_params_get_defrag_thold(VALUE self) {
|
1157
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1158
|
+
return DBL2NUM(ptr->params.defrag_thold);
|
1159
|
+
}
|
1160
|
+
|
1149
1161
|
static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
|
1150
1162
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1151
1163
|
return UINT2NUM(ptr->params.yarn_orig_ctx);
|
@@ -1175,18 +1187,6 @@ private:
|
|
1175
1187
|
return INT2NUM(ptr->params.type_v);
|
1176
1188
|
}
|
1177
1189
|
|
1178
|
-
// mul_mat_q
|
1179
|
-
static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
|
1180
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1181
|
-
ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
|
1182
|
-
return ptr->params.mul_mat_q ? Qtrue : Qfalse;
|
1183
|
-
}
|
1184
|
-
|
1185
|
-
static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
|
1186
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1187
|
-
return ptr->params.mul_mat_q ? Qtrue : Qfalse;
|
1188
|
-
}
|
1189
|
-
|
1190
1190
|
// logits_all
|
1191
1191
|
static VALUE _llama_context_params_set_logits_all(VALUE self, VALUE logits_all) {
|
1192
1192
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -1433,7 +1433,8 @@ public:
|
|
1433
1433
|
rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
|
1434
1434
|
rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
|
1435
1435
|
rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
|
1436
|
-
rb_define_method(rb_cLLaMAModel, "
|
1436
|
+
rb_define_method(rb_cLLaMAModel, "vocab_type", RUBY_METHOD_FUNC(_llama_model_get_model_vocab_type), 0);
|
1437
|
+
rb_define_method(rb_cLLaMAModel, "rope_type", RUBY_METHOD_FUNC(_llama_model_get_model_rope_type), 0);
|
1437
1438
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
1438
1439
|
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
1439
1440
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
@@ -1559,41 +1560,14 @@ private:
|
|
1559
1560
|
return Qnil;
|
1560
1561
|
}
|
1561
1562
|
|
1562
|
-
static VALUE
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
1567
|
-
rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
|
1568
|
-
|
1569
|
-
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1570
|
-
rb_raise(rb_eArgError, "lora_path must be a string");
|
1571
|
-
return Qnil;
|
1572
|
-
}
|
1573
|
-
if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
|
1574
|
-
rb_raise(rb_eArgError, "base_model_path must be a string");
|
1575
|
-
return Qnil;
|
1576
|
-
}
|
1577
|
-
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
1578
|
-
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1579
|
-
return Qnil;
|
1580
|
-
}
|
1581
|
-
if (kw_values[3] != Qundef && !RB_FLOAT_TYPE_P(kw_values[3])) {
|
1582
|
-
rb_raise(rb_eArgError, "scale must be a float");
|
1583
|
-
return Qnil;
|
1584
|
-
}
|
1585
|
-
|
1586
|
-
const char* lora_path = StringValueCStr(kw_values[0]);
|
1587
|
-
const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
|
1588
|
-
const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
|
1589
|
-
const float scale = kw_values[3] == Qundef ? 1.0 : NUM2DBL(kw_values[3]);
|
1563
|
+
static VALUE _llama_model_get_model_vocab_type(VALUE self) {
|
1564
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1565
|
+
return INT2NUM(llama_vocab_type(ptr->model));
|
1566
|
+
}
|
1590
1567
|
|
1568
|
+
static VALUE _llama_model_get_model_rope_type(VALUE self) {
|
1591
1569
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1592
|
-
|
1593
|
-
rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
|
1594
|
-
return Qnil;
|
1595
|
-
}
|
1596
|
-
return Qnil;
|
1570
|
+
return INT2NUM(llama_rope_type(ptr->model));
|
1597
1571
|
}
|
1598
1572
|
|
1599
1573
|
static VALUE _llama_model_get_model_n_vocab(VALUE self) {
|
@@ -2038,8 +2012,6 @@ public:
|
|
2038
2012
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
2039
2013
|
rb_define_attr(rb_cLLaMAContext, "model", 1, 0);
|
2040
2014
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
2041
|
-
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
2042
|
-
rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
|
2043
2015
|
rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
|
2044
2016
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
2045
2017
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
@@ -2054,14 +2026,16 @@ public:
|
|
2054
2026
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
|
2055
2027
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
|
2056
2028
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
|
2057
|
-
rb_define_method(rb_cLLaMAContext, "
|
2029
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_add", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_add), 4);
|
2058
2030
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
|
2031
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_pos_max", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_pos_max), 1);
|
2032
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
|
2033
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
|
2059
2034
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2060
2035
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2061
2036
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2062
2037
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
2063
2038
|
rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
|
2064
|
-
rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
|
2065
2039
|
rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
|
2066
2040
|
rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
|
2067
2041
|
rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
|
@@ -2070,7 +2044,6 @@ public:
|
|
2070
2044
|
rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
|
2071
2045
|
rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
|
2072
2046
|
rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
|
2073
|
-
rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
|
2074
2047
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
|
2075
2048
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
|
2076
2049
|
rb_define_method(rb_cLLaMAContext, "sample_token_greedy", RUBY_METHOD_FUNC(_llama_context_sample_token_greedy), 1);
|
@@ -2122,110 +2095,6 @@ private:
|
|
2122
2095
|
return Qnil;
|
2123
2096
|
}
|
2124
2097
|
|
2125
|
-
static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
|
2126
|
-
VALUE kw_args = Qnil;
|
2127
|
-
ID kw_table[3] = { rb_intern("tokens"), rb_intern("n_past"), rb_intern("n_tokens") };
|
2128
|
-
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2129
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
2130
|
-
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2131
|
-
|
2132
|
-
rb_warn("eval is deprecated. Use decode instead.");
|
2133
|
-
|
2134
|
-
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2135
|
-
rb_raise(rb_eArgError, "tokens must be an Array");
|
2136
|
-
return Qnil;
|
2137
|
-
}
|
2138
|
-
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
2139
|
-
rb_raise(rb_eArgError, "n_past must be an integer");
|
2140
|
-
return Qnil;
|
2141
|
-
}
|
2142
|
-
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
2143
|
-
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
2144
|
-
return Qnil;
|
2145
|
-
}
|
2146
|
-
|
2147
|
-
const size_t tokens_len = RARRAY_LEN(kw_values[0]);
|
2148
|
-
std::vector<llama_token> embd(tokens_len);
|
2149
|
-
for (size_t i = 0; i < tokens_len; i++) {
|
2150
|
-
VALUE token = rb_ary_entry(kw_values[0], i);
|
2151
|
-
if (!RB_INTEGER_TYPE_P(token)) {
|
2152
|
-
rb_raise(rb_eArgError, "tokens must be an array of integers");
|
2153
|
-
return Qnil;
|
2154
|
-
}
|
2155
|
-
embd[i] = NUM2INT(token);
|
2156
|
-
}
|
2157
|
-
|
2158
|
-
const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
|
2159
|
-
const int n_past = NUM2INT(kw_values[1]);
|
2160
|
-
|
2161
|
-
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2162
|
-
if (ptr->ctx == NULL) {
|
2163
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2164
|
-
return Qnil;
|
2165
|
-
}
|
2166
|
-
if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
|
2167
|
-
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
2168
|
-
return Qnil;
|
2169
|
-
}
|
2170
|
-
|
2171
|
-
rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
|
2172
|
-
rb_iv_set(self, "@has_evaluated", Qtrue);
|
2173
|
-
|
2174
|
-
return Qnil;
|
2175
|
-
}
|
2176
|
-
|
2177
|
-
static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
|
2178
|
-
VALUE kw_args = Qnil;
|
2179
|
-
ID kw_table[3] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens") };
|
2180
|
-
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2181
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
2182
|
-
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2183
|
-
|
2184
|
-
rb_warn("eval_embd is deprecated. Use decode instead.");
|
2185
|
-
|
2186
|
-
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2187
|
-
rb_raise(rb_eArgError, "tokens must be an Array");
|
2188
|
-
return Qnil;
|
2189
|
-
}
|
2190
|
-
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
2191
|
-
rb_raise(rb_eArgError, "n_past must be an integer");
|
2192
|
-
return Qnil;
|
2193
|
-
}
|
2194
|
-
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
2195
|
-
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
2196
|
-
return Qnil;
|
2197
|
-
}
|
2198
|
-
|
2199
|
-
const size_t tokens_len = RARRAY_LEN(kw_values[0]);
|
2200
|
-
std::vector<float> embd(tokens_len);
|
2201
|
-
for (size_t i = 0; i < tokens_len; i++) {
|
2202
|
-
VALUE el = rb_ary_entry(kw_values[0], i);
|
2203
|
-
if (!RB_FLOAT_TYPE_P(el)) {
|
2204
|
-
rb_raise(rb_eArgError, "embd must be an array of floats");
|
2205
|
-
return Qnil;
|
2206
|
-
}
|
2207
|
-
embd[i] = NUM2DBL(el);
|
2208
|
-
}
|
2209
|
-
|
2210
|
-
const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
|
2211
|
-
const int n_past = NUM2INT(kw_values[1]);
|
2212
|
-
|
2213
|
-
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2214
|
-
if (ptr->ctx == NULL) {
|
2215
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2216
|
-
return Qnil;
|
2217
|
-
}
|
2218
|
-
if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
|
2219
|
-
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
2220
|
-
return Qnil;
|
2221
|
-
}
|
2222
|
-
|
2223
|
-
rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
|
2224
|
-
rb_iv_set(self, "@has_evaluated", Qtrue);
|
2225
|
-
|
2226
|
-
return Qnil;
|
2227
|
-
}
|
2228
|
-
|
2229
2098
|
static VALUE _llama_context_decode(VALUE self, VALUE batch) {
|
2230
2099
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2231
2100
|
if (ptr->ctx == NULL) {
|
@@ -2430,13 +2299,13 @@ private:
|
|
2430
2299
|
return Qnil;
|
2431
2300
|
}
|
2432
2301
|
|
2433
|
-
static VALUE
|
2302
|
+
static VALUE _llama_context_kv_cache_seq_add(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
|
2434
2303
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2435
2304
|
if (ptr->ctx == NULL) {
|
2436
2305
|
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2437
2306
|
return Qnil;
|
2438
2307
|
}
|
2439
|
-
|
2308
|
+
llama_kv_cache_seq_add(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
|
2440
2309
|
return Qnil;
|
2441
2310
|
}
|
2442
2311
|
|
@@ -2450,6 +2319,35 @@ private:
|
|
2450
2319
|
return Qnil;
|
2451
2320
|
}
|
2452
2321
|
|
2322
|
+
static VALUE _llama_context_kv_cache_seq_pos_max(VALUE self, VALUE seq_id) {
|
2323
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2324
|
+
if (ptr->ctx == NULL) {
|
2325
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2326
|
+
return Qnil;
|
2327
|
+
}
|
2328
|
+
return INT2NUM(llama_kv_cache_seq_pos_max(ptr->ctx, NUM2INT(seq_id)));
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
static VALUE _llama_context_kv_cache_defrag(VALUE self) {
|
2332
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2333
|
+
if (ptr->ctx == NULL) {
|
2334
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2335
|
+
return Qnil;
|
2336
|
+
}
|
2337
|
+
llama_kv_cache_defrag(ptr->ctx);
|
2338
|
+
return Qnil;
|
2339
|
+
}
|
2340
|
+
|
2341
|
+
static VALUE _llama_context_kv_cache_update(VALUE self) {
|
2342
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2343
|
+
if (ptr->ctx == NULL) {
|
2344
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2345
|
+
return Qnil;
|
2346
|
+
}
|
2347
|
+
llama_kv_cache_update(ptr->ctx);
|
2348
|
+
return Qnil;
|
2349
|
+
}
|
2350
|
+
|
2453
2351
|
static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
|
2454
2352
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2455
2353
|
if (ptr->ctx == NULL) {
|
@@ -2659,46 +2557,6 @@ private:
|
|
2659
2557
|
return Qnil;
|
2660
2558
|
}
|
2661
2559
|
|
2662
|
-
static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
|
2663
|
-
VALUE kw_args = Qnil;
|
2664
|
-
ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
|
2665
|
-
VALUE kw_values[2] = { Qundef, Qundef };
|
2666
|
-
VALUE candidates = Qnil;
|
2667
|
-
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
2668
|
-
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
2669
|
-
|
2670
|
-
if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
|
2671
|
-
rb_raise(rb_eArgError, "guidance must be a Context");
|
2672
|
-
return Qnil;
|
2673
|
-
}
|
2674
|
-
if (!RB_FLOAT_TYPE_P(kw_values[1])) {
|
2675
|
-
rb_raise(rb_eArgError, "scale must be a float");
|
2676
|
-
return Qnil;
|
2677
|
-
}
|
2678
|
-
|
2679
|
-
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
2680
|
-
if (ctx_ptr->ctx == NULL) {
|
2681
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2682
|
-
return Qnil;
|
2683
|
-
}
|
2684
|
-
LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
|
2685
|
-
if (cnd_ptr->array.data == nullptr) {
|
2686
|
-
rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
|
2687
|
-
return Qnil;
|
2688
|
-
}
|
2689
|
-
|
2690
|
-
LLaMAContextWrapper* guidance_ptr = get_llama_context(kw_values[0]);
|
2691
|
-
if (guidance_ptr->ctx == NULL) {
|
2692
|
-
rb_raise(rb_eRuntimeError, "guidance context is not initialized");
|
2693
|
-
return Qnil;
|
2694
|
-
}
|
2695
|
-
const float scale = NUM2DBL(kw_values[1]);
|
2696
|
-
|
2697
|
-
llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
|
2698
|
-
|
2699
|
-
return Qnil;
|
2700
|
-
}
|
2701
|
-
|
2702
2560
|
static VALUE _llama_context_sample_softmax(VALUE self, VALUE candidates) {
|
2703
2561
|
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
2704
2562
|
rb_raise(rb_eArgError, "argument must be a TokenDataArray");
|
@@ -2994,42 +2852,6 @@ private:
|
|
2994
2852
|
return Qnil;
|
2995
2853
|
}
|
2996
2854
|
|
2997
|
-
static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
|
2998
|
-
VALUE kw_args = Qnil;
|
2999
|
-
ID kw_table[1] = { rb_intern("temperature") };
|
3000
|
-
VALUE kw_values[1] = { Qundef };
|
3001
|
-
VALUE candidates = Qnil;
|
3002
|
-
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
3003
|
-
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
3004
|
-
|
3005
|
-
rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
|
3006
|
-
|
3007
|
-
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
3008
|
-
rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
|
3009
|
-
return Qnil;
|
3010
|
-
}
|
3011
|
-
if (!RB_FLOAT_TYPE_P(kw_values[0])) {
|
3012
|
-
rb_raise(rb_eArgError, "temperature must be a float");
|
3013
|
-
return Qnil;
|
3014
|
-
}
|
3015
|
-
|
3016
|
-
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
3017
|
-
if (ctx_ptr->ctx == NULL) {
|
3018
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3019
|
-
return Qnil;
|
3020
|
-
}
|
3021
|
-
LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
|
3022
|
-
if (cnd_ptr->array.data == nullptr) {
|
3023
|
-
rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
|
3024
|
-
return Qnil;
|
3025
|
-
}
|
3026
|
-
const float temperature = NUM2DBL(kw_values[0]);
|
3027
|
-
|
3028
|
-
llama_sample_temperature(ctx_ptr->ctx, &(cnd_ptr->array), temperature);
|
3029
|
-
|
3030
|
-
return Qnil;
|
3031
|
-
}
|
3032
|
-
|
3033
2855
|
static VALUE _llama_context_sample_token_mirostat(int argc, VALUE* argv, VALUE self) {
|
3034
2856
|
VALUE kw_args = Qnil;
|
3035
2857
|
ID kw_table[4] = { rb_intern("tau"), rb_intern("eta"), rb_intern("m"), rb_intern("mu") };
|
@@ -3307,16 +3129,6 @@ static VALUE rb_llama_time_us(VALUE self) {
|
|
3307
3129
|
return LONG2NUM(llama_time_us());
|
3308
3130
|
}
|
3309
3131
|
|
3310
|
-
static VALUE rb_llama_mmap_supported(VALUE self) {
|
3311
|
-
rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
|
3312
|
-
return llama_mmap_supported() ? Qtrue : Qfalse;
|
3313
|
-
}
|
3314
|
-
|
3315
|
-
static VALUE rb_llama_mlock_supported(VALUE self) {
|
3316
|
-
rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
|
3317
|
-
return llama_mlock_supported() ? Qtrue : Qfalse;
|
3318
|
-
}
|
3319
|
-
|
3320
3132
|
static VALUE rb_llama_max_devices(VALUE self) {
|
3321
3133
|
return SIZET2NUM(llama_max_devices());
|
3322
3134
|
}
|
@@ -3355,8 +3167,6 @@ extern "C" void Init_llama_cpp(void) {
|
|
3355
3167
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
3356
3168
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
3357
3169
|
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
3358
|
-
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
3359
|
-
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
3360
3170
|
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
3361
3171
|
rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
|
3362
3172
|
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
@@ -3394,16 +3204,16 @@ extern "C" void Init_llama_cpp(void) {
|
|
3394
3204
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3395
3205
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3396
3206
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3397
|
-
rb_define_const(rb_mLLaMACpp, "
|
3207
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS));
|
3398
3208
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3399
3209
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3400
3210
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3401
3211
|
|
3402
3212
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3403
3213
|
|
3404
|
-
rb_define_const(rb_mLLaMACpp, "
|
3405
|
-
rb_define_const(rb_mLLaMACpp, "
|
3406
|
-
rb_define_const(rb_mLLaMACpp, "
|
3214
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
|
3215
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
|
3216
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
|
3407
3217
|
|
3408
3218
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3409
3219
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
@@ -3413,19 +3223,19 @@ extern "C" void Init_llama_cpp(void) {
|
|
3413
3223
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
|
3414
3224
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
|
3415
3225
|
|
3416
|
-
rb_define_const(rb_mLLaMACpp, "
|
3417
|
-
rb_define_const(rb_mLLaMACpp, "
|
3418
|
-
rb_define_const(rb_mLLaMACpp, "
|
3419
|
-
rb_define_const(rb_mLLaMACpp, "
|
3420
|
-
rb_define_const(rb_mLLaMACpp, "
|
3226
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED));
|
3227
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_NONE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE));
|
3228
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR));
|
3229
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
|
3230
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
|
3421
3231
|
|
3422
|
-
rb_define_const(rb_mLLaMACpp, "
|
3423
|
-
rb_define_const(rb_mLLaMACpp, "
|
3424
|
-
rb_define_const(rb_mLLaMACpp, "
|
3232
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
|
3233
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
|
3234
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
|
3425
3235
|
|
3426
|
-
rb_define_const(rb_mLLaMACpp, "
|
3427
|
-
rb_define_const(rb_mLLaMACpp, "
|
3428
|
-
rb_define_const(rb_mLLaMACpp, "
|
3236
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
|
3237
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
|
3238
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_ROW", INT2NUM(LLAMA_SPLIT_MODE_ROW));
|
3429
3239
|
|
3430
3240
|
std::stringstream ss_magic;
|
3431
3241
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.13.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2303'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
|
|
27
27
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
28
28
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
29
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
30
|
-
|
30
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
|
31
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
36
|
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
37
|
+
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
38
38
|
|
39
39
|
LLAMA_GRETYPE_END: Integer
|
40
40
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -44,19 +44,19 @@ module LLaMACpp
|
|
44
44
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
45
45
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
47
|
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
|
48
|
+
LLAMA_ROPE_SCALING_TYPE_NONE: Integer
|
49
|
+
LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
|
50
|
+
LLAMA_ROPE_SCALING_TYPE_YARN: Integer
|
51
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
|
53
|
+
LLAMA_POOLING_TYPE_NONE: Integer
|
54
|
+
LLAMA_POOLING_TYPE_MEAN: Integer
|
55
|
+
LLAMA_POOLING_TYPE_CLS: Integer
|
56
56
|
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
LLAMA_SPLIT_MODE_NONE: Integer
|
58
|
+
LLAMA_SPLIT_MODE_LAYER: Integer
|
59
|
+
LLAMA_SPLIT_MODE_ROW: Integer
|
60
60
|
|
61
61
|
def self?.backend_init: () -> void
|
62
62
|
def self?.backend_free: () -> void
|
@@ -68,8 +68,6 @@ module LLaMACpp
|
|
68
68
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
69
69
|
def self?.print_system_info: () -> void
|
70
70
|
def self?.time_us: () -> Integer
|
71
|
-
def self?.mmap_supported?: () -> bool
|
72
|
-
def self?.mlock_supported?: () -> bool
|
73
71
|
def self?.max_devices: () -> Integer
|
74
72
|
def self?.supports_mmap?: () -> bool
|
75
73
|
def self?.supports_mlock?: () -> bool
|
@@ -103,7 +101,8 @@ module LLaMACpp
|
|
103
101
|
def empty?: () -> bool
|
104
102
|
def free: () -> void
|
105
103
|
def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
106
|
-
def
|
104
|
+
def vocab_type: () -> Integer
|
105
|
+
def rope_type: () -> Integer
|
107
106
|
def n_vocab: () -> Integer
|
108
107
|
def n_ctx_train: () -> Integer
|
109
108
|
def n_embd: () -> Integer
|
@@ -202,8 +201,6 @@ module LLaMACpp
|
|
202
201
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
203
202
|
def embeddings: () -> Array[Float]
|
204
203
|
def embeddings_ith: (Integer) -> Array[Float]
|
205
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
206
|
-
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
207
204
|
def decode: (::LLaMACpp::Batch) -> void
|
208
205
|
def logits: () -> Array[Float]
|
209
206
|
def n_ctx: () -> Integer
|
@@ -216,14 +213,16 @@ module LLaMACpp
|
|
216
213
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
217
214
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
218
215
|
def kv_cache_seq_keep: (Integer) -> void
|
219
|
-
def
|
216
|
+
def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
|
220
217
|
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
218
|
+
def kv_cache_seq_pos_max: (Integer) -> Integer
|
219
|
+
def kv_cache_defrag: () -> void
|
220
|
+
def kv_cache_update: () -> void
|
221
221
|
def set_rng_seed: (Integer) -> void
|
222
222
|
def load_session_file: (session_path: String) -> void
|
223
223
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
224
224
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
225
225
|
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
226
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
227
226
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
228
227
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
229
228
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -232,7 +231,6 @@ module LLaMACpp
|
|
232
231
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
233
232
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
234
233
|
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
235
|
-
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
236
234
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
237
235
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
238
236
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
@@ -270,12 +268,12 @@ module LLaMACpp
|
|
270
268
|
def yarn_beta_slow: () -> Float
|
271
269
|
def yarn_orig_ctx=: (Integer) -> Integer
|
272
270
|
def yarn_orig_ctx: () -> Integer
|
271
|
+
def defrag_thold=: (Float) -> Float
|
272
|
+
def defrag_thold: () -> Float
|
273
273
|
def type_k=: (Integer) -> Integer
|
274
274
|
def type_k: () -> Integer
|
275
275
|
def type_v=: (Integer) -> Integer
|
276
276
|
def type_v: () -> Integer
|
277
|
-
def mul_mat_q: () -> bool
|
278
|
-
def mul_mat_q=: (bool) -> bool
|
279
277
|
def logits_all: () -> bool
|
280
278
|
def logits_all=: (bool) -> bool
|
281
279
|
def embedding: () -> bool
|
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
|
|
383
383
|
endif # LLAMA_BLIS
|
384
384
|
|
385
385
|
ifdef LLAMA_CUBLAS
|
386
|
-
|
387
|
-
|
386
|
+
ifneq ('', '$(wildcard /opt/cuda)')
|
387
|
+
CUDA_PATH ?= /opt/cuda
|
388
|
+
else
|
389
|
+
CUDA_PATH ?= /usr/local/cuda
|
390
|
+
endif
|
391
|
+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
392
|
+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
388
393
|
OBJS += ggml-cuda.o
|
389
394
|
MK_NVCCFLAGS += -use_fast_math
|
390
395
|
ifdef LLAMA_FATAL_WARNINGS
|
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|
599
604
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
600
605
|
ifdef LLAMA_CUBLAS
|
601
606
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
602
|
-
CUDA_VERSION := $(shell
|
607
|
+
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
603
608
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
604
609
|
ifndef CUDA_DOCKER_ARCH
|
605
610
|
ifndef CUDA_POWER_ARCH
|