llama_cpp 0.12.7 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
|
4
|
+
data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
|
7
|
+
data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b2143 to b2303.
|
4
|
+
- Remove deprecated methods:
|
5
|
+
- `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
|
6
|
+
- Rename some constants.
|
7
|
+
- Rename `kv_cache_seq_shift` method to `kv_cache_seq_add`.
|
8
|
+
- Add `defrag_thold` accessor to `ContextParams`.
|
9
|
+
- Add `vocab_type` and `rope_type` methods to `Model`.
|
10
|
+
- Add `kv_cache_seq_pos_max`, `kv_cache_defrag`, and `kv_cache_update` methods to `Context`.
|
11
|
+
|
1
12
|
## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
|
2
13
|
|
3
14
|
- Bump bundled llama.cpp from b2106 to b2143.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -966,12 +966,12 @@ public:
|
|
966
966
|
rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
|
967
967
|
rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
|
968
968
|
rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
|
969
|
+
rb_define_method(rb_cLLaMAContextParams, "defrag_thold=", RUBY_METHOD_FUNC(_llama_context_params_set_defrag_thold), 1);
|
970
|
+
rb_define_method(rb_cLLaMAContextParams, "defrag_thold", RUBY_METHOD_FUNC(_llama_context_params_get_defrag_thold), 0);
|
969
971
|
rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
|
970
972
|
rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
|
971
973
|
rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
|
972
974
|
rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
|
973
|
-
rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
|
974
|
-
rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
|
975
975
|
rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
|
976
976
|
rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
|
977
977
|
rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
|
@@ -1146,6 +1146,18 @@ private:
|
|
1146
1146
|
return UINT2NUM(ptr->params.yarn_orig_ctx);
|
1147
1147
|
}
|
1148
1148
|
|
1149
|
+
// defrag_thold
|
1150
|
+
static VALUE _llama_context_params_set_defrag_thold(VALUE self, VALUE defrag_thold) {
|
1151
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1152
|
+
ptr->params.defrag_thold = NUM2DBL(defrag_thold);
|
1153
|
+
return DBL2NUM(ptr->params.defrag_thold);
|
1154
|
+
}
|
1155
|
+
|
1156
|
+
static VALUE _llama_context_params_get_defrag_thold(VALUE self) {
|
1157
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1158
|
+
return DBL2NUM(ptr->params.defrag_thold);
|
1159
|
+
}
|
1160
|
+
|
1149
1161
|
static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
|
1150
1162
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1151
1163
|
return UINT2NUM(ptr->params.yarn_orig_ctx);
|
@@ -1175,18 +1187,6 @@ private:
|
|
1175
1187
|
return INT2NUM(ptr->params.type_v);
|
1176
1188
|
}
|
1177
1189
|
|
1178
|
-
// mul_mat_q
|
1179
|
-
static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
|
1180
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1181
|
-
ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
|
1182
|
-
return ptr->params.mul_mat_q ? Qtrue : Qfalse;
|
1183
|
-
}
|
1184
|
-
|
1185
|
-
static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
|
1186
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1187
|
-
return ptr->params.mul_mat_q ? Qtrue : Qfalse;
|
1188
|
-
}
|
1189
|
-
|
1190
1190
|
// logits_all
|
1191
1191
|
static VALUE _llama_context_params_set_logits_all(VALUE self, VALUE logits_all) {
|
1192
1192
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -1433,7 +1433,8 @@ public:
|
|
1433
1433
|
rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
|
1434
1434
|
rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
|
1435
1435
|
rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
|
1436
|
-
rb_define_method(rb_cLLaMAModel, "
|
1436
|
+
rb_define_method(rb_cLLaMAModel, "vocab_type", RUBY_METHOD_FUNC(_llama_model_get_model_vocab_type), 0);
|
1437
|
+
rb_define_method(rb_cLLaMAModel, "rope_type", RUBY_METHOD_FUNC(_llama_model_get_model_rope_type), 0);
|
1437
1438
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
1438
1439
|
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
1439
1440
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
@@ -1559,41 +1560,14 @@ private:
|
|
1559
1560
|
return Qnil;
|
1560
1561
|
}
|
1561
1562
|
|
1562
|
-
static VALUE
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
1567
|
-
rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
|
1568
|
-
|
1569
|
-
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1570
|
-
rb_raise(rb_eArgError, "lora_path must be a string");
|
1571
|
-
return Qnil;
|
1572
|
-
}
|
1573
|
-
if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
|
1574
|
-
rb_raise(rb_eArgError, "base_model_path must be a string");
|
1575
|
-
return Qnil;
|
1576
|
-
}
|
1577
|
-
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
1578
|
-
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1579
|
-
return Qnil;
|
1580
|
-
}
|
1581
|
-
if (kw_values[3] != Qundef && !RB_FLOAT_TYPE_P(kw_values[3])) {
|
1582
|
-
rb_raise(rb_eArgError, "scale must be a float");
|
1583
|
-
return Qnil;
|
1584
|
-
}
|
1585
|
-
|
1586
|
-
const char* lora_path = StringValueCStr(kw_values[0]);
|
1587
|
-
const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
|
1588
|
-
const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
|
1589
|
-
const float scale = kw_values[3] == Qundef ? 1.0 : NUM2DBL(kw_values[3]);
|
1563
|
+
static VALUE _llama_model_get_model_vocab_type(VALUE self) {
|
1564
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1565
|
+
return INT2NUM(llama_vocab_type(ptr->model));
|
1566
|
+
}
|
1590
1567
|
|
1568
|
+
static VALUE _llama_model_get_model_rope_type(VALUE self) {
|
1591
1569
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1592
|
-
|
1593
|
-
rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
|
1594
|
-
return Qnil;
|
1595
|
-
}
|
1596
|
-
return Qnil;
|
1570
|
+
return INT2NUM(llama_rope_type(ptr->model));
|
1597
1571
|
}
|
1598
1572
|
|
1599
1573
|
static VALUE _llama_model_get_model_n_vocab(VALUE self) {
|
@@ -2038,8 +2012,6 @@ public:
|
|
2038
2012
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
2039
2013
|
rb_define_attr(rb_cLLaMAContext, "model", 1, 0);
|
2040
2014
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
2041
|
-
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
2042
|
-
rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
|
2043
2015
|
rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
|
2044
2016
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
2045
2017
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
@@ -2054,14 +2026,16 @@ public:
|
|
2054
2026
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
|
2055
2027
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
|
2056
2028
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
|
2057
|
-
rb_define_method(rb_cLLaMAContext, "
|
2029
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_add", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_add), 4);
|
2058
2030
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
|
2031
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_pos_max", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_pos_max), 1);
|
2032
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
|
2033
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
|
2059
2034
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2060
2035
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2061
2036
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2062
2037
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
2063
2038
|
rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
|
2064
|
-
rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
|
2065
2039
|
rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
|
2066
2040
|
rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
|
2067
2041
|
rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
|
@@ -2070,7 +2044,6 @@ public:
|
|
2070
2044
|
rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
|
2071
2045
|
rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
|
2072
2046
|
rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
|
2073
|
-
rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
|
2074
2047
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
|
2075
2048
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
|
2076
2049
|
rb_define_method(rb_cLLaMAContext, "sample_token_greedy", RUBY_METHOD_FUNC(_llama_context_sample_token_greedy), 1);
|
@@ -2122,110 +2095,6 @@ private:
|
|
2122
2095
|
return Qnil;
|
2123
2096
|
}
|
2124
2097
|
|
2125
|
-
static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
|
2126
|
-
VALUE kw_args = Qnil;
|
2127
|
-
ID kw_table[3] = { rb_intern("tokens"), rb_intern("n_past"), rb_intern("n_tokens") };
|
2128
|
-
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2129
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
2130
|
-
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2131
|
-
|
2132
|
-
rb_warn("eval is deprecated. Use decode instead.");
|
2133
|
-
|
2134
|
-
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2135
|
-
rb_raise(rb_eArgError, "tokens must be an Array");
|
2136
|
-
return Qnil;
|
2137
|
-
}
|
2138
|
-
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
2139
|
-
rb_raise(rb_eArgError, "n_past must be an integer");
|
2140
|
-
return Qnil;
|
2141
|
-
}
|
2142
|
-
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
2143
|
-
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
2144
|
-
return Qnil;
|
2145
|
-
}
|
2146
|
-
|
2147
|
-
const size_t tokens_len = RARRAY_LEN(kw_values[0]);
|
2148
|
-
std::vector<llama_token> embd(tokens_len);
|
2149
|
-
for (size_t i = 0; i < tokens_len; i++) {
|
2150
|
-
VALUE token = rb_ary_entry(kw_values[0], i);
|
2151
|
-
if (!RB_INTEGER_TYPE_P(token)) {
|
2152
|
-
rb_raise(rb_eArgError, "tokens must be an array of integers");
|
2153
|
-
return Qnil;
|
2154
|
-
}
|
2155
|
-
embd[i] = NUM2INT(token);
|
2156
|
-
}
|
2157
|
-
|
2158
|
-
const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
|
2159
|
-
const int n_past = NUM2INT(kw_values[1]);
|
2160
|
-
|
2161
|
-
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2162
|
-
if (ptr->ctx == NULL) {
|
2163
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2164
|
-
return Qnil;
|
2165
|
-
}
|
2166
|
-
if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
|
2167
|
-
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
2168
|
-
return Qnil;
|
2169
|
-
}
|
2170
|
-
|
2171
|
-
rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
|
2172
|
-
rb_iv_set(self, "@has_evaluated", Qtrue);
|
2173
|
-
|
2174
|
-
return Qnil;
|
2175
|
-
}
|
2176
|
-
|
2177
|
-
static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
|
2178
|
-
VALUE kw_args = Qnil;
|
2179
|
-
ID kw_table[3] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens") };
|
2180
|
-
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2181
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
2182
|
-
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2183
|
-
|
2184
|
-
rb_warn("eval_embd is deprecated. Use decode instead.");
|
2185
|
-
|
2186
|
-
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2187
|
-
rb_raise(rb_eArgError, "tokens must be an Array");
|
2188
|
-
return Qnil;
|
2189
|
-
}
|
2190
|
-
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
2191
|
-
rb_raise(rb_eArgError, "n_past must be an integer");
|
2192
|
-
return Qnil;
|
2193
|
-
}
|
2194
|
-
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
2195
|
-
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
2196
|
-
return Qnil;
|
2197
|
-
}
|
2198
|
-
|
2199
|
-
const size_t tokens_len = RARRAY_LEN(kw_values[0]);
|
2200
|
-
std::vector<float> embd(tokens_len);
|
2201
|
-
for (size_t i = 0; i < tokens_len; i++) {
|
2202
|
-
VALUE el = rb_ary_entry(kw_values[0], i);
|
2203
|
-
if (!RB_FLOAT_TYPE_P(el)) {
|
2204
|
-
rb_raise(rb_eArgError, "embd must be an array of floats");
|
2205
|
-
return Qnil;
|
2206
|
-
}
|
2207
|
-
embd[i] = NUM2DBL(el);
|
2208
|
-
}
|
2209
|
-
|
2210
|
-
const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
|
2211
|
-
const int n_past = NUM2INT(kw_values[1]);
|
2212
|
-
|
2213
|
-
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2214
|
-
if (ptr->ctx == NULL) {
|
2215
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2216
|
-
return Qnil;
|
2217
|
-
}
|
2218
|
-
if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
|
2219
|
-
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
2220
|
-
return Qnil;
|
2221
|
-
}
|
2222
|
-
|
2223
|
-
rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
|
2224
|
-
rb_iv_set(self, "@has_evaluated", Qtrue);
|
2225
|
-
|
2226
|
-
return Qnil;
|
2227
|
-
}
|
2228
|
-
|
2229
2098
|
static VALUE _llama_context_decode(VALUE self, VALUE batch) {
|
2230
2099
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2231
2100
|
if (ptr->ctx == NULL) {
|
@@ -2430,13 +2299,13 @@ private:
|
|
2430
2299
|
return Qnil;
|
2431
2300
|
}
|
2432
2301
|
|
2433
|
-
static VALUE
|
2302
|
+
static VALUE _llama_context_kv_cache_seq_add(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
|
2434
2303
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2435
2304
|
if (ptr->ctx == NULL) {
|
2436
2305
|
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2437
2306
|
return Qnil;
|
2438
2307
|
}
|
2439
|
-
|
2308
|
+
llama_kv_cache_seq_add(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
|
2440
2309
|
return Qnil;
|
2441
2310
|
}
|
2442
2311
|
|
@@ -2450,6 +2319,35 @@ private:
|
|
2450
2319
|
return Qnil;
|
2451
2320
|
}
|
2452
2321
|
|
2322
|
+
static VALUE _llama_context_kv_cache_seq_pos_max(VALUE self, VALUE seq_id) {
|
2323
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2324
|
+
if (ptr->ctx == NULL) {
|
2325
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2326
|
+
return Qnil;
|
2327
|
+
}
|
2328
|
+
return INT2NUM(llama_kv_cache_seq_pos_max(ptr->ctx, NUM2INT(seq_id)));
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
static VALUE _llama_context_kv_cache_defrag(VALUE self) {
|
2332
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2333
|
+
if (ptr->ctx == NULL) {
|
2334
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2335
|
+
return Qnil;
|
2336
|
+
}
|
2337
|
+
llama_kv_cache_defrag(ptr->ctx);
|
2338
|
+
return Qnil;
|
2339
|
+
}
|
2340
|
+
|
2341
|
+
static VALUE _llama_context_kv_cache_update(VALUE self) {
|
2342
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2343
|
+
if (ptr->ctx == NULL) {
|
2344
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2345
|
+
return Qnil;
|
2346
|
+
}
|
2347
|
+
llama_kv_cache_update(ptr->ctx);
|
2348
|
+
return Qnil;
|
2349
|
+
}
|
2350
|
+
|
2453
2351
|
static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
|
2454
2352
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2455
2353
|
if (ptr->ctx == NULL) {
|
@@ -2659,46 +2557,6 @@ private:
|
|
2659
2557
|
return Qnil;
|
2660
2558
|
}
|
2661
2559
|
|
2662
|
-
static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
|
2663
|
-
VALUE kw_args = Qnil;
|
2664
|
-
ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
|
2665
|
-
VALUE kw_values[2] = { Qundef, Qundef };
|
2666
|
-
VALUE candidates = Qnil;
|
2667
|
-
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
2668
|
-
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
2669
|
-
|
2670
|
-
if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
|
2671
|
-
rb_raise(rb_eArgError, "guidance must be a Context");
|
2672
|
-
return Qnil;
|
2673
|
-
}
|
2674
|
-
if (!RB_FLOAT_TYPE_P(kw_values[1])) {
|
2675
|
-
rb_raise(rb_eArgError, "scale must be a float");
|
2676
|
-
return Qnil;
|
2677
|
-
}
|
2678
|
-
|
2679
|
-
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
2680
|
-
if (ctx_ptr->ctx == NULL) {
|
2681
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2682
|
-
return Qnil;
|
2683
|
-
}
|
2684
|
-
LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
|
2685
|
-
if (cnd_ptr->array.data == nullptr) {
|
2686
|
-
rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
|
2687
|
-
return Qnil;
|
2688
|
-
}
|
2689
|
-
|
2690
|
-
LLaMAContextWrapper* guidance_ptr = get_llama_context(kw_values[0]);
|
2691
|
-
if (guidance_ptr->ctx == NULL) {
|
2692
|
-
rb_raise(rb_eRuntimeError, "guidance context is not initialized");
|
2693
|
-
return Qnil;
|
2694
|
-
}
|
2695
|
-
const float scale = NUM2DBL(kw_values[1]);
|
2696
|
-
|
2697
|
-
llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
|
2698
|
-
|
2699
|
-
return Qnil;
|
2700
|
-
}
|
2701
|
-
|
2702
2560
|
static VALUE _llama_context_sample_softmax(VALUE self, VALUE candidates) {
|
2703
2561
|
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
2704
2562
|
rb_raise(rb_eArgError, "argument must be a TokenDataArray");
|
@@ -2994,42 +2852,6 @@ private:
|
|
2994
2852
|
return Qnil;
|
2995
2853
|
}
|
2996
2854
|
|
2997
|
-
static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
|
2998
|
-
VALUE kw_args = Qnil;
|
2999
|
-
ID kw_table[1] = { rb_intern("temperature") };
|
3000
|
-
VALUE kw_values[1] = { Qundef };
|
3001
|
-
VALUE candidates = Qnil;
|
3002
|
-
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
3003
|
-
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
3004
|
-
|
3005
|
-
rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
|
3006
|
-
|
3007
|
-
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
3008
|
-
rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
|
3009
|
-
return Qnil;
|
3010
|
-
}
|
3011
|
-
if (!RB_FLOAT_TYPE_P(kw_values[0])) {
|
3012
|
-
rb_raise(rb_eArgError, "temperature must be a float");
|
3013
|
-
return Qnil;
|
3014
|
-
}
|
3015
|
-
|
3016
|
-
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
3017
|
-
if (ctx_ptr->ctx == NULL) {
|
3018
|
-
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3019
|
-
return Qnil;
|
3020
|
-
}
|
3021
|
-
LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
|
3022
|
-
if (cnd_ptr->array.data == nullptr) {
|
3023
|
-
rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
|
3024
|
-
return Qnil;
|
3025
|
-
}
|
3026
|
-
const float temperature = NUM2DBL(kw_values[0]);
|
3027
|
-
|
3028
|
-
llama_sample_temperature(ctx_ptr->ctx, &(cnd_ptr->array), temperature);
|
3029
|
-
|
3030
|
-
return Qnil;
|
3031
|
-
}
|
3032
|
-
|
3033
2855
|
static VALUE _llama_context_sample_token_mirostat(int argc, VALUE* argv, VALUE self) {
|
3034
2856
|
VALUE kw_args = Qnil;
|
3035
2857
|
ID kw_table[4] = { rb_intern("tau"), rb_intern("eta"), rb_intern("m"), rb_intern("mu") };
|
@@ -3307,16 +3129,6 @@ static VALUE rb_llama_time_us(VALUE self) {
|
|
3307
3129
|
return LONG2NUM(llama_time_us());
|
3308
3130
|
}
|
3309
3131
|
|
3310
|
-
static VALUE rb_llama_mmap_supported(VALUE self) {
|
3311
|
-
rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
|
3312
|
-
return llama_mmap_supported() ? Qtrue : Qfalse;
|
3313
|
-
}
|
3314
|
-
|
3315
|
-
static VALUE rb_llama_mlock_supported(VALUE self) {
|
3316
|
-
rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
|
3317
|
-
return llama_mlock_supported() ? Qtrue : Qfalse;
|
3318
|
-
}
|
3319
|
-
|
3320
3132
|
static VALUE rb_llama_max_devices(VALUE self) {
|
3321
3133
|
return SIZET2NUM(llama_max_devices());
|
3322
3134
|
}
|
@@ -3355,8 +3167,6 @@ extern "C" void Init_llama_cpp(void) {
|
|
3355
3167
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
3356
3168
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
3357
3169
|
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
3358
|
-
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
3359
|
-
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
3360
3170
|
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
3361
3171
|
rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
|
3362
3172
|
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
@@ -3394,16 +3204,16 @@ extern "C" void Init_llama_cpp(void) {
|
|
3394
3204
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3395
3205
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3396
3206
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3397
|
-
rb_define_const(rb_mLLaMACpp, "
|
3207
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS));
|
3398
3208
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3399
3209
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3400
3210
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3401
3211
|
|
3402
3212
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3403
3213
|
|
3404
|
-
rb_define_const(rb_mLLaMACpp, "
|
3405
|
-
rb_define_const(rb_mLLaMACpp, "
|
3406
|
-
rb_define_const(rb_mLLaMACpp, "
|
3214
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
|
3215
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
|
3216
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
|
3407
3217
|
|
3408
3218
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3409
3219
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
@@ -3413,19 +3223,19 @@ extern "C" void Init_llama_cpp(void) {
|
|
3413
3223
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
|
3414
3224
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
|
3415
3225
|
|
3416
|
-
rb_define_const(rb_mLLaMACpp, "
|
3417
|
-
rb_define_const(rb_mLLaMACpp, "
|
3418
|
-
rb_define_const(rb_mLLaMACpp, "
|
3419
|
-
rb_define_const(rb_mLLaMACpp, "
|
3420
|
-
rb_define_const(rb_mLLaMACpp, "
|
3226
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED));
|
3227
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_NONE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE));
|
3228
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR));
|
3229
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
|
3230
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
|
3421
3231
|
|
3422
|
-
rb_define_const(rb_mLLaMACpp, "
|
3423
|
-
rb_define_const(rb_mLLaMACpp, "
|
3424
|
-
rb_define_const(rb_mLLaMACpp, "
|
3232
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
|
3233
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
|
3234
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
|
3425
3235
|
|
3426
|
-
rb_define_const(rb_mLLaMACpp, "
|
3427
|
-
rb_define_const(rb_mLLaMACpp, "
|
3428
|
-
rb_define_const(rb_mLLaMACpp, "
|
3236
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
|
3237
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
|
3238
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_ROW", INT2NUM(LLAMA_SPLIT_MODE_ROW));
|
3429
3239
|
|
3430
3240
|
std::stringstream ss_magic;
|
3431
3241
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.13.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2303'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
|
|
27
27
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
28
28
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
29
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
30
|
-
|
30
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
|
31
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
36
|
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
37
|
+
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
38
38
|
|
39
39
|
LLAMA_GRETYPE_END: Integer
|
40
40
|
LLAMA_GRETYPE_ALT: Integer
|
@@ -44,19 +44,19 @@ module LLaMACpp
|
|
44
44
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
45
45
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
47
|
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
|
48
|
+
LLAMA_ROPE_SCALING_TYPE_NONE: Integer
|
49
|
+
LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
|
50
|
+
LLAMA_ROPE_SCALING_TYPE_YARN: Integer
|
51
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
|
53
|
+
LLAMA_POOLING_TYPE_NONE: Integer
|
54
|
+
LLAMA_POOLING_TYPE_MEAN: Integer
|
55
|
+
LLAMA_POOLING_TYPE_CLS: Integer
|
56
56
|
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
LLAMA_SPLIT_MODE_NONE: Integer
|
58
|
+
LLAMA_SPLIT_MODE_LAYER: Integer
|
59
|
+
LLAMA_SPLIT_MODE_ROW: Integer
|
60
60
|
|
61
61
|
def self?.backend_init: () -> void
|
62
62
|
def self?.backend_free: () -> void
|
@@ -68,8 +68,6 @@ module LLaMACpp
|
|
68
68
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
69
69
|
def self?.print_system_info: () -> void
|
70
70
|
def self?.time_us: () -> Integer
|
71
|
-
def self?.mmap_supported?: () -> bool
|
72
|
-
def self?.mlock_supported?: () -> bool
|
73
71
|
def self?.max_devices: () -> Integer
|
74
72
|
def self?.supports_mmap?: () -> bool
|
75
73
|
def self?.supports_mlock?: () -> bool
|
@@ -103,7 +101,8 @@ module LLaMACpp
|
|
103
101
|
def empty?: () -> bool
|
104
102
|
def free: () -> void
|
105
103
|
def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
106
|
-
def
|
104
|
+
def vocab_type: () -> Integer
|
105
|
+
def rope_type: () -> Integer
|
107
106
|
def n_vocab: () -> Integer
|
108
107
|
def n_ctx_train: () -> Integer
|
109
108
|
def n_embd: () -> Integer
|
@@ -202,8 +201,6 @@ module LLaMACpp
|
|
202
201
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
203
202
|
def embeddings: () -> Array[Float]
|
204
203
|
def embeddings_ith: (Integer) -> Array[Float]
|
205
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
206
|
-
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
207
204
|
def decode: (::LLaMACpp::Batch) -> void
|
208
205
|
def logits: () -> Array[Float]
|
209
206
|
def n_ctx: () -> Integer
|
@@ -216,14 +213,16 @@ module LLaMACpp
|
|
216
213
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
217
214
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
218
215
|
def kv_cache_seq_keep: (Integer) -> void
|
219
|
-
def
|
216
|
+
def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
|
220
217
|
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
218
|
+
def kv_cache_seq_pos_max: (Integer) -> Integer
|
219
|
+
def kv_cache_defrag: () -> void
|
220
|
+
def kv_cache_update: () -> void
|
221
221
|
def set_rng_seed: (Integer) -> void
|
222
222
|
def load_session_file: (session_path: String) -> void
|
223
223
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
224
224
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
225
225
|
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
226
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
227
226
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
228
227
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
229
228
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -232,7 +231,6 @@ module LLaMACpp
|
|
232
231
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
233
232
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
234
233
|
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
235
|
-
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
236
234
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
237
235
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
238
236
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
@@ -270,12 +268,12 @@ module LLaMACpp
|
|
270
268
|
def yarn_beta_slow: () -> Float
|
271
269
|
def yarn_orig_ctx=: (Integer) -> Integer
|
272
270
|
def yarn_orig_ctx: () -> Integer
|
271
|
+
def defrag_thold=: (Float) -> Float
|
272
|
+
def defrag_thold: () -> Float
|
273
273
|
def type_k=: (Integer) -> Integer
|
274
274
|
def type_k: () -> Integer
|
275
275
|
def type_v=: (Integer) -> Integer
|
276
276
|
def type_v: () -> Integer
|
277
|
-
def mul_mat_q: () -> bool
|
278
|
-
def mul_mat_q=: (bool) -> bool
|
279
277
|
def logits_all: () -> bool
|
280
278
|
def logits_all=: (bool) -> bool
|
281
279
|
def embedding: () -> bool
|
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
|
|
383
383
|
endif # LLAMA_BLIS
|
384
384
|
|
385
385
|
ifdef LLAMA_CUBLAS
|
386
|
-
|
387
|
-
|
386
|
+
ifneq ('', '$(wildcard /opt/cuda)')
|
387
|
+
CUDA_PATH ?= /opt/cuda
|
388
|
+
else
|
389
|
+
CUDA_PATH ?= /usr/local/cuda
|
390
|
+
endif
|
391
|
+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
392
|
+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
388
393
|
OBJS += ggml-cuda.o
|
389
394
|
MK_NVCCFLAGS += -use_fast_math
|
390
395
|
ifdef LLAMA_FATAL_WARNINGS
|
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|
599
604
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
600
605
|
ifdef LLAMA_CUBLAS
|
601
606
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
602
|
-
CUDA_VERSION := $(shell
|
607
|
+
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
603
608
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
604
609
|
ifndef CUDA_DOCKER_ARCH
|
605
610
|
ifndef CUDA_POWER_ARCH
|