llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 296b29b7d20c7bfd66f69749ccd41e63d6998589af0d3514db8f6c08011d545f
4
- data.tar.gz: 48f8787a63759a95049bbc515f4b35c74d07b356f1635d751d8d9d852e386c5a
3
+ metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
4
+ data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
5
5
  SHA512:
6
- metadata.gz: 5cd4c284a31fcdd36565b481c2456545eaf3fe19fda3778121f26f529ca01d18a894ba73739d966dc29f5aa239f8784ed56801bac5db3d21ae13e5b5aa2b4012
7
- data.tar.gz: 7d03f1d081d097913fe3489a0432a5869a13e0a0371458c6c4d6cdea7296422a5af51c13ae05ea0d752e068865cc99e52ee0c4f3d67de892003c76e9126d5940
6
+ metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
7
+ data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,24 @@
1
+ ## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
2
+
3
+ - Bump bundled llama.cpp from b2143 to b2303.
4
+ - Remove deprecated methods:
5
+ - `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
6
+ - Rename some constants.
7
+ - Rename `kv_cache_seq_shift` method to `kv_cache_seq_add`.
8
+ - Add `defrag_thold` accessor to `ContextParams`.
9
+ - Add `vocab_type` and `rope_type` methods to `Model`.
10
+ - Add `kv_cache_seq_pos_max`, `kv_cache_defrag`, and `kv_cache_update` methods to `Context`.
11
+
12
+ ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
13
+
14
+ - Bump bundled llama.cpp from b2106 to b2143.
15
+ - Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
16
+ - Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
17
+ - Add `numa_init` module function to `LLaMACpp`.
18
+ - Remove unnecessary argument from `backend_init`.
19
+
20
+ Implementation of llama_chat_apply_template binding has been postponed for the time being.
21
+
1
22
  ## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
2
23
 
3
24
  - Bump bundled llama.cpp from b2106 to b2143.
@@ -966,12 +966,12 @@ public:
966
966
  rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
967
967
  rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
968
968
  rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
969
+ rb_define_method(rb_cLLaMAContextParams, "defrag_thold=", RUBY_METHOD_FUNC(_llama_context_params_set_defrag_thold), 1);
970
+ rb_define_method(rb_cLLaMAContextParams, "defrag_thold", RUBY_METHOD_FUNC(_llama_context_params_get_defrag_thold), 0);
969
971
  rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
970
972
  rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
971
973
  rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
972
974
  rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
973
- rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
974
- rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
975
975
  rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
976
976
  rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
977
977
  rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
@@ -1146,6 +1146,18 @@ private:
1146
1146
  return UINT2NUM(ptr->params.yarn_orig_ctx);
1147
1147
  }
1148
1148
 
1149
+ // defrag_thold
1150
+ static VALUE _llama_context_params_set_defrag_thold(VALUE self, VALUE defrag_thold) {
1151
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1152
+ ptr->params.defrag_thold = NUM2DBL(defrag_thold);
1153
+ return DBL2NUM(ptr->params.defrag_thold);
1154
+ }
1155
+
1156
+ static VALUE _llama_context_params_get_defrag_thold(VALUE self) {
1157
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1158
+ return DBL2NUM(ptr->params.defrag_thold);
1159
+ }
1160
+
1149
1161
  static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
1150
1162
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1151
1163
  return UINT2NUM(ptr->params.yarn_orig_ctx);
@@ -1175,18 +1187,6 @@ private:
1175
1187
  return INT2NUM(ptr->params.type_v);
1176
1188
  }
1177
1189
 
1178
- // mul_mat_q
1179
- static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
1180
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1181
- ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
1182
- return ptr->params.mul_mat_q ? Qtrue : Qfalse;
1183
- }
1184
-
1185
- static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
1186
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1187
- return ptr->params.mul_mat_q ? Qtrue : Qfalse;
1188
- }
1189
-
1190
1190
  // logits_all
1191
1191
  static VALUE _llama_context_params_set_logits_all(VALUE self, VALUE logits_all) {
1192
1192
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1433,7 +1433,8 @@ public:
1433
1433
  rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
1434
1434
  rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
1435
1435
  rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
1436
- rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
1436
+ rb_define_method(rb_cLLaMAModel, "vocab_type", RUBY_METHOD_FUNC(_llama_model_get_model_vocab_type), 0);
1437
+ rb_define_method(rb_cLLaMAModel, "rope_type", RUBY_METHOD_FUNC(_llama_model_get_model_rope_type), 0);
1437
1438
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
1438
1439
  rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
1439
1440
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
@@ -1559,41 +1560,14 @@ private:
1559
1560
  return Qnil;
1560
1561
  }
1561
1562
 
1562
- static VALUE _llama_model_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
1563
- VALUE kw_args = Qnil;
1564
- ID kw_table[4] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads"), rb_intern("scale") };
1565
- VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1566
- rb_scan_args(argc, argv, ":", &kw_args);
1567
- rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
1568
-
1569
- if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1570
- rb_raise(rb_eArgError, "lora_path must be a string");
1571
- return Qnil;
1572
- }
1573
- if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
1574
- rb_raise(rb_eArgError, "base_model_path must be a string");
1575
- return Qnil;
1576
- }
1577
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
1578
- rb_raise(rb_eArgError, "n_threads must be an integer");
1579
- return Qnil;
1580
- }
1581
- if (kw_values[3] != Qundef && !RB_FLOAT_TYPE_P(kw_values[3])) {
1582
- rb_raise(rb_eArgError, "scale must be a float");
1583
- return Qnil;
1584
- }
1585
-
1586
- const char* lora_path = StringValueCStr(kw_values[0]);
1587
- const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
1588
- const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
1589
- const float scale = kw_values[3] == Qundef ? 1.0 : NUM2DBL(kw_values[3]);
1563
+ static VALUE _llama_model_get_model_vocab_type(VALUE self) {
1564
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1565
+ return INT2NUM(llama_vocab_type(ptr->model));
1566
+ }
1590
1567
 
1568
+ static VALUE _llama_model_get_model_rope_type(VALUE self) {
1591
1569
  LLaMAModelWrapper* ptr = get_llama_model(self);
1592
- if (llama_model_apply_lora_from_file(ptr->model, lora_path, scale, base_model_path, n_threads) != 0) {
1593
- rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
1594
- return Qnil;
1595
- }
1596
- return Qnil;
1570
+ return INT2NUM(llama_rope_type(ptr->model));
1597
1571
  }
1598
1572
 
1599
1573
  static VALUE _llama_model_get_model_n_vocab(VALUE self) {
@@ -2038,8 +2012,6 @@ public:
2038
2012
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
2039
2013
  rb_define_attr(rb_cLLaMAContext, "model", 1, 0);
2040
2014
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
2041
- rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
2042
- rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
2043
2015
  rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
2044
2016
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
2045
2017
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
@@ -2054,14 +2026,16 @@ public:
2054
2026
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
2055
2027
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
2056
2028
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
2057
- rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
2029
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_add", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_add), 4);
2058
2030
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
2031
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_pos_max", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_pos_max), 1);
2032
+ rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
2033
+ rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
2059
2034
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2060
2035
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2061
2036
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2062
2037
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
2063
2038
  rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
2064
- rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
2065
2039
  rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
2066
2040
  rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
2067
2041
  rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
@@ -2070,7 +2044,6 @@ public:
2070
2044
  rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
2071
2045
  rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
2072
2046
  rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
2073
- rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
2074
2047
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
2075
2048
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
2076
2049
  rb_define_method(rb_cLLaMAContext, "sample_token_greedy", RUBY_METHOD_FUNC(_llama_context_sample_token_greedy), 1);
@@ -2122,110 +2095,6 @@ private:
2122
2095
  return Qnil;
2123
2096
  }
2124
2097
 
2125
- static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
2126
- VALUE kw_args = Qnil;
2127
- ID kw_table[3] = { rb_intern("tokens"), rb_intern("n_past"), rb_intern("n_tokens") };
2128
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2129
- rb_scan_args(argc, argv, ":", &kw_args);
2130
- rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2131
-
2132
- rb_warn("eval is deprecated. Use decode instead.");
2133
-
2134
- if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2135
- rb_raise(rb_eArgError, "tokens must be an Array");
2136
- return Qnil;
2137
- }
2138
- if (!RB_INTEGER_TYPE_P(kw_values[1])) {
2139
- rb_raise(rb_eArgError, "n_past must be an integer");
2140
- return Qnil;
2141
- }
2142
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
2143
- rb_raise(rb_eArgError, "n_tokens must be an integer");
2144
- return Qnil;
2145
- }
2146
-
2147
- const size_t tokens_len = RARRAY_LEN(kw_values[0]);
2148
- std::vector<llama_token> embd(tokens_len);
2149
- for (size_t i = 0; i < tokens_len; i++) {
2150
- VALUE token = rb_ary_entry(kw_values[0], i);
2151
- if (!RB_INTEGER_TYPE_P(token)) {
2152
- rb_raise(rb_eArgError, "tokens must be an array of integers");
2153
- return Qnil;
2154
- }
2155
- embd[i] = NUM2INT(token);
2156
- }
2157
-
2158
- const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
2159
- const int n_past = NUM2INT(kw_values[1]);
2160
-
2161
- LLaMAContextWrapper* ptr = get_llama_context(self);
2162
- if (ptr->ctx == NULL) {
2163
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2164
- return Qnil;
2165
- }
2166
- if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
2167
- rb_raise(rb_eRuntimeError, "Failed to evaluate");
2168
- return Qnil;
2169
- }
2170
-
2171
- rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
2172
- rb_iv_set(self, "@has_evaluated", Qtrue);
2173
-
2174
- return Qnil;
2175
- }
2176
-
2177
- static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
2178
- VALUE kw_args = Qnil;
2179
- ID kw_table[3] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens") };
2180
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2181
- rb_scan_args(argc, argv, ":", &kw_args);
2182
- rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2183
-
2184
- rb_warn("eval_embd is deprecated. Use decode instead.");
2185
-
2186
- if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2187
- rb_raise(rb_eArgError, "tokens must be an Array");
2188
- return Qnil;
2189
- }
2190
- if (!RB_INTEGER_TYPE_P(kw_values[1])) {
2191
- rb_raise(rb_eArgError, "n_past must be an integer");
2192
- return Qnil;
2193
- }
2194
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
2195
- rb_raise(rb_eArgError, "n_tokens must be an integer");
2196
- return Qnil;
2197
- }
2198
-
2199
- const size_t tokens_len = RARRAY_LEN(kw_values[0]);
2200
- std::vector<float> embd(tokens_len);
2201
- for (size_t i = 0; i < tokens_len; i++) {
2202
- VALUE el = rb_ary_entry(kw_values[0], i);
2203
- if (!RB_FLOAT_TYPE_P(el)) {
2204
- rb_raise(rb_eArgError, "embd must be an array of floats");
2205
- return Qnil;
2206
- }
2207
- embd[i] = NUM2DBL(el);
2208
- }
2209
-
2210
- const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
2211
- const int n_past = NUM2INT(kw_values[1]);
2212
-
2213
- LLaMAContextWrapper* ptr = get_llama_context(self);
2214
- if (ptr->ctx == NULL) {
2215
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2216
- return Qnil;
2217
- }
2218
- if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
2219
- rb_raise(rb_eRuntimeError, "Failed to evaluate");
2220
- return Qnil;
2221
- }
2222
-
2223
- rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
2224
- rb_iv_set(self, "@has_evaluated", Qtrue);
2225
-
2226
- return Qnil;
2227
- }
2228
-
2229
2098
  static VALUE _llama_context_decode(VALUE self, VALUE batch) {
2230
2099
  LLaMAContextWrapper* ptr = get_llama_context(self);
2231
2100
  if (ptr->ctx == NULL) {
@@ -2430,13 +2299,13 @@ private:
2430
2299
  return Qnil;
2431
2300
  }
2432
2301
 
2433
- static VALUE _llama_context_kv_cache_seq_shift(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
2302
+ static VALUE _llama_context_kv_cache_seq_add(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
2434
2303
  LLaMAContextWrapper* ptr = get_llama_context(self);
2435
2304
  if (ptr->ctx == NULL) {
2436
2305
  rb_raise(rb_eArgError, "LLaMA context is not initialized");
2437
2306
  return Qnil;
2438
2307
  }
2439
- llama_kv_cache_seq_shift(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
2308
+ llama_kv_cache_seq_add(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
2440
2309
  return Qnil;
2441
2310
  }
2442
2311
 
@@ -2450,6 +2319,35 @@ private:
2450
2319
  return Qnil;
2451
2320
  }
2452
2321
 
2322
+ static VALUE _llama_context_kv_cache_seq_pos_max(VALUE self, VALUE seq_id) {
2323
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2324
+ if (ptr->ctx == NULL) {
2325
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2326
+ return Qnil;
2327
+ }
2328
+ return INT2NUM(llama_kv_cache_seq_pos_max(ptr->ctx, NUM2INT(seq_id)));
2329
+ }
2330
+
2331
+ static VALUE _llama_context_kv_cache_defrag(VALUE self) {
2332
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2333
+ if (ptr->ctx == NULL) {
2334
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2335
+ return Qnil;
2336
+ }
2337
+ llama_kv_cache_defrag(ptr->ctx);
2338
+ return Qnil;
2339
+ }
2340
+
2341
+ static VALUE _llama_context_kv_cache_update(VALUE self) {
2342
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2343
+ if (ptr->ctx == NULL) {
2344
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2345
+ return Qnil;
2346
+ }
2347
+ llama_kv_cache_update(ptr->ctx);
2348
+ return Qnil;
2349
+ }
2350
+
2453
2351
  static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
2454
2352
  LLaMAContextWrapper* ptr = get_llama_context(self);
2455
2353
  if (ptr->ctx == NULL) {
@@ -2659,46 +2557,6 @@ private:
2659
2557
  return Qnil;
2660
2558
  }
2661
2559
 
2662
- static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
2663
- VALUE kw_args = Qnil;
2664
- ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
2665
- VALUE kw_values[2] = { Qundef, Qundef };
2666
- VALUE candidates = Qnil;
2667
- rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
2668
- rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
2669
-
2670
- if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
2671
- rb_raise(rb_eArgError, "guidance must be a Context");
2672
- return Qnil;
2673
- }
2674
- if (!RB_FLOAT_TYPE_P(kw_values[1])) {
2675
- rb_raise(rb_eArgError, "scale must be a float");
2676
- return Qnil;
2677
- }
2678
-
2679
- LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
2680
- if (ctx_ptr->ctx == NULL) {
2681
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2682
- return Qnil;
2683
- }
2684
- LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
2685
- if (cnd_ptr->array.data == nullptr) {
2686
- rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
2687
- return Qnil;
2688
- }
2689
-
2690
- LLaMAContextWrapper* guidance_ptr = get_llama_context(kw_values[0]);
2691
- if (guidance_ptr->ctx == NULL) {
2692
- rb_raise(rb_eRuntimeError, "guidance context is not initialized");
2693
- return Qnil;
2694
- }
2695
- const float scale = NUM2DBL(kw_values[1]);
2696
-
2697
- llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
2698
-
2699
- return Qnil;
2700
- }
2701
-
2702
2560
  static VALUE _llama_context_sample_softmax(VALUE self, VALUE candidates) {
2703
2561
  if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
2704
2562
  rb_raise(rb_eArgError, "argument must be a TokenDataArray");
@@ -2994,42 +2852,6 @@ private:
2994
2852
  return Qnil;
2995
2853
  }
2996
2854
 
2997
- static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
2998
- VALUE kw_args = Qnil;
2999
- ID kw_table[1] = { rb_intern("temperature") };
3000
- VALUE kw_values[1] = { Qundef };
3001
- VALUE candidates = Qnil;
3002
- rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
3003
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
3004
-
3005
- rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
3006
-
3007
- if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
3008
- rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
3009
- return Qnil;
3010
- }
3011
- if (!RB_FLOAT_TYPE_P(kw_values[0])) {
3012
- rb_raise(rb_eArgError, "temperature must be a float");
3013
- return Qnil;
3014
- }
3015
-
3016
- LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
3017
- if (ctx_ptr->ctx == NULL) {
3018
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3019
- return Qnil;
3020
- }
3021
- LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
3022
- if (cnd_ptr->array.data == nullptr) {
3023
- rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
3024
- return Qnil;
3025
- }
3026
- const float temperature = NUM2DBL(kw_values[0]);
3027
-
3028
- llama_sample_temperature(ctx_ptr->ctx, &(cnd_ptr->array), temperature);
3029
-
3030
- return Qnil;
3031
- }
3032
-
3033
2855
  static VALUE _llama_context_sample_token_mirostat(int argc, VALUE* argv, VALUE self) {
3034
2856
  VALUE kw_args = Qnil;
3035
2857
  ID kw_table[4] = { rb_intern("tau"), rb_intern("eta"), rb_intern("m"), rb_intern("mu") };
@@ -3243,15 +3065,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
3243
3065
 
3244
3066
  // module functions
3245
3067
 
3246
- static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
3247
- VALUE kw_args = Qnil;
3248
- ID kw_table[1] = { rb_intern("numa") };
3249
- VALUE kw_values[1] = { Qundef };
3250
- rb_scan_args(argc, argv, ":", &kw_args);
3251
- rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
3252
-
3253
- const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
3254
- llama_backend_init(numa);
3068
+ static VALUE rb_llama_llama_backend_init(VALUE self) {
3069
+ llama_backend_init();
3255
3070
 
3256
3071
  return Qnil;
3257
3072
  }
@@ -3262,6 +3077,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
3262
3077
  return Qnil;
3263
3078
  }
3264
3079
 
3080
+ static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
3081
+ if (!RB_INTEGER_TYPE_P(strategy)) {
3082
+ rb_raise(rb_eArgError, "strategy must be an integer");
3083
+ return Qnil;
3084
+ }
3085
+
3086
+ llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
3087
+
3088
+ return Qnil;
3089
+ }
3090
+
3265
3091
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
3266
3092
  VALUE kw_args = Qnil;
3267
3093
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
@@ -3303,16 +3129,6 @@ static VALUE rb_llama_time_us(VALUE self) {
3303
3129
  return LONG2NUM(llama_time_us());
3304
3130
  }
3305
3131
 
3306
- static VALUE rb_llama_mmap_supported(VALUE self) {
3307
- rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
3308
- return llama_mmap_supported() ? Qtrue : Qfalse;
3309
- }
3310
-
3311
- static VALUE rb_llama_mlock_supported(VALUE self) {
3312
- rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
3313
- return llama_mlock_supported() ? Qtrue : Qfalse;
3314
- }
3315
-
3316
3132
  static VALUE rb_llama_max_devices(VALUE self) {
3317
3133
  return SIZET2NUM(llama_max_devices());
3318
3134
  }
@@ -3345,13 +3161,12 @@ extern "C" void Init_llama_cpp(void) {
3345
3161
  RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
3346
3162
  RbLLaMAGrammar::define_class(rb_mLLaMACpp);
3347
3163
 
3348
- rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
3164
+ rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
3349
3165
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
3166
+ rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
3350
3167
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
3351
3168
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
3352
3169
  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
3353
- rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
3354
- rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
3355
3170
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
3356
3171
  rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
3357
3172
  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
@@ -3389,14 +3204,16 @@ extern "C" void Init_llama_cpp(void) {
3389
3204
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
3390
3205
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3391
3206
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3392
- rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3207
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS));
3393
3208
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3209
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3210
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3394
3211
 
3395
3212
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3396
3213
 
3397
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_INT", INT2NUM(LLAMA_KV_OVERRIDE_INT));
3398
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_FLOAT));
3399
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_BOOL));
3214
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
3215
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
3216
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
3400
3217
 
3401
3218
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
3402
3219
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3406,15 +3223,19 @@ extern "C" void Init_llama_cpp(void) {
3406
3223
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
3407
3224
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
3408
3225
 
3409
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED));
3410
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_NONE", INT2NUM(LLAMA_ROPE_SCALING_NONE));
3411
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_LINEAR));
3412
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3413
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3226
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED));
3227
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_NONE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE));
3228
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR));
3229
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
3230
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
3231
+
3232
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
3233
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
3234
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
3414
3235
 
3415
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3416
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3417
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
3236
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
3237
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
3238
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_ROW", INT2NUM(LLAMA_SPLIT_MODE_ROW));
3418
3239
 
3419
3240
  std::stringstream ss_magic;
3420
3241
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.6'
6
+ VERSION = '0.13.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2143'
9
+ LLAMA_CPP_VERSION = 'b2303'
10
10
  end