llama_cpp 0.12.7 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
4
- data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
3
+ metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
4
+ data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
5
5
  SHA512:
6
- metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
7
- data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
6
+ metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
7
+ data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
2
+
3
+ - Bump bundled llama.cpp from b2143 to b2303.
4
+ - Remove deprecated methods:
5
+ - `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
6
+ - Rename some constants.
7
+ - Rename `kv_cache_seq_shift` method to `kv_cache_seq_add`.
8
+ - Add `defrag_thold` accessor to `ContextParams`.
9
+ - Add `vocab_type` and `rope_type` methods to `Model`.
10
+ - Add `kv_cache_seq_pos_max`, `kv_cache_defrag`, and `kv_cache_update` methods to `Context`.
11
+
1
12
  ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
2
13
 
3
14
  - Bump bundled llama.cpp from b2106 to b2143.
@@ -966,12 +966,12 @@ public:
966
966
  rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
967
967
  rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
968
968
  rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
969
+ rb_define_method(rb_cLLaMAContextParams, "defrag_thold=", RUBY_METHOD_FUNC(_llama_context_params_set_defrag_thold), 1);
970
+ rb_define_method(rb_cLLaMAContextParams, "defrag_thold", RUBY_METHOD_FUNC(_llama_context_params_get_defrag_thold), 0);
969
971
  rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
970
972
  rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
971
973
  rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
972
974
  rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
973
- rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
974
- rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
975
975
  rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
976
976
  rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
977
977
  rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
@@ -1146,6 +1146,18 @@ private:
1146
1146
  return UINT2NUM(ptr->params.yarn_orig_ctx);
1147
1147
  }
1148
1148
 
1149
+ // defrag_thold
1150
+ static VALUE _llama_context_params_set_defrag_thold(VALUE self, VALUE defrag_thold) {
1151
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1152
+ ptr->params.defrag_thold = NUM2DBL(defrag_thold);
1153
+ return DBL2NUM(ptr->params.defrag_thold);
1154
+ }
1155
+
1156
+ static VALUE _llama_context_params_get_defrag_thold(VALUE self) {
1157
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1158
+ return DBL2NUM(ptr->params.defrag_thold);
1159
+ }
1160
+
1149
1161
  static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
1150
1162
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1151
1163
  return UINT2NUM(ptr->params.yarn_orig_ctx);
@@ -1175,18 +1187,6 @@ private:
1175
1187
  return INT2NUM(ptr->params.type_v);
1176
1188
  }
1177
1189
 
1178
- // mul_mat_q
1179
- static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
1180
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1181
- ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
1182
- return ptr->params.mul_mat_q ? Qtrue : Qfalse;
1183
- }
1184
-
1185
- static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
1186
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1187
- return ptr->params.mul_mat_q ? Qtrue : Qfalse;
1188
- }
1189
-
1190
1190
  // logits_all
1191
1191
  static VALUE _llama_context_params_set_logits_all(VALUE self, VALUE logits_all) {
1192
1192
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1433,7 +1433,8 @@ public:
1433
1433
  rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
1434
1434
  rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
1435
1435
  rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
1436
- rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
1436
+ rb_define_method(rb_cLLaMAModel, "vocab_type", RUBY_METHOD_FUNC(_llama_model_get_model_vocab_type), 0);
1437
+ rb_define_method(rb_cLLaMAModel, "rope_type", RUBY_METHOD_FUNC(_llama_model_get_model_rope_type), 0);
1437
1438
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
1438
1439
  rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
1439
1440
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
@@ -1559,41 +1560,14 @@ private:
1559
1560
  return Qnil;
1560
1561
  }
1561
1562
 
1562
- static VALUE _llama_model_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
1563
- VALUE kw_args = Qnil;
1564
- ID kw_table[4] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads"), rb_intern("scale") };
1565
- VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1566
- rb_scan_args(argc, argv, ":", &kw_args);
1567
- rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
1568
-
1569
- if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1570
- rb_raise(rb_eArgError, "lora_path must be a string");
1571
- return Qnil;
1572
- }
1573
- if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
1574
- rb_raise(rb_eArgError, "base_model_path must be a string");
1575
- return Qnil;
1576
- }
1577
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
1578
- rb_raise(rb_eArgError, "n_threads must be an integer");
1579
- return Qnil;
1580
- }
1581
- if (kw_values[3] != Qundef && !RB_FLOAT_TYPE_P(kw_values[3])) {
1582
- rb_raise(rb_eArgError, "scale must be a float");
1583
- return Qnil;
1584
- }
1585
-
1586
- const char* lora_path = StringValueCStr(kw_values[0]);
1587
- const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
1588
- const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
1589
- const float scale = kw_values[3] == Qundef ? 1.0 : NUM2DBL(kw_values[3]);
1563
+ static VALUE _llama_model_get_model_vocab_type(VALUE self) {
1564
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1565
+ return INT2NUM(llama_vocab_type(ptr->model));
1566
+ }
1590
1567
 
1568
+ static VALUE _llama_model_get_model_rope_type(VALUE self) {
1591
1569
  LLaMAModelWrapper* ptr = get_llama_model(self);
1592
- if (llama_model_apply_lora_from_file(ptr->model, lora_path, scale, base_model_path, n_threads) != 0) {
1593
- rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
1594
- return Qnil;
1595
- }
1596
- return Qnil;
1570
+ return INT2NUM(llama_rope_type(ptr->model));
1597
1571
  }
1598
1572
 
1599
1573
  static VALUE _llama_model_get_model_n_vocab(VALUE self) {
@@ -2038,8 +2012,6 @@ public:
2038
2012
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
2039
2013
  rb_define_attr(rb_cLLaMAContext, "model", 1, 0);
2040
2014
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
2041
- rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
2042
- rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
2043
2015
  rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
2044
2016
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
2045
2017
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
@@ -2054,14 +2026,16 @@ public:
2054
2026
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
2055
2027
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
2056
2028
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
2057
- rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
2029
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_add", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_add), 4);
2058
2030
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
2031
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_pos_max", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_pos_max), 1);
2032
+ rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
2033
+ rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
2059
2034
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2060
2035
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2061
2036
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2062
2037
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
2063
2038
  rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
2064
- rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
2065
2039
  rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
2066
2040
  rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
2067
2041
  rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
@@ -2070,7 +2044,6 @@ public:
2070
2044
  rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
2071
2045
  rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
2072
2046
  rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
2073
- rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
2074
2047
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
2075
2048
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
2076
2049
  rb_define_method(rb_cLLaMAContext, "sample_token_greedy", RUBY_METHOD_FUNC(_llama_context_sample_token_greedy), 1);
@@ -2122,110 +2095,6 @@ private:
2122
2095
  return Qnil;
2123
2096
  }
2124
2097
 
2125
- static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
2126
- VALUE kw_args = Qnil;
2127
- ID kw_table[3] = { rb_intern("tokens"), rb_intern("n_past"), rb_intern("n_tokens") };
2128
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2129
- rb_scan_args(argc, argv, ":", &kw_args);
2130
- rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2131
-
2132
- rb_warn("eval is deprecated. Use decode instead.");
2133
-
2134
- if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2135
- rb_raise(rb_eArgError, "tokens must be an Array");
2136
- return Qnil;
2137
- }
2138
- if (!RB_INTEGER_TYPE_P(kw_values[1])) {
2139
- rb_raise(rb_eArgError, "n_past must be an integer");
2140
- return Qnil;
2141
- }
2142
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
2143
- rb_raise(rb_eArgError, "n_tokens must be an integer");
2144
- return Qnil;
2145
- }
2146
-
2147
- const size_t tokens_len = RARRAY_LEN(kw_values[0]);
2148
- std::vector<llama_token> embd(tokens_len);
2149
- for (size_t i = 0; i < tokens_len; i++) {
2150
- VALUE token = rb_ary_entry(kw_values[0], i);
2151
- if (!RB_INTEGER_TYPE_P(token)) {
2152
- rb_raise(rb_eArgError, "tokens must be an array of integers");
2153
- return Qnil;
2154
- }
2155
- embd[i] = NUM2INT(token);
2156
- }
2157
-
2158
- const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
2159
- const int n_past = NUM2INT(kw_values[1]);
2160
-
2161
- LLaMAContextWrapper* ptr = get_llama_context(self);
2162
- if (ptr->ctx == NULL) {
2163
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2164
- return Qnil;
2165
- }
2166
- if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
2167
- rb_raise(rb_eRuntimeError, "Failed to evaluate");
2168
- return Qnil;
2169
- }
2170
-
2171
- rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
2172
- rb_iv_set(self, "@has_evaluated", Qtrue);
2173
-
2174
- return Qnil;
2175
- }
2176
-
2177
- static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
2178
- VALUE kw_args = Qnil;
2179
- ID kw_table[3] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens") };
2180
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2181
- rb_scan_args(argc, argv, ":", &kw_args);
2182
- rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2183
-
2184
- rb_warn("eval_embd is deprecated. Use decode instead.");
2185
-
2186
- if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2187
- rb_raise(rb_eArgError, "tokens must be an Array");
2188
- return Qnil;
2189
- }
2190
- if (!RB_INTEGER_TYPE_P(kw_values[1])) {
2191
- rb_raise(rb_eArgError, "n_past must be an integer");
2192
- return Qnil;
2193
- }
2194
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
2195
- rb_raise(rb_eArgError, "n_tokens must be an integer");
2196
- return Qnil;
2197
- }
2198
-
2199
- const size_t tokens_len = RARRAY_LEN(kw_values[0]);
2200
- std::vector<float> embd(tokens_len);
2201
- for (size_t i = 0; i < tokens_len; i++) {
2202
- VALUE el = rb_ary_entry(kw_values[0], i);
2203
- if (!RB_FLOAT_TYPE_P(el)) {
2204
- rb_raise(rb_eArgError, "embd must be an array of floats");
2205
- return Qnil;
2206
- }
2207
- embd[i] = NUM2DBL(el);
2208
- }
2209
-
2210
- const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
2211
- const int n_past = NUM2INT(kw_values[1]);
2212
-
2213
- LLaMAContextWrapper* ptr = get_llama_context(self);
2214
- if (ptr->ctx == NULL) {
2215
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2216
- return Qnil;
2217
- }
2218
- if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
2219
- rb_raise(rb_eRuntimeError, "Failed to evaluate");
2220
- return Qnil;
2221
- }
2222
-
2223
- rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
2224
- rb_iv_set(self, "@has_evaluated", Qtrue);
2225
-
2226
- return Qnil;
2227
- }
2228
-
2229
2098
  static VALUE _llama_context_decode(VALUE self, VALUE batch) {
2230
2099
  LLaMAContextWrapper* ptr = get_llama_context(self);
2231
2100
  if (ptr->ctx == NULL) {
@@ -2430,13 +2299,13 @@ private:
2430
2299
  return Qnil;
2431
2300
  }
2432
2301
 
2433
- static VALUE _llama_context_kv_cache_seq_shift(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
2302
+ static VALUE _llama_context_kv_cache_seq_add(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
2434
2303
  LLaMAContextWrapper* ptr = get_llama_context(self);
2435
2304
  if (ptr->ctx == NULL) {
2436
2305
  rb_raise(rb_eArgError, "LLaMA context is not initialized");
2437
2306
  return Qnil;
2438
2307
  }
2439
- llama_kv_cache_seq_shift(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
2308
+ llama_kv_cache_seq_add(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
2440
2309
  return Qnil;
2441
2310
  }
2442
2311
 
@@ -2450,6 +2319,35 @@ private:
2450
2319
  return Qnil;
2451
2320
  }
2452
2321
 
2322
+ static VALUE _llama_context_kv_cache_seq_pos_max(VALUE self, VALUE seq_id) {
2323
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2324
+ if (ptr->ctx == NULL) {
2325
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2326
+ return Qnil;
2327
+ }
2328
+ return INT2NUM(llama_kv_cache_seq_pos_max(ptr->ctx, NUM2INT(seq_id)));
2329
+ }
2330
+
2331
+ static VALUE _llama_context_kv_cache_defrag(VALUE self) {
2332
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2333
+ if (ptr->ctx == NULL) {
2334
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2335
+ return Qnil;
2336
+ }
2337
+ llama_kv_cache_defrag(ptr->ctx);
2338
+ return Qnil;
2339
+ }
2340
+
2341
+ static VALUE _llama_context_kv_cache_update(VALUE self) {
2342
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2343
+ if (ptr->ctx == NULL) {
2344
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2345
+ return Qnil;
2346
+ }
2347
+ llama_kv_cache_update(ptr->ctx);
2348
+ return Qnil;
2349
+ }
2350
+
2453
2351
  static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
2454
2352
  LLaMAContextWrapper* ptr = get_llama_context(self);
2455
2353
  if (ptr->ctx == NULL) {
@@ -2659,46 +2557,6 @@ private:
2659
2557
  return Qnil;
2660
2558
  }
2661
2559
 
2662
- static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
2663
- VALUE kw_args = Qnil;
2664
- ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
2665
- VALUE kw_values[2] = { Qundef, Qundef };
2666
- VALUE candidates = Qnil;
2667
- rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
2668
- rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
2669
-
2670
- if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
2671
- rb_raise(rb_eArgError, "guidance must be a Context");
2672
- return Qnil;
2673
- }
2674
- if (!RB_FLOAT_TYPE_P(kw_values[1])) {
2675
- rb_raise(rb_eArgError, "scale must be a float");
2676
- return Qnil;
2677
- }
2678
-
2679
- LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
2680
- if (ctx_ptr->ctx == NULL) {
2681
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2682
- return Qnil;
2683
- }
2684
- LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
2685
- if (cnd_ptr->array.data == nullptr) {
2686
- rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
2687
- return Qnil;
2688
- }
2689
-
2690
- LLaMAContextWrapper* guidance_ptr = get_llama_context(kw_values[0]);
2691
- if (guidance_ptr->ctx == NULL) {
2692
- rb_raise(rb_eRuntimeError, "guidance context is not initialized");
2693
- return Qnil;
2694
- }
2695
- const float scale = NUM2DBL(kw_values[1]);
2696
-
2697
- llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
2698
-
2699
- return Qnil;
2700
- }
2701
-
2702
2560
  static VALUE _llama_context_sample_softmax(VALUE self, VALUE candidates) {
2703
2561
  if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
2704
2562
  rb_raise(rb_eArgError, "argument must be a TokenDataArray");
@@ -2994,42 +2852,6 @@ private:
2994
2852
  return Qnil;
2995
2853
  }
2996
2854
 
2997
- static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
2998
- VALUE kw_args = Qnil;
2999
- ID kw_table[1] = { rb_intern("temperature") };
3000
- VALUE kw_values[1] = { Qundef };
3001
- VALUE candidates = Qnil;
3002
- rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
3003
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
3004
-
3005
- rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
3006
-
3007
- if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
3008
- rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
3009
- return Qnil;
3010
- }
3011
- if (!RB_FLOAT_TYPE_P(kw_values[0])) {
3012
- rb_raise(rb_eArgError, "temperature must be a float");
3013
- return Qnil;
3014
- }
3015
-
3016
- LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
3017
- if (ctx_ptr->ctx == NULL) {
3018
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3019
- return Qnil;
3020
- }
3021
- LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
3022
- if (cnd_ptr->array.data == nullptr) {
3023
- rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
3024
- return Qnil;
3025
- }
3026
- const float temperature = NUM2DBL(kw_values[0]);
3027
-
3028
- llama_sample_temperature(ctx_ptr->ctx, &(cnd_ptr->array), temperature);
3029
-
3030
- return Qnil;
3031
- }
3032
-
3033
2855
  static VALUE _llama_context_sample_token_mirostat(int argc, VALUE* argv, VALUE self) {
3034
2856
  VALUE kw_args = Qnil;
3035
2857
  ID kw_table[4] = { rb_intern("tau"), rb_intern("eta"), rb_intern("m"), rb_intern("mu") };
@@ -3307,16 +3129,6 @@ static VALUE rb_llama_time_us(VALUE self) {
3307
3129
  return LONG2NUM(llama_time_us());
3308
3130
  }
3309
3131
 
3310
- static VALUE rb_llama_mmap_supported(VALUE self) {
3311
- rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
3312
- return llama_mmap_supported() ? Qtrue : Qfalse;
3313
- }
3314
-
3315
- static VALUE rb_llama_mlock_supported(VALUE self) {
3316
- rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
3317
- return llama_mlock_supported() ? Qtrue : Qfalse;
3318
- }
3319
-
3320
3132
  static VALUE rb_llama_max_devices(VALUE self) {
3321
3133
  return SIZET2NUM(llama_max_devices());
3322
3134
  }
@@ -3355,8 +3167,6 @@ extern "C" void Init_llama_cpp(void) {
3355
3167
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
3356
3168
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
3357
3169
  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
3358
- rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
3359
- rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
3360
3170
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
3361
3171
  rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
3362
3172
  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
@@ -3394,16 +3204,16 @@ extern "C" void Init_llama_cpp(void) {
3394
3204
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
3395
3205
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3396
3206
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3397
- rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3207
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS));
3398
3208
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3399
3209
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3400
3210
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3401
3211
 
3402
3212
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3403
3213
 
3404
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_INT", INT2NUM(LLAMA_KV_OVERRIDE_INT));
3405
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_FLOAT));
3406
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_BOOL));
3214
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
3215
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
3216
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
3407
3217
 
3408
3218
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
3409
3219
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3413,19 +3223,19 @@ extern "C" void Init_llama_cpp(void) {
3413
3223
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
3414
3224
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
3415
3225
 
3416
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED));
3417
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_NONE", INT2NUM(LLAMA_ROPE_SCALING_NONE));
3418
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_LINEAR));
3419
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3420
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3226
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED));
3227
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_NONE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE));
3228
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR));
3229
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
3230
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
3421
3231
 
3422
- rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
3423
- rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
3424
- rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
3232
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
3233
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
3234
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
3425
3235
 
3426
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3427
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3428
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
3236
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
3237
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
3238
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_ROW", INT2NUM(LLAMA_SPLIT_MODE_ROW));
3429
3239
 
3430
3240
  std::stringstream ss_magic;
3431
3241
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.7'
6
+ VERSION = '0.13.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2249'
9
+ LLAMA_CPP_VERSION = 'b2303'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
27
27
  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
28
28
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
29
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
30
- LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
30
+ LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
31
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
32
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
34
34
 
35
- LLAMA_KV_OVERRIDE_INT: Integer
36
- LLAMA_KV_OVERRIDE_FLOAT: Integer
37
- LLAMA_KV_OVERRIDE_BOOL: Integer
35
+ LLAMA_KV_OVERRIDE_TYPE_INT: Integer
36
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
37
+ LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
38
38
 
39
39
  LLAMA_GRETYPE_END: Integer
40
40
  LLAMA_GRETYPE_ALT: Integer
@@ -44,19 +44,19 @@ module LLaMACpp
44
44
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
45
45
  LLAMA_GRETYPE_CHAR_ALT: Integer
46
46
 
47
- LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
48
- LLAMA_ROPE_SCALING_NONE: Integer
49
- LLAMA_ROPE_SCALING_LINEAR: Integer
50
- LLAMA_ROPE_SCALING_YARN: Integer
51
- LLAMA_ROPE_SCALING_MAX_VALUE: Integer
47
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
48
+ LLAMA_ROPE_SCALING_TYPE_NONE: Integer
49
+ LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
50
+ LLAMA_ROPE_SCALING_TYPE_YARN: Integer
51
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
52
52
 
53
- LLAMA_POOLING_NONE: Integer
54
- LLAMA_POOLING_MEAN: Integer
55
- LLAMA_POOLING_CLS: Integer
53
+ LLAMA_POOLING_TYPE_NONE: Integer
54
+ LLAMA_POOLING_TYPE_MEAN: Integer
55
+ LLAMA_POOLING_TYPE_CLS: Integer
56
56
 
57
- LLAMA_SPLIT_NONE: Integer
58
- LLAMA_SPLIT_LAYER: Integer
59
- LLAMA_SPLIT_ROW: Integer
57
+ LLAMA_SPLIT_MODE_NONE: Integer
58
+ LLAMA_SPLIT_MODE_LAYER: Integer
59
+ LLAMA_SPLIT_MODE_ROW: Integer
60
60
 
61
61
  def self?.backend_init: () -> void
62
62
  def self?.backend_free: () -> void
@@ -68,8 +68,6 @@ module LLaMACpp
68
68
  ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
69
69
  def self?.print_system_info: () -> void
70
70
  def self?.time_us: () -> Integer
71
- def self?.mmap_supported?: () -> bool
72
- def self?.mlock_supported?: () -> bool
73
71
  def self?.max_devices: () -> Integer
74
72
  def self?.supports_mmap?: () -> bool
75
73
  def self?.supports_mlock?: () -> bool
@@ -103,7 +101,8 @@ module LLaMACpp
103
101
  def empty?: () -> bool
104
102
  def free: () -> void
105
103
  def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
106
- def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
104
+ def vocab_type: () -> Integer
105
+ def rope_type: () -> Integer
107
106
  def n_vocab: () -> Integer
108
107
  def n_ctx_train: () -> Integer
109
108
  def n_embd: () -> Integer
@@ -202,8 +201,6 @@ module LLaMACpp
202
201
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
203
202
  def embeddings: () -> Array[Float]
204
203
  def embeddings_ith: (Integer) -> Array[Float]
205
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
206
- def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
207
204
  def decode: (::LLaMACpp::Batch) -> void
208
205
  def logits: () -> Array[Float]
209
206
  def n_ctx: () -> Integer
@@ -216,14 +213,16 @@ module LLaMACpp
216
213
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
217
214
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
218
215
  def kv_cache_seq_keep: (Integer) -> void
219
- def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
216
+ def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
220
217
  def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
218
+ def kv_cache_seq_pos_max: (Integer) -> Integer
219
+ def kv_cache_defrag: () -> void
220
+ def kv_cache_update: () -> void
221
221
  def set_rng_seed: (Integer) -> void
222
222
  def load_session_file: (session_path: String) -> void
223
223
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
224
224
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
225
225
  def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
226
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
227
226
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
228
227
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
229
228
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -232,7 +231,6 @@ module LLaMACpp
232
231
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
233
232
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
234
233
  def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
235
- def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
236
234
  def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
237
235
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
238
236
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
@@ -270,12 +268,12 @@ module LLaMACpp
270
268
  def yarn_beta_slow: () -> Float
271
269
  def yarn_orig_ctx=: (Integer) -> Integer
272
270
  def yarn_orig_ctx: () -> Integer
271
+ def defrag_thold=: (Float) -> Float
272
+ def defrag_thold: () -> Float
273
273
  def type_k=: (Integer) -> Integer
274
274
  def type_k: () -> Integer
275
275
  def type_v=: (Integer) -> Integer
276
276
  def type_v: () -> Integer
277
- def mul_mat_q: () -> bool
278
- def mul_mat_q=: (bool) -> bool
279
277
  def logits_all: () -> bool
280
278
  def logits_all=: (bool) -> bool
281
279
  def embedding: () -> bool
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
383
383
  endif # LLAMA_BLIS
384
384
 
385
385
  ifdef LLAMA_CUBLAS
386
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
387
- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
386
+ ifneq ('', '$(wildcard /opt/cuda)')
387
+ CUDA_PATH ?= /opt/cuda
388
+ else
389
+ CUDA_PATH ?= /usr/local/cuda
390
+ endif
391
+ MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
392
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
388
393
  OBJS += ggml-cuda.o
389
394
  MK_NVCCFLAGS += -use_fast_math
390
395
  ifdef LLAMA_FATAL_WARNINGS
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
599
604
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
600
605
  ifdef LLAMA_CUBLAS
601
606
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
602
- CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
607
+ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
603
608
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
604
609
  ifndef CUDA_DOCKER_ARCH
605
610
  ifndef CUDA_POWER_ARCH