llama_cpp 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
4
- data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
3
+ metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
4
+ data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
5
5
  SHA512:
6
- metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
7
- data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
6
+ metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
7
+ data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
2
+
3
+ - Bump bundled llama.cpp from b2143 to b2303.
4
+ - Remove deprecated methods:
5
+ - `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
6
+ - Rename some constants.
7
+ - Rename `kv_cache_seq_shift` method to `kv_cache_seq_add`.
8
+ - Add `defrag_thold` accessor to `ContextParams`.
9
+ - Add `vocab_type` and `rope_type` methods to `Model`.
10
+ - Add `kv_cache_seq_pos_max`, `kv_cache_defrag`, and `kv_cache_update` methods to `Context`.
11
+
1
12
  ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
2
13
 
3
14
  - Bump bundled llama.cpp from b2106 to b2143.
@@ -966,12 +966,12 @@ public:
966
966
  rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
967
967
  rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
968
968
  rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
969
+ rb_define_method(rb_cLLaMAContextParams, "defrag_thold=", RUBY_METHOD_FUNC(_llama_context_params_set_defrag_thold), 1);
970
+ rb_define_method(rb_cLLaMAContextParams, "defrag_thold", RUBY_METHOD_FUNC(_llama_context_params_get_defrag_thold), 0);
969
971
  rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
970
972
  rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
971
973
  rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
972
974
  rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
973
- rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
974
- rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
975
975
  rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
976
976
  rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
977
977
  rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
@@ -1146,6 +1146,18 @@ private:
1146
1146
  return UINT2NUM(ptr->params.yarn_orig_ctx);
1147
1147
  }
1148
1148
 
1149
+ // defrag_thold
1150
+ static VALUE _llama_context_params_set_defrag_thold(VALUE self, VALUE defrag_thold) {
1151
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1152
+ ptr->params.defrag_thold = NUM2DBL(defrag_thold);
1153
+ return DBL2NUM(ptr->params.defrag_thold);
1154
+ }
1155
+
1156
+ static VALUE _llama_context_params_get_defrag_thold(VALUE self) {
1157
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1158
+ return DBL2NUM(ptr->params.defrag_thold);
1159
+ }
1160
+
1149
1161
  static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
1150
1162
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1151
1163
  return UINT2NUM(ptr->params.yarn_orig_ctx);
@@ -1175,18 +1187,6 @@ private:
1175
1187
  return INT2NUM(ptr->params.type_v);
1176
1188
  }
1177
1189
 
1178
- // mul_mat_q
1179
- static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
1180
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1181
- ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
1182
- return ptr->params.mul_mat_q ? Qtrue : Qfalse;
1183
- }
1184
-
1185
- static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
1186
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1187
- return ptr->params.mul_mat_q ? Qtrue : Qfalse;
1188
- }
1189
-
1190
1190
  // logits_all
1191
1191
  static VALUE _llama_context_params_set_logits_all(VALUE self, VALUE logits_all) {
1192
1192
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1433,7 +1433,8 @@ public:
1433
1433
  rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
1434
1434
  rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
1435
1435
  rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
1436
- rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
1436
+ rb_define_method(rb_cLLaMAModel, "vocab_type", RUBY_METHOD_FUNC(_llama_model_get_model_vocab_type), 0);
1437
+ rb_define_method(rb_cLLaMAModel, "rope_type", RUBY_METHOD_FUNC(_llama_model_get_model_rope_type), 0);
1437
1438
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
1438
1439
  rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
1439
1440
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
@@ -1559,41 +1560,14 @@ private:
1559
1560
  return Qnil;
1560
1561
  }
1561
1562
 
1562
- static VALUE _llama_model_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
1563
- VALUE kw_args = Qnil;
1564
- ID kw_table[4] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads"), rb_intern("scale") };
1565
- VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1566
- rb_scan_args(argc, argv, ":", &kw_args);
1567
- rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
1568
-
1569
- if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1570
- rb_raise(rb_eArgError, "lora_path must be a string");
1571
- return Qnil;
1572
- }
1573
- if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
1574
- rb_raise(rb_eArgError, "base_model_path must be a string");
1575
- return Qnil;
1576
- }
1577
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
1578
- rb_raise(rb_eArgError, "n_threads must be an integer");
1579
- return Qnil;
1580
- }
1581
- if (kw_values[3] != Qundef && !RB_FLOAT_TYPE_P(kw_values[3])) {
1582
- rb_raise(rb_eArgError, "scale must be a float");
1583
- return Qnil;
1584
- }
1585
-
1586
- const char* lora_path = StringValueCStr(kw_values[0]);
1587
- const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
1588
- const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
1589
- const float scale = kw_values[3] == Qundef ? 1.0 : NUM2DBL(kw_values[3]);
1563
+ static VALUE _llama_model_get_model_vocab_type(VALUE self) {
1564
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1565
+ return INT2NUM(llama_vocab_type(ptr->model));
1566
+ }
1590
1567
 
1568
+ static VALUE _llama_model_get_model_rope_type(VALUE self) {
1591
1569
  LLaMAModelWrapper* ptr = get_llama_model(self);
1592
- if (llama_model_apply_lora_from_file(ptr->model, lora_path, scale, base_model_path, n_threads) != 0) {
1593
- rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
1594
- return Qnil;
1595
- }
1596
- return Qnil;
1570
+ return INT2NUM(llama_rope_type(ptr->model));
1597
1571
  }
1598
1572
 
1599
1573
  static VALUE _llama_model_get_model_n_vocab(VALUE self) {
@@ -2038,8 +2012,6 @@ public:
2038
2012
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
2039
2013
  rb_define_attr(rb_cLLaMAContext, "model", 1, 0);
2040
2014
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
2041
- rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
2042
- rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
2043
2015
  rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
2044
2016
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
2045
2017
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
@@ -2054,14 +2026,16 @@ public:
2054
2026
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
2055
2027
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
2056
2028
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
2057
- rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
2029
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_add", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_add), 4);
2058
2030
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
2031
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_pos_max", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_pos_max), 1);
2032
+ rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
2033
+ rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
2059
2034
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2060
2035
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2061
2036
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2062
2037
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
2063
2038
  rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
2064
- rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
2065
2039
  rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
2066
2040
  rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
2067
2041
  rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
@@ -2070,7 +2044,6 @@ public:
2070
2044
  rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
2071
2045
  rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
2072
2046
  rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
2073
- rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
2074
2047
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
2075
2048
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
2076
2049
  rb_define_method(rb_cLLaMAContext, "sample_token_greedy", RUBY_METHOD_FUNC(_llama_context_sample_token_greedy), 1);
@@ -2122,110 +2095,6 @@ private:
2122
2095
  return Qnil;
2123
2096
  }
2124
2097
 
2125
- static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
2126
- VALUE kw_args = Qnil;
2127
- ID kw_table[3] = { rb_intern("tokens"), rb_intern("n_past"), rb_intern("n_tokens") };
2128
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2129
- rb_scan_args(argc, argv, ":", &kw_args);
2130
- rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2131
-
2132
- rb_warn("eval is deprecated. Use decode instead.");
2133
-
2134
- if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2135
- rb_raise(rb_eArgError, "tokens must be an Array");
2136
- return Qnil;
2137
- }
2138
- if (!RB_INTEGER_TYPE_P(kw_values[1])) {
2139
- rb_raise(rb_eArgError, "n_past must be an integer");
2140
- return Qnil;
2141
- }
2142
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
2143
- rb_raise(rb_eArgError, "n_tokens must be an integer");
2144
- return Qnil;
2145
- }
2146
-
2147
- const size_t tokens_len = RARRAY_LEN(kw_values[0]);
2148
- std::vector<llama_token> embd(tokens_len);
2149
- for (size_t i = 0; i < tokens_len; i++) {
2150
- VALUE token = rb_ary_entry(kw_values[0], i);
2151
- if (!RB_INTEGER_TYPE_P(token)) {
2152
- rb_raise(rb_eArgError, "tokens must be an array of integers");
2153
- return Qnil;
2154
- }
2155
- embd[i] = NUM2INT(token);
2156
- }
2157
-
2158
- const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
2159
- const int n_past = NUM2INT(kw_values[1]);
2160
-
2161
- LLaMAContextWrapper* ptr = get_llama_context(self);
2162
- if (ptr->ctx == NULL) {
2163
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2164
- return Qnil;
2165
- }
2166
- if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
2167
- rb_raise(rb_eRuntimeError, "Failed to evaluate");
2168
- return Qnil;
2169
- }
2170
-
2171
- rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
2172
- rb_iv_set(self, "@has_evaluated", Qtrue);
2173
-
2174
- return Qnil;
2175
- }
2176
-
2177
- static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
2178
- VALUE kw_args = Qnil;
2179
- ID kw_table[3] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens") };
2180
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2181
- rb_scan_args(argc, argv, ":", &kw_args);
2182
- rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2183
-
2184
- rb_warn("eval_embd is deprecated. Use decode instead.");
2185
-
2186
- if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2187
- rb_raise(rb_eArgError, "tokens must be an Array");
2188
- return Qnil;
2189
- }
2190
- if (!RB_INTEGER_TYPE_P(kw_values[1])) {
2191
- rb_raise(rb_eArgError, "n_past must be an integer");
2192
- return Qnil;
2193
- }
2194
- if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
2195
- rb_raise(rb_eArgError, "n_tokens must be an integer");
2196
- return Qnil;
2197
- }
2198
-
2199
- const size_t tokens_len = RARRAY_LEN(kw_values[0]);
2200
- std::vector<float> embd(tokens_len);
2201
- for (size_t i = 0; i < tokens_len; i++) {
2202
- VALUE el = rb_ary_entry(kw_values[0], i);
2203
- if (!RB_FLOAT_TYPE_P(el)) {
2204
- rb_raise(rb_eArgError, "embd must be an array of floats");
2205
- return Qnil;
2206
- }
2207
- embd[i] = NUM2DBL(el);
2208
- }
2209
-
2210
- const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
2211
- const int n_past = NUM2INT(kw_values[1]);
2212
-
2213
- LLaMAContextWrapper* ptr = get_llama_context(self);
2214
- if (ptr->ctx == NULL) {
2215
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2216
- return Qnil;
2217
- }
2218
- if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
2219
- rb_raise(rb_eRuntimeError, "Failed to evaluate");
2220
- return Qnil;
2221
- }
2222
-
2223
- rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
2224
- rb_iv_set(self, "@has_evaluated", Qtrue);
2225
-
2226
- return Qnil;
2227
- }
2228
-
2229
2098
  static VALUE _llama_context_decode(VALUE self, VALUE batch) {
2230
2099
  LLaMAContextWrapper* ptr = get_llama_context(self);
2231
2100
  if (ptr->ctx == NULL) {
@@ -2430,13 +2299,13 @@ private:
2430
2299
  return Qnil;
2431
2300
  }
2432
2301
 
2433
- static VALUE _llama_context_kv_cache_seq_shift(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
2302
+ static VALUE _llama_context_kv_cache_seq_add(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
2434
2303
  LLaMAContextWrapper* ptr = get_llama_context(self);
2435
2304
  if (ptr->ctx == NULL) {
2436
2305
  rb_raise(rb_eArgError, "LLaMA context is not initialized");
2437
2306
  return Qnil;
2438
2307
  }
2439
- llama_kv_cache_seq_shift(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
2308
+ llama_kv_cache_seq_add(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
2440
2309
  return Qnil;
2441
2310
  }
2442
2311
 
@@ -2450,6 +2319,35 @@ private:
2450
2319
  return Qnil;
2451
2320
  }
2452
2321
 
2322
+ static VALUE _llama_context_kv_cache_seq_pos_max(VALUE self, VALUE seq_id) {
2323
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2324
+ if (ptr->ctx == NULL) {
2325
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2326
+ return Qnil;
2327
+ }
2328
+ return INT2NUM(llama_kv_cache_seq_pos_max(ptr->ctx, NUM2INT(seq_id)));
2329
+ }
2330
+
2331
+ static VALUE _llama_context_kv_cache_defrag(VALUE self) {
2332
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2333
+ if (ptr->ctx == NULL) {
2334
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2335
+ return Qnil;
2336
+ }
2337
+ llama_kv_cache_defrag(ptr->ctx);
2338
+ return Qnil;
2339
+ }
2340
+
2341
+ static VALUE _llama_context_kv_cache_update(VALUE self) {
2342
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2343
+ if (ptr->ctx == NULL) {
2344
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2345
+ return Qnil;
2346
+ }
2347
+ llama_kv_cache_update(ptr->ctx);
2348
+ return Qnil;
2349
+ }
2350
+
2453
2351
  static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
2454
2352
  LLaMAContextWrapper* ptr = get_llama_context(self);
2455
2353
  if (ptr->ctx == NULL) {
@@ -2659,46 +2557,6 @@ private:
2659
2557
  return Qnil;
2660
2558
  }
2661
2559
 
2662
- static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
2663
- VALUE kw_args = Qnil;
2664
- ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
2665
- VALUE kw_values[2] = { Qundef, Qundef };
2666
- VALUE candidates = Qnil;
2667
- rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
2668
- rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
2669
-
2670
- if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
2671
- rb_raise(rb_eArgError, "guidance must be a Context");
2672
- return Qnil;
2673
- }
2674
- if (!RB_FLOAT_TYPE_P(kw_values[1])) {
2675
- rb_raise(rb_eArgError, "scale must be a float");
2676
- return Qnil;
2677
- }
2678
-
2679
- LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
2680
- if (ctx_ptr->ctx == NULL) {
2681
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2682
- return Qnil;
2683
- }
2684
- LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
2685
- if (cnd_ptr->array.data == nullptr) {
2686
- rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
2687
- return Qnil;
2688
- }
2689
-
2690
- LLaMAContextWrapper* guidance_ptr = get_llama_context(kw_values[0]);
2691
- if (guidance_ptr->ctx == NULL) {
2692
- rb_raise(rb_eRuntimeError, "guidance context is not initialized");
2693
- return Qnil;
2694
- }
2695
- const float scale = NUM2DBL(kw_values[1]);
2696
-
2697
- llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
2698
-
2699
- return Qnil;
2700
- }
2701
-
2702
2560
  static VALUE _llama_context_sample_softmax(VALUE self, VALUE candidates) {
2703
2561
  if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
2704
2562
  rb_raise(rb_eArgError, "argument must be a TokenDataArray");
@@ -2994,42 +2852,6 @@ private:
2994
2852
  return Qnil;
2995
2853
  }
2996
2854
 
2997
- static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
2998
- VALUE kw_args = Qnil;
2999
- ID kw_table[1] = { rb_intern("temperature") };
3000
- VALUE kw_values[1] = { Qundef };
3001
- VALUE candidates = Qnil;
3002
- rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
3003
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
3004
-
3005
- rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
3006
-
3007
- if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
3008
- rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
3009
- return Qnil;
3010
- }
3011
- if (!RB_FLOAT_TYPE_P(kw_values[0])) {
3012
- rb_raise(rb_eArgError, "temperature must be a float");
3013
- return Qnil;
3014
- }
3015
-
3016
- LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
3017
- if (ctx_ptr->ctx == NULL) {
3018
- rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3019
- return Qnil;
3020
- }
3021
- LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
3022
- if (cnd_ptr->array.data == nullptr) {
3023
- rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
3024
- return Qnil;
3025
- }
3026
- const float temperature = NUM2DBL(kw_values[0]);
3027
-
3028
- llama_sample_temperature(ctx_ptr->ctx, &(cnd_ptr->array), temperature);
3029
-
3030
- return Qnil;
3031
- }
3032
-
3033
2855
  static VALUE _llama_context_sample_token_mirostat(int argc, VALUE* argv, VALUE self) {
3034
2856
  VALUE kw_args = Qnil;
3035
2857
  ID kw_table[4] = { rb_intern("tau"), rb_intern("eta"), rb_intern("m"), rb_intern("mu") };
@@ -3307,16 +3129,6 @@ static VALUE rb_llama_time_us(VALUE self) {
3307
3129
  return LONG2NUM(llama_time_us());
3308
3130
  }
3309
3131
 
3310
- static VALUE rb_llama_mmap_supported(VALUE self) {
3311
- rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
3312
- return llama_mmap_supported() ? Qtrue : Qfalse;
3313
- }
3314
-
3315
- static VALUE rb_llama_mlock_supported(VALUE self) {
3316
- rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
3317
- return llama_mlock_supported() ? Qtrue : Qfalse;
3318
- }
3319
-
3320
3132
  static VALUE rb_llama_max_devices(VALUE self) {
3321
3133
  return SIZET2NUM(llama_max_devices());
3322
3134
  }
@@ -3355,8 +3167,6 @@ extern "C" void Init_llama_cpp(void) {
3355
3167
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
3356
3168
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
3357
3169
  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
3358
- rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
3359
- rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
3360
3170
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
3361
3171
  rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
3362
3172
  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
@@ -3394,16 +3204,16 @@ extern "C" void Init_llama_cpp(void) {
3394
3204
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
3395
3205
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3396
3206
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3397
- rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3207
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS));
3398
3208
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3399
3209
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3400
3210
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3401
3211
 
3402
3212
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3403
3213
 
3404
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_INT", INT2NUM(LLAMA_KV_OVERRIDE_INT));
3405
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_FLOAT));
3406
- rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_BOOL));
3214
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
3215
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
3216
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
3407
3217
 
3408
3218
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
3409
3219
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3413,19 +3223,19 @@ extern "C" void Init_llama_cpp(void) {
3413
3223
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
3414
3224
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
3415
3225
 
3416
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED));
3417
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_NONE", INT2NUM(LLAMA_ROPE_SCALING_NONE));
3418
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_LINEAR));
3419
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3420
- rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3226
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED));
3227
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_NONE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE));
3228
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR));
3229
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
3230
+ rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
3421
3231
 
3422
- rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
3423
- rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
3424
- rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
3232
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
3233
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
3234
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
3425
3235
 
3426
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3427
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3428
- rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
3236
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
3237
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
3238
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_ROW", INT2NUM(LLAMA_SPLIT_MODE_ROW));
3429
3239
 
3430
3240
  std::stringstream ss_magic;
3431
3241
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.7'
6
+ VERSION = '0.13.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2249'
9
+ LLAMA_CPP_VERSION = 'b2303'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
27
27
  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
28
28
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
29
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
30
- LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
30
+ LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
31
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
32
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
34
34
 
35
- LLAMA_KV_OVERRIDE_INT: Integer
36
- LLAMA_KV_OVERRIDE_FLOAT: Integer
37
- LLAMA_KV_OVERRIDE_BOOL: Integer
35
+ LLAMA_KV_OVERRIDE_TYPE_INT: Integer
36
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
37
+ LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
38
38
 
39
39
  LLAMA_GRETYPE_END: Integer
40
40
  LLAMA_GRETYPE_ALT: Integer
@@ -44,19 +44,19 @@ module LLaMACpp
44
44
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
45
45
  LLAMA_GRETYPE_CHAR_ALT: Integer
46
46
 
47
- LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
48
- LLAMA_ROPE_SCALING_NONE: Integer
49
- LLAMA_ROPE_SCALING_LINEAR: Integer
50
- LLAMA_ROPE_SCALING_YARN: Integer
51
- LLAMA_ROPE_SCALING_MAX_VALUE: Integer
47
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
48
+ LLAMA_ROPE_SCALING_TYPE_NONE: Integer
49
+ LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
50
+ LLAMA_ROPE_SCALING_TYPE_YARN: Integer
51
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
52
52
 
53
- LLAMA_POOLING_NONE: Integer
54
- LLAMA_POOLING_MEAN: Integer
55
- LLAMA_POOLING_CLS: Integer
53
+ LLAMA_POOLING_TYPE_NONE: Integer
54
+ LLAMA_POOLING_TYPE_MEAN: Integer
55
+ LLAMA_POOLING_TYPE_CLS: Integer
56
56
 
57
- LLAMA_SPLIT_NONE: Integer
58
- LLAMA_SPLIT_LAYER: Integer
59
- LLAMA_SPLIT_ROW: Integer
57
+ LLAMA_SPLIT_MODE_NONE: Integer
58
+ LLAMA_SPLIT_MODE_LAYER: Integer
59
+ LLAMA_SPLIT_MODE_ROW: Integer
60
60
 
61
61
  def self?.backend_init: () -> void
62
62
  def self?.backend_free: () -> void
@@ -68,8 +68,6 @@ module LLaMACpp
68
68
  ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
69
69
  def self?.print_system_info: () -> void
70
70
  def self?.time_us: () -> Integer
71
- def self?.mmap_supported?: () -> bool
72
- def self?.mlock_supported?: () -> bool
73
71
  def self?.max_devices: () -> Integer
74
72
  def self?.supports_mmap?: () -> bool
75
73
  def self?.supports_mlock?: () -> bool
@@ -103,7 +101,8 @@ module LLaMACpp
103
101
  def empty?: () -> bool
104
102
  def free: () -> void
105
103
  def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
106
- def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
104
+ def vocab_type: () -> Integer
105
+ def rope_type: () -> Integer
107
106
  def n_vocab: () -> Integer
108
107
  def n_ctx_train: () -> Integer
109
108
  def n_embd: () -> Integer
@@ -202,8 +201,6 @@ module LLaMACpp
202
201
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
203
202
  def embeddings: () -> Array[Float]
204
203
  def embeddings_ith: (Integer) -> Array[Float]
205
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
206
- def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
207
204
  def decode: (::LLaMACpp::Batch) -> void
208
205
  def logits: () -> Array[Float]
209
206
  def n_ctx: () -> Integer
@@ -216,14 +213,16 @@ module LLaMACpp
216
213
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
217
214
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
218
215
  def kv_cache_seq_keep: (Integer) -> void
219
- def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
216
+ def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
220
217
  def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
218
+ def kv_cache_seq_pos_max: (Integer) -> Integer
219
+ def kv_cache_defrag: () -> void
220
+ def kv_cache_update: () -> void
221
221
  def set_rng_seed: (Integer) -> void
222
222
  def load_session_file: (session_path: String) -> void
223
223
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
224
224
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
225
225
  def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
226
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
227
226
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
228
227
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
229
228
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -232,7 +231,6 @@ module LLaMACpp
232
231
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
233
232
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
234
233
  def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
235
- def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
236
234
  def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
237
235
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
238
236
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
@@ -270,12 +268,12 @@ module LLaMACpp
270
268
  def yarn_beta_slow: () -> Float
271
269
  def yarn_orig_ctx=: (Integer) -> Integer
272
270
  def yarn_orig_ctx: () -> Integer
271
+ def defrag_thold=: (Float) -> Float
272
+ def defrag_thold: () -> Float
273
273
  def type_k=: (Integer) -> Integer
274
274
  def type_k: () -> Integer
275
275
  def type_v=: (Integer) -> Integer
276
276
  def type_v: () -> Integer
277
- def mul_mat_q: () -> bool
278
- def mul_mat_q=: (bool) -> bool
279
277
  def logits_all: () -> bool
280
278
  def logits_all=: (bool) -> bool
281
279
  def embedding: () -> bool
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
383
383
  endif # LLAMA_BLIS
384
384
 
385
385
  ifdef LLAMA_CUBLAS
386
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
387
- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
386
+ ifneq ('', '$(wildcard /opt/cuda)')
387
+ CUDA_PATH ?= /opt/cuda
388
+ else
389
+ CUDA_PATH ?= /usr/local/cuda
390
+ endif
391
+ MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
392
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
388
393
  OBJS += ggml-cuda.o
389
394
  MK_NVCCFLAGS += -use_fast_math
390
395
  ifdef LLAMA_FATAL_WARNINGS
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
599
604
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
600
605
  ifdef LLAMA_CUBLAS
601
606
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
602
- CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
607
+ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
603
608
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
604
609
  ifndef CUDA_DOCKER_ARCH
605
610
  ifndef CUDA_POWER_ARCH