llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
4
- data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
3
+ metadata.gz: c2a192fa17c1d313a93306e415ec27dfb8fb6ce993b9fc78797ed6e1d38ca63f
4
+ data.tar.gz: f800e54961a8bea5de95373d15f0cda30f7e95edd655cc0504247dfefcff473a
5
5
  SHA512:
6
- metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
7
- data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
6
+ metadata.gz: 48cefba1491319f82d52a46e8be34b5f0115dbe80bd6a9fdbf4fe0e190581a6b1ff8c3e2b2dfdaefeaa0b7cb11c8b9f5a84bcb60354f64248abbee3d488378ee
7
+ data.tar.gz: 9c6d75d3818b61192bd5c93a8b091003e2342f28102de1fbc9a1a02955a7c89e2a144b82bbe83e805b3f741261e967469c3ad2f6d347b1b870fb51880b850d89
data/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
+ ## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
2
+
3
+ - Bump llama.cpp from b2361 to b2435.
4
+ - Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
5
+ - Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
6
+ - Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
7
+
8
+ ## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
9
+
10
+ **Breaking Changes**
11
+
12
+ - Bump bundled llama.cpp from b2303 to b2361.
13
+ - Rename embedding accessor to `embeddings` in `ContextParams`.
14
+ - Remove `do_pooling` accessor from `ContextParams`.
15
+ - Add `pooling_type` accessor to `ContextParams`.
16
+ - Fix the size of array returned by `embedding` method in `Context` from `n_embd` to `n_tokens * n_embd`.
17
+ - Add `embeddings_seq` method to `Context`.
18
+
1
19
  ## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
2
20
 
21
+ **Breaking Changes**
22
+
3
23
  - Bump bundled llama.cpp from b2143 to b2303.
4
24
  - Remove deprecated methods:
5
25
  - `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
@@ -946,12 +946,18 @@ public:
946
946
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
947
947
  rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
948
948
  rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
949
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
950
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
951
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
952
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
949
953
  rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
950
954
  rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
951
955
  rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
952
956
  rb_define_method(rb_cLLaMAContextParams, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads_batch), 0);
953
957
  rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_scaling_type), 1);
954
958
  rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type", RUBY_METHOD_FUNC(_llama_context_params_get_rope_scaling_type), 0);
959
+ rb_define_method(rb_cLLaMAContextParams, "pooling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_pooling_type), 1);
960
+ rb_define_method(rb_cLLaMAContextParams, "pooling_type", RUBY_METHOD_FUNC(_llama_context_params_get_pooling_type), 0);
955
961
  rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
956
962
  rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
957
963
  rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
@@ -974,12 +980,10 @@ public:
974
980
  rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
975
981
  rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
976
982
  rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
977
- rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
978
- rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
983
+ rb_define_method(rb_cLLaMAContextParams, "embeddings=", RUBY_METHOD_FUNC(_llama_context_params_set_embeddings), 1);
984
+ rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
979
985
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
980
986
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
981
- rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
982
- rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
983
987
  }
984
988
 
985
989
  private:
@@ -1031,6 +1035,30 @@ private:
1031
1035
  return INT2NUM(ptr->params.n_batch);
1032
1036
  }
1033
1037
 
1038
+ // n_ubatch
1039
+ static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
1040
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1041
+ ptr->params.n_ubatch = NUM2INT(n_ubatch);
1042
+ return INT2NUM(ptr->params.n_ubatch);
1043
+ }
1044
+
1045
+ static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
1046
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1047
+ return INT2NUM(ptr->params.n_ubatch);
1048
+ }
1049
+
1050
+ // n_seq_max
1051
+ static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
1052
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1053
+ ptr->params.n_seq_max = NUM2INT(n_seq_max);
1054
+ return INT2NUM(ptr->params.n_seq_max);
1055
+ }
1056
+
1057
+ static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
1058
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1059
+ return INT2NUM(ptr->params.n_seq_max);
1060
+ }
1061
+
1034
1062
  // n_threads
1035
1063
  static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
1036
1064
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1058,7 +1086,7 @@ private:
1058
1086
  // rope_scaling_type
1059
1087
  static VALUE _llama_context_params_set_rope_scaling_type(VALUE self, VALUE scaling_type) {
1060
1088
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1061
- ptr->params.rope_scaling_type = NUM2INT(scaling_type);
1089
+ ptr->params.rope_scaling_type = static_cast<enum llama_rope_scaling_type>(NUM2INT(scaling_type));
1062
1090
  return INT2NUM(ptr->params.rope_scaling_type);
1063
1091
  }
1064
1092
 
@@ -1067,6 +1095,18 @@ private:
1067
1095
  return INT2NUM(ptr->params.rope_scaling_type);
1068
1096
  }
1069
1097
 
1098
+ // pooling_type
1099
+ static VALUE _llama_context_params_set_pooling_type(VALUE self, VALUE scaling_type) {
1100
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1101
+ ptr->params.pooling_type = static_cast<enum llama_pooling_type>(NUM2INT(scaling_type));
1102
+ return INT2NUM(ptr->params.pooling_type);
1103
+ }
1104
+
1105
+ static VALUE _llama_context_params_get_pooling_type(VALUE self) {
1106
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1107
+ return INT2NUM(ptr->params.pooling_type);
1108
+ }
1109
+
1070
1110
  // rope_freq_base
1071
1111
  static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
1072
1112
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1199,16 +1239,16 @@ private:
1199
1239
  return ptr->params.logits_all ? Qtrue : Qfalse;
1200
1240
  }
1201
1241
 
1202
- // embedding
1203
- static VALUE _llama_context_params_set_embedding(VALUE self, VALUE embedding) {
1242
+ // embeddings
1243
+ static VALUE _llama_context_params_set_embeddings(VALUE self, VALUE embeddings) {
1204
1244
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1205
- ptr->params.embedding = RTEST(embedding) ? true : false;
1206
- return ptr->params.embedding ? Qtrue : Qfalse;
1245
+ ptr->params.embeddings = RTEST(embeddings) ? true : false;
1246
+ return ptr->params.embeddings ? Qtrue : Qfalse;
1207
1247
  }
1208
1248
 
1209
- static VALUE _llama_context_params_get_embedding(VALUE self) {
1249
+ static VALUE _llama_context_params_get_embeddings(VALUE self) {
1210
1250
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1211
- return ptr->params.embedding ? Qtrue : Qfalse;
1251
+ return ptr->params.embeddings ? Qtrue : Qfalse;
1212
1252
  }
1213
1253
 
1214
1254
  // offload_kqv
@@ -1222,18 +1262,6 @@ private:
1222
1262
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1223
1263
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1224
1264
  }
1225
-
1226
- // do_pooling
1227
- static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
1228
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1229
- ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
1230
- return ptr->params.do_pooling ? Qtrue : Qfalse;
1231
- }
1232
-
1233
- static VALUE _llama_context_params_get_do_pooling(VALUE self) {
1234
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1235
- return ptr->params.do_pooling ? Qtrue : Qfalse;
1236
- }
1237
1265
  };
1238
1266
 
1239
1267
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -2016,8 +2044,11 @@ public:
2016
2044
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
2017
2045
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2018
2046
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2047
+ rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2019
2048
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2020
2049
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2050
+ rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2051
+ rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2021
2052
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2022
2053
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2023
2054
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2032,6 +2063,8 @@ public:
2032
2063
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
2033
2064
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
2034
2065
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2066
+ rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
2067
+ rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
2035
2068
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2036
2069
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2037
2070
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
@@ -2151,7 +2184,7 @@ private:
2151
2184
  LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
2152
2185
  VALUE params = rb_iv_get(self, "@params");
2153
2186
  LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2154
- if (!prms_ptr->params.embedding) {
2187
+ if (!prms_ptr->params.embeddings) {
2155
2188
  rb_raise(rb_eRuntimeError, "embedding parameter is false");
2156
2189
  return Qnil;
2157
2190
  }
@@ -2160,10 +2193,11 @@ private:
2160
2193
  return Qnil;
2161
2194
  }
2162
2195
 
2196
+ const int n_tokens = NUM2INT(rb_iv_get(self, "@n_tokens"));
2163
2197
  const int n_embd = llama_n_embd(model_ptr->model);
2164
2198
  const float* embd = llama_get_embeddings(ptr->ctx);
2165
2199
  VALUE output = rb_ary_new();
2166
- for (int i = 0; i < n_embd; i++) {
2200
+ for (int i = 0; i < n_tokens * n_embd; i++) {
2167
2201
  rb_ary_push(output, DBL2NUM((double)(embd[i])));
2168
2202
  }
2169
2203
 
@@ -2182,7 +2216,7 @@ private:
2182
2216
  }
2183
2217
  VALUE params = rb_iv_get(self, "@params");
2184
2218
  LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2185
- if (!prms_ptr->params.embedding) {
2219
+ if (!prms_ptr->params.embeddings) {
2186
2220
  rb_raise(rb_eRuntimeError, "embedding parameter is false");
2187
2221
  return Qnil;
2188
2222
  }
@@ -2200,6 +2234,36 @@ private:
2200
2234
  return output;
2201
2235
  }
2202
2236
 
2237
+ static VALUE _llama_context_embeddings_seq(VALUE self, VALUE seq_id) {
2238
+ if (!RB_INTEGER_TYPE_P(seq_id)) {
2239
+ rb_raise(rb_eArgError, "seq_id must be an integer");
2240
+ return Qnil;
2241
+ }
2242
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2243
+ if (ptr->ctx == NULL) {
2244
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2245
+ return Qnil;
2246
+ }
2247
+ VALUE params = rb_iv_get(self, "@params");
2248
+ LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2249
+ if (!prms_ptr->params.embeddings) {
2250
+ rb_raise(rb_eRuntimeError, "embedding parameter is false");
2251
+ return Qnil;
2252
+ }
2253
+
2254
+ VALUE model = rb_iv_get(self, "@model");
2255
+ LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
2256
+ const int n_embd = llama_n_embd(model_ptr->model);
2257
+
2258
+ VALUE output = rb_ary_new();
2259
+ const float* embd = llama_get_embeddings_seq(ptr->ctx, NUM2INT(seq_id));
2260
+ for (int i = 0; i < n_embd; i++) {
2261
+ rb_ary_push(output, DBL2NUM((double)(embd[i])));
2262
+ }
2263
+
2264
+ return output;
2265
+ }
2266
+
2203
2267
  static VALUE _llama_context_n_ctx(VALUE self) {
2204
2268
  LLaMAContextWrapper* ptr = get_llama_context(self);
2205
2269
  if (ptr->ctx == NULL) {
@@ -2218,6 +2282,24 @@ private:
2218
2282
  return UINT2NUM(llama_n_batch(ptr->ctx));
2219
2283
  }
2220
2284
 
2285
+ static VALUE _llama_context_n_ubatch(VALUE self) {
2286
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2287
+ if (ptr->ctx == NULL) {
2288
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2289
+ return Qnil;
2290
+ }
2291
+ return UINT2NUM(llama_n_ubatch(ptr->ctx));
2292
+ }
2293
+
2294
+ static VALUE _llama_context_n_seq_max(VALUE self) {
2295
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2296
+ if (ptr->ctx == NULL) {
2297
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2298
+ return Qnil;
2299
+ }
2300
+ return UINT2NUM(llama_n_seq_max(ptr->ctx));
2301
+ }
2302
+
2221
2303
  static VALUE _llama_context_get_timings(VALUE self) {
2222
2304
  LLaMAContextWrapper* ptr = get_llama_context(self);
2223
2305
  if (ptr->ctx == NULL) {
@@ -2363,6 +2445,26 @@ private:
2363
2445
  return Qnil;
2364
2446
  }
2365
2447
 
2448
+ static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
2449
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2450
+ if (ptr->ctx == NULL) {
2451
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2452
+ return Qnil;
2453
+ }
2454
+ llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
2455
+ return Qnil;
2456
+ }
2457
+
2458
+ static VALUE _llama_context_synchronize(VALUE self) {
2459
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2460
+ if (ptr->ctx == NULL) {
2461
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2462
+ return Qnil;
2463
+ }
2464
+ llama_synchronize(ptr->ctx);
2465
+ return Qnil;
2466
+ }
2467
+
2366
2468
  static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
2367
2469
  VALUE kw_args = Qnil;
2368
2470
  ID kw_table[1] = { rb_intern("session_path") };
@@ -3172,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
3172
3274
  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3173
3275
  rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3174
3276
 
3277
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
3175
3278
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3176
3279
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3177
3280
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
@@ -3229,6 +3332,7 @@ extern "C" void Init_llama_cpp(void) {
3229
3332
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
3230
3333
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
3231
3334
 
3335
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED));
3232
3336
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
3233
3337
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
3234
3338
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.13.0'
6
+ VERSION = '0.14.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2303'
9
+ LLAMA_CPP_VERSION = 'b2435'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,7 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_VOCAB_TYPE_NONE: Integer
6
7
  LLAMA_VOCAB_TYPE_SPM: Integer
7
8
  LLAMA_VOCAB_TYPE_BPE: Integer
8
9
  LLAMA_VOCAB_TYPE_WPM: Integer
@@ -50,6 +51,7 @@ module LLaMACpp
50
51
  LLAMA_ROPE_SCALING_TYPE_YARN: Integer
51
52
  LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
52
53
 
54
+ LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
53
55
  LLAMA_POOLING_TYPE_NONE: Integer
54
56
  LLAMA_POOLING_TYPE_MEAN: Integer
55
57
  LLAMA_POOLING_TYPE_CLS: Integer
@@ -201,10 +203,13 @@ module LLaMACpp
201
203
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
202
204
  def embeddings: () -> Array[Float]
203
205
  def embeddings_ith: (Integer) -> Array[Float]
206
+ def embeddings_seq: (Integer) -> Array[Float]
204
207
  def decode: (::LLaMACpp::Batch) -> void
205
208
  def logits: () -> Array[Float]
206
209
  def n_ctx: () -> Integer
207
210
  def n_batch: () -> Integer
211
+ def n_ubatch: () -> Integer
212
+ def n_seq_max: () -> Integer
208
213
  def timings: () -> ::LLaMACpp::Timings
209
214
  def print_timings: () -> void
210
215
  def reset_timings: () -> void
@@ -219,6 +224,8 @@ module LLaMACpp
219
224
  def kv_cache_defrag: () -> void
220
225
  def kv_cache_update: () -> void
221
226
  def set_rng_seed: (Integer) -> void
227
+ def set_causal_attn: (bool) -> void
228
+ def synchronize: () -> void
222
229
  def load_session_file: (session_path: String) -> void
223
230
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
224
231
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
@@ -248,12 +255,18 @@ module LLaMACpp
248
255
  def n_ctx=: (Integer) -> Integer
249
256
  def n_batch: () -> Integer
250
257
  def n_batch=: (Integer) -> Integer
258
+ def n_ubatch: () -> Integer
259
+ def n_ubatch=: (Integer) -> Integer
260
+ def n_seq_max: () -> Integer
261
+ def n_seq_max=: (Integer) -> Integer
251
262
  def n_threads: () -> Integer
252
263
  def n_threads=: (Integer) -> Integer
253
264
  def n_threads_batch: () -> Integer
254
265
  def n_threads_batch=: (Integer) -> Integer
255
266
  def rope_scaling_type=: (Integer) -> Integer
256
267
  def rope_scaling_type: () -> Integer
268
+ def pooling_type=: (Integer) -> Integer
269
+ def pooling_type: () -> Integer
257
270
  def rope_freq_base=: (Float) -> Float
258
271
  def rope_freq_base: () -> Float
259
272
  def rope_freq_scale=: (Float) -> Float
@@ -276,12 +289,10 @@ module LLaMACpp
276
289
  def type_v: () -> Integer
277
290
  def logits_all: () -> bool
278
291
  def logits_all=: (bool) -> bool
279
- def embedding: () -> bool
280
- def embedding=: (bool) -> bool
292
+ def embeddings: () -> bool
293
+ def embeddings=: (bool) -> bool
281
294
  def offload_kqv: () -> bool
282
295
  def offload_kqv=: (bool) -> bool
283
- def do_pooling: () -> bool
284
- def do_pooling=: (bool) -> bool
285
296
  end
286
297
 
287
298
  class ModelQuantizeParams
@@ -2,7 +2,7 @@
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
4
  simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
5
+ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
167
167
  MK_CPPFLAGS += -D_BSD_SOURCE
168
168
  endif
169
169
 
170
+ ifdef LLAMA_SCHED_MAX_COPIES
171
+ MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
172
+ endif
173
+
170
174
  ifdef LLAMA_DEBUG
171
175
  MK_CFLAGS += -O0 -g
172
176
  MK_CXXFLAGS += -O0 -g
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
201
205
  MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
202
206
  endif
203
207
 
208
+ ifdef LLAMA_SERVER_SSL
209
+ MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
210
+ MK_LDFLAGS += -lssl -lcrypto
211
+ endif
204
212
 
205
213
  ifdef LLAMA_CODE_COVERAGE
206
214
  MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
451
459
  ifdef LLAMA_CUDA_CCBIN
452
460
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
453
461
  endif
454
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
462
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
455
463
  ifdef JETSON_EOL_MODULE_DETECT
456
464
  $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
457
465
  else
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
551
559
  $(CC) $(CFLAGS) -c $< -o $@
552
560
 
553
561
  ifdef LLAMA_METAL_EMBED_LIBRARY
554
- ggml-metal-embed.o: ggml-metal.metal
562
+ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
555
563
  @echo "Embedding Metal library"
564
+ @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
556
565
  $(eval TEMP_ASSEMBLY=$(shell mktemp))
557
- @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
558
- @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
559
- @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
560
- @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
561
- @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
562
- @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
566
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
567
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
568
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
569
+ @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
570
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
571
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
563
572
  @$(AS) $(TEMP_ASSEMBLY) -o $@
564
573
  @rm -f ${TEMP_ASSEMBLY}
565
574
  endif
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
628
637
  ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
629
638
  $(CC) $(CFLAGS) -c $< -o $@
630
639
 
631
- ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
640
+ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
632
641
  $(CC) $(CFLAGS) -c $< -o $@
633
642
 
634
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
643
+ unicode.o: unicode.cpp unicode.h
644
+ $(CXX) $(CXXFLAGS) -c $< -o $@
635
645
 
636
- llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
646
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
647
+
648
+ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
637
649
  $(CXX) $(CXXFLAGS) -c $< -o $@
638
650
 
639
651
  COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
@@ -725,14 +737,17 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
725
737
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
726
738
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
727
739
 
740
+ gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
741
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
742
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
743
+
728
744
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
729
745
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
730
746
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
731
747
 
732
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
748
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
733
749
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
734
- $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
735
- $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
750
+ $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
736
751
 
737
752
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
738
753
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)