llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
4
- data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
3
+ metadata.gz: c2a192fa17c1d313a93306e415ec27dfb8fb6ce993b9fc78797ed6e1d38ca63f
4
+ data.tar.gz: f800e54961a8bea5de95373d15f0cda30f7e95edd655cc0504247dfefcff473a
5
5
  SHA512:
6
- metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
7
- data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
6
+ metadata.gz: 48cefba1491319f82d52a46e8be34b5f0115dbe80bd6a9fdbf4fe0e190581a6b1ff8c3e2b2dfdaefeaa0b7cb11c8b9f5a84bcb60354f64248abbee3d488378ee
7
+ data.tar.gz: 9c6d75d3818b61192bd5c93a8b091003e2342f28102de1fbc9a1a02955a7c89e2a144b82bbe83e805b3f741261e967469c3ad2f6d347b1b870fb51880b850d89
data/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
+ ## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
2
+
3
+ - Bump llama.cpp from b2361 to b2435.
4
+ - Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
5
+ - Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
6
+ - Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
7
+
8
+ ## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
9
+
10
+ **Breaking Changes**
11
+
12
+ - Bump bundled llama.cpp from b2303 to b2361.
13
+ - Rename embedding accessor to `embeddings` in `ContextParams`.
14
+ - Remove `do_pooling` accessor from `ContextParams`.
15
+ - Add `pooling_type` accessor to `ContextParams`.
16
+ - Fix the size of array returned by `embedding` method in `Context` from `n_embd` to `n_tokens * n_embd`.
17
+ - Add `embeddings_seq` method to `Context`.
18
+
1
19
  ## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
2
20
 
21
+ **Breaking Changes**
22
+
3
23
  - Bump bundled llama.cpp from b2143 to b2303.
4
24
  - Remove deprecated methods:
5
25
  - `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
@@ -946,12 +946,18 @@ public:
946
946
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
947
947
  rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
948
948
  rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
949
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
950
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
951
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
952
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
949
953
  rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
950
954
  rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
951
955
  rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
952
956
  rb_define_method(rb_cLLaMAContextParams, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads_batch), 0);
953
957
  rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_scaling_type), 1);
954
958
  rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type", RUBY_METHOD_FUNC(_llama_context_params_get_rope_scaling_type), 0);
959
+ rb_define_method(rb_cLLaMAContextParams, "pooling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_pooling_type), 1);
960
+ rb_define_method(rb_cLLaMAContextParams, "pooling_type", RUBY_METHOD_FUNC(_llama_context_params_get_pooling_type), 0);
955
961
  rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
956
962
  rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
957
963
  rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
@@ -974,12 +980,10 @@ public:
974
980
  rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
975
981
  rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
976
982
  rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
977
- rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
978
- rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
983
+ rb_define_method(rb_cLLaMAContextParams, "embeddings=", RUBY_METHOD_FUNC(_llama_context_params_set_embeddings), 1);
984
+ rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
979
985
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
980
986
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
981
- rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
982
- rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
983
987
  }
984
988
 
985
989
  private:
@@ -1031,6 +1035,30 @@ private:
1031
1035
  return INT2NUM(ptr->params.n_batch);
1032
1036
  }
1033
1037
 
1038
+ // n_ubatch
1039
+ static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
1040
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1041
+ ptr->params.n_ubatch = NUM2INT(n_ubatch);
1042
+ return INT2NUM(ptr->params.n_ubatch);
1043
+ }
1044
+
1045
+ static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
1046
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1047
+ return INT2NUM(ptr->params.n_ubatch);
1048
+ }
1049
+
1050
+ // n_seq_max
1051
+ static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
1052
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1053
+ ptr->params.n_seq_max = NUM2INT(n_seq_max);
1054
+ return INT2NUM(ptr->params.n_seq_max);
1055
+ }
1056
+
1057
+ static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
1058
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1059
+ return INT2NUM(ptr->params.n_seq_max);
1060
+ }
1061
+
1034
1062
  // n_threads
1035
1063
  static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
1036
1064
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1058,7 +1086,7 @@ private:
1058
1086
  // rope_scaling_type
1059
1087
  static VALUE _llama_context_params_set_rope_scaling_type(VALUE self, VALUE scaling_type) {
1060
1088
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1061
- ptr->params.rope_scaling_type = NUM2INT(scaling_type);
1089
+ ptr->params.rope_scaling_type = static_cast<enum llama_rope_scaling_type>(NUM2INT(scaling_type));
1062
1090
  return INT2NUM(ptr->params.rope_scaling_type);
1063
1091
  }
1064
1092
 
@@ -1067,6 +1095,18 @@ private:
1067
1095
  return INT2NUM(ptr->params.rope_scaling_type);
1068
1096
  }
1069
1097
 
1098
+ // pooling_type
1099
+ static VALUE _llama_context_params_set_pooling_type(VALUE self, VALUE scaling_type) {
1100
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1101
+ ptr->params.pooling_type = static_cast<enum llama_pooling_type>(NUM2INT(scaling_type));
1102
+ return INT2NUM(ptr->params.pooling_type);
1103
+ }
1104
+
1105
+ static VALUE _llama_context_params_get_pooling_type(VALUE self) {
1106
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1107
+ return INT2NUM(ptr->params.pooling_type);
1108
+ }
1109
+
1070
1110
  // rope_freq_base
1071
1111
  static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
1072
1112
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1199,16 +1239,16 @@ private:
1199
1239
  return ptr->params.logits_all ? Qtrue : Qfalse;
1200
1240
  }
1201
1241
 
1202
- // embedding
1203
- static VALUE _llama_context_params_set_embedding(VALUE self, VALUE embedding) {
1242
+ // embeddings
1243
+ static VALUE _llama_context_params_set_embeddings(VALUE self, VALUE embeddings) {
1204
1244
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1205
- ptr->params.embedding = RTEST(embedding) ? true : false;
1206
- return ptr->params.embedding ? Qtrue : Qfalse;
1245
+ ptr->params.embeddings = RTEST(embeddings) ? true : false;
1246
+ return ptr->params.embeddings ? Qtrue : Qfalse;
1207
1247
  }
1208
1248
 
1209
- static VALUE _llama_context_params_get_embedding(VALUE self) {
1249
+ static VALUE _llama_context_params_get_embeddings(VALUE self) {
1210
1250
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1211
- return ptr->params.embedding ? Qtrue : Qfalse;
1251
+ return ptr->params.embeddings ? Qtrue : Qfalse;
1212
1252
  }
1213
1253
 
1214
1254
  // offload_kqv
@@ -1222,18 +1262,6 @@ private:
1222
1262
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1223
1263
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1224
1264
  }
1225
-
1226
- // do_pooling
1227
- static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
1228
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1229
- ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
1230
- return ptr->params.do_pooling ? Qtrue : Qfalse;
1231
- }
1232
-
1233
- static VALUE _llama_context_params_get_do_pooling(VALUE self) {
1234
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1235
- return ptr->params.do_pooling ? Qtrue : Qfalse;
1236
- }
1237
1265
  };
1238
1266
 
1239
1267
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -2016,8 +2044,11 @@ public:
2016
2044
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
2017
2045
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2018
2046
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2047
+ rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2019
2048
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2020
2049
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2050
+ rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2051
+ rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2021
2052
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2022
2053
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2023
2054
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2032,6 +2063,8 @@ public:
2032
2063
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
2033
2064
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
2034
2065
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2066
+ rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
2067
+ rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
2035
2068
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2036
2069
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2037
2070
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
@@ -2151,7 +2184,7 @@ private:
2151
2184
  LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
2152
2185
  VALUE params = rb_iv_get(self, "@params");
2153
2186
  LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2154
- if (!prms_ptr->params.embedding) {
2187
+ if (!prms_ptr->params.embeddings) {
2155
2188
  rb_raise(rb_eRuntimeError, "embedding parameter is false");
2156
2189
  return Qnil;
2157
2190
  }
@@ -2160,10 +2193,11 @@ private:
2160
2193
  return Qnil;
2161
2194
  }
2162
2195
 
2196
+ const int n_tokens = NUM2INT(rb_iv_get(self, "@n_tokens"));
2163
2197
  const int n_embd = llama_n_embd(model_ptr->model);
2164
2198
  const float* embd = llama_get_embeddings(ptr->ctx);
2165
2199
  VALUE output = rb_ary_new();
2166
- for (int i = 0; i < n_embd; i++) {
2200
+ for (int i = 0; i < n_tokens * n_embd; i++) {
2167
2201
  rb_ary_push(output, DBL2NUM((double)(embd[i])));
2168
2202
  }
2169
2203
 
@@ -2182,7 +2216,7 @@ private:
2182
2216
  }
2183
2217
  VALUE params = rb_iv_get(self, "@params");
2184
2218
  LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2185
- if (!prms_ptr->params.embedding) {
2219
+ if (!prms_ptr->params.embeddings) {
2186
2220
  rb_raise(rb_eRuntimeError, "embedding parameter is false");
2187
2221
  return Qnil;
2188
2222
  }
@@ -2200,6 +2234,36 @@ private:
2200
2234
  return output;
2201
2235
  }
2202
2236
 
2237
+ static VALUE _llama_context_embeddings_seq(VALUE self, VALUE seq_id) {
2238
+ if (!RB_INTEGER_TYPE_P(seq_id)) {
2239
+ rb_raise(rb_eArgError, "seq_id must be an integer");
2240
+ return Qnil;
2241
+ }
2242
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2243
+ if (ptr->ctx == NULL) {
2244
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2245
+ return Qnil;
2246
+ }
2247
+ VALUE params = rb_iv_get(self, "@params");
2248
+ LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2249
+ if (!prms_ptr->params.embeddings) {
2250
+ rb_raise(rb_eRuntimeError, "embedding parameter is false");
2251
+ return Qnil;
2252
+ }
2253
+
2254
+ VALUE model = rb_iv_get(self, "@model");
2255
+ LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
2256
+ const int n_embd = llama_n_embd(model_ptr->model);
2257
+
2258
+ VALUE output = rb_ary_new();
2259
+ const float* embd = llama_get_embeddings_seq(ptr->ctx, NUM2INT(seq_id));
2260
+ for (int i = 0; i < n_embd; i++) {
2261
+ rb_ary_push(output, DBL2NUM((double)(embd[i])));
2262
+ }
2263
+
2264
+ return output;
2265
+ }
2266
+
2203
2267
  static VALUE _llama_context_n_ctx(VALUE self) {
2204
2268
  LLaMAContextWrapper* ptr = get_llama_context(self);
2205
2269
  if (ptr->ctx == NULL) {
@@ -2218,6 +2282,24 @@ private:
2218
2282
  return UINT2NUM(llama_n_batch(ptr->ctx));
2219
2283
  }
2220
2284
 
2285
+ static VALUE _llama_context_n_ubatch(VALUE self) {
2286
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2287
+ if (ptr->ctx == NULL) {
2288
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2289
+ return Qnil;
2290
+ }
2291
+ return UINT2NUM(llama_n_ubatch(ptr->ctx));
2292
+ }
2293
+
2294
+ static VALUE _llama_context_n_seq_max(VALUE self) {
2295
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2296
+ if (ptr->ctx == NULL) {
2297
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2298
+ return Qnil;
2299
+ }
2300
+ return UINT2NUM(llama_n_seq_max(ptr->ctx));
2301
+ }
2302
+
2221
2303
  static VALUE _llama_context_get_timings(VALUE self) {
2222
2304
  LLaMAContextWrapper* ptr = get_llama_context(self);
2223
2305
  if (ptr->ctx == NULL) {
@@ -2363,6 +2445,26 @@ private:
2363
2445
  return Qnil;
2364
2446
  }
2365
2447
 
2448
+ static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
2449
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2450
+ if (ptr->ctx == NULL) {
2451
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2452
+ return Qnil;
2453
+ }
2454
+ llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
2455
+ return Qnil;
2456
+ }
2457
+
2458
+ static VALUE _llama_context_synchronize(VALUE self) {
2459
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2460
+ if (ptr->ctx == NULL) {
2461
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2462
+ return Qnil;
2463
+ }
2464
+ llama_synchronize(ptr->ctx);
2465
+ return Qnil;
2466
+ }
2467
+
2366
2468
  static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
2367
2469
  VALUE kw_args = Qnil;
2368
2470
  ID kw_table[1] = { rb_intern("session_path") };
@@ -3172,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
3172
3274
  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3173
3275
  rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3174
3276
 
3277
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
3175
3278
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3176
3279
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3177
3280
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
@@ -3229,6 +3332,7 @@ extern "C" void Init_llama_cpp(void) {
3229
3332
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
3230
3333
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
3231
3334
 
3335
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED));
3232
3336
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
3233
3337
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
3234
3338
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.13.0'
6
+ VERSION = '0.14.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2303'
9
+ LLAMA_CPP_VERSION = 'b2435'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,7 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_VOCAB_TYPE_NONE: Integer
6
7
  LLAMA_VOCAB_TYPE_SPM: Integer
7
8
  LLAMA_VOCAB_TYPE_BPE: Integer
8
9
  LLAMA_VOCAB_TYPE_WPM: Integer
@@ -50,6 +51,7 @@ module LLaMACpp
50
51
  LLAMA_ROPE_SCALING_TYPE_YARN: Integer
51
52
  LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
52
53
 
54
+ LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
53
55
  LLAMA_POOLING_TYPE_NONE: Integer
54
56
  LLAMA_POOLING_TYPE_MEAN: Integer
55
57
  LLAMA_POOLING_TYPE_CLS: Integer
@@ -201,10 +203,13 @@ module LLaMACpp
201
203
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
202
204
  def embeddings: () -> Array[Float]
203
205
  def embeddings_ith: (Integer) -> Array[Float]
206
+ def embeddings_seq: (Integer) -> Array[Float]
204
207
  def decode: (::LLaMACpp::Batch) -> void
205
208
  def logits: () -> Array[Float]
206
209
  def n_ctx: () -> Integer
207
210
  def n_batch: () -> Integer
211
+ def n_ubatch: () -> Integer
212
+ def n_seq_max: () -> Integer
208
213
  def timings: () -> ::LLaMACpp::Timings
209
214
  def print_timings: () -> void
210
215
  def reset_timings: () -> void
@@ -219,6 +224,8 @@ module LLaMACpp
219
224
  def kv_cache_defrag: () -> void
220
225
  def kv_cache_update: () -> void
221
226
  def set_rng_seed: (Integer) -> void
227
+ def set_causal_attn: (bool) -> void
228
+ def synchronize: () -> void
222
229
  def load_session_file: (session_path: String) -> void
223
230
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
224
231
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
@@ -248,12 +255,18 @@ module LLaMACpp
248
255
  def n_ctx=: (Integer) -> Integer
249
256
  def n_batch: () -> Integer
250
257
  def n_batch=: (Integer) -> Integer
258
+ def n_ubatch: () -> Integer
259
+ def n_ubatch=: (Integer) -> Integer
260
+ def n_seq_max: () -> Integer
261
+ def n_seq_max=: (Integer) -> Integer
251
262
  def n_threads: () -> Integer
252
263
  def n_threads=: (Integer) -> Integer
253
264
  def n_threads_batch: () -> Integer
254
265
  def n_threads_batch=: (Integer) -> Integer
255
266
  def rope_scaling_type=: (Integer) -> Integer
256
267
  def rope_scaling_type: () -> Integer
268
+ def pooling_type=: (Integer) -> Integer
269
+ def pooling_type: () -> Integer
257
270
  def rope_freq_base=: (Float) -> Float
258
271
  def rope_freq_base: () -> Float
259
272
  def rope_freq_scale=: (Float) -> Float
@@ -276,12 +289,10 @@ module LLaMACpp
276
289
  def type_v: () -> Integer
277
290
  def logits_all: () -> bool
278
291
  def logits_all=: (bool) -> bool
279
- def embedding: () -> bool
280
- def embedding=: (bool) -> bool
292
+ def embeddings: () -> bool
293
+ def embeddings=: (bool) -> bool
281
294
  def offload_kqv: () -> bool
282
295
  def offload_kqv=: (bool) -> bool
283
- def do_pooling: () -> bool
284
- def do_pooling=: (bool) -> bool
285
296
  end
286
297
 
287
298
  class ModelQuantizeParams
@@ -2,7 +2,7 @@
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
4
  simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
5
+ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
167
167
  MK_CPPFLAGS += -D_BSD_SOURCE
168
168
  endif
169
169
 
170
+ ifdef LLAMA_SCHED_MAX_COPIES
171
+ MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
172
+ endif
173
+
170
174
  ifdef LLAMA_DEBUG
171
175
  MK_CFLAGS += -O0 -g
172
176
  MK_CXXFLAGS += -O0 -g
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
201
205
  MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
202
206
  endif
203
207
 
208
+ ifdef LLAMA_SERVER_SSL
209
+ MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
210
+ MK_LDFLAGS += -lssl -lcrypto
211
+ endif
204
212
 
205
213
  ifdef LLAMA_CODE_COVERAGE
206
214
  MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
451
459
  ifdef LLAMA_CUDA_CCBIN
452
460
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
453
461
  endif
454
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
462
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
455
463
  ifdef JETSON_EOL_MODULE_DETECT
456
464
  $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
457
465
  else
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
551
559
  $(CC) $(CFLAGS) -c $< -o $@
552
560
 
553
561
  ifdef LLAMA_METAL_EMBED_LIBRARY
554
- ggml-metal-embed.o: ggml-metal.metal
562
+ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
555
563
  @echo "Embedding Metal library"
564
+ @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
556
565
  $(eval TEMP_ASSEMBLY=$(shell mktemp))
557
- @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
558
- @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
559
- @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
560
- @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
561
- @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
562
- @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
566
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
567
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
568
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
569
+ @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
570
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
571
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
563
572
  @$(AS) $(TEMP_ASSEMBLY) -o $@
564
573
  @rm -f ${TEMP_ASSEMBLY}
565
574
  endif
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
628
637
  ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
629
638
  $(CC) $(CFLAGS) -c $< -o $@
630
639
 
631
- ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
640
+ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
632
641
  $(CC) $(CFLAGS) -c $< -o $@
633
642
 
634
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
643
+ unicode.o: unicode.cpp unicode.h
644
+ $(CXX) $(CXXFLAGS) -c $< -o $@
635
645
 
636
- llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
646
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
647
+
648
+ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
637
649
  $(CXX) $(CXXFLAGS) -c $< -o $@
638
650
 
639
651
  COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
@@ -725,14 +737,17 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
725
737
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
726
738
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
727
739
 
740
+ gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
741
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
742
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
743
+
728
744
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
729
745
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
730
746
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
731
747
 
732
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
748
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
733
749
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
734
- $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
735
- $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
750
+ $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
736
751
 
737
752
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
738
753
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)