llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2a192fa17c1d313a93306e415ec27dfb8fb6ce993b9fc78797ed6e1d38ca63f
|
4
|
+
data.tar.gz: f800e54961a8bea5de95373d15f0cda30f7e95edd655cc0504247dfefcff473a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48cefba1491319f82d52a46e8be34b5f0115dbe80bd6a9fdbf4fe0e190581a6b1ff8c3e2b2dfdaefeaa0b7cb11c8b9f5a84bcb60354f64248abbee3d488378ee
|
7
|
+
data.tar.gz: 9c6d75d3818b61192bd5c93a8b091003e2342f28102de1fbc9a1a02955a7c89e2a144b82bbe83e805b3f741261e967469c3ad2f6d347b1b870fb51880b850d89
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,25 @@
|
|
1
|
+
## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2361 to b2435.
|
4
|
+
- Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
|
5
|
+
- Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
|
6
|
+
- Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
|
7
|
+
|
8
|
+
## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
|
9
|
+
|
10
|
+
**Breaking Changes**
|
11
|
+
|
12
|
+
- Bump bundled llama.cpp from b2303 to b2361.
|
13
|
+
- Rename embedding accessor to `embeddings` in `ContextParams`.
|
14
|
+
- Remove `do_pooling` accessor from `ContextParams`.
|
15
|
+
- Add `pooling_type` accessor to `ContextParams`.
|
16
|
+
- Fix the size of array returned by `embedding` method in `Context` from `n_embd` to `n_tokens * n_embd`.
|
17
|
+
- Add `embeddings_seq` method to `Context`.
|
18
|
+
|
1
19
|
## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
|
2
20
|
|
21
|
+
**Breaking Changes**
|
22
|
+
|
3
23
|
- Bump bundled llama.cpp from b2143 to b2303.
|
4
24
|
- Remove deprecated methods:
|
5
25
|
- `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -946,12 +946,18 @@ public:
|
|
946
946
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
|
947
947
|
rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
|
948
948
|
rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
|
949
|
+
rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
|
950
|
+
rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
|
951
|
+
rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
|
952
|
+
rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
|
949
953
|
rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
|
950
954
|
rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
|
951
955
|
rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
|
952
956
|
rb_define_method(rb_cLLaMAContextParams, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads_batch), 0);
|
953
957
|
rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_scaling_type), 1);
|
954
958
|
rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type", RUBY_METHOD_FUNC(_llama_context_params_get_rope_scaling_type), 0);
|
959
|
+
rb_define_method(rb_cLLaMAContextParams, "pooling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_pooling_type), 1);
|
960
|
+
rb_define_method(rb_cLLaMAContextParams, "pooling_type", RUBY_METHOD_FUNC(_llama_context_params_get_pooling_type), 0);
|
955
961
|
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
|
956
962
|
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
|
957
963
|
rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
|
@@ -974,12 +980,10 @@ public:
|
|
974
980
|
rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
|
975
981
|
rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
|
976
982
|
rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
|
977
|
-
rb_define_method(rb_cLLaMAContextParams, "
|
978
|
-
rb_define_method(rb_cLLaMAContextParams, "
|
983
|
+
rb_define_method(rb_cLLaMAContextParams, "embeddings=", RUBY_METHOD_FUNC(_llama_context_params_set_embeddings), 1);
|
984
|
+
rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
|
979
985
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
980
986
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
981
|
-
rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
|
982
|
-
rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
|
983
987
|
}
|
984
988
|
|
985
989
|
private:
|
@@ -1031,6 +1035,30 @@ private:
|
|
1031
1035
|
return INT2NUM(ptr->params.n_batch);
|
1032
1036
|
}
|
1033
1037
|
|
1038
|
+
// n_ubatch
|
1039
|
+
static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
|
1040
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1041
|
+
ptr->params.n_ubatch = NUM2INT(n_ubatch);
|
1042
|
+
return INT2NUM(ptr->params.n_ubatch);
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
|
1046
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1047
|
+
return INT2NUM(ptr->params.n_ubatch);
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
// n_seq_max
|
1051
|
+
static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
|
1052
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1053
|
+
ptr->params.n_seq_max = NUM2INT(n_seq_max);
|
1054
|
+
return INT2NUM(ptr->params.n_seq_max);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
|
1058
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1059
|
+
return INT2NUM(ptr->params.n_seq_max);
|
1060
|
+
}
|
1061
|
+
|
1034
1062
|
// n_threads
|
1035
1063
|
static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
|
1036
1064
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -1058,7 +1086,7 @@ private:
|
|
1058
1086
|
// rope_scaling_type
|
1059
1087
|
static VALUE _llama_context_params_set_rope_scaling_type(VALUE self, VALUE scaling_type) {
|
1060
1088
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1061
|
-
ptr->params.rope_scaling_type = NUM2INT(scaling_type);
|
1089
|
+
ptr->params.rope_scaling_type = static_cast<enum llama_rope_scaling_type>(NUM2INT(scaling_type));
|
1062
1090
|
return INT2NUM(ptr->params.rope_scaling_type);
|
1063
1091
|
}
|
1064
1092
|
|
@@ -1067,6 +1095,18 @@ private:
|
|
1067
1095
|
return INT2NUM(ptr->params.rope_scaling_type);
|
1068
1096
|
}
|
1069
1097
|
|
1098
|
+
// pooling_type
|
1099
|
+
static VALUE _llama_context_params_set_pooling_type(VALUE self, VALUE scaling_type) {
|
1100
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1101
|
+
ptr->params.pooling_type = static_cast<enum llama_pooling_type>(NUM2INT(scaling_type));
|
1102
|
+
return INT2NUM(ptr->params.pooling_type);
|
1103
|
+
}
|
1104
|
+
|
1105
|
+
static VALUE _llama_context_params_get_pooling_type(VALUE self) {
|
1106
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1107
|
+
return INT2NUM(ptr->params.pooling_type);
|
1108
|
+
}
|
1109
|
+
|
1070
1110
|
// rope_freq_base
|
1071
1111
|
static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
|
1072
1112
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -1199,16 +1239,16 @@ private:
|
|
1199
1239
|
return ptr->params.logits_all ? Qtrue : Qfalse;
|
1200
1240
|
}
|
1201
1241
|
|
1202
|
-
//
|
1203
|
-
static VALUE
|
1242
|
+
// embeddings
|
1243
|
+
static VALUE _llama_context_params_set_embeddings(VALUE self, VALUE embeddings) {
|
1204
1244
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1205
|
-
ptr->params.
|
1206
|
-
return ptr->params.
|
1245
|
+
ptr->params.embeddings = RTEST(embeddings) ? true : false;
|
1246
|
+
return ptr->params.embeddings ? Qtrue : Qfalse;
|
1207
1247
|
}
|
1208
1248
|
|
1209
|
-
static VALUE
|
1249
|
+
static VALUE _llama_context_params_get_embeddings(VALUE self) {
|
1210
1250
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1211
|
-
return ptr->params.
|
1251
|
+
return ptr->params.embeddings ? Qtrue : Qfalse;
|
1212
1252
|
}
|
1213
1253
|
|
1214
1254
|
// offload_kqv
|
@@ -1222,18 +1262,6 @@ private:
|
|
1222
1262
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1223
1263
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1224
1264
|
}
|
1225
|
-
|
1226
|
-
// do_pooling
|
1227
|
-
static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
|
1228
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1229
|
-
ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
|
1230
|
-
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1231
|
-
}
|
1232
|
-
|
1233
|
-
static VALUE _llama_context_params_get_do_pooling(VALUE self) {
|
1234
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1235
|
-
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1236
|
-
}
|
1237
1265
|
};
|
1238
1266
|
|
1239
1267
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -2016,8 +2044,11 @@ public:
|
|
2016
2044
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
2017
2045
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2018
2046
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2047
|
+
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2019
2048
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2020
2049
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2050
|
+
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2051
|
+
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2021
2052
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2022
2053
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2023
2054
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2032,6 +2063,8 @@ public:
|
|
2032
2063
|
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
|
2033
2064
|
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
|
2034
2065
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2066
|
+
rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
|
2067
|
+
rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
|
2035
2068
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2036
2069
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2037
2070
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
@@ -2151,7 +2184,7 @@ private:
|
|
2151
2184
|
LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
|
2152
2185
|
VALUE params = rb_iv_get(self, "@params");
|
2153
2186
|
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2154
|
-
if (!prms_ptr->params.
|
2187
|
+
if (!prms_ptr->params.embeddings) {
|
2155
2188
|
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2156
2189
|
return Qnil;
|
2157
2190
|
}
|
@@ -2160,10 +2193,11 @@ private:
|
|
2160
2193
|
return Qnil;
|
2161
2194
|
}
|
2162
2195
|
|
2196
|
+
const int n_tokens = NUM2INT(rb_iv_get(self, "@n_tokens"));
|
2163
2197
|
const int n_embd = llama_n_embd(model_ptr->model);
|
2164
2198
|
const float* embd = llama_get_embeddings(ptr->ctx);
|
2165
2199
|
VALUE output = rb_ary_new();
|
2166
|
-
for (int i = 0; i < n_embd; i++) {
|
2200
|
+
for (int i = 0; i < n_tokens * n_embd; i++) {
|
2167
2201
|
rb_ary_push(output, DBL2NUM((double)(embd[i])));
|
2168
2202
|
}
|
2169
2203
|
|
@@ -2182,7 +2216,7 @@ private:
|
|
2182
2216
|
}
|
2183
2217
|
VALUE params = rb_iv_get(self, "@params");
|
2184
2218
|
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2185
|
-
if (!prms_ptr->params.
|
2219
|
+
if (!prms_ptr->params.embeddings) {
|
2186
2220
|
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2187
2221
|
return Qnil;
|
2188
2222
|
}
|
@@ -2200,6 +2234,36 @@ private:
|
|
2200
2234
|
return output;
|
2201
2235
|
}
|
2202
2236
|
|
2237
|
+
static VALUE _llama_context_embeddings_seq(VALUE self, VALUE seq_id) {
|
2238
|
+
if (!RB_INTEGER_TYPE_P(seq_id)) {
|
2239
|
+
rb_raise(rb_eArgError, "seq_id must be an integer");
|
2240
|
+
return Qnil;
|
2241
|
+
}
|
2242
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2243
|
+
if (ptr->ctx == NULL) {
|
2244
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2245
|
+
return Qnil;
|
2246
|
+
}
|
2247
|
+
VALUE params = rb_iv_get(self, "@params");
|
2248
|
+
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2249
|
+
if (!prms_ptr->params.embeddings) {
|
2250
|
+
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2251
|
+
return Qnil;
|
2252
|
+
}
|
2253
|
+
|
2254
|
+
VALUE model = rb_iv_get(self, "@model");
|
2255
|
+
LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
|
2256
|
+
const int n_embd = llama_n_embd(model_ptr->model);
|
2257
|
+
|
2258
|
+
VALUE output = rb_ary_new();
|
2259
|
+
const float* embd = llama_get_embeddings_seq(ptr->ctx, NUM2INT(seq_id));
|
2260
|
+
for (int i = 0; i < n_embd; i++) {
|
2261
|
+
rb_ary_push(output, DBL2NUM((double)(embd[i])));
|
2262
|
+
}
|
2263
|
+
|
2264
|
+
return output;
|
2265
|
+
}
|
2266
|
+
|
2203
2267
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2204
2268
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2205
2269
|
if (ptr->ctx == NULL) {
|
@@ -2218,6 +2282,24 @@ private:
|
|
2218
2282
|
return UINT2NUM(llama_n_batch(ptr->ctx));
|
2219
2283
|
}
|
2220
2284
|
|
2285
|
+
static VALUE _llama_context_n_ubatch(VALUE self) {
|
2286
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2287
|
+
if (ptr->ctx == NULL) {
|
2288
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2289
|
+
return Qnil;
|
2290
|
+
}
|
2291
|
+
return UINT2NUM(llama_n_ubatch(ptr->ctx));
|
2292
|
+
}
|
2293
|
+
|
2294
|
+
static VALUE _llama_context_n_seq_max(VALUE self) {
|
2295
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2296
|
+
if (ptr->ctx == NULL) {
|
2297
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2298
|
+
return Qnil;
|
2299
|
+
}
|
2300
|
+
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2301
|
+
}
|
2302
|
+
|
2221
2303
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2222
2304
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2223
2305
|
if (ptr->ctx == NULL) {
|
@@ -2363,6 +2445,26 @@ private:
|
|
2363
2445
|
return Qnil;
|
2364
2446
|
}
|
2365
2447
|
|
2448
|
+
static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
|
2449
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2450
|
+
if (ptr->ctx == NULL) {
|
2451
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2452
|
+
return Qnil;
|
2453
|
+
}
|
2454
|
+
llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
|
2455
|
+
return Qnil;
|
2456
|
+
}
|
2457
|
+
|
2458
|
+
static VALUE _llama_context_synchronize(VALUE self) {
|
2459
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2460
|
+
if (ptr->ctx == NULL) {
|
2461
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2462
|
+
return Qnil;
|
2463
|
+
}
|
2464
|
+
llama_synchronize(ptr->ctx);
|
2465
|
+
return Qnil;
|
2466
|
+
}
|
2467
|
+
|
2366
2468
|
static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
|
2367
2469
|
VALUE kw_args = Qnil;
|
2368
2470
|
ID kw_table[1] = { rb_intern("session_path") };
|
@@ -3172,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3172
3274
|
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
3173
3275
|
rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
|
3174
3276
|
|
3277
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
|
3175
3278
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3176
3279
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3177
3280
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
@@ -3229,6 +3332,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3229
3332
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
|
3230
3333
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
|
3231
3334
|
|
3335
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED));
|
3232
3336
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
|
3233
3337
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
|
3234
3338
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.14.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2435'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,7 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_VOCAB_TYPE_NONE: Integer
|
6
7
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
7
8
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
8
9
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
@@ -50,6 +51,7 @@ module LLaMACpp
|
|
50
51
|
LLAMA_ROPE_SCALING_TYPE_YARN: Integer
|
51
52
|
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
|
52
53
|
|
54
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
|
53
55
|
LLAMA_POOLING_TYPE_NONE: Integer
|
54
56
|
LLAMA_POOLING_TYPE_MEAN: Integer
|
55
57
|
LLAMA_POOLING_TYPE_CLS: Integer
|
@@ -201,10 +203,13 @@ module LLaMACpp
|
|
201
203
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
202
204
|
def embeddings: () -> Array[Float]
|
203
205
|
def embeddings_ith: (Integer) -> Array[Float]
|
206
|
+
def embeddings_seq: (Integer) -> Array[Float]
|
204
207
|
def decode: (::LLaMACpp::Batch) -> void
|
205
208
|
def logits: () -> Array[Float]
|
206
209
|
def n_ctx: () -> Integer
|
207
210
|
def n_batch: () -> Integer
|
211
|
+
def n_ubatch: () -> Integer
|
212
|
+
def n_seq_max: () -> Integer
|
208
213
|
def timings: () -> ::LLaMACpp::Timings
|
209
214
|
def print_timings: () -> void
|
210
215
|
def reset_timings: () -> void
|
@@ -219,6 +224,8 @@ module LLaMACpp
|
|
219
224
|
def kv_cache_defrag: () -> void
|
220
225
|
def kv_cache_update: () -> void
|
221
226
|
def set_rng_seed: (Integer) -> void
|
227
|
+
def set_causal_attn: (bool) -> void
|
228
|
+
def synchronize: () -> void
|
222
229
|
def load_session_file: (session_path: String) -> void
|
223
230
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
224
231
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
@@ -248,12 +255,18 @@ module LLaMACpp
|
|
248
255
|
def n_ctx=: (Integer) -> Integer
|
249
256
|
def n_batch: () -> Integer
|
250
257
|
def n_batch=: (Integer) -> Integer
|
258
|
+
def n_ubatch: () -> Integer
|
259
|
+
def n_ubatch=: (Integer) -> Integer
|
260
|
+
def n_seq_max: () -> Integer
|
261
|
+
def n_seq_max=: (Integer) -> Integer
|
251
262
|
def n_threads: () -> Integer
|
252
263
|
def n_threads=: (Integer) -> Integer
|
253
264
|
def n_threads_batch: () -> Integer
|
254
265
|
def n_threads_batch=: (Integer) -> Integer
|
255
266
|
def rope_scaling_type=: (Integer) -> Integer
|
256
267
|
def rope_scaling_type: () -> Integer
|
268
|
+
def pooling_type=: (Integer) -> Integer
|
269
|
+
def pooling_type: () -> Integer
|
257
270
|
def rope_freq_base=: (Float) -> Float
|
258
271
|
def rope_freq_base: () -> Float
|
259
272
|
def rope_freq_scale=: (Float) -> Float
|
@@ -276,12 +289,10 @@ module LLaMACpp
|
|
276
289
|
def type_v: () -> Integer
|
277
290
|
def logits_all: () -> bool
|
278
291
|
def logits_all=: (bool) -> bool
|
279
|
-
def
|
280
|
-
def
|
292
|
+
def embeddings: () -> bool
|
293
|
+
def embeddings=: (bool) -> bool
|
281
294
|
def offload_kqv: () -> bool
|
282
295
|
def offload_kqv=: (bool) -> bool
|
283
|
-
def do_pooling: () -> bool
|
284
|
-
def do_pooling=: (bool) -> bool
|
285
296
|
end
|
286
297
|
|
287
298
|
class ModelQuantizeParams
|
@@ -2,7 +2,7 @@
|
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
|
|
167
167
|
MK_CPPFLAGS += -D_BSD_SOURCE
|
168
168
|
endif
|
169
169
|
|
170
|
+
ifdef LLAMA_SCHED_MAX_COPIES
|
171
|
+
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
|
172
|
+
endif
|
173
|
+
|
170
174
|
ifdef LLAMA_DEBUG
|
171
175
|
MK_CFLAGS += -O0 -g
|
172
176
|
MK_CXXFLAGS += -O0 -g
|
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
|
|
201
205
|
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
202
206
|
endif
|
203
207
|
|
208
|
+
ifdef LLAMA_SERVER_SSL
|
209
|
+
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
210
|
+
MK_LDFLAGS += -lssl -lcrypto
|
211
|
+
endif
|
204
212
|
|
205
213
|
ifdef LLAMA_CODE_COVERAGE
|
206
214
|
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
|
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
451
459
|
ifdef LLAMA_CUDA_CCBIN
|
452
460
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
453
461
|
endif
|
454
|
-
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
462
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
|
455
463
|
ifdef JETSON_EOL_MODULE_DETECT
|
456
464
|
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
457
465
|
else
|
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
|
|
551
559
|
$(CC) $(CFLAGS) -c $< -o $@
|
552
560
|
|
553
561
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
554
|
-
ggml-metal-embed.o: ggml-metal.metal
|
562
|
+
ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
555
563
|
@echo "Embedding Metal library"
|
564
|
+
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
|
556
565
|
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
557
|
-
@echo ".section __DATA, __ggml_metallib"
|
558
|
-
@echo ".globl _ggml_metallib_start"
|
559
|
-
@echo "_ggml_metallib_start:"
|
560
|
-
@echo ".incbin \"
|
561
|
-
@echo ".globl _ggml_metallib_end"
|
562
|
-
@echo "_ggml_metallib_end:"
|
566
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
567
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
568
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
569
|
+
@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
|
570
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
571
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
563
572
|
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
564
573
|
@rm -f ${TEMP_ASSEMBLY}
|
565
574
|
endif
|
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
|
628
637
|
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
629
638
|
$(CC) $(CFLAGS) -c $< -o $@
|
630
639
|
|
631
|
-
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
640
|
+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
632
641
|
$(CC) $(CFLAGS) -c $< -o $@
|
633
642
|
|
634
|
-
|
643
|
+
unicode.o: unicode.cpp unicode.h
|
644
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
635
645
|
|
636
|
-
|
646
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
|
647
|
+
|
648
|
+
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
637
649
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
638
650
|
|
639
651
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
@@ -725,14 +737,17 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
|
725
737
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
726
738
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
727
739
|
|
740
|
+
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
741
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
742
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
743
|
+
|
728
744
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
729
745
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
730
746
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
731
747
|
|
732
|
-
server: examples/server/server.cpp examples/server/
|
748
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
733
749
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
734
|
-
$(CXX) $(CXXFLAGS) -
|
735
|
-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
750
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
736
751
|
|
737
752
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
738
753
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|