llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2a192fa17c1d313a93306e415ec27dfb8fb6ce993b9fc78797ed6e1d38ca63f
|
4
|
+
data.tar.gz: f800e54961a8bea5de95373d15f0cda30f7e95edd655cc0504247dfefcff473a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48cefba1491319f82d52a46e8be34b5f0115dbe80bd6a9fdbf4fe0e190581a6b1ff8c3e2b2dfdaefeaa0b7cb11c8b9f5a84bcb60354f64248abbee3d488378ee
|
7
|
+
data.tar.gz: 9c6d75d3818b61192bd5c93a8b091003e2342f28102de1fbc9a1a02955a7c89e2a144b82bbe83e805b3f741261e967469c3ad2f6d347b1b870fb51880b850d89
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,25 @@
|
|
1
|
+
## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2361 to b2435.
|
4
|
+
- Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
|
5
|
+
- Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
|
6
|
+
- Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
|
7
|
+
|
8
|
+
## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
|
9
|
+
|
10
|
+
**Breaking Changes**
|
11
|
+
|
12
|
+
- Bump bundled llama.cpp from b2303 to b2361.
|
13
|
+
- Rename embedding accessor to `embeddings` in `ContextParams`.
|
14
|
+
- Remove `do_pooling` accessor from `ContextParams`.
|
15
|
+
- Add `pooling_type` accessor to `ContextParams`.
|
16
|
+
- Fix the size of array returned by `embedding` method in `Context` from `n_embd` to `n_tokens * n_embd`.
|
17
|
+
- Add `embeddings_seq` method to `Context`.
|
18
|
+
|
1
19
|
## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
|
2
20
|
|
21
|
+
**Breaking Changes**
|
22
|
+
|
3
23
|
- Bump bundled llama.cpp from b2143 to b2303.
|
4
24
|
- Remove deprecated methods:
|
5
25
|
- `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -946,12 +946,18 @@ public:
|
|
946
946
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
|
947
947
|
rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
|
948
948
|
rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
|
949
|
+
rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
|
950
|
+
rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
|
951
|
+
rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
|
952
|
+
rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
|
949
953
|
rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
|
950
954
|
rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
|
951
955
|
rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
|
952
956
|
rb_define_method(rb_cLLaMAContextParams, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads_batch), 0);
|
953
957
|
rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_scaling_type), 1);
|
954
958
|
rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type", RUBY_METHOD_FUNC(_llama_context_params_get_rope_scaling_type), 0);
|
959
|
+
rb_define_method(rb_cLLaMAContextParams, "pooling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_pooling_type), 1);
|
960
|
+
rb_define_method(rb_cLLaMAContextParams, "pooling_type", RUBY_METHOD_FUNC(_llama_context_params_get_pooling_type), 0);
|
955
961
|
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
|
956
962
|
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
|
957
963
|
rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
|
@@ -974,12 +980,10 @@ public:
|
|
974
980
|
rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
|
975
981
|
rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
|
976
982
|
rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
|
977
|
-
rb_define_method(rb_cLLaMAContextParams, "
|
978
|
-
rb_define_method(rb_cLLaMAContextParams, "
|
983
|
+
rb_define_method(rb_cLLaMAContextParams, "embeddings=", RUBY_METHOD_FUNC(_llama_context_params_set_embeddings), 1);
|
984
|
+
rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
|
979
985
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
980
986
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
981
|
-
rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
|
982
|
-
rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
|
983
987
|
}
|
984
988
|
|
985
989
|
private:
|
@@ -1031,6 +1035,30 @@ private:
|
|
1031
1035
|
return INT2NUM(ptr->params.n_batch);
|
1032
1036
|
}
|
1033
1037
|
|
1038
|
+
// n_ubatch
|
1039
|
+
static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
|
1040
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1041
|
+
ptr->params.n_ubatch = NUM2INT(n_ubatch);
|
1042
|
+
return INT2NUM(ptr->params.n_ubatch);
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
|
1046
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1047
|
+
return INT2NUM(ptr->params.n_ubatch);
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
// n_seq_max
|
1051
|
+
static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
|
1052
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1053
|
+
ptr->params.n_seq_max = NUM2INT(n_seq_max);
|
1054
|
+
return INT2NUM(ptr->params.n_seq_max);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
|
1058
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1059
|
+
return INT2NUM(ptr->params.n_seq_max);
|
1060
|
+
}
|
1061
|
+
|
1034
1062
|
// n_threads
|
1035
1063
|
static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
|
1036
1064
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -1058,7 +1086,7 @@ private:
|
|
1058
1086
|
// rope_scaling_type
|
1059
1087
|
static VALUE _llama_context_params_set_rope_scaling_type(VALUE self, VALUE scaling_type) {
|
1060
1088
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1061
|
-
ptr->params.rope_scaling_type = NUM2INT(scaling_type);
|
1089
|
+
ptr->params.rope_scaling_type = static_cast<enum llama_rope_scaling_type>(NUM2INT(scaling_type));
|
1062
1090
|
return INT2NUM(ptr->params.rope_scaling_type);
|
1063
1091
|
}
|
1064
1092
|
|
@@ -1067,6 +1095,18 @@ private:
|
|
1067
1095
|
return INT2NUM(ptr->params.rope_scaling_type);
|
1068
1096
|
}
|
1069
1097
|
|
1098
|
+
// pooling_type
|
1099
|
+
static VALUE _llama_context_params_set_pooling_type(VALUE self, VALUE scaling_type) {
|
1100
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1101
|
+
ptr->params.pooling_type = static_cast<enum llama_pooling_type>(NUM2INT(scaling_type));
|
1102
|
+
return INT2NUM(ptr->params.pooling_type);
|
1103
|
+
}
|
1104
|
+
|
1105
|
+
static VALUE _llama_context_params_get_pooling_type(VALUE self) {
|
1106
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1107
|
+
return INT2NUM(ptr->params.pooling_type);
|
1108
|
+
}
|
1109
|
+
|
1070
1110
|
// rope_freq_base
|
1071
1111
|
static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
|
1072
1112
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -1199,16 +1239,16 @@ private:
|
|
1199
1239
|
return ptr->params.logits_all ? Qtrue : Qfalse;
|
1200
1240
|
}
|
1201
1241
|
|
1202
|
-
//
|
1203
|
-
static VALUE
|
1242
|
+
// embeddings
|
1243
|
+
static VALUE _llama_context_params_set_embeddings(VALUE self, VALUE embeddings) {
|
1204
1244
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1205
|
-
ptr->params.
|
1206
|
-
return ptr->params.
|
1245
|
+
ptr->params.embeddings = RTEST(embeddings) ? true : false;
|
1246
|
+
return ptr->params.embeddings ? Qtrue : Qfalse;
|
1207
1247
|
}
|
1208
1248
|
|
1209
|
-
static VALUE
|
1249
|
+
static VALUE _llama_context_params_get_embeddings(VALUE self) {
|
1210
1250
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1211
|
-
return ptr->params.
|
1251
|
+
return ptr->params.embeddings ? Qtrue : Qfalse;
|
1212
1252
|
}
|
1213
1253
|
|
1214
1254
|
// offload_kqv
|
@@ -1222,18 +1262,6 @@ private:
|
|
1222
1262
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1223
1263
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1224
1264
|
}
|
1225
|
-
|
1226
|
-
// do_pooling
|
1227
|
-
static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
|
1228
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1229
|
-
ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
|
1230
|
-
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1231
|
-
}
|
1232
|
-
|
1233
|
-
static VALUE _llama_context_params_get_do_pooling(VALUE self) {
|
1234
|
-
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1235
|
-
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1236
|
-
}
|
1237
1265
|
};
|
1238
1266
|
|
1239
1267
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -2016,8 +2044,11 @@ public:
|
|
2016
2044
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
2017
2045
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2018
2046
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2047
|
+
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2019
2048
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2020
2049
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2050
|
+
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2051
|
+
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2021
2052
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2022
2053
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2023
2054
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2032,6 +2063,8 @@ public:
|
|
2032
2063
|
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
|
2033
2064
|
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
|
2034
2065
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2066
|
+
rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
|
2067
|
+
rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
|
2035
2068
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2036
2069
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2037
2070
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
@@ -2151,7 +2184,7 @@ private:
|
|
2151
2184
|
LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
|
2152
2185
|
VALUE params = rb_iv_get(self, "@params");
|
2153
2186
|
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2154
|
-
if (!prms_ptr->params.
|
2187
|
+
if (!prms_ptr->params.embeddings) {
|
2155
2188
|
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2156
2189
|
return Qnil;
|
2157
2190
|
}
|
@@ -2160,10 +2193,11 @@ private:
|
|
2160
2193
|
return Qnil;
|
2161
2194
|
}
|
2162
2195
|
|
2196
|
+
const int n_tokens = NUM2INT(rb_iv_get(self, "@n_tokens"));
|
2163
2197
|
const int n_embd = llama_n_embd(model_ptr->model);
|
2164
2198
|
const float* embd = llama_get_embeddings(ptr->ctx);
|
2165
2199
|
VALUE output = rb_ary_new();
|
2166
|
-
for (int i = 0; i < n_embd; i++) {
|
2200
|
+
for (int i = 0; i < n_tokens * n_embd; i++) {
|
2167
2201
|
rb_ary_push(output, DBL2NUM((double)(embd[i])));
|
2168
2202
|
}
|
2169
2203
|
|
@@ -2182,7 +2216,7 @@ private:
|
|
2182
2216
|
}
|
2183
2217
|
VALUE params = rb_iv_get(self, "@params");
|
2184
2218
|
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2185
|
-
if (!prms_ptr->params.
|
2219
|
+
if (!prms_ptr->params.embeddings) {
|
2186
2220
|
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2187
2221
|
return Qnil;
|
2188
2222
|
}
|
@@ -2200,6 +2234,36 @@ private:
|
|
2200
2234
|
return output;
|
2201
2235
|
}
|
2202
2236
|
|
2237
|
+
static VALUE _llama_context_embeddings_seq(VALUE self, VALUE seq_id) {
|
2238
|
+
if (!RB_INTEGER_TYPE_P(seq_id)) {
|
2239
|
+
rb_raise(rb_eArgError, "seq_id must be an integer");
|
2240
|
+
return Qnil;
|
2241
|
+
}
|
2242
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2243
|
+
if (ptr->ctx == NULL) {
|
2244
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2245
|
+
return Qnil;
|
2246
|
+
}
|
2247
|
+
VALUE params = rb_iv_get(self, "@params");
|
2248
|
+
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2249
|
+
if (!prms_ptr->params.embeddings) {
|
2250
|
+
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2251
|
+
return Qnil;
|
2252
|
+
}
|
2253
|
+
|
2254
|
+
VALUE model = rb_iv_get(self, "@model");
|
2255
|
+
LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
|
2256
|
+
const int n_embd = llama_n_embd(model_ptr->model);
|
2257
|
+
|
2258
|
+
VALUE output = rb_ary_new();
|
2259
|
+
const float* embd = llama_get_embeddings_seq(ptr->ctx, NUM2INT(seq_id));
|
2260
|
+
for (int i = 0; i < n_embd; i++) {
|
2261
|
+
rb_ary_push(output, DBL2NUM((double)(embd[i])));
|
2262
|
+
}
|
2263
|
+
|
2264
|
+
return output;
|
2265
|
+
}
|
2266
|
+
|
2203
2267
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2204
2268
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2205
2269
|
if (ptr->ctx == NULL) {
|
@@ -2218,6 +2282,24 @@ private:
|
|
2218
2282
|
return UINT2NUM(llama_n_batch(ptr->ctx));
|
2219
2283
|
}
|
2220
2284
|
|
2285
|
+
static VALUE _llama_context_n_ubatch(VALUE self) {
|
2286
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2287
|
+
if (ptr->ctx == NULL) {
|
2288
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2289
|
+
return Qnil;
|
2290
|
+
}
|
2291
|
+
return UINT2NUM(llama_n_ubatch(ptr->ctx));
|
2292
|
+
}
|
2293
|
+
|
2294
|
+
static VALUE _llama_context_n_seq_max(VALUE self) {
|
2295
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2296
|
+
if (ptr->ctx == NULL) {
|
2297
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2298
|
+
return Qnil;
|
2299
|
+
}
|
2300
|
+
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2301
|
+
}
|
2302
|
+
|
2221
2303
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2222
2304
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2223
2305
|
if (ptr->ctx == NULL) {
|
@@ -2363,6 +2445,26 @@ private:
|
|
2363
2445
|
return Qnil;
|
2364
2446
|
}
|
2365
2447
|
|
2448
|
+
static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
|
2449
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2450
|
+
if (ptr->ctx == NULL) {
|
2451
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2452
|
+
return Qnil;
|
2453
|
+
}
|
2454
|
+
llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
|
2455
|
+
return Qnil;
|
2456
|
+
}
|
2457
|
+
|
2458
|
+
static VALUE _llama_context_synchronize(VALUE self) {
|
2459
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2460
|
+
if (ptr->ctx == NULL) {
|
2461
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2462
|
+
return Qnil;
|
2463
|
+
}
|
2464
|
+
llama_synchronize(ptr->ctx);
|
2465
|
+
return Qnil;
|
2466
|
+
}
|
2467
|
+
|
2366
2468
|
static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
|
2367
2469
|
VALUE kw_args = Qnil;
|
2368
2470
|
ID kw_table[1] = { rb_intern("session_path") };
|
@@ -3172,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3172
3274
|
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
3173
3275
|
rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
|
3174
3276
|
|
3277
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
|
3175
3278
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3176
3279
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3177
3280
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
@@ -3229,6 +3332,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3229
3332
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
|
3230
3333
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
|
3231
3334
|
|
3335
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED));
|
3232
3336
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
|
3233
3337
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
|
3234
3338
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.14.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2435'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,7 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_VOCAB_TYPE_NONE: Integer
|
6
7
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
7
8
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
8
9
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
@@ -50,6 +51,7 @@ module LLaMACpp
|
|
50
51
|
LLAMA_ROPE_SCALING_TYPE_YARN: Integer
|
51
52
|
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
|
52
53
|
|
54
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
|
53
55
|
LLAMA_POOLING_TYPE_NONE: Integer
|
54
56
|
LLAMA_POOLING_TYPE_MEAN: Integer
|
55
57
|
LLAMA_POOLING_TYPE_CLS: Integer
|
@@ -201,10 +203,13 @@ module LLaMACpp
|
|
201
203
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
202
204
|
def embeddings: () -> Array[Float]
|
203
205
|
def embeddings_ith: (Integer) -> Array[Float]
|
206
|
+
def embeddings_seq: (Integer) -> Array[Float]
|
204
207
|
def decode: (::LLaMACpp::Batch) -> void
|
205
208
|
def logits: () -> Array[Float]
|
206
209
|
def n_ctx: () -> Integer
|
207
210
|
def n_batch: () -> Integer
|
211
|
+
def n_ubatch: () -> Integer
|
212
|
+
def n_seq_max: () -> Integer
|
208
213
|
def timings: () -> ::LLaMACpp::Timings
|
209
214
|
def print_timings: () -> void
|
210
215
|
def reset_timings: () -> void
|
@@ -219,6 +224,8 @@ module LLaMACpp
|
|
219
224
|
def kv_cache_defrag: () -> void
|
220
225
|
def kv_cache_update: () -> void
|
221
226
|
def set_rng_seed: (Integer) -> void
|
227
|
+
def set_causal_attn: (bool) -> void
|
228
|
+
def synchronize: () -> void
|
222
229
|
def load_session_file: (session_path: String) -> void
|
223
230
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
224
231
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
@@ -248,12 +255,18 @@ module LLaMACpp
|
|
248
255
|
def n_ctx=: (Integer) -> Integer
|
249
256
|
def n_batch: () -> Integer
|
250
257
|
def n_batch=: (Integer) -> Integer
|
258
|
+
def n_ubatch: () -> Integer
|
259
|
+
def n_ubatch=: (Integer) -> Integer
|
260
|
+
def n_seq_max: () -> Integer
|
261
|
+
def n_seq_max=: (Integer) -> Integer
|
251
262
|
def n_threads: () -> Integer
|
252
263
|
def n_threads=: (Integer) -> Integer
|
253
264
|
def n_threads_batch: () -> Integer
|
254
265
|
def n_threads_batch=: (Integer) -> Integer
|
255
266
|
def rope_scaling_type=: (Integer) -> Integer
|
256
267
|
def rope_scaling_type: () -> Integer
|
268
|
+
def pooling_type=: (Integer) -> Integer
|
269
|
+
def pooling_type: () -> Integer
|
257
270
|
def rope_freq_base=: (Float) -> Float
|
258
271
|
def rope_freq_base: () -> Float
|
259
272
|
def rope_freq_scale=: (Float) -> Float
|
@@ -276,12 +289,10 @@ module LLaMACpp
|
|
276
289
|
def type_v: () -> Integer
|
277
290
|
def logits_all: () -> bool
|
278
291
|
def logits_all=: (bool) -> bool
|
279
|
-
def
|
280
|
-
def
|
292
|
+
def embeddings: () -> bool
|
293
|
+
def embeddings=: (bool) -> bool
|
281
294
|
def offload_kqv: () -> bool
|
282
295
|
def offload_kqv=: (bool) -> bool
|
283
|
-
def do_pooling: () -> bool
|
284
|
-
def do_pooling=: (bool) -> bool
|
285
296
|
end
|
286
297
|
|
287
298
|
class ModelQuantizeParams
|
@@ -2,7 +2,7 @@
|
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
|
|
167
167
|
MK_CPPFLAGS += -D_BSD_SOURCE
|
168
168
|
endif
|
169
169
|
|
170
|
+
ifdef LLAMA_SCHED_MAX_COPIES
|
171
|
+
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
|
172
|
+
endif
|
173
|
+
|
170
174
|
ifdef LLAMA_DEBUG
|
171
175
|
MK_CFLAGS += -O0 -g
|
172
176
|
MK_CXXFLAGS += -O0 -g
|
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
|
|
201
205
|
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
202
206
|
endif
|
203
207
|
|
208
|
+
ifdef LLAMA_SERVER_SSL
|
209
|
+
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
210
|
+
MK_LDFLAGS += -lssl -lcrypto
|
211
|
+
endif
|
204
212
|
|
205
213
|
ifdef LLAMA_CODE_COVERAGE
|
206
214
|
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
|
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
451
459
|
ifdef LLAMA_CUDA_CCBIN
|
452
460
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
453
461
|
endif
|
454
|
-
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
462
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
|
455
463
|
ifdef JETSON_EOL_MODULE_DETECT
|
456
464
|
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
457
465
|
else
|
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
|
|
551
559
|
$(CC) $(CFLAGS) -c $< -o $@
|
552
560
|
|
553
561
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
554
|
-
ggml-metal-embed.o: ggml-metal.metal
|
562
|
+
ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
555
563
|
@echo "Embedding Metal library"
|
564
|
+
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
|
556
565
|
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
557
|
-
@echo ".section __DATA, __ggml_metallib"
|
558
|
-
@echo ".globl _ggml_metallib_start"
|
559
|
-
@echo "_ggml_metallib_start:"
|
560
|
-
@echo ".incbin \"
|
561
|
-
@echo ".globl _ggml_metallib_end"
|
562
|
-
@echo "_ggml_metallib_end:"
|
566
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
567
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
568
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
569
|
+
@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
|
570
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
571
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
563
572
|
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
564
573
|
@rm -f ${TEMP_ASSEMBLY}
|
565
574
|
endif
|
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
|
628
637
|
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
629
638
|
$(CC) $(CFLAGS) -c $< -o $@
|
630
639
|
|
631
|
-
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
640
|
+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
632
641
|
$(CC) $(CFLAGS) -c $< -o $@
|
633
642
|
|
634
|
-
|
643
|
+
unicode.o: unicode.cpp unicode.h
|
644
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
635
645
|
|
636
|
-
|
646
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
|
647
|
+
|
648
|
+
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
637
649
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
638
650
|
|
639
651
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
@@ -725,14 +737,17 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
|
725
737
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
726
738
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
727
739
|
|
740
|
+
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
741
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
742
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
743
|
+
|
728
744
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
729
745
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
730
746
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
731
747
|
|
732
|
-
server: examples/server/server.cpp examples/server/
|
748
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
733
749
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
734
|
-
$(CXX) $(CXXFLAGS) -
|
735
|
-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
750
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
736
751
|
|
737
752
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
738
753
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|