llama_cpp 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1fcd28849baae5e90c466665aff4fe5da1d848193ebcf74c3fe333c5674191c
|
4
|
+
data.tar.gz: fcb0c64528d24c5cfad677f17bfd6e1e817a4b8279317ca5b2113302735598b9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c70b5f919feb7a585efbe21b3360254c2f5789504cd73fecee12fd686483c77eeb763ed91a8e7434d5852208555a78f168b358d0895f15b1ea7e774d36d6910a
|
7
|
+
data.tar.gz: f554ad58fc9d68c39b80995b7f424468386b32a5847dbdefbceb1cba53ff7182da35be8599523d82a6daa8fee23667d07e06faedc4c727d52e8fc594d0bc7d3f
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
## [[0.3.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.1...v0.3.2)] - 2023-07-08
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-b8c8dda to master-481f793.
|
4
|
+
- Add `Timings` class and `timings` method to `Context`:
|
5
|
+
```ruby
|
6
|
+
require 'llama_cpp'
|
7
|
+
|
8
|
+
# ...
|
9
|
+
|
10
|
+
context = LLaMACpp::Context.new(model: model)
|
11
|
+
timings = context.timings
|
12
|
+
|
13
|
+
puts timings.class
|
14
|
+
# => LLaMACpp::Timings
|
15
|
+
puts timings.t_load_ms
|
16
|
+
# => 79.61
|
17
|
+
```
|
18
|
+
- Expose sampling options as the arguemnts of `generate` module function:
|
19
|
+
```ruby
|
20
|
+
require 'llama_cpp'
|
21
|
+
|
22
|
+
# ...
|
23
|
+
|
24
|
+
LLaMACpp.generate(context, 'Hello, world.', top_k: 30, top_p: 0.8, temperature: 0.9)
|
25
|
+
```
|
26
|
+
- Add `ModelQuantizaParams` class, this class was not published because the author forgot to write rb_define_class.
|
27
|
+
- Minor update to example scripts, configuration files, and documentations.
|
28
|
+
|
1
29
|
## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
|
2
30
|
|
3
31
|
- Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
|
data/README.md
CHANGED
@@ -68,6 +68,15 @@ User:
|
|
68
68
|
|
69
69
|

|
70
70
|
|
71
|
+
Japanse chat is also possible using the [Vicuna model on Hugging Face](https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized).
|
72
|
+
|
73
|
+
```sh
|
74
|
+
$ wget https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_0.bin
|
75
|
+
$ ruby chat.rb --model ggml-vicuna-7b-1.1-q4_0.bin --file prompt_jp.txt
|
76
|
+
```
|
77
|
+
|
78
|
+

|
79
|
+
|
71
80
|
## Contributing
|
72
81
|
|
73
82
|
Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/llama_cpp.rb.
|
data/examples/chat.rb
CHANGED
@@ -33,7 +33,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
33
33
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
34
|
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
35
35
|
params = LLaMACpp::ContextParams.new
|
36
|
-
params.seed = options[:seed]
|
36
|
+
params.seed = options[:seed] if options[:seed] != -1
|
37
37
|
params.n_gpu_layers = options[:n_gpu_layers]
|
38
38
|
model = LLaMACpp::Model.new(model_path: options[:model], params: params)
|
39
39
|
context = LLaMACpp::Context.new(model: model)
|
data/examples/embedding.rb
CHANGED
@@ -18,7 +18,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
18
18
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
19
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
20
20
|
params = LLaMACpp::ContextParams.new
|
21
|
-
params.seed = options[:seed]
|
21
|
+
params.seed = options[:seed] if options[:seed] != -1
|
22
22
|
params.n_gpu_layers = options[:n_gpu_layers]
|
23
23
|
params.embedding = true
|
24
24
|
model = LLaMACpp::Model.new(model_path: options[:model], params: params)
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -7,8 +7,8 @@ abort 'libstdc++ is not found.' unless have_library('stdc++')
|
|
7
7
|
|
8
8
|
$srcs = %w[ggml.c llama.cpp llama_cpp.cpp]
|
9
9
|
$srcs << 'ggml-opencl.cpp' if with_config('clblast')
|
10
|
-
$CFLAGS << ' -w'
|
11
|
-
$CXXFLAGS << ' -std=c++11'
|
10
|
+
$CFLAGS << ' -w -DNDEBUG'
|
11
|
+
$CXXFLAGS << ' -std=c++11 -DNDEBUG'
|
12
12
|
$INCFLAGS << ' -I$(srcdir)/src'
|
13
13
|
$VPATH << '$(srcdir)/src'
|
14
14
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
|
2
1
|
#include "llama_cpp.h"
|
3
2
|
|
4
3
|
VALUE rb_mLLaMACpp;
|
5
4
|
VALUE rb_cLLaMAModel;
|
5
|
+
VALUE rb_cLLaMATimings;
|
6
6
|
VALUE rb_cLLaMAContext;
|
7
7
|
VALUE rb_cLLaMAContextParams;
|
8
8
|
VALUE rb_cLLaMAModelQuantizeParams;
|
@@ -256,6 +256,111 @@ const rb_data_type_t RbLLaMATokenDataArray::llama_token_data_array_type = {
|
|
256
256
|
RUBY_TYPED_FREE_IMMEDIATELY
|
257
257
|
};
|
258
258
|
|
259
|
+
class LLaMATimingsWrapper {
|
260
|
+
public:
|
261
|
+
struct llama_timings timings;
|
262
|
+
|
263
|
+
LLaMATimingsWrapper(){};
|
264
|
+
|
265
|
+
~LLaMATimingsWrapper(){};
|
266
|
+
};
|
267
|
+
|
268
|
+
class RbLLaMATimings {
|
269
|
+
public:
|
270
|
+
static VALUE llama_timings_alloc(VALUE self) {
|
271
|
+
LLaMATimingsWrapper* ptr = (LLaMATimingsWrapper*)ruby_xmalloc(sizeof(LLaMATimingsWrapper));
|
272
|
+
new (ptr) LLaMATimingsWrapper();
|
273
|
+
return TypedData_Wrap_Struct(self, &llama_timings_type, ptr);
|
274
|
+
}
|
275
|
+
|
276
|
+
static void llama_timings_free(void* ptr) {
|
277
|
+
((LLaMATimingsWrapper*)ptr)->~LLaMATimingsWrapper();
|
278
|
+
ruby_xfree(ptr);
|
279
|
+
}
|
280
|
+
|
281
|
+
static size_t llama_timings_size(const void* ptr) {
|
282
|
+
return sizeof(*((LLaMATimingsWrapper*)ptr));
|
283
|
+
}
|
284
|
+
|
285
|
+
static LLaMATimingsWrapper* get_llama_timings(VALUE self) {
|
286
|
+
LLaMATimingsWrapper* ptr;
|
287
|
+
TypedData_Get_Struct(self, LLaMATimingsWrapper, &llama_timings_type, ptr);
|
288
|
+
return ptr;
|
289
|
+
}
|
290
|
+
|
291
|
+
static void define_class(VALUE outer) {
|
292
|
+
rb_cLLaMATimings = rb_define_class_under(outer, "Timings", rb_cObject);
|
293
|
+
rb_define_alloc_func(rb_cLLaMATimings, llama_timings_alloc);
|
294
|
+
rb_define_method(rb_cLLaMATimings, "t_start_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_start_ms), 0);
|
295
|
+
rb_define_method(rb_cLLaMATimings, "t_end_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_end_ms), 0);
|
296
|
+
rb_define_method(rb_cLLaMATimings, "t_load_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_load_ms), 0);
|
297
|
+
rb_define_method(rb_cLLaMATimings, "t_sample_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_sample_ms), 0);
|
298
|
+
rb_define_method(rb_cLLaMATimings, "t_p_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_p_eval_ms), 0);
|
299
|
+
rb_define_method(rb_cLLaMATimings, "t_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_eval_ms), 0);
|
300
|
+
rb_define_method(rb_cLLaMATimings, "n_sample", RUBY_METHOD_FUNC(_llama_timings_get_n_sample), 0);
|
301
|
+
rb_define_method(rb_cLLaMATimings, "n_p_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_p_eval), 0);
|
302
|
+
rb_define_method(rb_cLLaMATimings, "n_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_eval), 0);
|
303
|
+
}
|
304
|
+
|
305
|
+
private:
|
306
|
+
static const rb_data_type_t llama_timings_type;
|
307
|
+
|
308
|
+
static VALUE _llama_timings_get_t_start_ms(VALUE self) {
|
309
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
310
|
+
return DBL2NUM(ptr->timings.t_start_ms);
|
311
|
+
}
|
312
|
+
|
313
|
+
static VALUE _llama_timings_get_t_end_ms(VALUE self) {
|
314
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
315
|
+
return DBL2NUM(ptr->timings.t_end_ms);
|
316
|
+
}
|
317
|
+
|
318
|
+
static VALUE _llama_timings_get_t_load_ms(VALUE self) {
|
319
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
320
|
+
return DBL2NUM(ptr->timings.t_load_ms);
|
321
|
+
}
|
322
|
+
|
323
|
+
static VALUE _llama_timings_get_t_sample_ms(VALUE self) {
|
324
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
325
|
+
return DBL2NUM(ptr->timings.t_sample_ms);
|
326
|
+
}
|
327
|
+
|
328
|
+
static VALUE _llama_timings_get_t_p_eval_ms(VALUE self) {
|
329
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
330
|
+
return DBL2NUM(ptr->timings.t_p_eval_ms);
|
331
|
+
}
|
332
|
+
|
333
|
+
static VALUE _llama_timings_get_t_eval_ms(VALUE self) {
|
334
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
335
|
+
return DBL2NUM(ptr->timings.t_eval_ms);
|
336
|
+
}
|
337
|
+
|
338
|
+
static VALUE _llama_timings_get_n_sample(VALUE self) {
|
339
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
340
|
+
return INT2NUM(ptr->timings.n_sample);
|
341
|
+
}
|
342
|
+
|
343
|
+
static VALUE _llama_timings_get_n_p_eval(VALUE self) {
|
344
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
345
|
+
return INT2NUM(ptr->timings.n_p_eval);
|
346
|
+
}
|
347
|
+
|
348
|
+
static VALUE _llama_timings_get_n_eval(VALUE self) {
|
349
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
350
|
+
return INT2NUM(ptr->timings.n_eval);
|
351
|
+
}
|
352
|
+
};
|
353
|
+
|
354
|
+
const rb_data_type_t RbLLaMATimings::llama_timings_type = {
|
355
|
+
"RbLLaMATimings",
|
356
|
+
{ NULL,
|
357
|
+
RbLLaMATimings::llama_timings_free,
|
358
|
+
RbLLaMATimings::llama_timings_size },
|
359
|
+
NULL,
|
360
|
+
NULL,
|
361
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
362
|
+
};
|
363
|
+
|
259
364
|
class LLaMAContextParamsWrapper {
|
260
365
|
public:
|
261
366
|
struct llama_context_params params;
|
@@ -866,6 +971,7 @@ public:
|
|
866
971
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
867
972
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
868
973
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
974
|
+
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
869
975
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
870
976
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
871
977
|
rb_define_method(rb_cLLaMAContext, "kv_cache_token_count", RUBY_METHOD_FUNC(_llama_context_kv_cache_token_count), 0);
|
@@ -1227,6 +1333,18 @@ private:
|
|
1227
1333
|
return INT2NUM(llama_n_embd(ptr->ctx));
|
1228
1334
|
};
|
1229
1335
|
|
1336
|
+
static VALUE _llama_context_get_timings(VALUE self) {
|
1337
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1338
|
+
if (ptr->ctx == NULL) {
|
1339
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1340
|
+
return Qnil;
|
1341
|
+
}
|
1342
|
+
VALUE tm_obj = rb_funcall(rb_cLLaMATimings, rb_intern("new"), 0);
|
1343
|
+
LLaMATimingsWrapper* tm_ptr = RbLLaMATimings::get_llama_timings(tm_obj);
|
1344
|
+
tm_ptr->timings = llama_get_timings(ptr->ctx);
|
1345
|
+
return tm_obj;
|
1346
|
+
}
|
1347
|
+
|
1230
1348
|
static VALUE _llama_context_print_timings(VALUE self) {
|
1231
1349
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1232
1350
|
if (ptr->ctx == NULL) {
|
@@ -1898,8 +2016,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
1898
2016
|
RbLLaMATokenData::define_class(rb_mLLaMACpp);
|
1899
2017
|
RbLLaMATokenDataArray::define_class(rb_mLLaMACpp);
|
1900
2018
|
RbLLaMAModel::define_class(rb_mLLaMACpp);
|
2019
|
+
RbLLaMATimings::define_class(rb_mLLaMACpp);
|
1901
2020
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
1902
2021
|
RbLLaMAContextParams::define_class(rb_mLLaMACpp);
|
2022
|
+
RbLLaMAModelQuantizeParams::define_class(rb_mLLaMACpp);
|
1903
2023
|
|
1904
2024
|
rb_define_module_function(rb_mLLaMACpp, "init_backend", rb_llama_llama_init_backend, -1);
|
1905
2025
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|