llama_cpp 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1fcd28849baae5e90c466665aff4fe5da1d848193ebcf74c3fe333c5674191c
|
4
|
+
data.tar.gz: fcb0c64528d24c5cfad677f17bfd6e1e817a4b8279317ca5b2113302735598b9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c70b5f919feb7a585efbe21b3360254c2f5789504cd73fecee12fd686483c77eeb763ed91a8e7434d5852208555a78f168b358d0895f15b1ea7e774d36d6910a
|
7
|
+
data.tar.gz: f554ad58fc9d68c39b80995b7f424468386b32a5847dbdefbceb1cba53ff7182da35be8599523d82a6daa8fee23667d07e06faedc4c727d52e8fc594d0bc7d3f
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
## [[0.3.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.1...v0.3.2)] - 2023-07-08
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-b8c8dda to master-481f793.
|
4
|
+
- Add `Timings` class and `timings` method to `Context`:
|
5
|
+
```ruby
|
6
|
+
require 'llama_cpp'
|
7
|
+
|
8
|
+
# ...
|
9
|
+
|
10
|
+
context = LLaMACpp::Context.new(model: model)
|
11
|
+
timings = context.timings
|
12
|
+
|
13
|
+
puts timings.class
|
14
|
+
# => LLaMACpp::Timings
|
15
|
+
puts timings.t_load_ms
|
16
|
+
# => 79.61
|
17
|
+
```
|
18
|
+
- Expose sampling options as the arguemnts of `generate` module function:
|
19
|
+
```ruby
|
20
|
+
require 'llama_cpp'
|
21
|
+
|
22
|
+
# ...
|
23
|
+
|
24
|
+
LLaMACpp.generate(context, 'Hello, world.', top_k: 30, top_p: 0.8, temperature: 0.9)
|
25
|
+
```
|
26
|
+
- Add `ModelQuantizaParams` class, this class was not published because the author forgot to write rb_define_class.
|
27
|
+
- Minor update to example scripts, configuration files, and documentations.
|
28
|
+
|
1
29
|
## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
|
2
30
|
|
3
31
|
- Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
|
data/README.md
CHANGED
@@ -68,6 +68,15 @@ User:
|
|
68
68
|
|
69
69
|
![llama_cpp_chat_example](https://github.com/yoshoku/llama_cpp.rb/assets/5562409/374ae3d8-63a6-498f-ae6e-5552b464bdda)
|
70
70
|
|
71
|
+
Japanse chat is also possible using the [Vicuna model on Hugging Face](https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized).
|
72
|
+
|
73
|
+
```sh
|
74
|
+
$ wget https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_0.bin
|
75
|
+
$ ruby chat.rb --model ggml-vicuna-7b-1.1-q4_0.bin --file prompt_jp.txt
|
76
|
+
```
|
77
|
+
|
78
|
+
![llama_cpp rb-jpchat](https://github.com/yoshoku/llama_cpp.rb/assets/5562409/526ff18c-2bb2-4b06-8933-f72960024033)
|
79
|
+
|
71
80
|
## Contributing
|
72
81
|
|
73
82
|
Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/llama_cpp.rb.
|
data/examples/chat.rb
CHANGED
@@ -33,7 +33,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
33
33
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
34
|
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
35
35
|
params = LLaMACpp::ContextParams.new
|
36
|
-
params.seed = options[:seed]
|
36
|
+
params.seed = options[:seed] if options[:seed] != -1
|
37
37
|
params.n_gpu_layers = options[:n_gpu_layers]
|
38
38
|
model = LLaMACpp::Model.new(model_path: options[:model], params: params)
|
39
39
|
context = LLaMACpp::Context.new(model: model)
|
data/examples/embedding.rb
CHANGED
@@ -18,7 +18,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
18
18
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
19
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
20
20
|
params = LLaMACpp::ContextParams.new
|
21
|
-
params.seed = options[:seed]
|
21
|
+
params.seed = options[:seed] if options[:seed] != -1
|
22
22
|
params.n_gpu_layers = options[:n_gpu_layers]
|
23
23
|
params.embedding = true
|
24
24
|
model = LLaMACpp::Model.new(model_path: options[:model], params: params)
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -7,8 +7,8 @@ abort 'libstdc++ is not found.' unless have_library('stdc++')
|
|
7
7
|
|
8
8
|
$srcs = %w[ggml.c llama.cpp llama_cpp.cpp]
|
9
9
|
$srcs << 'ggml-opencl.cpp' if with_config('clblast')
|
10
|
-
$CFLAGS << ' -w'
|
11
|
-
$CXXFLAGS << ' -std=c++11'
|
10
|
+
$CFLAGS << ' -w -DNDEBUG'
|
11
|
+
$CXXFLAGS << ' -std=c++11 -DNDEBUG'
|
12
12
|
$INCFLAGS << ' -I$(srcdir)/src'
|
13
13
|
$VPATH << '$(srcdir)/src'
|
14
14
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
|
2
1
|
#include "llama_cpp.h"
|
3
2
|
|
4
3
|
VALUE rb_mLLaMACpp;
|
5
4
|
VALUE rb_cLLaMAModel;
|
5
|
+
VALUE rb_cLLaMATimings;
|
6
6
|
VALUE rb_cLLaMAContext;
|
7
7
|
VALUE rb_cLLaMAContextParams;
|
8
8
|
VALUE rb_cLLaMAModelQuantizeParams;
|
@@ -256,6 +256,111 @@ const rb_data_type_t RbLLaMATokenDataArray::llama_token_data_array_type = {
|
|
256
256
|
RUBY_TYPED_FREE_IMMEDIATELY
|
257
257
|
};
|
258
258
|
|
259
|
+
class LLaMATimingsWrapper {
|
260
|
+
public:
|
261
|
+
struct llama_timings timings;
|
262
|
+
|
263
|
+
LLaMATimingsWrapper(){};
|
264
|
+
|
265
|
+
~LLaMATimingsWrapper(){};
|
266
|
+
};
|
267
|
+
|
268
|
+
class RbLLaMATimings {
|
269
|
+
public:
|
270
|
+
static VALUE llama_timings_alloc(VALUE self) {
|
271
|
+
LLaMATimingsWrapper* ptr = (LLaMATimingsWrapper*)ruby_xmalloc(sizeof(LLaMATimingsWrapper));
|
272
|
+
new (ptr) LLaMATimingsWrapper();
|
273
|
+
return TypedData_Wrap_Struct(self, &llama_timings_type, ptr);
|
274
|
+
}
|
275
|
+
|
276
|
+
static void llama_timings_free(void* ptr) {
|
277
|
+
((LLaMATimingsWrapper*)ptr)->~LLaMATimingsWrapper();
|
278
|
+
ruby_xfree(ptr);
|
279
|
+
}
|
280
|
+
|
281
|
+
static size_t llama_timings_size(const void* ptr) {
|
282
|
+
return sizeof(*((LLaMATimingsWrapper*)ptr));
|
283
|
+
}
|
284
|
+
|
285
|
+
static LLaMATimingsWrapper* get_llama_timings(VALUE self) {
|
286
|
+
LLaMATimingsWrapper* ptr;
|
287
|
+
TypedData_Get_Struct(self, LLaMATimingsWrapper, &llama_timings_type, ptr);
|
288
|
+
return ptr;
|
289
|
+
}
|
290
|
+
|
291
|
+
static void define_class(VALUE outer) {
|
292
|
+
rb_cLLaMATimings = rb_define_class_under(outer, "Timings", rb_cObject);
|
293
|
+
rb_define_alloc_func(rb_cLLaMATimings, llama_timings_alloc);
|
294
|
+
rb_define_method(rb_cLLaMATimings, "t_start_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_start_ms), 0);
|
295
|
+
rb_define_method(rb_cLLaMATimings, "t_end_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_end_ms), 0);
|
296
|
+
rb_define_method(rb_cLLaMATimings, "t_load_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_load_ms), 0);
|
297
|
+
rb_define_method(rb_cLLaMATimings, "t_sample_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_sample_ms), 0);
|
298
|
+
rb_define_method(rb_cLLaMATimings, "t_p_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_p_eval_ms), 0);
|
299
|
+
rb_define_method(rb_cLLaMATimings, "t_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_eval_ms), 0);
|
300
|
+
rb_define_method(rb_cLLaMATimings, "n_sample", RUBY_METHOD_FUNC(_llama_timings_get_n_sample), 0);
|
301
|
+
rb_define_method(rb_cLLaMATimings, "n_p_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_p_eval), 0);
|
302
|
+
rb_define_method(rb_cLLaMATimings, "n_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_eval), 0);
|
303
|
+
}
|
304
|
+
|
305
|
+
private:
|
306
|
+
static const rb_data_type_t llama_timings_type;
|
307
|
+
|
308
|
+
static VALUE _llama_timings_get_t_start_ms(VALUE self) {
|
309
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
310
|
+
return DBL2NUM(ptr->timings.t_start_ms);
|
311
|
+
}
|
312
|
+
|
313
|
+
static VALUE _llama_timings_get_t_end_ms(VALUE self) {
|
314
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
315
|
+
return DBL2NUM(ptr->timings.t_end_ms);
|
316
|
+
}
|
317
|
+
|
318
|
+
static VALUE _llama_timings_get_t_load_ms(VALUE self) {
|
319
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
320
|
+
return DBL2NUM(ptr->timings.t_load_ms);
|
321
|
+
}
|
322
|
+
|
323
|
+
static VALUE _llama_timings_get_t_sample_ms(VALUE self) {
|
324
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
325
|
+
return DBL2NUM(ptr->timings.t_sample_ms);
|
326
|
+
}
|
327
|
+
|
328
|
+
static VALUE _llama_timings_get_t_p_eval_ms(VALUE self) {
|
329
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
330
|
+
return DBL2NUM(ptr->timings.t_p_eval_ms);
|
331
|
+
}
|
332
|
+
|
333
|
+
static VALUE _llama_timings_get_t_eval_ms(VALUE self) {
|
334
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
335
|
+
return DBL2NUM(ptr->timings.t_eval_ms);
|
336
|
+
}
|
337
|
+
|
338
|
+
static VALUE _llama_timings_get_n_sample(VALUE self) {
|
339
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
340
|
+
return INT2NUM(ptr->timings.n_sample);
|
341
|
+
}
|
342
|
+
|
343
|
+
static VALUE _llama_timings_get_n_p_eval(VALUE self) {
|
344
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
345
|
+
return INT2NUM(ptr->timings.n_p_eval);
|
346
|
+
}
|
347
|
+
|
348
|
+
static VALUE _llama_timings_get_n_eval(VALUE self) {
|
349
|
+
LLaMATimingsWrapper* ptr = get_llama_timings(self);
|
350
|
+
return INT2NUM(ptr->timings.n_eval);
|
351
|
+
}
|
352
|
+
};
|
353
|
+
|
354
|
+
const rb_data_type_t RbLLaMATimings::llama_timings_type = {
|
355
|
+
"RbLLaMATimings",
|
356
|
+
{ NULL,
|
357
|
+
RbLLaMATimings::llama_timings_free,
|
358
|
+
RbLLaMATimings::llama_timings_size },
|
359
|
+
NULL,
|
360
|
+
NULL,
|
361
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
362
|
+
};
|
363
|
+
|
259
364
|
class LLaMAContextParamsWrapper {
|
260
365
|
public:
|
261
366
|
struct llama_context_params params;
|
@@ -866,6 +971,7 @@ public:
|
|
866
971
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
867
972
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
868
973
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
974
|
+
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
869
975
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
870
976
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
871
977
|
rb_define_method(rb_cLLaMAContext, "kv_cache_token_count", RUBY_METHOD_FUNC(_llama_context_kv_cache_token_count), 0);
|
@@ -1227,6 +1333,18 @@ private:
|
|
1227
1333
|
return INT2NUM(llama_n_embd(ptr->ctx));
|
1228
1334
|
};
|
1229
1335
|
|
1336
|
+
static VALUE _llama_context_get_timings(VALUE self) {
|
1337
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1338
|
+
if (ptr->ctx == NULL) {
|
1339
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1340
|
+
return Qnil;
|
1341
|
+
}
|
1342
|
+
VALUE tm_obj = rb_funcall(rb_cLLaMATimings, rb_intern("new"), 0);
|
1343
|
+
LLaMATimingsWrapper* tm_ptr = RbLLaMATimings::get_llama_timings(tm_obj);
|
1344
|
+
tm_ptr->timings = llama_get_timings(ptr->ctx);
|
1345
|
+
return tm_obj;
|
1346
|
+
}
|
1347
|
+
|
1230
1348
|
static VALUE _llama_context_print_timings(VALUE self) {
|
1231
1349
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1232
1350
|
if (ptr->ctx == NULL) {
|
@@ -1898,8 +2016,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
1898
2016
|
RbLLaMATokenData::define_class(rb_mLLaMACpp);
|
1899
2017
|
RbLLaMATokenDataArray::define_class(rb_mLLaMACpp);
|
1900
2018
|
RbLLaMAModel::define_class(rb_mLLaMACpp);
|
2019
|
+
RbLLaMATimings::define_class(rb_mLLaMACpp);
|
1901
2020
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
1902
2021
|
RbLLaMAContextParams::define_class(rb_mLLaMACpp);
|
2022
|
+
RbLLaMAModelQuantizeParams::define_class(rb_mLLaMACpp);
|
1903
2023
|
|
1904
2024
|
rb_define_module_function(rb_mLLaMACpp, "init_backend", rb_llama_llama_init_backend, -1);
|
1905
2025
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|