llama_cpp 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +5 -4
- data/ext/llama_cpp/extconf.rb +38 -0
- data/ext/llama_cpp/llama_cpp.cpp +118 -2
- data/ext/llama_cpp/src/ggml.c +1740 -658
- data/ext/llama_cpp/src/ggml.h +84 -16
- data/ext/llama_cpp/src/llama.cpp +1108 -756
- data/ext/llama_cpp/src/llama.h +37 -1
- data/ext/llama_cpp/src/llama_util.h +396 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +6 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2df0c858faac117b7317683fb7b9a52fc0eb4f7329f728ac6a209085af487142
|
4
|
+
data.tar.gz: 6b5c5d5d5d4e9020b92c7d76c12086fd77089ecc9c2181fb9d8157df5267da96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e9d3ccdb8cdc9d4cb7b60f32a709c874953c357fdaccc057502e5761efdec62a0fc0b39929448203ffc4210dbf0ca2f6019dc13f88cf0db84b754f44fd77bea
|
7
|
+
data.tar.gz: 75fc1d6674c8d509ae0557308277d6d3d7e05f5a6fbea512c2472c46bea1de6e2541a67ec3dda43f874d7f64e6981b720aa1c722d3ec7ea3b96ae9084a4d201b
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,41 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
|
4
|
+
|
5
|
+
- Bump bundled llama.cpp from master-c85e03d to master-315a95a.
|
6
|
+
- Add `apply_lora_from_file` method to LLaMACpp::Context.
|
7
|
+
- Add `mlock_supported?` module function to LLaMACpp.
|
8
|
+
- Add `mmap_supported?` module function to LLaMACpp.
|
9
|
+
- Fix to not destroy original prompt in `LLaMACpp.generate` module function.
|
10
|
+
- Add check for context initialization.
|
11
|
+
- Add blas config options:
|
12
|
+
```
|
13
|
+
$ gem install llama_cpp -- --with-openblas
|
14
|
+
```
|
15
|
+
macOS:
|
16
|
+
```
|
17
|
+
$ gem install llama_cpp -- --with-openblas --with-opt-dir=/opt/homebrew/opt/openblas
|
18
|
+
$ gem install llama_cpp -- --with-accelerate
|
19
|
+
```
|
20
|
+
|
21
|
+
## [[0.0.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.3...v0.0.4)] - 2023-04-15
|
22
|
+
|
23
|
+
- Bump bundled llama.cpp from master-698f7b5 to master-c85e03d.
|
24
|
+
- Add parameterless constructor to LLaMACpp::Context.
|
25
|
+
- Add free and load methods to LLaMACpp::Context.
|
26
|
+
```ruby
|
27
|
+
require 'llama_cpp'
|
28
|
+
|
29
|
+
context = LLaMACpp::Context.new
|
30
|
+
|
31
|
+
params = LLaMACpp::ContextParams.new
|
32
|
+
context.load(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
|
33
|
+
|
34
|
+
# ...
|
35
|
+
|
36
|
+
context.free
|
37
|
+
```
|
38
|
+
|
3
39
|
## [[0.0.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.2...v0.0.3)] - 2023-04-08
|
4
40
|
|
5
41
|
- Bump bundled llama.cpp from master-5b70e7d to master-698f7b5.
|
data/README.md
CHANGED
@@ -20,17 +20,18 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
Prepare
|
23
|
+
Prepare the quantized model by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage) or
|
24
|
+
download the qunatized model, for example [ggml-vicuna-7b-4bit](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5541351), from Hugging Face.
|
24
25
|
|
25
26
|
```ruby
|
26
27
|
require 'llama_cpp'
|
27
28
|
|
28
29
|
params = LLaMACpp::ContextParams.new
|
29
|
-
params.seed =
|
30
|
+
params.seed = 12
|
30
31
|
|
31
|
-
context = LLaMACpp::Context.new(model_path: '/path/to/
|
32
|
+
context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
|
32
33
|
|
33
|
-
puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.')
|
34
|
+
puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.', n_threads: 4)
|
34
35
|
# => "There are two major cities in Japan, Tokyo and Osaka, which have about 30 million populations."
|
35
36
|
```
|
36
37
|
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -10,4 +10,42 @@ $CXXFLAGS << ' -std=c++11'
|
|
10
10
|
$INCFLAGS << ' -I$(srcdir)/src'
|
11
11
|
$VPATH << '$(srcdir)/src'
|
12
12
|
|
13
|
+
if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>', '-pthread')
|
14
|
+
$CFLAGS << ' -pthread'
|
15
|
+
$CXXFLAGS << ' -pthread'
|
16
|
+
end
|
17
|
+
|
18
|
+
if with_config('openblas')
|
19
|
+
abort 'libopenblas is not found.' unless have_library('openblas')
|
20
|
+
abort 'cblas.h is not found.' unless have_header('cblas.h')
|
21
|
+
|
22
|
+
$CFLAGS << ' -DGGML_USE_OPENBLAS'
|
23
|
+
end
|
24
|
+
|
25
|
+
if with_config('accelerate')
|
26
|
+
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
27
|
+
$LDFLAGS << ' -framework Accelerate'
|
28
|
+
end
|
29
|
+
|
30
|
+
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
31
|
+
|
32
|
+
# rubocop:disable Layout/LineLength
|
33
|
+
if UNAME_M.match?(/x86_64|i686/) && try_compile('#include <stdio.h>', '-march=native -mtune=native')
|
34
|
+
$CFLAGS << ' -march=native -mtune=native'
|
35
|
+
$CXXFLAGS << ' -march=native -mtune=native'
|
36
|
+
elsif UNAME_M.match?(/aarch64/) && try_compile('#include <stdio.h>', '-mcpu=native')
|
37
|
+
$CFLAGS << ' -mcpu=native'
|
38
|
+
$CXXFLAGS << ' -mcpu=native'
|
39
|
+
elsif UNAME_M.match?(/armv6/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access')
|
40
|
+
$CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
|
41
|
+
$CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
|
42
|
+
elsif UNAME_M.match?(/armv7/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations')
|
43
|
+
$CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
|
44
|
+
$CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
|
45
|
+
elsif UNAME_M.match?(/armv8/) && try_compile('#include <stdio.h>', '-mfp16-format=ieee -mno-unaligned-access')
|
46
|
+
$CFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
|
47
|
+
$CXXFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
|
48
|
+
end
|
49
|
+
# rubocop:enable Layout/LineLength
|
50
|
+
|
13
51
|
create_makefile('llama_cpp/llama_cpp')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -226,6 +226,9 @@ public:
|
|
226
226
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
227
227
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
228
228
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
229
|
+
rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
|
230
|
+
rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
|
231
|
+
rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
|
229
232
|
};
|
230
233
|
|
231
234
|
private:
|
@@ -236,7 +239,13 @@ private:
|
|
236
239
|
ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
|
237
240
|
VALUE kw_values[2] = { Qundef, Qundef };
|
238
241
|
rb_scan_args(argc, argv, ":", &kw_args);
|
239
|
-
rb_get_kwargs(kw_args, kw_table,
|
242
|
+
rb_get_kwargs(kw_args, kw_table, 0, 2, kw_values);
|
243
|
+
|
244
|
+
if (kw_values[0] == Qundef && kw_values[1] == Qundef) {
|
245
|
+
rb_iv_set(self, "@params", Qnil);
|
246
|
+
rb_iv_set(self, "@has_evaluated", Qfalse);
|
247
|
+
return Qnil;
|
248
|
+
}
|
240
249
|
|
241
250
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
242
251
|
rb_raise(rb_eArgError, "model_path must be a string");
|
@@ -260,7 +269,7 @@ private:
|
|
260
269
|
rb_iv_set(self, "@has_evaluated", Qfalse);
|
261
270
|
|
262
271
|
RB_GC_GUARD(filename);
|
263
|
-
return
|
272
|
+
return Qnil;
|
264
273
|
};
|
265
274
|
|
266
275
|
static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
|
@@ -303,6 +312,10 @@ private:
|
|
303
312
|
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
304
313
|
|
305
314
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
315
|
+
if (ptr->ctx == NULL) {
|
316
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
317
|
+
return Qnil;
|
318
|
+
}
|
306
319
|
if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
|
307
320
|
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
308
321
|
return Qnil;
|
@@ -341,6 +354,10 @@ private:
|
|
341
354
|
|
342
355
|
std::vector<llama_token> tokens(n_max_tokens);
|
343
356
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
357
|
+
if (ptr->ctx == NULL) {
|
358
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
359
|
+
return Qnil;
|
360
|
+
}
|
344
361
|
const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
|
345
362
|
if (n < 0) {
|
346
363
|
rb_raise(rb_eRuntimeError, "Failed to tokenize");
|
@@ -441,6 +458,10 @@ private:
|
|
441
458
|
}
|
442
459
|
|
443
460
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
461
|
+
if (ptr->ctx == NULL) {
|
462
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
463
|
+
return Qnil;
|
464
|
+
}
|
444
465
|
llama_token token = llama_sample_top_p_top_k(ptr->ctx, last_n_tokens_data.data(), last_n_tokens_size, top_k, top_p, temp, penalty);
|
445
466
|
|
446
467
|
return INT2NUM(token);
|
@@ -492,6 +513,91 @@ private:
|
|
492
513
|
llama_reset_timings(ptr->ctx);
|
493
514
|
return Qnil;
|
494
515
|
};
|
516
|
+
|
517
|
+
static VALUE _llama_context_free(VALUE self) {
|
518
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
519
|
+
if (ptr->ctx != NULL) {
|
520
|
+
llama_free(ptr->ctx);
|
521
|
+
ptr->ctx = NULL;
|
522
|
+
rb_iv_set(self, "@params", Qnil);
|
523
|
+
rb_iv_set(self, "@has_evaluated", Qfalse);
|
524
|
+
}
|
525
|
+
return Qnil;
|
526
|
+
}
|
527
|
+
|
528
|
+
static VALUE _llama_context_load(int argc, VALUE* argv, VALUE self) {
|
529
|
+
VALUE kw_args = Qnil;
|
530
|
+
ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
|
531
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
532
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
533
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
534
|
+
|
535
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
536
|
+
rb_raise(rb_eArgError, "model_path must be a string");
|
537
|
+
return Qnil;
|
538
|
+
}
|
539
|
+
if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
|
540
|
+
rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
|
541
|
+
return Qnil;
|
542
|
+
}
|
543
|
+
|
544
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
545
|
+
if (ctx_ptr->ctx != NULL) {
|
546
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
|
547
|
+
return Qnil;
|
548
|
+
}
|
549
|
+
|
550
|
+
VALUE filename = kw_values[0];
|
551
|
+
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
|
552
|
+
ctx_ptr->ctx = llama_init_from_file(StringValueCStr(filename), prms_ptr->params);
|
553
|
+
if (ctx_ptr->ctx == NULL) {
|
554
|
+
rb_raise(rb_eRuntimeError, "Failed to initialize LLaMA context");
|
555
|
+
return Qnil;
|
556
|
+
}
|
557
|
+
|
558
|
+
rb_iv_set(self, "@params", kw_values[1]);
|
559
|
+
rb_iv_set(self, "@has_evaluated", Qfalse);
|
560
|
+
|
561
|
+
RB_GC_GUARD(filename);
|
562
|
+
return Qnil;
|
563
|
+
};
|
564
|
+
|
565
|
+
static VALUE _llama_context_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
|
566
|
+
VALUE kw_args = Qnil;
|
567
|
+
ID kw_table[3] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads") };
|
568
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
569
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
570
|
+
rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
|
571
|
+
|
572
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
573
|
+
rb_raise(rb_eArgError, "lora_path must be a string");
|
574
|
+
return Qnil;
|
575
|
+
}
|
576
|
+
if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
|
577
|
+
rb_raise(rb_eArgError, "base_model_path must be a string");
|
578
|
+
return Qnil;
|
579
|
+
}
|
580
|
+
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
581
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
582
|
+
return Qnil;
|
583
|
+
}
|
584
|
+
|
585
|
+
const char* lora_path = StringValueCStr(kw_values[0]);
|
586
|
+
const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
|
587
|
+
const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
|
588
|
+
|
589
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
590
|
+
if (ptr->ctx != NULL) {
|
591
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
|
592
|
+
return Qnil;
|
593
|
+
}
|
594
|
+
|
595
|
+
if (llama_apply_lora_from_file(ptr->ctx, lora_path, base_model_path, n_threads) != 0) {
|
596
|
+
rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
|
597
|
+
return Qnil;
|
598
|
+
}
|
599
|
+
return Qnil;
|
600
|
+
};
|
495
601
|
};
|
496
602
|
|
497
603
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
@@ -519,6 +625,14 @@ static VALUE rb_llama_print_system_info(VALUE self) {
|
|
519
625
|
return rb_utf8_str_new_cstr(result);
|
520
626
|
}
|
521
627
|
|
628
|
+
static VALUE rb_llama_mmap_supported(VALUE self) {
|
629
|
+
return llama_mmap_supported() ? Qtrue : Qfalse;
|
630
|
+
}
|
631
|
+
|
632
|
+
static VALUE rb_llama_mlock_supported(VALUE self) {
|
633
|
+
return llama_mlock_supported() ? Qtrue : Qfalse;
|
634
|
+
}
|
635
|
+
|
522
636
|
extern "C" void Init_llama_cpp(void) {
|
523
637
|
rb_mLLaMACpp = rb_define_module("LLaMACpp");
|
524
638
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
@@ -527,6 +641,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
527
641
|
rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
|
528
642
|
rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
|
529
643
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
644
|
+
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
645
|
+
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
530
646
|
|
531
647
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
532
648
|
std::stringstream ss_magic;
|