llama_cpp 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +5 -4
- data/ext/llama_cpp/extconf.rb +38 -0
- data/ext/llama_cpp/llama_cpp.cpp +118 -2
- data/ext/llama_cpp/src/ggml.c +1740 -658
- data/ext/llama_cpp/src/ggml.h +84 -16
- data/ext/llama_cpp/src/llama.cpp +1108 -756
- data/ext/llama_cpp/src/llama.h +37 -1
- data/ext/llama_cpp/src/llama_util.h +396 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +6 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2df0c858faac117b7317683fb7b9a52fc0eb4f7329f728ac6a209085af487142
|
4
|
+
data.tar.gz: 6b5c5d5d5d4e9020b92c7d76c12086fd77089ecc9c2181fb9d8157df5267da96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e9d3ccdb8cdc9d4cb7b60f32a709c874953c357fdaccc057502e5761efdec62a0fc0b39929448203ffc4210dbf0ca2f6019dc13f88cf0db84b754f44fd77bea
|
7
|
+
data.tar.gz: 75fc1d6674c8d509ae0557308277d6d3d7e05f5a6fbea512c2472c46bea1de6e2541a67ec3dda43f874d7f64e6981b720aa1c722d3ec7ea3b96ae9084a4d201b
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,41 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
|
4
|
+
|
5
|
+
- Bump bundled llama.cpp from master-c85e03d to master-315a95a.
|
6
|
+
- Add `apply_lora_from_file` method to LLaMACpp::Context.
|
7
|
+
- Add `mlock_supported?` module function to LLaMACpp.
|
8
|
+
- Add `mmap_supported?` module function to LLaMACpp.
|
9
|
+
- Fix to not destroy original prompt in `LLaMACpp.generate` module function.
|
10
|
+
- Add check for context initialization.
|
11
|
+
- Add blas config options:
|
12
|
+
```
|
13
|
+
$ gem install llama_cpp -- --with-openblas
|
14
|
+
```
|
15
|
+
macOS:
|
16
|
+
```
|
17
|
+
$ gem install llama_cpp -- --with-openblas --with-opt-dir=/opt/homebrew/opt/openblas
|
18
|
+
$ gem install llama_cpp -- --with-accelerate
|
19
|
+
```
|
20
|
+
|
21
|
+
## [[0.0.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.3...v0.0.4)] - 2023-04-15
|
22
|
+
|
23
|
+
- Bump bundled llama.cpp from master-698f7b5 to master-c85e03d.
|
24
|
+
- Add parameterless constructor to LLaMACpp::Context.
|
25
|
+
- Add free and load methods to LLaMACpp::Context.
|
26
|
+
```ruby
|
27
|
+
require 'llama_cpp'
|
28
|
+
|
29
|
+
context = LLaMACpp::Context.new
|
30
|
+
|
31
|
+
params = LLaMACpp::ContextParams.new
|
32
|
+
context.load(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
|
33
|
+
|
34
|
+
# ...
|
35
|
+
|
36
|
+
context.free
|
37
|
+
```
|
38
|
+
|
3
39
|
## [[0.0.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.2...v0.0.3)] - 2023-04-08
|
4
40
|
|
5
41
|
- Bump bundled llama.cpp from master-5b70e7d to master-698f7b5.
|
data/README.md
CHANGED
@@ -20,17 +20,18 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
Prepare
|
23
|
+
Prepare the quantized model by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage) or
|
24
|
+
download the qunatized model, for example [ggml-vicuna-7b-4bit](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5541351), from Hugging Face.
|
24
25
|
|
25
26
|
```ruby
|
26
27
|
require 'llama_cpp'
|
27
28
|
|
28
29
|
params = LLaMACpp::ContextParams.new
|
29
|
-
params.seed =
|
30
|
+
params.seed = 12
|
30
31
|
|
31
|
-
context = LLaMACpp::Context.new(model_path: '/path/to/
|
32
|
+
context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
|
32
33
|
|
33
|
-
puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.')
|
34
|
+
puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.', n_threads: 4)
|
34
35
|
# => "There are two major cities in Japan, Tokyo and Osaka, which have about 30 million populations."
|
35
36
|
```
|
36
37
|
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -10,4 +10,42 @@ $CXXFLAGS << ' -std=c++11'
|
|
10
10
|
$INCFLAGS << ' -I$(srcdir)/src'
|
11
11
|
$VPATH << '$(srcdir)/src'
|
12
12
|
|
13
|
+
if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>', '-pthread')
|
14
|
+
$CFLAGS << ' -pthread'
|
15
|
+
$CXXFLAGS << ' -pthread'
|
16
|
+
end
|
17
|
+
|
18
|
+
if with_config('openblas')
|
19
|
+
abort 'libopenblas is not found.' unless have_library('openblas')
|
20
|
+
abort 'cblas.h is not found.' unless have_header('cblas.h')
|
21
|
+
|
22
|
+
$CFLAGS << ' -DGGML_USE_OPENBLAS'
|
23
|
+
end
|
24
|
+
|
25
|
+
if with_config('accelerate')
|
26
|
+
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
27
|
+
$LDFLAGS << ' -framework Accelerate'
|
28
|
+
end
|
29
|
+
|
30
|
+
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
31
|
+
|
32
|
+
# rubocop:disable Layout/LineLength
|
33
|
+
if UNAME_M.match?(/x86_64|i686/) && try_compile('#include <stdio.h>', '-march=native -mtune=native')
|
34
|
+
$CFLAGS << ' -march=native -mtune=native'
|
35
|
+
$CXXFLAGS << ' -march=native -mtune=native'
|
36
|
+
elsif UNAME_M.match?(/aarch64/) && try_compile('#include <stdio.h>', '-mcpu=native')
|
37
|
+
$CFLAGS << ' -mcpu=native'
|
38
|
+
$CXXFLAGS << ' -mcpu=native'
|
39
|
+
elsif UNAME_M.match?(/armv6/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access')
|
40
|
+
$CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
|
41
|
+
$CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
|
42
|
+
elsif UNAME_M.match?(/armv7/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations')
|
43
|
+
$CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
|
44
|
+
$CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
|
45
|
+
elsif UNAME_M.match?(/armv8/) && try_compile('#include <stdio.h>', '-mfp16-format=ieee -mno-unaligned-access')
|
46
|
+
$CFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
|
47
|
+
$CXXFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
|
48
|
+
end
|
49
|
+
# rubocop:enable Layout/LineLength
|
50
|
+
|
13
51
|
create_makefile('llama_cpp/llama_cpp')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -226,6 +226,9 @@ public:
|
|
226
226
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
227
227
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
228
228
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
229
|
+
rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
|
230
|
+
rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
|
231
|
+
rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
|
229
232
|
};
|
230
233
|
|
231
234
|
private:
|
@@ -236,7 +239,13 @@ private:
|
|
236
239
|
ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
|
237
240
|
VALUE kw_values[2] = { Qundef, Qundef };
|
238
241
|
rb_scan_args(argc, argv, ":", &kw_args);
|
239
|
-
rb_get_kwargs(kw_args, kw_table,
|
242
|
+
rb_get_kwargs(kw_args, kw_table, 0, 2, kw_values);
|
243
|
+
|
244
|
+
if (kw_values[0] == Qundef && kw_values[1] == Qundef) {
|
245
|
+
rb_iv_set(self, "@params", Qnil);
|
246
|
+
rb_iv_set(self, "@has_evaluated", Qfalse);
|
247
|
+
return Qnil;
|
248
|
+
}
|
240
249
|
|
241
250
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
242
251
|
rb_raise(rb_eArgError, "model_path must be a string");
|
@@ -260,7 +269,7 @@ private:
|
|
260
269
|
rb_iv_set(self, "@has_evaluated", Qfalse);
|
261
270
|
|
262
271
|
RB_GC_GUARD(filename);
|
263
|
-
return
|
272
|
+
return Qnil;
|
264
273
|
};
|
265
274
|
|
266
275
|
static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
|
@@ -303,6 +312,10 @@ private:
|
|
303
312
|
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
304
313
|
|
305
314
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
315
|
+
if (ptr->ctx == NULL) {
|
316
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
317
|
+
return Qnil;
|
318
|
+
}
|
306
319
|
if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
|
307
320
|
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
308
321
|
return Qnil;
|
@@ -341,6 +354,10 @@ private:
|
|
341
354
|
|
342
355
|
std::vector<llama_token> tokens(n_max_tokens);
|
343
356
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
357
|
+
if (ptr->ctx == NULL) {
|
358
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
359
|
+
return Qnil;
|
360
|
+
}
|
344
361
|
const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
|
345
362
|
if (n < 0) {
|
346
363
|
rb_raise(rb_eRuntimeError, "Failed to tokenize");
|
@@ -441,6 +458,10 @@ private:
|
|
441
458
|
}
|
442
459
|
|
443
460
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
461
|
+
if (ptr->ctx == NULL) {
|
462
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
463
|
+
return Qnil;
|
464
|
+
}
|
444
465
|
llama_token token = llama_sample_top_p_top_k(ptr->ctx, last_n_tokens_data.data(), last_n_tokens_size, top_k, top_p, temp, penalty);
|
445
466
|
|
446
467
|
return INT2NUM(token);
|
@@ -492,6 +513,91 @@ private:
|
|
492
513
|
llama_reset_timings(ptr->ctx);
|
493
514
|
return Qnil;
|
494
515
|
};
|
516
|
+
|
517
|
+
static VALUE _llama_context_free(VALUE self) {
|
518
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
519
|
+
if (ptr->ctx != NULL) {
|
520
|
+
llama_free(ptr->ctx);
|
521
|
+
ptr->ctx = NULL;
|
522
|
+
rb_iv_set(self, "@params", Qnil);
|
523
|
+
rb_iv_set(self, "@has_evaluated", Qfalse);
|
524
|
+
}
|
525
|
+
return Qnil;
|
526
|
+
}
|
527
|
+
|
528
|
+
static VALUE _llama_context_load(int argc, VALUE* argv, VALUE self) {
|
529
|
+
VALUE kw_args = Qnil;
|
530
|
+
ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
|
531
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
532
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
533
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
534
|
+
|
535
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
536
|
+
rb_raise(rb_eArgError, "model_path must be a string");
|
537
|
+
return Qnil;
|
538
|
+
}
|
539
|
+
if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
|
540
|
+
rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
|
541
|
+
return Qnil;
|
542
|
+
}
|
543
|
+
|
544
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
545
|
+
if (ctx_ptr->ctx != NULL) {
|
546
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
|
547
|
+
return Qnil;
|
548
|
+
}
|
549
|
+
|
550
|
+
VALUE filename = kw_values[0];
|
551
|
+
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
|
552
|
+
ctx_ptr->ctx = llama_init_from_file(StringValueCStr(filename), prms_ptr->params);
|
553
|
+
if (ctx_ptr->ctx == NULL) {
|
554
|
+
rb_raise(rb_eRuntimeError, "Failed to initialize LLaMA context");
|
555
|
+
return Qnil;
|
556
|
+
}
|
557
|
+
|
558
|
+
rb_iv_set(self, "@params", kw_values[1]);
|
559
|
+
rb_iv_set(self, "@has_evaluated", Qfalse);
|
560
|
+
|
561
|
+
RB_GC_GUARD(filename);
|
562
|
+
return Qnil;
|
563
|
+
};
|
564
|
+
|
565
|
+
static VALUE _llama_context_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
|
566
|
+
VALUE kw_args = Qnil;
|
567
|
+
ID kw_table[3] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads") };
|
568
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
569
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
570
|
+
rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
|
571
|
+
|
572
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
573
|
+
rb_raise(rb_eArgError, "lora_path must be a string");
|
574
|
+
return Qnil;
|
575
|
+
}
|
576
|
+
if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
|
577
|
+
rb_raise(rb_eArgError, "base_model_path must be a string");
|
578
|
+
return Qnil;
|
579
|
+
}
|
580
|
+
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
581
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
582
|
+
return Qnil;
|
583
|
+
}
|
584
|
+
|
585
|
+
const char* lora_path = StringValueCStr(kw_values[0]);
|
586
|
+
const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
|
587
|
+
const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
|
588
|
+
|
589
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
590
|
+
if (ptr->ctx != NULL) {
|
591
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
|
592
|
+
return Qnil;
|
593
|
+
}
|
594
|
+
|
595
|
+
if (llama_apply_lora_from_file(ptr->ctx, lora_path, base_model_path, n_threads) != 0) {
|
596
|
+
rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
|
597
|
+
return Qnil;
|
598
|
+
}
|
599
|
+
return Qnil;
|
600
|
+
};
|
495
601
|
};
|
496
602
|
|
497
603
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
@@ -519,6 +625,14 @@ static VALUE rb_llama_print_system_info(VALUE self) {
|
|
519
625
|
return rb_utf8_str_new_cstr(result);
|
520
626
|
}
|
521
627
|
|
628
|
+
static VALUE rb_llama_mmap_supported(VALUE self) {
|
629
|
+
return llama_mmap_supported() ? Qtrue : Qfalse;
|
630
|
+
}
|
631
|
+
|
632
|
+
static VALUE rb_llama_mlock_supported(VALUE self) {
|
633
|
+
return llama_mlock_supported() ? Qtrue : Qfalse;
|
634
|
+
}
|
635
|
+
|
522
636
|
extern "C" void Init_llama_cpp(void) {
|
523
637
|
rb_mLLaMACpp = rb_define_module("LLaMACpp");
|
524
638
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
@@ -527,6 +641,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
527
641
|
rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
|
528
642
|
rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
|
529
643
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
644
|
+
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
645
|
+
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
530
646
|
|
531
647
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
532
648
|
std::stringstream ss_magic;
|