llama_cpp 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3ce894c9b013134688dffb18229c6f18073cdc8aceafa7d8a519803ae8ffc8a4
4
- data.tar.gz: b9a09f3b7217c120d0eae5e89ecf15a4ccbedcdef92db7d5c4508d03ecd65d3c
3
+ metadata.gz: 2df0c858faac117b7317683fb7b9a52fc0eb4f7329f728ac6a209085af487142
4
+ data.tar.gz: 6b5c5d5d5d4e9020b92c7d76c12086fd77089ecc9c2181fb9d8157df5267da96
5
5
  SHA512:
6
- metadata.gz: a979c8a488ec410f214873664288f618af9363d60b6ef6b3ef44de9bd7486bd223b8b38704eab09c1cec1f210c55e5d08ba03af8d6ddc87c10d8836da983c1de
7
- data.tar.gz: 47228be684c3ce577b066b2255482c42f6979c4cce5852c22e85a9f0b66bdcaea58d667c56f2eefef6cfc121822a2761406cd2911abccd754c07e8568bb8550e
6
+ metadata.gz: 8e9d3ccdb8cdc9d4cb7b60f32a709c874953c357fdaccc057502e5761efdec62a0fc0b39929448203ffc4210dbf0ca2f6019dc13f88cf0db84b754f44fd77bea
7
+ data.tar.gz: 75fc1d6674c8d509ae0557308277d6d3d7e05f5a6fbea512c2472c46bea1de6e2541a67ec3dda43f874d7f64e6981b720aa1c722d3ec7ea3b96ae9084a4d201b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,41 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
4
+
5
+ - Bump bundled llama.cpp from master-c85e03d to master-315a95a.
6
+ - Add `apply_lora_from_file` method to LLaMACpp::Context.
7
+ - Add `mlock_supported?` module function to LLaMACpp.
8
+ - Add `mmap_supported?` module function to LLaMACpp.
9
+ - Fix to not destroy original prompt in `LLaMACpp.generate` module function.
10
+ - Add check for context initialization.
11
+ - Add blas config options:
12
+ ```
13
+ $ gem install llama_cpp -- --with-openblas
14
+ ```
15
+ macOS:
16
+ ```
17
+ $ gem install llama_cpp -- --with-openblas --with-opt-dir=/opt/homebrew/opt/openblas
18
+ $ gem install llama_cpp -- --with-accelerate
19
+ ```
20
+
21
+ ## [[0.0.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.3...v0.0.4)] - 2023-04-15
22
+
23
+ - Bump bundled llama.cpp from master-698f7b5 to master-c85e03d.
24
+ - Add parameterless constructor to LLaMACpp::Context.
25
+ - Add free and load methods to LLaMACpp::Context.
26
+ ```ruby
27
+ require 'llama_cpp'
28
+
29
+ context = LLaMACpp::Context.new
30
+
31
+ params = LLaMACpp::ContextParams.new
32
+ context.load(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
33
+
34
+ # ...
35
+
36
+ context.free
37
+ ```
38
+
3
39
  ## [[0.0.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.2...v0.0.3)] - 2023-04-08
4
40
 
5
41
  - Bump bundled llama.cpp from master-5b70e7d to master-698f7b5.
data/README.md CHANGED
@@ -20,17 +20,18 @@ If bundler is not being used to manage dependencies, install the gem by executin
20
20
 
21
21
  ## Usage
22
22
 
23
- Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
23
+ Prepare the quantized model by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage) or
24
+ download the qunatized model, for example [ggml-vicuna-7b-4bit](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5541351), from Hugging Face.
24
25
 
25
26
  ```ruby
26
27
  require 'llama_cpp'
27
28
 
28
29
  params = LLaMACpp::ContextParams.new
29
- params.seed = 123456
30
+ params.seed = 12
30
31
 
31
- context = LLaMACpp::Context.new(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
32
+ context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
32
33
 
33
- puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.')
34
+ puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.', n_threads: 4)
34
35
  # => "There are two major cities in Japan, Tokyo and Osaka, which have about 30 million populations."
35
36
  ```
36
37
 
@@ -10,4 +10,42 @@ $CXXFLAGS << ' -std=c++11'
10
10
  $INCFLAGS << ' -I$(srcdir)/src'
11
11
  $VPATH << '$(srcdir)/src'
12
12
 
13
+ if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>', '-pthread')
14
+ $CFLAGS << ' -pthread'
15
+ $CXXFLAGS << ' -pthread'
16
+ end
17
+
18
+ if with_config('openblas')
19
+ abort 'libopenblas is not found.' unless have_library('openblas')
20
+ abort 'cblas.h is not found.' unless have_header('cblas.h')
21
+
22
+ $CFLAGS << ' -DGGML_USE_OPENBLAS'
23
+ end
24
+
25
+ if with_config('accelerate')
26
+ $CFLAGS << ' -DGGML_USE_ACCELERATE'
27
+ $LDFLAGS << ' -framework Accelerate'
28
+ end
29
+
30
+ UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
31
+
32
+ # rubocop:disable Layout/LineLength
33
+ if UNAME_M.match?(/x86_64|i686/) && try_compile('#include <stdio.h>', '-march=native -mtune=native')
34
+ $CFLAGS << ' -march=native -mtune=native'
35
+ $CXXFLAGS << ' -march=native -mtune=native'
36
+ elsif UNAME_M.match?(/aarch64/) && try_compile('#include <stdio.h>', '-mcpu=native')
37
+ $CFLAGS << ' -mcpu=native'
38
+ $CXXFLAGS << ' -mcpu=native'
39
+ elsif UNAME_M.match?(/armv6/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access')
40
+ $CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
41
+ $CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
42
+ elsif UNAME_M.match?(/armv7/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations')
43
+ $CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
44
+ $CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
45
+ elsif UNAME_M.match?(/armv8/) && try_compile('#include <stdio.h>', '-mfp16-format=ieee -mno-unaligned-access')
46
+ $CFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
47
+ $CXXFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
48
+ end
49
+ # rubocop:enable Layout/LineLength
50
+
13
51
  create_makefile('llama_cpp/llama_cpp')
@@ -226,6 +226,9 @@ public:
226
226
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
227
227
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
228
228
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
229
+ rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
230
+ rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
231
+ rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
229
232
  };
230
233
 
231
234
  private:
@@ -236,7 +239,13 @@ private:
236
239
  ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
237
240
  VALUE kw_values[2] = { Qundef, Qundef };
238
241
  rb_scan_args(argc, argv, ":", &kw_args);
239
- rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
242
+ rb_get_kwargs(kw_args, kw_table, 0, 2, kw_values);
243
+
244
+ if (kw_values[0] == Qundef && kw_values[1] == Qundef) {
245
+ rb_iv_set(self, "@params", Qnil);
246
+ rb_iv_set(self, "@has_evaluated", Qfalse);
247
+ return Qnil;
248
+ }
240
249
 
241
250
  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
242
251
  rb_raise(rb_eArgError, "model_path must be a string");
@@ -260,7 +269,7 @@ private:
260
269
  rb_iv_set(self, "@has_evaluated", Qfalse);
261
270
 
262
271
  RB_GC_GUARD(filename);
263
- return self;
272
+ return Qnil;
264
273
  };
265
274
 
266
275
  static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
@@ -303,6 +312,10 @@ private:
303
312
  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
304
313
 
305
314
  LLaMAContextWrapper* ptr = get_llama_context(self);
315
+ if (ptr->ctx == NULL) {
316
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
317
+ return Qnil;
318
+ }
306
319
  if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
307
320
  rb_raise(rb_eRuntimeError, "Failed to evaluate");
308
321
  return Qnil;
@@ -341,6 +354,10 @@ private:
341
354
 
342
355
  std::vector<llama_token> tokens(n_max_tokens);
343
356
  LLaMAContextWrapper* ptr = get_llama_context(self);
357
+ if (ptr->ctx == NULL) {
358
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
359
+ return Qnil;
360
+ }
344
361
  const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
345
362
  if (n < 0) {
346
363
  rb_raise(rb_eRuntimeError, "Failed to tokenize");
@@ -441,6 +458,10 @@ private:
441
458
  }
442
459
 
443
460
  LLaMAContextWrapper* ptr = get_llama_context(self);
461
+ if (ptr->ctx == NULL) {
462
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
463
+ return Qnil;
464
+ }
444
465
  llama_token token = llama_sample_top_p_top_k(ptr->ctx, last_n_tokens_data.data(), last_n_tokens_size, top_k, top_p, temp, penalty);
445
466
 
446
467
  return INT2NUM(token);
@@ -492,6 +513,91 @@ private:
492
513
  llama_reset_timings(ptr->ctx);
493
514
  return Qnil;
494
515
  };
516
+
517
+ static VALUE _llama_context_free(VALUE self) {
518
+ LLaMAContextWrapper* ptr = get_llama_context(self);
519
+ if (ptr->ctx != NULL) {
520
+ llama_free(ptr->ctx);
521
+ ptr->ctx = NULL;
522
+ rb_iv_set(self, "@params", Qnil);
523
+ rb_iv_set(self, "@has_evaluated", Qfalse);
524
+ }
525
+ return Qnil;
526
+ }
527
+
528
+ static VALUE _llama_context_load(int argc, VALUE* argv, VALUE self) {
529
+ VALUE kw_args = Qnil;
530
+ ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
531
+ VALUE kw_values[2] = { Qundef, Qundef };
532
+ rb_scan_args(argc, argv, ":", &kw_args);
533
+ rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
534
+
535
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
536
+ rb_raise(rb_eArgError, "model_path must be a string");
537
+ return Qnil;
538
+ }
539
+ if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
540
+ rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
541
+ return Qnil;
542
+ }
543
+
544
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
545
+ if (ctx_ptr->ctx != NULL) {
546
+ rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
547
+ return Qnil;
548
+ }
549
+
550
+ VALUE filename = kw_values[0];
551
+ LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
552
+ ctx_ptr->ctx = llama_init_from_file(StringValueCStr(filename), prms_ptr->params);
553
+ if (ctx_ptr->ctx == NULL) {
554
+ rb_raise(rb_eRuntimeError, "Failed to initialize LLaMA context");
555
+ return Qnil;
556
+ }
557
+
558
+ rb_iv_set(self, "@params", kw_values[1]);
559
+ rb_iv_set(self, "@has_evaluated", Qfalse);
560
+
561
+ RB_GC_GUARD(filename);
562
+ return Qnil;
563
+ };
564
+
565
+ static VALUE _llama_context_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
566
+ VALUE kw_args = Qnil;
567
+ ID kw_table[3] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads") };
568
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
569
+ rb_scan_args(argc, argv, ":", &kw_args);
570
+ rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
571
+
572
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
573
+ rb_raise(rb_eArgError, "lora_path must be a string");
574
+ return Qnil;
575
+ }
576
+ if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
577
+ rb_raise(rb_eArgError, "base_model_path must be a string");
578
+ return Qnil;
579
+ }
580
+ if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
581
+ rb_raise(rb_eArgError, "n_threads must be an integer");
582
+ return Qnil;
583
+ }
584
+
585
+ const char* lora_path = StringValueCStr(kw_values[0]);
586
+ const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
587
+ const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
588
+
589
+ LLaMAContextWrapper* ptr = get_llama_context(self);
590
+ if (ptr->ctx != NULL) {
591
+ rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
592
+ return Qnil;
593
+ }
594
+
595
+ if (llama_apply_lora_from_file(ptr->ctx, lora_path, base_model_path, n_threads) != 0) {
596
+ rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
597
+ return Qnil;
598
+ }
599
+ return Qnil;
600
+ };
495
601
  };
496
602
 
497
603
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -519,6 +625,14 @@ static VALUE rb_llama_print_system_info(VALUE self) {
519
625
  return rb_utf8_str_new_cstr(result);
520
626
  }
521
627
 
628
+ static VALUE rb_llama_mmap_supported(VALUE self) {
629
+ return llama_mmap_supported() ? Qtrue : Qfalse;
630
+ }
631
+
632
+ static VALUE rb_llama_mlock_supported(VALUE self) {
633
+ return llama_mlock_supported() ? Qtrue : Qfalse;
634
+ }
635
+
522
636
  extern "C" void Init_llama_cpp(void) {
523
637
  rb_mLLaMACpp = rb_define_module("LLaMACpp");
524
638
  RbLLaMAContext::define_class(rb_mLLaMACpp);
@@ -527,6 +641,8 @@ extern "C" void Init_llama_cpp(void) {
527
641
  rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
528
642
  rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
529
643
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
644
+ rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
645
+ rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
530
646
 
531
647
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
532
648
  std::stringstream ss_magic;