llama_cpp 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e659b4cc27e9ab45b524ec3d341892b72cebdf84ccad823f24ff7e472f2ffa8
4
- data.tar.gz: 790f4d2f6dc9ddf211701f6014ae91ca19e0492efd7c64eb881e66729f929544
3
+ metadata.gz: 00d1390f6d91449d27050952aa9dd80572fb18d6809dc64098e2e4fce79bc91e
4
+ data.tar.gz: 49f4422e5c8903bb83fbd69a7901ff8ed0cbfae248d6652bb93b43feac331821
5
5
  SHA512:
6
- metadata.gz: 54eb4dd6c4ca9affc7b6f2da1c9dc719f60c8dc3841645cf47b8f0310ff31ad16a5bc841051663f03e962bc7424f56f1d7a1c5c5eabd03e3f5e7b706467bb0ba
7
- data.tar.gz: b2a16c6be3d7f117fabea5cc837b9306b0768d9ad99104a6fa2752932d1e1a034312983a87ebfe9e3ccb1bf83257d5ce40520e049a40291c64fb2fab8663882a
6
+ metadata.gz: 0633f8565d940b618eb1637fa095a7bc940618100698986971274ffb943fca9fab32eb04e5a0e22bd930f18f998b7f4ca59fb0e4f7ac6210efdf7d4c44e8cc9c
7
+ data.tar.gz: 2e79f037ec38c415cbe9a64485b986c64e3de73c26580b0e6e8be577cec38abcc4eb9506de2bd1d0b84646f4f44b7422aafe83c0ee867028c2a223f345a294ce
data/CHANGELOG.md CHANGED
@@ -1,5 +1,33 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
4
+
5
+ - Bump bundled llama.cpp from master-315a95a to master-12b5900.
6
+ - Add model file type constants
7
+ - Add `model_quantize` module function to LLaMACpp.
8
+ - Add cublas config option:
9
+ ```
10
+ $ gem install llama_cpp -- --with-cublas
11
+ ```
12
+
13
+ ## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
14
+
15
+ - Bump bundled llama.cpp from master-c85e03d to master-315a95a.
16
+ - Add `apply_lora_from_file` method to LLaMACpp::Context.
17
+ - Add `mlock_supported?` module function to LLaMACpp.
18
+ - Add `mmap_supported?` module function to LLaMACpp.
19
+ - Fix to not destroy original prompt in `LLaMACpp.generate` module function.
20
+ - Add check for context initialization.
21
+ - Add blas config options:
22
+ ```
23
+ $ gem install llama_cpp -- --with-openblas
24
+ ```
25
+ macOS:
26
+ ```
27
+ $ gem install llama_cpp -- --with-openblas --with-opt-dir=/opt/homebrew/opt/openblas
28
+ $ gem install llama_cpp -- --with-accelerate
29
+ ```
30
+
3
31
  ## [[0.0.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.3...v0.0.4)] - 2023-04-15
4
32
 
5
33
  - Bump bundled llama.cpp from master-698f7b5 to master-c85e03d.
data/README.md CHANGED
@@ -20,7 +20,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
20
20
 
21
21
  ## Usage
22
22
 
23
- Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
23
+ Prepare the quantized model by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage) or
24
+ download the qunatized model, for example [ggml-vicuna-7b-4bit](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5541351), from Hugging Face.
24
25
 
25
26
  ```ruby
26
27
  require 'llama_cpp'
@@ -28,7 +29,7 @@ require 'llama_cpp'
28
29
  params = LLaMACpp::ContextParams.new
29
30
  params.seed = 12
30
31
 
31
- context = LLaMACpp::Context.new(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
32
+ context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
32
33
 
33
34
  puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.', n_threads: 4)
34
35
  # => "There are two major cities in Japan, Tokyo and Osaka, which have about 30 million populations."
@@ -15,6 +15,25 @@ if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>',
15
15
  $CXXFLAGS << ' -pthread'
16
16
  end
17
17
 
18
+ if with_config('openblas')
19
+ abort 'libopenblas is not found.' unless have_library('openblas')
20
+ abort 'cblas.h is not found.' unless have_header('cblas.h')
21
+
22
+ $CFLAGS << ' -DGGML_USE_OPENBLAS'
23
+ end
24
+
25
+ if with_config('accelerate')
26
+ abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
27
+
28
+ $CFLAGS << ' -DGGML_USE_ACCELERATE'
29
+ end
30
+
31
+ if with_config('cublas')
32
+ $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
33
+ $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
34
+ $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
35
+ end
36
+
18
37
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
19
38
 
20
39
  # rubocop:disable Layout/LineLength
@@ -37,3 +56,10 @@ end
37
56
  # rubocop:enable Layout/LineLength
38
57
 
39
58
  create_makefile('llama_cpp/llama_cpp')
59
+
60
+ if with_config('cublas')
61
+ File.open('Makefile', 'a') do |f|
62
+ f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
63
+ f.puts "\tnvcc -arch=native -c -o $@ $<"
64
+ end
65
+ end
@@ -228,6 +228,7 @@ public:
228
228
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
229
229
  rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
230
230
  rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
231
+ rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
231
232
  };
232
233
 
233
234
  private:
@@ -311,6 +312,10 @@ private:
311
312
  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
312
313
 
313
314
  LLaMAContextWrapper* ptr = get_llama_context(self);
315
+ if (ptr->ctx == NULL) {
316
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
317
+ return Qnil;
318
+ }
314
319
  if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
315
320
  rb_raise(rb_eRuntimeError, "Failed to evaluate");
316
321
  return Qnil;
@@ -349,6 +354,10 @@ private:
349
354
 
350
355
  std::vector<llama_token> tokens(n_max_tokens);
351
356
  LLaMAContextWrapper* ptr = get_llama_context(self);
357
+ if (ptr->ctx == NULL) {
358
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
359
+ return Qnil;
360
+ }
352
361
  const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
353
362
  if (n < 0) {
354
363
  rb_raise(rb_eRuntimeError, "Failed to tokenize");
@@ -449,6 +458,10 @@ private:
449
458
  }
450
459
 
451
460
  LLaMAContextWrapper* ptr = get_llama_context(self);
461
+ if (ptr->ctx == NULL) {
462
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
463
+ return Qnil;
464
+ }
452
465
  llama_token token = llama_sample_top_p_top_k(ptr->ctx, last_n_tokens_data.data(), last_n_tokens_size, top_k, top_p, temp, penalty);
453
466
 
454
467
  return INT2NUM(token);
@@ -548,6 +561,43 @@ private:
548
561
  RB_GC_GUARD(filename);
549
562
  return Qnil;
550
563
  };
564
+
565
+ static VALUE _llama_context_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
566
+ VALUE kw_args = Qnil;
567
+ ID kw_table[3] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads") };
568
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
569
+ rb_scan_args(argc, argv, ":", &kw_args);
570
+ rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
571
+
572
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
573
+ rb_raise(rb_eArgError, "lora_path must be a string");
574
+ return Qnil;
575
+ }
576
+ if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
577
+ rb_raise(rb_eArgError, "base_model_path must be a string");
578
+ return Qnil;
579
+ }
580
+ if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
581
+ rb_raise(rb_eArgError, "n_threads must be an integer");
582
+ return Qnil;
583
+ }
584
+
585
+ const char* lora_path = StringValueCStr(kw_values[0]);
586
+ const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
587
+ const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
588
+
589
+ LLaMAContextWrapper* ptr = get_llama_context(self);
590
+ if (ptr->ctx != NULL) {
591
+ rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
592
+ return Qnil;
593
+ }
594
+
595
+ if (llama_apply_lora_from_file(ptr->ctx, lora_path, base_model_path, n_threads) != 0) {
596
+ rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
597
+ return Qnil;
598
+ }
599
+ return Qnil;
600
+ };
551
601
  };
552
602
 
553
603
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -562,6 +612,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
562
612
 
563
613
  // module functions
564
614
 
615
+ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
616
+ VALUE kw_args = Qnil;
617
+ ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
618
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
619
+ rb_scan_args(argc, argv, ":", &kw_args);
620
+ rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
621
+
622
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
623
+ rb_raise(rb_eArgError, "input_path must be a string");
624
+ return Qnil;
625
+ }
626
+ if (!RB_TYPE_P(kw_values[1], T_STRING)) {
627
+ rb_raise(rb_eArgError, "output_path must be a string");
628
+ return Qnil;
629
+ }
630
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
631
+ rb_raise(rb_eArgError, "ftype must be an integer");
632
+ return Qnil;
633
+ }
634
+ if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
635
+ rb_raise(rb_eArgError, "n_threads must be an integer");
636
+ return Qnil;
637
+ }
638
+
639
+ const char* input_path = StringValueCStr(kw_values[0]);
640
+ const char* output_path = StringValueCStr(kw_values[1]);
641
+ const int ftype = NUM2INT(kw_values[2]);
642
+ const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
643
+
644
+ if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
645
+ rb_raise(rb_eRuntimeError, "Failed to quantize model");
646
+ return Qnil;
647
+ }
648
+
649
+ return Qnil;
650
+ }
651
+
565
652
  static VALUE rb_llama_token_bos(VALUE self) {
566
653
  return INT2NUM(llama_token_bos());
567
654
  }
@@ -575,14 +662,33 @@ static VALUE rb_llama_print_system_info(VALUE self) {
575
662
  return rb_utf8_str_new_cstr(result);
576
663
  }
577
664
 
665
+ static VALUE rb_llama_mmap_supported(VALUE self) {
666
+ return llama_mmap_supported() ? Qtrue : Qfalse;
667
+ }
668
+
669
+ static VALUE rb_llama_mlock_supported(VALUE self) {
670
+ return llama_mlock_supported() ? Qtrue : Qfalse;
671
+ }
672
+
578
673
  extern "C" void Init_llama_cpp(void) {
579
674
  rb_mLLaMACpp = rb_define_module("LLaMACpp");
580
675
  RbLLaMAContext::define_class(rb_mLLaMACpp);
581
676
  RbLLaMAContextParams::define_class(rb_mLLaMACpp);
582
677
 
678
+ rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
583
679
  rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
584
680
  rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
585
681
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
682
+ rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
683
+ rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
684
+
685
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
686
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
687
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
688
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
689
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
690
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
691
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
586
692
 
587
693
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
588
694
  std::stringstream ss_magic;
@@ -0,0 +1,12 @@
1
+ #ifdef __cplusplus
2
+ extern "C" {
3
+ #endif
4
+
5
+ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
6
+ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
+ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
8
+ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
9
+
10
+ #ifdef __cplusplus
11
+ }
12
+ #endif