llama_cpp 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e659b4cc27e9ab45b524ec3d341892b72cebdf84ccad823f24ff7e472f2ffa8
4
- data.tar.gz: 790f4d2f6dc9ddf211701f6014ae91ca19e0492efd7c64eb881e66729f929544
3
+ metadata.gz: 00d1390f6d91449d27050952aa9dd80572fb18d6809dc64098e2e4fce79bc91e
4
+ data.tar.gz: 49f4422e5c8903bb83fbd69a7901ff8ed0cbfae248d6652bb93b43feac331821
5
5
  SHA512:
6
- metadata.gz: 54eb4dd6c4ca9affc7b6f2da1c9dc719f60c8dc3841645cf47b8f0310ff31ad16a5bc841051663f03e962bc7424f56f1d7a1c5c5eabd03e3f5e7b706467bb0ba
7
- data.tar.gz: b2a16c6be3d7f117fabea5cc837b9306b0768d9ad99104a6fa2752932d1e1a034312983a87ebfe9e3ccb1bf83257d5ce40520e049a40291c64fb2fab8663882a
6
+ metadata.gz: 0633f8565d940b618eb1637fa095a7bc940618100698986971274ffb943fca9fab32eb04e5a0e22bd930f18f998b7f4ca59fb0e4f7ac6210efdf7d4c44e8cc9c
7
+ data.tar.gz: 2e79f037ec38c415cbe9a64485b986c64e3de73c26580b0e6e8be577cec38abcc4eb9506de2bd1d0b84646f4f44b7422aafe83c0ee867028c2a223f345a294ce
data/CHANGELOG.md CHANGED
@@ -1,5 +1,33 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
4
+
5
+ - Bump bundled llama.cpp from master-315a95a to master-12b5900.
6
+ - Add model file type constants
7
+ - Add `model_quantize` module function to LLaMACpp.
8
+ - Add cublas config option:
9
+ ```
10
+ $ gem install llama_cpp -- --with-cublas
11
+ ```
12
+
13
+ ## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
14
+
15
+ - Bump bundled llama.cpp from master-c85e03d to master-315a95a.
16
+ - Add `apply_lora_from_file` method to LLaMACpp::Context.
17
+ - Add `mlock_supported?` module function to LLaMACpp.
18
+ - Add `mmap_supported?` module function to LLaMACpp.
19
+ - Fix to not destroy original prompt in `LLaMACpp.generate` module function.
20
+ - Add check for context initialization.
21
+ - Add blas config options:
22
+ ```
23
+ $ gem install llama_cpp -- --with-openblas
24
+ ```
25
+ macOS:
26
+ ```
27
+ $ gem install llama_cpp -- --with-openblas --with-opt-dir=/opt/homebrew/opt/openblas
28
+ $ gem install llama_cpp -- --with-accelerate
29
+ ```
30
+
3
31
  ## [[0.0.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.3...v0.0.4)] - 2023-04-15
4
32
 
5
33
  - Bump bundled llama.cpp from master-698f7b5 to master-c85e03d.
data/README.md CHANGED
@@ -20,7 +20,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
20
20
 
21
21
  ## Usage
22
22
 
23
- Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
23
+ Prepare the quantized model by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage) or
24
+ download the qunatized model, for example [ggml-vicuna-7b-4bit](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5541351), from Hugging Face.
24
25
 
25
26
  ```ruby
26
27
  require 'llama_cpp'
@@ -28,7 +29,7 @@ require 'llama_cpp'
28
29
  params = LLaMACpp::ContextParams.new
29
30
  params.seed = 12
30
31
 
31
- context = LLaMACpp::Context.new(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
32
+ context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
32
33
 
33
34
  puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.', n_threads: 4)
34
35
  # => "There are two major cities in Japan, Tokyo and Osaka, which have about 30 million populations."
@@ -15,6 +15,25 @@ if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>',
15
15
  $CXXFLAGS << ' -pthread'
16
16
  end
17
17
 
18
+ if with_config('openblas')
19
+ abort 'libopenblas is not found.' unless have_library('openblas')
20
+ abort 'cblas.h is not found.' unless have_header('cblas.h')
21
+
22
+ $CFLAGS << ' -DGGML_USE_OPENBLAS'
23
+ end
24
+
25
+ if with_config('accelerate')
26
+ abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
27
+
28
+ $CFLAGS << ' -DGGML_USE_ACCELERATE'
29
+ end
30
+
31
+ if with_config('cublas')
32
+ $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
33
+ $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
34
+ $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
35
+ end
36
+
18
37
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
19
38
 
20
39
  # rubocop:disable Layout/LineLength
@@ -37,3 +56,10 @@ end
37
56
  # rubocop:enable Layout/LineLength
38
57
 
39
58
  create_makefile('llama_cpp/llama_cpp')
59
+
60
+ if with_config('cublas')
61
+ File.open('Makefile', 'a') do |f|
62
+ f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
63
+ f.puts "\tnvcc -arch=native -c -o $@ $<"
64
+ end
65
+ end
@@ -228,6 +228,7 @@ public:
228
228
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
229
229
  rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
230
230
  rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
231
+ rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
231
232
  };
232
233
 
233
234
  private:
@@ -311,6 +312,10 @@ private:
311
312
  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
312
313
 
313
314
  LLaMAContextWrapper* ptr = get_llama_context(self);
315
+ if (ptr->ctx == NULL) {
316
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
317
+ return Qnil;
318
+ }
314
319
  if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
315
320
  rb_raise(rb_eRuntimeError, "Failed to evaluate");
316
321
  return Qnil;
@@ -349,6 +354,10 @@ private:
349
354
 
350
355
  std::vector<llama_token> tokens(n_max_tokens);
351
356
  LLaMAContextWrapper* ptr = get_llama_context(self);
357
+ if (ptr->ctx == NULL) {
358
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
359
+ return Qnil;
360
+ }
352
361
  const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
353
362
  if (n < 0) {
354
363
  rb_raise(rb_eRuntimeError, "Failed to tokenize");
@@ -449,6 +458,10 @@ private:
449
458
  }
450
459
 
451
460
  LLaMAContextWrapper* ptr = get_llama_context(self);
461
+ if (ptr->ctx == NULL) {
462
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
463
+ return Qnil;
464
+ }
452
465
  llama_token token = llama_sample_top_p_top_k(ptr->ctx, last_n_tokens_data.data(), last_n_tokens_size, top_k, top_p, temp, penalty);
453
466
 
454
467
  return INT2NUM(token);
@@ -548,6 +561,43 @@ private:
548
561
  RB_GC_GUARD(filename);
549
562
  return Qnil;
550
563
  };
564
+
565
+ static VALUE _llama_context_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
566
+ VALUE kw_args = Qnil;
567
+ ID kw_table[3] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads") };
568
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
569
+ rb_scan_args(argc, argv, ":", &kw_args);
570
+ rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
571
+
572
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
573
+ rb_raise(rb_eArgError, "lora_path must be a string");
574
+ return Qnil;
575
+ }
576
+ if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
577
+ rb_raise(rb_eArgError, "base_model_path must be a string");
578
+ return Qnil;
579
+ }
580
+ if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
581
+ rb_raise(rb_eArgError, "n_threads must be an integer");
582
+ return Qnil;
583
+ }
584
+
585
+ const char* lora_path = StringValueCStr(kw_values[0]);
586
+ const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
587
+ const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
588
+
589
+ LLaMAContextWrapper* ptr = get_llama_context(self);
590
+ if (ptr->ctx != NULL) {
591
+ rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
592
+ return Qnil;
593
+ }
594
+
595
+ if (llama_apply_lora_from_file(ptr->ctx, lora_path, base_model_path, n_threads) != 0) {
596
+ rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
597
+ return Qnil;
598
+ }
599
+ return Qnil;
600
+ };
551
601
  };
552
602
 
553
603
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -562,6 +612,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
562
612
 
563
613
  // module functions
564
614
 
615
+ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
616
+ VALUE kw_args = Qnil;
617
+ ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
618
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
619
+ rb_scan_args(argc, argv, ":", &kw_args);
620
+ rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
621
+
622
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
623
+ rb_raise(rb_eArgError, "input_path must be a string");
624
+ return Qnil;
625
+ }
626
+ if (!RB_TYPE_P(kw_values[1], T_STRING)) {
627
+ rb_raise(rb_eArgError, "output_path must be a string");
628
+ return Qnil;
629
+ }
630
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
631
+ rb_raise(rb_eArgError, "ftype must be an integer");
632
+ return Qnil;
633
+ }
634
+ if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
635
+ rb_raise(rb_eArgError, "n_threads must be an integer");
636
+ return Qnil;
637
+ }
638
+
639
+ const char* input_path = StringValueCStr(kw_values[0]);
640
+ const char* output_path = StringValueCStr(kw_values[1]);
641
+ const int ftype = NUM2INT(kw_values[2]);
642
+ const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
643
+
644
+ if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
645
+ rb_raise(rb_eRuntimeError, "Failed to quantize model");
646
+ return Qnil;
647
+ }
648
+
649
+ return Qnil;
650
+ }
651
+
565
652
  static VALUE rb_llama_token_bos(VALUE self) {
566
653
  return INT2NUM(llama_token_bos());
567
654
  }
@@ -575,14 +662,33 @@ static VALUE rb_llama_print_system_info(VALUE self) {
575
662
  return rb_utf8_str_new_cstr(result);
576
663
  }
577
664
 
665
+ static VALUE rb_llama_mmap_supported(VALUE self) {
666
+ return llama_mmap_supported() ? Qtrue : Qfalse;
667
+ }
668
+
669
+ static VALUE rb_llama_mlock_supported(VALUE self) {
670
+ return llama_mlock_supported() ? Qtrue : Qfalse;
671
+ }
672
+
578
673
  extern "C" void Init_llama_cpp(void) {
579
674
  rb_mLLaMACpp = rb_define_module("LLaMACpp");
580
675
  RbLLaMAContext::define_class(rb_mLLaMACpp);
581
676
  RbLLaMAContextParams::define_class(rb_mLLaMACpp);
582
677
 
678
+ rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
583
679
  rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
584
680
  rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
585
681
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
682
+ rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
683
+ rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
684
+
685
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
686
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
687
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
688
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
689
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
690
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
691
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
586
692
 
587
693
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
588
694
  std::stringstream ss_magic;
@@ -0,0 +1,12 @@
1
+ #ifdef __cplusplus
2
+ extern "C" {
3
+ #endif
4
+
5
+ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
6
+ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
+ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
8
+ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
9
+
10
+ #ifdef __cplusplus
11
+ }
12
+ #endif