llama_cpp 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2df0c858faac117b7317683fb7b9a52fc0eb4f7329f728ac6a209085af487142
4
- data.tar.gz: 6b5c5d5d5d4e9020b92c7d76c12086fd77089ecc9c2181fb9d8157df5267da96
3
+ metadata.gz: 00d1390f6d91449d27050952aa9dd80572fb18d6809dc64098e2e4fce79bc91e
4
+ data.tar.gz: 49f4422e5c8903bb83fbd69a7901ff8ed0cbfae248d6652bb93b43feac331821
5
5
  SHA512:
6
- metadata.gz: 8e9d3ccdb8cdc9d4cb7b60f32a709c874953c357fdaccc057502e5761efdec62a0fc0b39929448203ffc4210dbf0ca2f6019dc13f88cf0db84b754f44fd77bea
7
- data.tar.gz: 75fc1d6674c8d509ae0557308277d6d3d7e05f5a6fbea512c2472c46bea1de6e2541a67ec3dda43f874d7f64e6981b720aa1c722d3ec7ea3b96ae9084a4d201b
6
+ metadata.gz: 0633f8565d940b618eb1637fa095a7bc940618100698986971274ffb943fca9fab32eb04e5a0e22bd930f18f998b7f4ca59fb0e4f7ac6210efdf7d4c44e8cc9c
7
+ data.tar.gz: 2e79f037ec38c415cbe9a64485b986c64e3de73c26580b0e6e8be577cec38abcc4eb9506de2bd1d0b84646f4f44b7422aafe83c0ee867028c2a223f345a294ce
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
4
+
5
+ - Bump bundled llama.cpp from master-315a95a to master-12b5900.
6
+ - Add model file type constants
7
+ - Add `model_quantize` module function to LLaMACpp.
8
+ - Add cublas config option:
9
+ ```
10
+ $ gem install llama_cpp -- --with-cublas
11
+ ```
12
+
3
13
  ## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
4
14
 
5
15
  - Bump bundled llama.cpp from master-c85e03d to master-315a95a.
@@ -23,8 +23,15 @@ if with_config('openblas')
23
23
  end
24
24
 
25
25
  if with_config('accelerate')
26
+ abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
27
+
26
28
  $CFLAGS << ' -DGGML_USE_ACCELERATE'
27
- $LDFLAGS << ' -framework Accelerate'
29
+ end
30
+
31
+ if with_config('cublas')
32
+ $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
33
+ $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
34
+ $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
28
35
  end
29
36
 
30
37
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -49,3 +56,10 @@ end
49
56
  # rubocop:enable Layout/LineLength
50
57
 
51
58
  create_makefile('llama_cpp/llama_cpp')
59
+
60
+ if with_config('cublas')
61
+ File.open('Makefile', 'a') do |f|
62
+ f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
63
+ f.puts "\tnvcc -arch=native -c -o $@ $<"
64
+ end
65
+ end
@@ -612,6 +612,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
612
612
 
613
613
  // module functions
614
614
 
615
+ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
616
+ VALUE kw_args = Qnil;
617
+ ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
618
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
619
+ rb_scan_args(argc, argv, ":", &kw_args);
620
+ rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
621
+
622
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
623
+ rb_raise(rb_eArgError, "input_path must be a string");
624
+ return Qnil;
625
+ }
626
+ if (!RB_TYPE_P(kw_values[1], T_STRING)) {
627
+ rb_raise(rb_eArgError, "output_path must be a string");
628
+ return Qnil;
629
+ }
630
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
631
+ rb_raise(rb_eArgError, "ftype must be an integer");
632
+ return Qnil;
633
+ }
634
+ if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
635
+ rb_raise(rb_eArgError, "n_threads must be an integer");
636
+ return Qnil;
637
+ }
638
+
639
+ const char* input_path = StringValueCStr(kw_values[0]);
640
+ const char* output_path = StringValueCStr(kw_values[1]);
641
+ const int ftype = NUM2INT(kw_values[2]);
642
+ const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
643
+
644
+ if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
645
+ rb_raise(rb_eRuntimeError, "Failed to quantize model");
646
+ return Qnil;
647
+ }
648
+
649
+ return Qnil;
650
+ }
651
+
615
652
  static VALUE rb_llama_token_bos(VALUE self) {
616
653
  return INT2NUM(llama_token_bos());
617
654
  }
@@ -638,12 +675,21 @@ extern "C" void Init_llama_cpp(void) {
638
675
  RbLLaMAContext::define_class(rb_mLLaMACpp);
639
676
  RbLLaMAContextParams::define_class(rb_mLLaMACpp);
640
677
 
678
+ rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
641
679
  rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
642
680
  rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
643
681
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
644
682
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
645
683
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
646
684
 
685
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
686
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
687
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
688
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
689
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
690
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
691
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
692
+
647
693
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
648
694
  std::stringstream ss_magic;
649
695
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
@@ -0,0 +1,12 @@
1
+ #ifdef __cplusplus
2
+ extern "C" {
3
+ #endif
4
+
5
+ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
6
+ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
+ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
8
+ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
9
+
10
+ #ifdef __cplusplus
11
+ }
12
+ #endif