llama_cpp 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2df0c858faac117b7317683fb7b9a52fc0eb4f7329f728ac6a209085af487142
4
- data.tar.gz: 6b5c5d5d5d4e9020b92c7d76c12086fd77089ecc9c2181fb9d8157df5267da96
3
+ metadata.gz: 00d1390f6d91449d27050952aa9dd80572fb18d6809dc64098e2e4fce79bc91e
4
+ data.tar.gz: 49f4422e5c8903bb83fbd69a7901ff8ed0cbfae248d6652bb93b43feac331821
5
5
  SHA512:
6
- metadata.gz: 8e9d3ccdb8cdc9d4cb7b60f32a709c874953c357fdaccc057502e5761efdec62a0fc0b39929448203ffc4210dbf0ca2f6019dc13f88cf0db84b754f44fd77bea
7
- data.tar.gz: 75fc1d6674c8d509ae0557308277d6d3d7e05f5a6fbea512c2472c46bea1de6e2541a67ec3dda43f874d7f64e6981b720aa1c722d3ec7ea3b96ae9084a4d201b
6
+ metadata.gz: 0633f8565d940b618eb1637fa095a7bc940618100698986971274ffb943fca9fab32eb04e5a0e22bd930f18f998b7f4ca59fb0e4f7ac6210efdf7d4c44e8cc9c
7
+ data.tar.gz: 2e79f037ec38c415cbe9a64485b986c64e3de73c26580b0e6e8be577cec38abcc4eb9506de2bd1d0b84646f4f44b7422aafe83c0ee867028c2a223f345a294ce
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
4
+
5
+ - Bump bundled llama.cpp from master-315a95a to master-12b5900.
6
+ - Add model file type constants
7
+ - Add `model_quantize` module function to LLaMACpp.
8
+ - Add cublas config option:
9
+ ```
10
+ $ gem install llama_cpp -- --with-cublas
11
+ ```
12
+
3
13
  ## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
4
14
 
5
15
  - Bump bundled llama.cpp from master-c85e03d to master-315a95a.
@@ -23,8 +23,15 @@ if with_config('openblas')
23
23
  end
24
24
 
25
25
  if with_config('accelerate')
26
+ abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
27
+
26
28
  $CFLAGS << ' -DGGML_USE_ACCELERATE'
27
- $LDFLAGS << ' -framework Accelerate'
29
+ end
30
+
31
+ if with_config('cublas')
32
+ $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
33
+ $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
34
+ $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
28
35
  end
29
36
 
30
37
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -49,3 +56,10 @@ end
49
56
  # rubocop:enable Layout/LineLength
50
57
 
51
58
  create_makefile('llama_cpp/llama_cpp')
59
+
60
+ if with_config('cublas')
61
+ File.open('Makefile', 'a') do |f|
62
+ f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
63
+ f.puts "\tnvcc -arch=native -c -o $@ $<"
64
+ end
65
+ end
@@ -612,6 +612,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
612
612
 
613
613
  // module functions
614
614
 
615
+ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
616
+ VALUE kw_args = Qnil;
617
+ ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
618
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
619
+ rb_scan_args(argc, argv, ":", &kw_args);
620
+ rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
621
+
622
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
623
+ rb_raise(rb_eArgError, "input_path must be a string");
624
+ return Qnil;
625
+ }
626
+ if (!RB_TYPE_P(kw_values[1], T_STRING)) {
627
+ rb_raise(rb_eArgError, "output_path must be a string");
628
+ return Qnil;
629
+ }
630
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
631
+ rb_raise(rb_eArgError, "ftype must be an integer");
632
+ return Qnil;
633
+ }
634
+ if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
635
+ rb_raise(rb_eArgError, "n_threads must be an integer");
636
+ return Qnil;
637
+ }
638
+
639
+ const char* input_path = StringValueCStr(kw_values[0]);
640
+ const char* output_path = StringValueCStr(kw_values[1]);
641
+ const int ftype = NUM2INT(kw_values[2]);
642
+ const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
643
+
644
+ if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
645
+ rb_raise(rb_eRuntimeError, "Failed to quantize model");
646
+ return Qnil;
647
+ }
648
+
649
+ return Qnil;
650
+ }
651
+
615
652
  static VALUE rb_llama_token_bos(VALUE self) {
616
653
  return INT2NUM(llama_token_bos());
617
654
  }
@@ -638,12 +675,21 @@ extern "C" void Init_llama_cpp(void) {
638
675
  RbLLaMAContext::define_class(rb_mLLaMACpp);
639
676
  RbLLaMAContextParams::define_class(rb_mLLaMACpp);
640
677
 
678
+ rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
641
679
  rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
642
680
  rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
643
681
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
644
682
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
645
683
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
646
684
 
685
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
686
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
687
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
688
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
689
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
690
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
691
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
692
+
647
693
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
648
694
  std::stringstream ss_magic;
649
695
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
@@ -0,0 +1,12 @@
1
+ #ifdef __cplusplus
2
+ extern "C" {
3
+ #endif
4
+
5
+ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
6
+ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
+ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
8
+ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
9
+
10
+ #ifdef __cplusplus
11
+ }
12
+ #endif