llama_cpp 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ed569e816938dfca79c345228cf592eb81827c04acfeae3c8e26e0635bbc518b
4
- data.tar.gz: f4a899df0cf450370d7dc75e486a17617f1af0cbcacd9d9a8c7d3bde10016441
3
+ metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
4
+ data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
5
5
  SHA512:
6
- metadata.gz: 0f3d38eed6628e8d68efc741fe00024fb0c5199fb2e1a33d6f04d9299e1c59deb969e3eafe36190ade84522e70ddca50956fbee9b6406edc5d613f654889a83a
7
- data.tar.gz: 0b1705a8d70564a59ad6472b03dc0241727766d4121e26a2e9c3c0d4725ddf2ccf65cb8f4a862688661ea9fa2b1c8858cd6e5e722821e6c2c30c91401475ef74
6
+ metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
7
+ data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a
data/CHANGELOG.md CHANGED
@@ -1,25 +1,56 @@
1
- ## [Unreleased]
1
+ ## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
2
+
3
+ - Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
4
+ - Fix installation files for CUDA.
5
+ - Add metal config option:
6
+ ```
7
+ $ gem install llama_cpp -- --with-metal
8
+ ```
9
+ ```ruby
10
+ require 'llama_cpp'
11
+
12
+ params = LLaMACpp::ContextParams.new
13
+ params.n_gpu_layers = 1
14
+
15
+ context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
16
+ LLaMACpp.generate(context, 'Hello, world.')
17
+ ```
18
+
19
+ **Breaking Changes**
20
+
21
+ - Add ModelQuantizationParams class.
22
+ - Change the argument of the `model_quantize` module function in LLaMACpp.
23
+ ```ruby
24
+ require 'llama_cpp'
25
+
26
+ params = LLaMACpp::ModelQuantizeParams.new
27
+ LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
28
+ ```
29
+
30
+ ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
31
+
32
+ - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
2
33
 
3
34
  ## [[0.1.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.2...v0.1.3)] - 2023-05-27
4
35
 
5
- - Bump bundled llama.cpp from master-265db98 to master-66874d4
36
+ - Bump bundled llama.cpp from master-265db98 to master-66874d4.
6
37
 
7
38
  ## [[0.1.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.1...v0.1.2)] - 2023-05-22
8
39
 
9
40
  **Breaking Changes**
10
41
 
11
- - Bump bundled llama.cpp from master-6986c78 to master-265db98
12
- - bump LLAMA_FILE_VERSION to 3
42
+ - Bump bundled llama.cpp from master-6986c78 to master-265db98.
43
+ - bump LLAMA_FILE_VERSION to 3.
13
44
 
14
45
  ## [[0.1.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.0...v0.1.1)] - 2023-05-21
15
46
 
16
- - Add load_session_file method to Context
17
- - Add save_session_file method to Context
47
+ - Add load_session_file method to Context.
48
+ - Add save_session_file method to Context.
18
49
 
19
50
  **Breaking Changes**
20
51
 
21
- - Bump bundled llama.cpp from master-173d0e6 to master-6986c78
22
- - bump LLAMA_FILE_VERSION to 2
52
+ - Bump bundled llama.cpp from master-173d0e6 to master-6986c78.
53
+ - bump LLAMA_FILE_VERSION to 2.
23
54
 
24
55
  ## [[0.1.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.7...v0.1.0)] - 2023-05-20
25
56
 
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'mkmf'
4
+ require 'fileutils'
4
5
 
5
6
  abort 'libstdc++ is not found.' unless have_library('stdc++')
6
7
 
@@ -36,17 +37,30 @@ if with_config('accelerate')
36
37
  $CFLAGS << ' -DGGML_USE_ACCELERATE'
37
38
  end
38
39
 
40
+ if with_config('metal')
41
+ $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
42
+ $CXXFLAGS << ' -DGGML_USE_METAL'
43
+ $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
44
+ $objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
45
+ end
46
+
39
47
  if with_config('cublas')
40
48
  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
49
+ $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
41
50
  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
42
51
  $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
43
52
  end
44
53
 
45
54
  if with_config('clblast')
46
55
  abort 'libclblast is not found.' unless have_library('clblast')
47
- abort 'libOpenCL is not found.' unless have_library('OpenCL')
48
56
 
49
57
  $CFLAGS << ' -DGGML_USE_CLBLAST'
58
+ $CXXFLAGS << ' -DGGML_USE_CLBLAST'
59
+ if RUBY_PLATFORM.match?(/darwin/)
60
+ $LDFLAGS << ' -framework OpenCL'
61
+ else
62
+ abort 'libOpenCL is not found.' unless have_library('OpenCL')
63
+ end
50
64
  end
51
65
 
52
66
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -78,3 +92,14 @@ if with_config('cublas')
78
92
  f.puts "\tnvcc -arch=native -c -o $@ $<"
79
93
  end
80
94
  end
95
+
96
+ if with_config('metal')
97
+ File.open('Makefile', 'a') do |f|
98
+ f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
99
+ f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
100
+ end
101
+
102
+ metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
103
+ dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
104
+ FileUtils.cp(metal_path, dest_path)
105
+ end
@@ -4,6 +4,7 @@
4
4
  VALUE rb_mLLaMACpp;
5
5
  VALUE rb_cLLaMAContext;
6
6
  VALUE rb_cLLaMAContextParams;
7
+ VALUE rb_cLLaMAModelQuantizeParams;
7
8
  VALUE rb_cLLaMATokenData;
8
9
  VALUE rb_cLLaMATokenDataArray;
9
10
 
@@ -292,6 +293,13 @@ public:
292
293
  // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
293
294
  rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
294
295
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
296
+ rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
297
+ rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
298
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
299
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
300
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
301
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
302
+ rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
295
303
  rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
296
304
  rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
297
305
  rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -329,6 +337,55 @@ private:
329
337
  return INT2NUM(ptr->params.n_ctx);
330
338
  };
331
339
 
340
+ // n_batch
341
+ static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
342
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
343
+ ptr->params.n_batch = NUM2INT(n_batch);
344
+ return INT2NUM(ptr->params.n_batch);
345
+ };
346
+
347
+ static VALUE _llama_context_params_get_n_batch(VALUE self) {
348
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
349
+ return INT2NUM(ptr->params.n_batch);
350
+ };
351
+
352
+ // n_gpu_layers
353
+ static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
354
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
355
+ ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
356
+ return INT2NUM(ptr->params.n_gpu_layers);
357
+ };
358
+
359
+ static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
360
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
361
+ return INT2NUM(ptr->params.n_gpu_layers);
362
+ };
363
+
364
+ // main_gpu
365
+ static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
366
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
367
+ ptr->params.main_gpu = NUM2INT(main_gpu);
368
+ return INT2NUM(ptr->params.main_gpu);
369
+ };
370
+
371
+ static VALUE _llama_context_params_get_main_gpu(VALUE self) {
372
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
373
+ return INT2NUM(ptr->params.main_gpu);
374
+ };
375
+
376
+ // tensor_split
377
+ static VALUE _llama_context_params_get_tensor_split(VALUE self) {
378
+ if (LLAMA_MAX_DEVICES < 1) {
379
+ return rb_ary_new();
380
+ }
381
+ VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
382
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
383
+ for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
384
+ rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
385
+ }
386
+ return ret;
387
+ };
388
+
332
389
  // seed
333
390
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
334
391
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -424,6 +481,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
424
481
  RUBY_TYPED_FREE_IMMEDIATELY
425
482
  };
426
483
 
484
+ class LLaMAModelQuantizeParamsWrapper {
485
+ public:
486
+ llama_model_quantize_params params;
487
+
488
+ LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
489
+
490
+ ~LLaMAModelQuantizeParamsWrapper(){};
491
+ };
492
+
493
+ class RbLLaMAModelQuantizeParams {
494
+ public:
495
+ static VALUE llama_model_quantize_params_alloc(VALUE self) {
496
+ LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
497
+ new (ptr) LLaMAModelQuantizeParamsWrapper();
498
+ return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
499
+ };
500
+
501
+ static void llama_model_quantize_params_free(void* ptr) {
502
+ ((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
503
+ ruby_xfree(ptr);
504
+ };
505
+
506
+ static size_t llama_model_quantize_params_size(const void* ptr) {
507
+ return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
508
+ };
509
+
510
+ static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
511
+ LLaMAModelQuantizeParamsWrapper* ptr;
512
+ TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
513
+ return ptr;
514
+ };
515
+
516
+ static void define_class(VALUE outer) {
517
+ rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
518
+ rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
519
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
520
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
521
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
522
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
523
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
524
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
525
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
526
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
527
+ };
528
+
529
+ private:
530
+ static const rb_data_type_t llama_model_quantize_params_type;
531
+
532
+ // n_thread
533
+ static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
534
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
535
+ ptr->params.nthread = NUM2INT(n_thread);
536
+ return INT2NUM(ptr->params.nthread);
537
+ };
538
+
539
+ static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
540
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
541
+ return INT2NUM(ptr->params.nthread);
542
+ };
543
+
544
+ // ftype
545
+ static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
546
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
547
+ ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
548
+ return INT2NUM(ptr->params.ftype);
549
+ };
550
+
551
+ static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
552
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
553
+ return INT2NUM(ptr->params.ftype);
554
+ };
555
+
556
+ // allow_requantize
557
+ static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
558
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
559
+ if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
560
+ ptr->params.allow_requantize = false;
561
+ } else {
562
+ ptr->params.allow_requantize = true;
563
+ }
564
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
565
+ };
566
+
567
+ static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
568
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
569
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
570
+ };
571
+
572
+ // quantize_output_tensor
573
+ static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
574
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
575
+ if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
576
+ ptr->params.quantize_output_tensor = false;
577
+ } else {
578
+ ptr->params.quantize_output_tensor = true;
579
+ }
580
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
581
+ };
582
+
583
+ static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
584
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
585
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
586
+ };
587
+ };
588
+
589
+ const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
590
+ "RbLLaMAModelQuantizeParams",
591
+ { NULL,
592
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
593
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
594
+ NULL,
595
+ NULL,
596
+ RUBY_TYPED_FREE_IMMEDIATELY
597
+ };
598
+
427
599
  class LLaMAContextWrapper {
428
600
  public:
429
601
  struct llama_context* ctx;
@@ -465,6 +637,7 @@ public:
465
637
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
466
638
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
467
639
  rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
640
+ rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
468
641
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
469
642
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
470
643
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
@@ -517,7 +690,7 @@ private:
517
690
  return Qnil;
518
691
  }
519
692
  if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
520
- rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
693
+ rb_raise(rb_eArgError, "params must be a ContextParams");
521
694
  return Qnil;
522
695
  }
523
696
 
@@ -599,6 +772,24 @@ private:
599
772
  return Qnil;
600
773
  };
601
774
 
775
+ static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
776
+ LLaMAContextWrapper* ptr = get_llama_context(self);
777
+ if (ptr->ctx == NULL) {
778
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
779
+ return Qnil;
780
+ }
781
+ if (!RB_TYPE_P(fname_, T_STRING)) {
782
+ rb_raise(rb_eArgError, "fname must be a string");
783
+ return Qnil;
784
+ }
785
+ const char* fname = StringValueCStr(fname_);
786
+ if (llama_eval_export(ptr->ctx, fname) != 0) {
787
+ return Qfalse;
788
+ }
789
+ RB_GC_GUARD(fname_);
790
+ return Qtrue;
791
+ };
792
+
602
793
  static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
603
794
  VALUE kw_args = Qnil;
604
795
  ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
@@ -1428,10 +1619,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
1428
1619
 
1429
1620
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1430
1621
  VALUE kw_args = Qnil;
1431
- ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
1432
- VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1622
+ ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
1623
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
1433
1624
  rb_scan_args(argc, argv, ":", &kw_args);
1434
- rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
1625
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
1435
1626
 
1436
1627
  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1437
1628
  rb_raise(rb_eArgError, "input_path must be a string");
@@ -1441,21 +1632,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1441
1632
  rb_raise(rb_eArgError, "output_path must be a string");
1442
1633
  return Qnil;
1443
1634
  }
1444
- if (!RB_INTEGER_TYPE_P(kw_values[2])) {
1445
- rb_raise(rb_eArgError, "ftype must be an integer");
1446
- return Qnil;
1447
- }
1448
- if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
1449
- rb_raise(rb_eArgError, "n_threads must be an integer");
1635
+ if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
1636
+ rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
1450
1637
  return Qnil;
1451
1638
  }
1452
1639
 
1453
1640
  const char* input_path = StringValueCStr(kw_values[0]);
1454
1641
  const char* output_path = StringValueCStr(kw_values[1]);
1455
- const int ftype = NUM2INT(kw_values[2]);
1456
- const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
1642
+ LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
1457
1643
 
1458
- if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
1644
+ if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
1459
1645
  rb_raise(rb_eRuntimeError, "Failed to quantize model");
1460
1646
  return Qnil;
1461
1647
  }
@@ -1505,6 +1691,8 @@ extern "C" void Init_llama_cpp(void) {
1505
1691
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
1506
1692
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
1507
1693
 
1694
+ rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
1695
+
1508
1696
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
1509
1697
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
1510
1698
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -1513,6 +1701,15 @@ extern "C" void Init_llama_cpp(void) {
1513
1701
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
1514
1702
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
1515
1703
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
1704
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
1705
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
1706
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
1707
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
1708
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
1709
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
1710
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
1711
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
1712
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
1516
1713
 
1517
1714
  std::stringstream ss_magic;
1518
1715
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;