llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
4
- data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
3
+ metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
4
+ data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
5
5
  SHA512:
6
- metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
7
- data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
6
+ metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
7
+ data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a
data/CHANGELOG.md CHANGED
@@ -1,3 +1,32 @@
1
+ ## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
2
+
3
+ - Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
4
+ - Fix installation files for CUDA.
5
+ - Add metal config option:
6
+ ```
7
+ $ gem install llama_cpp -- --with-metal
8
+ ```
9
+ ```ruby
10
+ require 'llama_cpp'
11
+
12
+ params = LLaMACpp::ContextParams.new
13
+ params.n_gpu_layers = 1
14
+
15
+ context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
16
+ LLaMACpp.generate(context, 'Hello, world.')
17
+ ```
18
+
19
+ **Breaking Changes**
20
+
21
+ - Add ModelQuantizationParams class.
22
+ - Change the argument of the `model_quantize` module function in LLaMACpp.
23
+ ```ruby
24
+ require 'llama_cpp'
25
+
26
+ params = LLaMACpp::ModelQuantizeParams.new
27
+ LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
28
+ ```
29
+
1
30
  ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
2
31
 
3
32
  - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'mkmf'
4
+ require 'fileutils'
4
5
 
5
6
  abort 'libstdc++ is not found.' unless have_library('stdc++')
6
7
 
@@ -36,17 +37,30 @@ if with_config('accelerate')
36
37
  $CFLAGS << ' -DGGML_USE_ACCELERATE'
37
38
  end
38
39
 
40
+ if with_config('metal')
41
+ $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
42
+ $CXXFLAGS << ' -DGGML_USE_METAL'
43
+ $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
44
+ $objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
45
+ end
46
+
39
47
  if with_config('cublas')
40
48
  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
49
+ $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
41
50
  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
42
51
  $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
43
52
  end
44
53
 
45
54
  if with_config('clblast')
46
55
  abort 'libclblast is not found.' unless have_library('clblast')
47
- abort 'libOpenCL is not found.' unless have_library('OpenCL')
48
56
 
49
57
  $CFLAGS << ' -DGGML_USE_CLBLAST'
58
+ $CXXFLAGS << ' -DGGML_USE_CLBLAST'
59
+ if RUBY_PLATFORM.match?(/darwin/)
60
+ $LDFLAGS << ' -framework OpenCL'
61
+ else
62
+ abort 'libOpenCL is not found.' unless have_library('OpenCL')
63
+ end
50
64
  end
51
65
 
52
66
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -78,3 +92,14 @@ if with_config('cublas')
78
92
  f.puts "\tnvcc -arch=native -c -o $@ $<"
79
93
  end
80
94
  end
95
+
96
+ if with_config('metal')
97
+ File.open('Makefile', 'a') do |f|
98
+ f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
99
+ f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
100
+ end
101
+
102
+ metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
103
+ dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
104
+ FileUtils.cp(metal_path, dest_path)
105
+ end
@@ -4,6 +4,7 @@
4
4
  VALUE rb_mLLaMACpp;
5
5
  VALUE rb_cLLaMAContext;
6
6
  VALUE rb_cLLaMAContextParams;
7
+ VALUE rb_cLLaMAModelQuantizeParams;
7
8
  VALUE rb_cLLaMATokenData;
8
9
  VALUE rb_cLLaMATokenDataArray;
9
10
 
@@ -292,6 +293,13 @@ public:
292
293
  // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
293
294
  rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
294
295
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
296
+ rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
297
+ rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
298
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
299
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
300
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
301
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
302
+ rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
295
303
  rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
296
304
  rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
297
305
  rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -329,6 +337,55 @@ private:
329
337
  return INT2NUM(ptr->params.n_ctx);
330
338
  };
331
339
 
340
+ // n_batch
341
+ static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
342
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
343
+ ptr->params.n_batch = NUM2INT(n_batch);
344
+ return INT2NUM(ptr->params.n_batch);
345
+ };
346
+
347
+ static VALUE _llama_context_params_get_n_batch(VALUE self) {
348
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
349
+ return INT2NUM(ptr->params.n_batch);
350
+ };
351
+
352
+ // n_gpu_layers
353
+ static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
354
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
355
+ ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
356
+ return INT2NUM(ptr->params.n_gpu_layers);
357
+ };
358
+
359
+ static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
360
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
361
+ return INT2NUM(ptr->params.n_gpu_layers);
362
+ };
363
+
364
+ // main_gpu
365
+ static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
366
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
367
+ ptr->params.main_gpu = NUM2INT(main_gpu);
368
+ return INT2NUM(ptr->params.main_gpu);
369
+ };
370
+
371
+ static VALUE _llama_context_params_get_main_gpu(VALUE self) {
372
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
373
+ return INT2NUM(ptr->params.main_gpu);
374
+ };
375
+
376
+ // tensor_split
377
+ static VALUE _llama_context_params_get_tensor_split(VALUE self) {
378
+ if (LLAMA_MAX_DEVICES < 1) {
379
+ return rb_ary_new();
380
+ }
381
+ VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
382
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
383
+ for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
384
+ rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
385
+ }
386
+ return ret;
387
+ };
388
+
332
389
  // seed
333
390
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
334
391
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -424,6 +481,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
424
481
  RUBY_TYPED_FREE_IMMEDIATELY
425
482
  };
426
483
 
484
+ class LLaMAModelQuantizeParamsWrapper {
485
+ public:
486
+ llama_model_quantize_params params;
487
+
488
+ LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
489
+
490
+ ~LLaMAModelQuantizeParamsWrapper(){};
491
+ };
492
+
493
+ class RbLLaMAModelQuantizeParams {
494
+ public:
495
+ static VALUE llama_model_quantize_params_alloc(VALUE self) {
496
+ LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
497
+ new (ptr) LLaMAModelQuantizeParamsWrapper();
498
+ return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
499
+ };
500
+
501
+ static void llama_model_quantize_params_free(void* ptr) {
502
+ ((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
503
+ ruby_xfree(ptr);
504
+ };
505
+
506
+ static size_t llama_model_quantize_params_size(const void* ptr) {
507
+ return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
508
+ };
509
+
510
+ static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
511
+ LLaMAModelQuantizeParamsWrapper* ptr;
512
+ TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
513
+ return ptr;
514
+ };
515
+
516
+ static void define_class(VALUE outer) {
517
+ rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
518
+ rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
519
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
520
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
521
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
522
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
523
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
524
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
525
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
526
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
527
+ };
528
+
529
+ private:
530
+ static const rb_data_type_t llama_model_quantize_params_type;
531
+
532
+ // n_thread
533
+ static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
534
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
535
+ ptr->params.nthread = NUM2INT(n_thread);
536
+ return INT2NUM(ptr->params.nthread);
537
+ };
538
+
539
+ static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
540
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
541
+ return INT2NUM(ptr->params.nthread);
542
+ };
543
+
544
+ // ftype
545
+ static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
546
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
547
+ ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
548
+ return INT2NUM(ptr->params.ftype);
549
+ };
550
+
551
+ static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
552
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
553
+ return INT2NUM(ptr->params.ftype);
554
+ };
555
+
556
+ // allow_requantize
557
+ static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
558
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
559
+ if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
560
+ ptr->params.allow_requantize = false;
561
+ } else {
562
+ ptr->params.allow_requantize = true;
563
+ }
564
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
565
+ };
566
+
567
+ static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
568
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
569
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
570
+ };
571
+
572
+ // quantize_output_tensor
573
+ static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
574
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
575
+ if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
576
+ ptr->params.quantize_output_tensor = false;
577
+ } else {
578
+ ptr->params.quantize_output_tensor = true;
579
+ }
580
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
581
+ };
582
+
583
+ static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
584
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
585
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
586
+ };
587
+ };
588
+
589
+ const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
590
+ "RbLLaMAModelQuantizeParams",
591
+ { NULL,
592
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
593
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
594
+ NULL,
595
+ NULL,
596
+ RUBY_TYPED_FREE_IMMEDIATELY
597
+ };
598
+
427
599
  class LLaMAContextWrapper {
428
600
  public:
429
601
  struct llama_context* ctx;
@@ -465,6 +637,7 @@ public:
465
637
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
466
638
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
467
639
  rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
640
+ rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
468
641
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
469
642
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
470
643
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
@@ -517,7 +690,7 @@ private:
517
690
  return Qnil;
518
691
  }
519
692
  if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
520
- rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
693
+ rb_raise(rb_eArgError, "params must be a ContextParams");
521
694
  return Qnil;
522
695
  }
523
696
 
@@ -599,6 +772,24 @@ private:
599
772
  return Qnil;
600
773
  };
601
774
 
775
+ static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
776
+ LLaMAContextWrapper* ptr = get_llama_context(self);
777
+ if (ptr->ctx == NULL) {
778
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
779
+ return Qnil;
780
+ }
781
+ if (!RB_TYPE_P(fname_, T_STRING)) {
782
+ rb_raise(rb_eArgError, "fname must be a string");
783
+ return Qnil;
784
+ }
785
+ const char* fname = StringValueCStr(fname_);
786
+ if (llama_eval_export(ptr->ctx, fname) != 0) {
787
+ return Qfalse;
788
+ }
789
+ RB_GC_GUARD(fname_);
790
+ return Qtrue;
791
+ };
792
+
602
793
  static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
603
794
  VALUE kw_args = Qnil;
604
795
  ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
@@ -1428,10 +1619,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
1428
1619
 
1429
1620
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1430
1621
  VALUE kw_args = Qnil;
1431
- ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
1432
- VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1622
+ ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
1623
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
1433
1624
  rb_scan_args(argc, argv, ":", &kw_args);
1434
- rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
1625
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
1435
1626
 
1436
1627
  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1437
1628
  rb_raise(rb_eArgError, "input_path must be a string");
@@ -1441,21 +1632,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1441
1632
  rb_raise(rb_eArgError, "output_path must be a string");
1442
1633
  return Qnil;
1443
1634
  }
1444
- if (!RB_INTEGER_TYPE_P(kw_values[2])) {
1445
- rb_raise(rb_eArgError, "ftype must be an integer");
1446
- return Qnil;
1447
- }
1448
- if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
1449
- rb_raise(rb_eArgError, "n_threads must be an integer");
1635
+ if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
1636
+ rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
1450
1637
  return Qnil;
1451
1638
  }
1452
1639
 
1453
1640
  const char* input_path = StringValueCStr(kw_values[0]);
1454
1641
  const char* output_path = StringValueCStr(kw_values[1]);
1455
- const int ftype = NUM2INT(kw_values[2]);
1456
- const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
1642
+ LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
1457
1643
 
1458
- if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
1644
+ if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
1459
1645
  rb_raise(rb_eRuntimeError, "Failed to quantize model");
1460
1646
  return Qnil;
1461
1647
  }
@@ -1505,6 +1691,8 @@ extern "C" void Init_llama_cpp(void) {
1505
1691
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
1506
1692
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
1507
1693
 
1694
+ rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
1695
+
1508
1696
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
1509
1697
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
1510
1698
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -1513,6 +1701,15 @@ extern "C" void Init_llama_cpp(void) {
1513
1701
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
1514
1702
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
1515
1703
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
1704
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
1705
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
1706
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
1707
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
1708
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
1709
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
1710
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
1711
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
1712
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
1516
1713
 
1517
1714
  std::stringstream ss_magic;
1518
1715
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;