llama_cpp 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@
4
4
  VALUE rb_mLLaMACpp;
5
5
  VALUE rb_cLLaMAContext;
6
6
  VALUE rb_cLLaMAContextParams;
7
+ VALUE rb_cLLaMAModelQuantizeParams;
7
8
  VALUE rb_cLLaMATokenData;
8
9
  VALUE rb_cLLaMATokenDataArray;
9
10
 
@@ -292,6 +293,15 @@ public:
292
293
  // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
293
294
  rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
294
295
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
296
+ rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
297
+ rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
298
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
299
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
300
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
301
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
302
+ rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
303
+ rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
304
+ rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
295
305
  rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
296
306
  rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
297
307
  rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -329,6 +339,67 @@ private:
329
339
  return INT2NUM(ptr->params.n_ctx);
330
340
  };
331
341
 
342
+ // n_batch
343
+ static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
344
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
345
+ ptr->params.n_batch = NUM2INT(n_batch);
346
+ return INT2NUM(ptr->params.n_batch);
347
+ };
348
+
349
+ static VALUE _llama_context_params_get_n_batch(VALUE self) {
350
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
351
+ return INT2NUM(ptr->params.n_batch);
352
+ };
353
+
354
+ // n_gpu_layers
355
+ static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
356
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
357
+ ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
358
+ return INT2NUM(ptr->params.n_gpu_layers);
359
+ };
360
+
361
+ static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
362
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
363
+ return INT2NUM(ptr->params.n_gpu_layers);
364
+ };
365
+
366
+ // main_gpu
367
+ static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
368
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
369
+ ptr->params.main_gpu = NUM2INT(main_gpu);
370
+ return INT2NUM(ptr->params.main_gpu);
371
+ };
372
+
373
+ static VALUE _llama_context_params_get_main_gpu(VALUE self) {
374
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
375
+ return INT2NUM(ptr->params.main_gpu);
376
+ };
377
+
378
+ // tensor_split
379
+ static VALUE _llama_context_params_get_tensor_split(VALUE self) {
380
+ if (LLAMA_MAX_DEVICES < 1) {
381
+ return rb_ary_new();
382
+ }
383
+ VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
384
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
385
+ for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
386
+ rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
387
+ }
388
+ return ret;
389
+ };
390
+
391
+ // low_vram
392
+ static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
393
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
394
+ ptr->params.low_vram = low_vram == Qtrue ? true : false;
395
+ return ptr->params.low_vram ? Qtrue : Qfalse;
396
+ };
397
+
398
+ static VALUE _llama_context_params_get_low_vram(VALUE self) {
399
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
400
+ return ptr->params.low_vram ? Qtrue : Qfalse;
401
+ };
402
+
332
403
  // seed
333
404
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
334
405
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -424,6 +495,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
424
495
  RUBY_TYPED_FREE_IMMEDIATELY
425
496
  };
426
497
 
498
+ class LLaMAModelQuantizeParamsWrapper {
499
+ public:
500
+ llama_model_quantize_params params;
501
+
502
+ LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
503
+
504
+ ~LLaMAModelQuantizeParamsWrapper(){};
505
+ };
506
+
507
+ class RbLLaMAModelQuantizeParams {
508
+ public:
509
+ static VALUE llama_model_quantize_params_alloc(VALUE self) {
510
+ LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
511
+ new (ptr) LLaMAModelQuantizeParamsWrapper();
512
+ return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
513
+ };
514
+
515
+ static void llama_model_quantize_params_free(void* ptr) {
516
+ ((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
517
+ ruby_xfree(ptr);
518
+ };
519
+
520
+ static size_t llama_model_quantize_params_size(const void* ptr) {
521
+ return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
522
+ };
523
+
524
+ static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
525
+ LLaMAModelQuantizeParamsWrapper* ptr;
526
+ TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
527
+ return ptr;
528
+ };
529
+
530
+ static void define_class(VALUE outer) {
531
+ rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
532
+ rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
533
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
534
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
535
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
536
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
537
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
538
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
539
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
540
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
541
+ };
542
+
543
+ private:
544
+ static const rb_data_type_t llama_model_quantize_params_type;
545
+
546
+ // n_thread
547
+ static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
548
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
549
+ ptr->params.nthread = NUM2INT(n_thread);
550
+ return INT2NUM(ptr->params.nthread);
551
+ };
552
+
553
+ static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
554
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
555
+ return INT2NUM(ptr->params.nthread);
556
+ };
557
+
558
+ // ftype
559
+ static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
560
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
561
+ ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
562
+ return INT2NUM(ptr->params.ftype);
563
+ };
564
+
565
+ static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
566
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
567
+ return INT2NUM(ptr->params.ftype);
568
+ };
569
+
570
+ // allow_requantize
571
+ static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
572
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
573
+ if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
574
+ ptr->params.allow_requantize = false;
575
+ } else {
576
+ ptr->params.allow_requantize = true;
577
+ }
578
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
579
+ };
580
+
581
+ static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
582
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
583
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
584
+ };
585
+
586
+ // quantize_output_tensor
587
+ static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
588
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
589
+ if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
590
+ ptr->params.quantize_output_tensor = false;
591
+ } else {
592
+ ptr->params.quantize_output_tensor = true;
593
+ }
594
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
595
+ };
596
+
597
+ static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
598
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
599
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
600
+ };
601
+ };
602
+
603
+ const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
604
+ "RbLLaMAModelQuantizeParams",
605
+ { NULL,
606
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
607
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
608
+ NULL,
609
+ NULL,
610
+ RUBY_TYPED_FREE_IMMEDIATELY
611
+ };
612
+
427
613
  class LLaMAContextWrapper {
428
614
  public:
429
615
  struct llama_context* ctx;
@@ -465,9 +651,11 @@ public:
465
651
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
466
652
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
467
653
  rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
654
+ rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
468
655
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
469
656
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
470
657
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
658
+ rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
471
659
  rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
472
660
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
473
661
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
@@ -517,7 +705,7 @@ private:
517
705
  return Qnil;
518
706
  }
519
707
  if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
520
- rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
708
+ rb_raise(rb_eArgError, "params must be a ContextParams");
521
709
  return Qnil;
522
710
  }
523
711
 
@@ -599,6 +787,24 @@ private:
599
787
  return Qnil;
600
788
  };
601
789
 
790
+ static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
791
+ LLaMAContextWrapper* ptr = get_llama_context(self);
792
+ if (ptr->ctx == NULL) {
793
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
794
+ return Qnil;
795
+ }
796
+ if (!RB_TYPE_P(fname_, T_STRING)) {
797
+ rb_raise(rb_eArgError, "fname must be a string");
798
+ return Qnil;
799
+ }
800
+ const char* fname = StringValueCStr(fname_);
801
+ if (llama_eval_export(ptr->ctx, fname) != 0) {
802
+ return Qfalse;
803
+ }
804
+ RB_GC_GUARD(fname_);
805
+ return Qtrue;
806
+ };
807
+
602
808
  static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
603
809
  VALUE kw_args = Qnil;
604
810
  ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
@@ -705,6 +911,43 @@ private:
705
911
  return output;
706
912
  };
707
913
 
914
+ static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
915
+ VALUE kw_args = Qnil;
916
+ ID kw_table[1] = { rb_intern("capacity") };
917
+ VALUE kw_values[1] = { Qundef };
918
+ rb_scan_args(argc, argv, ":", &kw_args);
919
+ rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
920
+
921
+ if (!RB_INTEGER_TYPE_P(kw_values[0])) {
922
+ rb_raise(rb_eArgError, "capacity must be an integer");
923
+ return Qnil;
924
+ }
925
+
926
+ LLaMAContextWrapper* ptr = get_llama_context(self);
927
+ if (ptr->ctx == NULL) {
928
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
929
+ return Qnil;
930
+ }
931
+
932
+ const int capacity = NUM2INT(kw_values[0]);
933
+ std::vector<const char*> strings;
934
+ std::vector<float> scores;
935
+ int n_vocab = llama_n_vocab(ptr->ctx);
936
+ strings.resize(n_vocab, NULL);
937
+ scores.resize(n_vocab, 0);
938
+
939
+ n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
940
+
941
+ VALUE ret_strings = rb_ary_new();
942
+ VALUE ret_scores = rb_ary_new();
943
+ for (int i = 0; i < n_vocab; i++) {
944
+ rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
945
+ rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
946
+ }
947
+
948
+ return rb_ary_new_from_args(2, ret_strings, ret_scores);
949
+ };
950
+
708
951
  static VALUE _llama_context_n_vocab(VALUE self) {
709
952
  LLaMAContextWrapper* ptr = get_llama_context(self);
710
953
  if (ptr->ctx == NULL) {
@@ -1428,10 +1671,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
1428
1671
 
1429
1672
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1430
1673
  VALUE kw_args = Qnil;
1431
- ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
1432
- VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1674
+ ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
1675
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
1433
1676
  rb_scan_args(argc, argv, ":", &kw_args);
1434
- rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
1677
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
1435
1678
 
1436
1679
  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1437
1680
  rb_raise(rb_eArgError, "input_path must be a string");
@@ -1441,21 +1684,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1441
1684
  rb_raise(rb_eArgError, "output_path must be a string");
1442
1685
  return Qnil;
1443
1686
  }
1444
- if (!RB_INTEGER_TYPE_P(kw_values[2])) {
1445
- rb_raise(rb_eArgError, "ftype must be an integer");
1446
- return Qnil;
1447
- }
1448
- if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
1449
- rb_raise(rb_eArgError, "n_threads must be an integer");
1687
+ if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
1688
+ rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
1450
1689
  return Qnil;
1451
1690
  }
1452
1691
 
1453
1692
  const char* input_path = StringValueCStr(kw_values[0]);
1454
1693
  const char* output_path = StringValueCStr(kw_values[1]);
1455
- const int ftype = NUM2INT(kw_values[2]);
1456
- const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
1694
+ LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
1457
1695
 
1458
- if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
1696
+ if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
1459
1697
  rb_raise(rb_eRuntimeError, "Failed to quantize model");
1460
1698
  return Qnil;
1461
1699
  }
@@ -1505,6 +1743,8 @@ extern "C" void Init_llama_cpp(void) {
1505
1743
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
1506
1744
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
1507
1745
 
1746
+ rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
1747
+
1508
1748
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
1509
1749
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
1510
1750
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -1513,6 +1753,15 @@ extern "C" void Init_llama_cpp(void) {
1513
1753
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
1514
1754
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
1515
1755
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
1756
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
1757
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
1758
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
1759
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
1760
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
1761
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
1762
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
1763
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
1764
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
1516
1765
 
1517
1766
  std::stringstream ss_magic;
1518
1767
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;