llama_cpp 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,6 +4,7 @@
4
4
  VALUE rb_mLLaMACpp;
5
5
  VALUE rb_cLLaMAContext;
6
6
  VALUE rb_cLLaMAContextParams;
7
+ VALUE rb_cLLaMAModelQuantizeParams;
7
8
  VALUE rb_cLLaMATokenData;
8
9
  VALUE rb_cLLaMATokenDataArray;
9
10
 
@@ -292,6 +293,15 @@ public:
292
293
  // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
293
294
  rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
294
295
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
296
+ rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
297
+ rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
298
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
299
+ rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
300
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
301
+ rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
302
+ rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
303
+ rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
304
+ rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
295
305
  rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
296
306
  rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
297
307
  rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -329,6 +339,67 @@ private:
329
339
  return INT2NUM(ptr->params.n_ctx);
330
340
  };
331
341
 
342
+ // n_batch
343
+ static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
344
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
345
+ ptr->params.n_batch = NUM2INT(n_batch);
346
+ return INT2NUM(ptr->params.n_batch);
347
+ };
348
+
349
+ static VALUE _llama_context_params_get_n_batch(VALUE self) {
350
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
351
+ return INT2NUM(ptr->params.n_batch);
352
+ };
353
+
354
+ // n_gpu_layers
355
+ static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
356
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
357
+ ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
358
+ return INT2NUM(ptr->params.n_gpu_layers);
359
+ };
360
+
361
+ static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
362
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
363
+ return INT2NUM(ptr->params.n_gpu_layers);
364
+ };
365
+
366
+ // main_gpu
367
+ static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
368
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
369
+ ptr->params.main_gpu = NUM2INT(main_gpu);
370
+ return INT2NUM(ptr->params.main_gpu);
371
+ };
372
+
373
+ static VALUE _llama_context_params_get_main_gpu(VALUE self) {
374
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
375
+ return INT2NUM(ptr->params.main_gpu);
376
+ };
377
+
378
+ // tensor_split
379
+ static VALUE _llama_context_params_get_tensor_split(VALUE self) {
380
+ if (LLAMA_MAX_DEVICES < 1) {
381
+ return rb_ary_new();
382
+ }
383
+ VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
384
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
385
+ for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
386
+ rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
387
+ }
388
+ return ret;
389
+ };
390
+
391
+ // low_vram
392
+ static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
393
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
394
+ ptr->params.low_vram = low_vram == Qtrue ? true : false;
395
+ return ptr->params.low_vram ? Qtrue : Qfalse;
396
+ };
397
+
398
+ static VALUE _llama_context_params_get_low_vram(VALUE self) {
399
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
400
+ return ptr->params.low_vram ? Qtrue : Qfalse;
401
+ };
402
+
332
403
  // seed
333
404
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
334
405
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -424,6 +495,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
424
495
  RUBY_TYPED_FREE_IMMEDIATELY
425
496
  };
426
497
 
498
+ class LLaMAModelQuantizeParamsWrapper {
499
+ public:
500
+ llama_model_quantize_params params;
501
+
502
+ LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
503
+
504
+ ~LLaMAModelQuantizeParamsWrapper(){};
505
+ };
506
+
507
+ class RbLLaMAModelQuantizeParams {
508
+ public:
509
+ static VALUE llama_model_quantize_params_alloc(VALUE self) {
510
+ LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
511
+ new (ptr) LLaMAModelQuantizeParamsWrapper();
512
+ return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
513
+ };
514
+
515
+ static void llama_model_quantize_params_free(void* ptr) {
516
+ ((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
517
+ ruby_xfree(ptr);
518
+ };
519
+
520
+ static size_t llama_model_quantize_params_size(const void* ptr) {
521
+ return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
522
+ };
523
+
524
+ static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
525
+ LLaMAModelQuantizeParamsWrapper* ptr;
526
+ TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
527
+ return ptr;
528
+ };
529
+
530
+ static void define_class(VALUE outer) {
531
+ rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
532
+ rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
533
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
534
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
535
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
536
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
537
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
538
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
539
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
540
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
541
+ };
542
+
543
+ private:
544
+ static const rb_data_type_t llama_model_quantize_params_type;
545
+
546
+ // n_thread
547
+ static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
548
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
549
+ ptr->params.nthread = NUM2INT(n_thread);
550
+ return INT2NUM(ptr->params.nthread);
551
+ };
552
+
553
+ static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
554
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
555
+ return INT2NUM(ptr->params.nthread);
556
+ };
557
+
558
+ // ftype
559
+ static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
560
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
561
+ ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
562
+ return INT2NUM(ptr->params.ftype);
563
+ };
564
+
565
+ static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
566
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
567
+ return INT2NUM(ptr->params.ftype);
568
+ };
569
+
570
+ // allow_requantize
571
+ static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
572
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
573
+ if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
574
+ ptr->params.allow_requantize = false;
575
+ } else {
576
+ ptr->params.allow_requantize = true;
577
+ }
578
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
579
+ };
580
+
581
+ static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
582
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
583
+ return ptr->params.allow_requantize ? Qtrue : Qfalse;
584
+ };
585
+
586
+ // quantize_output_tensor
587
+ static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
588
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
589
+ if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
590
+ ptr->params.quantize_output_tensor = false;
591
+ } else {
592
+ ptr->params.quantize_output_tensor = true;
593
+ }
594
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
595
+ };
596
+
597
+ static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
598
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
599
+ return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
600
+ };
601
+ };
602
+
603
+ const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
604
+ "RbLLaMAModelQuantizeParams",
605
+ { NULL,
606
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
607
+ RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
608
+ NULL,
609
+ NULL,
610
+ RUBY_TYPED_FREE_IMMEDIATELY
611
+ };
612
+
427
613
  class LLaMAContextWrapper {
428
614
  public:
429
615
  struct llama_context* ctx;
@@ -465,9 +651,11 @@ public:
465
651
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
466
652
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
467
653
  rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
654
+ rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
468
655
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
469
656
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
470
657
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
658
+ rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
471
659
  rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
472
660
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
473
661
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
@@ -517,7 +705,7 @@ private:
517
705
  return Qnil;
518
706
  }
519
707
  if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
520
- rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
708
+ rb_raise(rb_eArgError, "params must be a ContextParams");
521
709
  return Qnil;
522
710
  }
523
711
 
@@ -599,6 +787,24 @@ private:
599
787
  return Qnil;
600
788
  };
601
789
 
790
+ static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
791
+ LLaMAContextWrapper* ptr = get_llama_context(self);
792
+ if (ptr->ctx == NULL) {
793
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
794
+ return Qnil;
795
+ }
796
+ if (!RB_TYPE_P(fname_, T_STRING)) {
797
+ rb_raise(rb_eArgError, "fname must be a string");
798
+ return Qnil;
799
+ }
800
+ const char* fname = StringValueCStr(fname_);
801
+ if (llama_eval_export(ptr->ctx, fname) != 0) {
802
+ return Qfalse;
803
+ }
804
+ RB_GC_GUARD(fname_);
805
+ return Qtrue;
806
+ };
807
+
602
808
  static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
603
809
  VALUE kw_args = Qnil;
604
810
  ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
@@ -705,6 +911,43 @@ private:
705
911
  return output;
706
912
  };
707
913
 
914
+ static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
915
+ VALUE kw_args = Qnil;
916
+ ID kw_table[1] = { rb_intern("capacity") };
917
+ VALUE kw_values[1] = { Qundef };
918
+ rb_scan_args(argc, argv, ":", &kw_args);
919
+ rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
920
+
921
+ if (!RB_INTEGER_TYPE_P(kw_values[0])) {
922
+ rb_raise(rb_eArgError, "capacity must be an integer");
923
+ return Qnil;
924
+ }
925
+
926
+ LLaMAContextWrapper* ptr = get_llama_context(self);
927
+ if (ptr->ctx == NULL) {
928
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
929
+ return Qnil;
930
+ }
931
+
932
+ const int capacity = NUM2INT(kw_values[0]);
933
+ std::vector<const char*> strings;
934
+ std::vector<float> scores;
935
+ int n_vocab = llama_n_vocab(ptr->ctx);
936
+ strings.resize(n_vocab, NULL);
937
+ scores.resize(n_vocab, 0);
938
+
939
+ n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
940
+
941
+ VALUE ret_strings = rb_ary_new();
942
+ VALUE ret_scores = rb_ary_new();
943
+ for (int i = 0; i < n_vocab; i++) {
944
+ rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
945
+ rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
946
+ }
947
+
948
+ return rb_ary_new_from_args(2, ret_strings, ret_scores);
949
+ };
950
+
708
951
  static VALUE _llama_context_n_vocab(VALUE self) {
709
952
  LLaMAContextWrapper* ptr = get_llama_context(self);
710
953
  if (ptr->ctx == NULL) {
@@ -1428,10 +1671,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
1428
1671
 
1429
1672
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1430
1673
  VALUE kw_args = Qnil;
1431
- ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
1432
- VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1674
+ ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
1675
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
1433
1676
  rb_scan_args(argc, argv, ":", &kw_args);
1434
- rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
1677
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
1435
1678
 
1436
1679
  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1437
1680
  rb_raise(rb_eArgError, "input_path must be a string");
@@ -1441,21 +1684,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
1441
1684
  rb_raise(rb_eArgError, "output_path must be a string");
1442
1685
  return Qnil;
1443
1686
  }
1444
- if (!RB_INTEGER_TYPE_P(kw_values[2])) {
1445
- rb_raise(rb_eArgError, "ftype must be an integer");
1446
- return Qnil;
1447
- }
1448
- if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
1449
- rb_raise(rb_eArgError, "n_threads must be an integer");
1687
+ if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
1688
+ rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
1450
1689
  return Qnil;
1451
1690
  }
1452
1691
 
1453
1692
  const char* input_path = StringValueCStr(kw_values[0]);
1454
1693
  const char* output_path = StringValueCStr(kw_values[1]);
1455
- const int ftype = NUM2INT(kw_values[2]);
1456
- const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
1694
+ LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
1457
1695
 
1458
- if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
1696
+ if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
1459
1697
  rb_raise(rb_eRuntimeError, "Failed to quantize model");
1460
1698
  return Qnil;
1461
1699
  }
@@ -1505,6 +1743,8 @@ extern "C" void Init_llama_cpp(void) {
1505
1743
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
1506
1744
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
1507
1745
 
1746
+ rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
1747
+
1508
1748
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
1509
1749
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
1510
1750
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -1513,6 +1753,15 @@ extern "C" void Init_llama_cpp(void) {
1513
1753
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
1514
1754
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
1515
1755
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
1756
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
1757
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
1758
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
1759
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
1760
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
1761
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
1762
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
1763
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
1764
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
1516
1765
 
1517
1766
  std::stringstream ss_magic;
1518
1767
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;