llama_cpp 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
VALUE rb_mLLaMACpp;
|
5
5
|
VALUE rb_cLLaMAContext;
|
6
6
|
VALUE rb_cLLaMAContextParams;
|
7
|
+
VALUE rb_cLLaMAModelQuantizeParams;
|
7
8
|
VALUE rb_cLLaMATokenData;
|
8
9
|
VALUE rb_cLLaMATokenDataArray;
|
9
10
|
|
@@ -292,6 +293,15 @@ public:
|
|
292
293
|
// rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
|
293
294
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
|
294
295
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
|
296
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
|
297
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
|
298
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
|
299
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
|
300
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
|
301
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
|
302
|
+
rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
|
303
|
+
rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
|
304
|
+
rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
|
295
305
|
rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
|
296
306
|
rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
|
297
307
|
rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
|
@@ -329,6 +339,67 @@ private:
|
|
329
339
|
return INT2NUM(ptr->params.n_ctx);
|
330
340
|
};
|
331
341
|
|
342
|
+
// n_batch
|
343
|
+
static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
|
344
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
345
|
+
ptr->params.n_batch = NUM2INT(n_batch);
|
346
|
+
return INT2NUM(ptr->params.n_batch);
|
347
|
+
};
|
348
|
+
|
349
|
+
static VALUE _llama_context_params_get_n_batch(VALUE self) {
|
350
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
351
|
+
return INT2NUM(ptr->params.n_batch);
|
352
|
+
};
|
353
|
+
|
354
|
+
// n_gpu_layers
|
355
|
+
static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
|
356
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
357
|
+
ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
|
358
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
359
|
+
};
|
360
|
+
|
361
|
+
static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
|
362
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
363
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
364
|
+
};
|
365
|
+
|
366
|
+
// main_gpu
|
367
|
+
static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
|
368
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
369
|
+
ptr->params.main_gpu = NUM2INT(main_gpu);
|
370
|
+
return INT2NUM(ptr->params.main_gpu);
|
371
|
+
};
|
372
|
+
|
373
|
+
static VALUE _llama_context_params_get_main_gpu(VALUE self) {
|
374
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
375
|
+
return INT2NUM(ptr->params.main_gpu);
|
376
|
+
};
|
377
|
+
|
378
|
+
// tensor_split
|
379
|
+
static VALUE _llama_context_params_get_tensor_split(VALUE self) {
|
380
|
+
if (LLAMA_MAX_DEVICES < 1) {
|
381
|
+
return rb_ary_new();
|
382
|
+
}
|
383
|
+
VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
|
384
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
385
|
+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
|
386
|
+
rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
|
387
|
+
}
|
388
|
+
return ret;
|
389
|
+
};
|
390
|
+
|
391
|
+
// low_vram
|
392
|
+
static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
|
393
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
394
|
+
ptr->params.low_vram = low_vram == Qtrue ? true : false;
|
395
|
+
return ptr->params.low_vram ? Qtrue : Qfalse;
|
396
|
+
};
|
397
|
+
|
398
|
+
static VALUE _llama_context_params_get_low_vram(VALUE self) {
|
399
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
400
|
+
return ptr->params.low_vram ? Qtrue : Qfalse;
|
401
|
+
};
|
402
|
+
|
332
403
|
// seed
|
333
404
|
static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
|
334
405
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -424,6 +495,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
|
424
495
|
RUBY_TYPED_FREE_IMMEDIATELY
|
425
496
|
};
|
426
497
|
|
498
|
+
class LLaMAModelQuantizeParamsWrapper {
|
499
|
+
public:
|
500
|
+
llama_model_quantize_params params;
|
501
|
+
|
502
|
+
LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
|
503
|
+
|
504
|
+
~LLaMAModelQuantizeParamsWrapper(){};
|
505
|
+
};
|
506
|
+
|
507
|
+
class RbLLaMAModelQuantizeParams {
|
508
|
+
public:
|
509
|
+
static VALUE llama_model_quantize_params_alloc(VALUE self) {
|
510
|
+
LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
|
511
|
+
new (ptr) LLaMAModelQuantizeParamsWrapper();
|
512
|
+
return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
|
513
|
+
};
|
514
|
+
|
515
|
+
static void llama_model_quantize_params_free(void* ptr) {
|
516
|
+
((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
|
517
|
+
ruby_xfree(ptr);
|
518
|
+
};
|
519
|
+
|
520
|
+
static size_t llama_model_quantize_params_size(const void* ptr) {
|
521
|
+
return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
|
522
|
+
};
|
523
|
+
|
524
|
+
static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
|
525
|
+
LLaMAModelQuantizeParamsWrapper* ptr;
|
526
|
+
TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
|
527
|
+
return ptr;
|
528
|
+
};
|
529
|
+
|
530
|
+
static void define_class(VALUE outer) {
|
531
|
+
rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
|
532
|
+
rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
|
533
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
|
534
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
|
535
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
|
536
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
|
537
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
|
538
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
539
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
540
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
541
|
+
};
|
542
|
+
|
543
|
+
private:
|
544
|
+
static const rb_data_type_t llama_model_quantize_params_type;
|
545
|
+
|
546
|
+
// n_thread
|
547
|
+
static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
|
548
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
549
|
+
ptr->params.nthread = NUM2INT(n_thread);
|
550
|
+
return INT2NUM(ptr->params.nthread);
|
551
|
+
};
|
552
|
+
|
553
|
+
static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
|
554
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
555
|
+
return INT2NUM(ptr->params.nthread);
|
556
|
+
};
|
557
|
+
|
558
|
+
// ftype
|
559
|
+
static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
|
560
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
561
|
+
ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
|
562
|
+
return INT2NUM(ptr->params.ftype);
|
563
|
+
};
|
564
|
+
|
565
|
+
static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
|
566
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
567
|
+
return INT2NUM(ptr->params.ftype);
|
568
|
+
};
|
569
|
+
|
570
|
+
// allow_requantize
|
571
|
+
static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
|
572
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
573
|
+
if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
|
574
|
+
ptr->params.allow_requantize = false;
|
575
|
+
} else {
|
576
|
+
ptr->params.allow_requantize = true;
|
577
|
+
}
|
578
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
579
|
+
};
|
580
|
+
|
581
|
+
static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
|
582
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
583
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
584
|
+
};
|
585
|
+
|
586
|
+
// quantize_output_tensor
|
587
|
+
static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
|
588
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
589
|
+
if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
|
590
|
+
ptr->params.quantize_output_tensor = false;
|
591
|
+
} else {
|
592
|
+
ptr->params.quantize_output_tensor = true;
|
593
|
+
}
|
594
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
595
|
+
};
|
596
|
+
|
597
|
+
static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
|
598
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
599
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
600
|
+
};
|
601
|
+
};
|
602
|
+
|
603
|
+
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
604
|
+
"RbLLaMAModelQuantizeParams",
|
605
|
+
{ NULL,
|
606
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
|
607
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
|
608
|
+
NULL,
|
609
|
+
NULL,
|
610
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
611
|
+
};
|
612
|
+
|
427
613
|
class LLaMAContextWrapper {
|
428
614
|
public:
|
429
615
|
struct llama_context* ctx;
|
@@ -465,9 +651,11 @@ public:
|
|
465
651
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
466
652
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
467
653
|
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
654
|
+
rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
|
468
655
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
469
656
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
470
657
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
658
|
+
rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
|
471
659
|
rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
|
472
660
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
473
661
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
@@ -517,7 +705,7 @@ private:
|
|
517
705
|
return Qnil;
|
518
706
|
}
|
519
707
|
if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
|
520
|
-
rb_raise(rb_eArgError, "params must be a
|
708
|
+
rb_raise(rb_eArgError, "params must be a ContextParams");
|
521
709
|
return Qnil;
|
522
710
|
}
|
523
711
|
|
@@ -599,6 +787,24 @@ private:
|
|
599
787
|
return Qnil;
|
600
788
|
};
|
601
789
|
|
790
|
+
static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
|
791
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
792
|
+
if (ptr->ctx == NULL) {
|
793
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
794
|
+
return Qnil;
|
795
|
+
}
|
796
|
+
if (!RB_TYPE_P(fname_, T_STRING)) {
|
797
|
+
rb_raise(rb_eArgError, "fname must be a string");
|
798
|
+
return Qnil;
|
799
|
+
}
|
800
|
+
const char* fname = StringValueCStr(fname_);
|
801
|
+
if (llama_eval_export(ptr->ctx, fname) != 0) {
|
802
|
+
return Qfalse;
|
803
|
+
}
|
804
|
+
RB_GC_GUARD(fname_);
|
805
|
+
return Qtrue;
|
806
|
+
};
|
807
|
+
|
602
808
|
static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
|
603
809
|
VALUE kw_args = Qnil;
|
604
810
|
ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
|
@@ -705,6 +911,43 @@ private:
|
|
705
911
|
return output;
|
706
912
|
};
|
707
913
|
|
914
|
+
static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
|
915
|
+
VALUE kw_args = Qnil;
|
916
|
+
ID kw_table[1] = { rb_intern("capacity") };
|
917
|
+
VALUE kw_values[1] = { Qundef };
|
918
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
919
|
+
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
920
|
+
|
921
|
+
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
922
|
+
rb_raise(rb_eArgError, "capacity must be an integer");
|
923
|
+
return Qnil;
|
924
|
+
}
|
925
|
+
|
926
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
927
|
+
if (ptr->ctx == NULL) {
|
928
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
929
|
+
return Qnil;
|
930
|
+
}
|
931
|
+
|
932
|
+
const int capacity = NUM2INT(kw_values[0]);
|
933
|
+
std::vector<const char*> strings;
|
934
|
+
std::vector<float> scores;
|
935
|
+
int n_vocab = llama_n_vocab(ptr->ctx);
|
936
|
+
strings.resize(n_vocab, NULL);
|
937
|
+
scores.resize(n_vocab, 0);
|
938
|
+
|
939
|
+
n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
|
940
|
+
|
941
|
+
VALUE ret_strings = rb_ary_new();
|
942
|
+
VALUE ret_scores = rb_ary_new();
|
943
|
+
for (int i = 0; i < n_vocab; i++) {
|
944
|
+
rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
|
945
|
+
rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
|
946
|
+
}
|
947
|
+
|
948
|
+
return rb_ary_new_from_args(2, ret_strings, ret_scores);
|
949
|
+
};
|
950
|
+
|
708
951
|
static VALUE _llama_context_n_vocab(VALUE self) {
|
709
952
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
710
953
|
if (ptr->ctx == NULL) {
|
@@ -1428,10 +1671,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
|
|
1428
1671
|
|
1429
1672
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
1430
1673
|
VALUE kw_args = Qnil;
|
1431
|
-
ID kw_table[
|
1432
|
-
VALUE kw_values[
|
1674
|
+
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
1675
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
1433
1676
|
rb_scan_args(argc, argv, ":", &kw_args);
|
1434
|
-
rb_get_kwargs(kw_args, kw_table, 3,
|
1677
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
1435
1678
|
|
1436
1679
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1437
1680
|
rb_raise(rb_eArgError, "input_path must be a string");
|
@@ -1441,21 +1684,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
|
1441
1684
|
rb_raise(rb_eArgError, "output_path must be a string");
|
1442
1685
|
return Qnil;
|
1443
1686
|
}
|
1444
|
-
if (!
|
1445
|
-
rb_raise(rb_eArgError, "
|
1446
|
-
return Qnil;
|
1447
|
-
}
|
1448
|
-
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
1449
|
-
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1687
|
+
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
|
1688
|
+
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
|
1450
1689
|
return Qnil;
|
1451
1690
|
}
|
1452
1691
|
|
1453
1692
|
const char* input_path = StringValueCStr(kw_values[0]);
|
1454
1693
|
const char* output_path = StringValueCStr(kw_values[1]);
|
1455
|
-
|
1456
|
-
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
1694
|
+
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
|
1457
1695
|
|
1458
|
-
if (llama_model_quantize(input_path, output_path, (
|
1696
|
+
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
|
1459
1697
|
rb_raise(rb_eRuntimeError, "Failed to quantize model");
|
1460
1698
|
return Qnil;
|
1461
1699
|
}
|
@@ -1505,6 +1743,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
1505
1743
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
1506
1744
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
1507
1745
|
|
1746
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
1747
|
+
|
1508
1748
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
1509
1749
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
1510
1750
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
@@ -1513,6 +1753,15 @@ extern "C" void Init_llama_cpp(void) {
|
|
1513
1753
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
|
1514
1754
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
|
1515
1755
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
|
1756
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
|
1757
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
|
1758
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
|
1759
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
|
1760
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
|
1761
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
|
1762
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
1763
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
1764
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
1516
1765
|
|
1517
1766
|
std::stringstream ss_magic;
|
1518
1767
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
|