llama_cpp 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
VALUE rb_mLLaMACpp;
|
5
5
|
VALUE rb_cLLaMAContext;
|
6
6
|
VALUE rb_cLLaMAContextParams;
|
7
|
+
VALUE rb_cLLaMAModelQuantizeParams;
|
7
8
|
VALUE rb_cLLaMATokenData;
|
8
9
|
VALUE rb_cLLaMATokenDataArray;
|
9
10
|
|
@@ -292,6 +293,15 @@ public:
|
|
292
293
|
// rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
|
293
294
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
|
294
295
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
|
296
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
|
297
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
|
298
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
|
299
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
|
300
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
|
301
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
|
302
|
+
rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
|
303
|
+
rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
|
304
|
+
rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
|
295
305
|
rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
|
296
306
|
rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
|
297
307
|
rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
|
@@ -329,6 +339,67 @@ private:
|
|
329
339
|
return INT2NUM(ptr->params.n_ctx);
|
330
340
|
};
|
331
341
|
|
342
|
+
// n_batch
|
343
|
+
static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
|
344
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
345
|
+
ptr->params.n_batch = NUM2INT(n_batch);
|
346
|
+
return INT2NUM(ptr->params.n_batch);
|
347
|
+
};
|
348
|
+
|
349
|
+
static VALUE _llama_context_params_get_n_batch(VALUE self) {
|
350
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
351
|
+
return INT2NUM(ptr->params.n_batch);
|
352
|
+
};
|
353
|
+
|
354
|
+
// n_gpu_layers
|
355
|
+
static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
|
356
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
357
|
+
ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
|
358
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
359
|
+
};
|
360
|
+
|
361
|
+
static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
|
362
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
363
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
364
|
+
};
|
365
|
+
|
366
|
+
// main_gpu
|
367
|
+
static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
|
368
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
369
|
+
ptr->params.main_gpu = NUM2INT(main_gpu);
|
370
|
+
return INT2NUM(ptr->params.main_gpu);
|
371
|
+
};
|
372
|
+
|
373
|
+
static VALUE _llama_context_params_get_main_gpu(VALUE self) {
|
374
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
375
|
+
return INT2NUM(ptr->params.main_gpu);
|
376
|
+
};
|
377
|
+
|
378
|
+
// tensor_split
|
379
|
+
static VALUE _llama_context_params_get_tensor_split(VALUE self) {
|
380
|
+
if (LLAMA_MAX_DEVICES < 1) {
|
381
|
+
return rb_ary_new();
|
382
|
+
}
|
383
|
+
VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
|
384
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
385
|
+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
|
386
|
+
rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
|
387
|
+
}
|
388
|
+
return ret;
|
389
|
+
};
|
390
|
+
|
391
|
+
// low_vram
|
392
|
+
static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
|
393
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
394
|
+
ptr->params.low_vram = low_vram == Qtrue ? true : false;
|
395
|
+
return ptr->params.low_vram ? Qtrue : Qfalse;
|
396
|
+
};
|
397
|
+
|
398
|
+
static VALUE _llama_context_params_get_low_vram(VALUE self) {
|
399
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
400
|
+
return ptr->params.low_vram ? Qtrue : Qfalse;
|
401
|
+
};
|
402
|
+
|
332
403
|
// seed
|
333
404
|
static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
|
334
405
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -424,6 +495,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
|
424
495
|
RUBY_TYPED_FREE_IMMEDIATELY
|
425
496
|
};
|
426
497
|
|
498
|
+
class LLaMAModelQuantizeParamsWrapper {
|
499
|
+
public:
|
500
|
+
llama_model_quantize_params params;
|
501
|
+
|
502
|
+
LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
|
503
|
+
|
504
|
+
~LLaMAModelQuantizeParamsWrapper(){};
|
505
|
+
};
|
506
|
+
|
507
|
+
class RbLLaMAModelQuantizeParams {
|
508
|
+
public:
|
509
|
+
static VALUE llama_model_quantize_params_alloc(VALUE self) {
|
510
|
+
LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
|
511
|
+
new (ptr) LLaMAModelQuantizeParamsWrapper();
|
512
|
+
return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
|
513
|
+
};
|
514
|
+
|
515
|
+
static void llama_model_quantize_params_free(void* ptr) {
|
516
|
+
((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
|
517
|
+
ruby_xfree(ptr);
|
518
|
+
};
|
519
|
+
|
520
|
+
static size_t llama_model_quantize_params_size(const void* ptr) {
|
521
|
+
return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
|
522
|
+
};
|
523
|
+
|
524
|
+
static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
|
525
|
+
LLaMAModelQuantizeParamsWrapper* ptr;
|
526
|
+
TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
|
527
|
+
return ptr;
|
528
|
+
};
|
529
|
+
|
530
|
+
static void define_class(VALUE outer) {
|
531
|
+
rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
|
532
|
+
rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
|
533
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
|
534
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
|
535
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
|
536
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
|
537
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
|
538
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
539
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
540
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
541
|
+
};
|
542
|
+
|
543
|
+
private:
|
544
|
+
static const rb_data_type_t llama_model_quantize_params_type;
|
545
|
+
|
546
|
+
// n_thread
|
547
|
+
static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
|
548
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
549
|
+
ptr->params.nthread = NUM2INT(n_thread);
|
550
|
+
return INT2NUM(ptr->params.nthread);
|
551
|
+
};
|
552
|
+
|
553
|
+
static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
|
554
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
555
|
+
return INT2NUM(ptr->params.nthread);
|
556
|
+
};
|
557
|
+
|
558
|
+
// ftype
|
559
|
+
static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
|
560
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
561
|
+
ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
|
562
|
+
return INT2NUM(ptr->params.ftype);
|
563
|
+
};
|
564
|
+
|
565
|
+
static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
|
566
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
567
|
+
return INT2NUM(ptr->params.ftype);
|
568
|
+
};
|
569
|
+
|
570
|
+
// allow_requantize
|
571
|
+
static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
|
572
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
573
|
+
if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
|
574
|
+
ptr->params.allow_requantize = false;
|
575
|
+
} else {
|
576
|
+
ptr->params.allow_requantize = true;
|
577
|
+
}
|
578
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
579
|
+
};
|
580
|
+
|
581
|
+
static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
|
582
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
583
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
584
|
+
};
|
585
|
+
|
586
|
+
// quantize_output_tensor
|
587
|
+
static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
|
588
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
589
|
+
if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
|
590
|
+
ptr->params.quantize_output_tensor = false;
|
591
|
+
} else {
|
592
|
+
ptr->params.quantize_output_tensor = true;
|
593
|
+
}
|
594
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
595
|
+
};
|
596
|
+
|
597
|
+
static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
|
598
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
599
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
600
|
+
};
|
601
|
+
};
|
602
|
+
|
603
|
+
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
604
|
+
"RbLLaMAModelQuantizeParams",
|
605
|
+
{ NULL,
|
606
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
|
607
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
|
608
|
+
NULL,
|
609
|
+
NULL,
|
610
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
611
|
+
};
|
612
|
+
|
427
613
|
class LLaMAContextWrapper {
|
428
614
|
public:
|
429
615
|
struct llama_context* ctx;
|
@@ -465,9 +651,11 @@ public:
|
|
465
651
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
466
652
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
467
653
|
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
654
|
+
rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
|
468
655
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
469
656
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
470
657
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
658
|
+
rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
|
471
659
|
rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
|
472
660
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
473
661
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
@@ -517,7 +705,7 @@ private:
|
|
517
705
|
return Qnil;
|
518
706
|
}
|
519
707
|
if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
|
520
|
-
rb_raise(rb_eArgError, "params must be a
|
708
|
+
rb_raise(rb_eArgError, "params must be a ContextParams");
|
521
709
|
return Qnil;
|
522
710
|
}
|
523
711
|
|
@@ -599,6 +787,24 @@ private:
|
|
599
787
|
return Qnil;
|
600
788
|
};
|
601
789
|
|
790
|
+
static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
|
791
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
792
|
+
if (ptr->ctx == NULL) {
|
793
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
794
|
+
return Qnil;
|
795
|
+
}
|
796
|
+
if (!RB_TYPE_P(fname_, T_STRING)) {
|
797
|
+
rb_raise(rb_eArgError, "fname must be a string");
|
798
|
+
return Qnil;
|
799
|
+
}
|
800
|
+
const char* fname = StringValueCStr(fname_);
|
801
|
+
if (llama_eval_export(ptr->ctx, fname) != 0) {
|
802
|
+
return Qfalse;
|
803
|
+
}
|
804
|
+
RB_GC_GUARD(fname_);
|
805
|
+
return Qtrue;
|
806
|
+
};
|
807
|
+
|
602
808
|
static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
|
603
809
|
VALUE kw_args = Qnil;
|
604
810
|
ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
|
@@ -705,6 +911,43 @@ private:
|
|
705
911
|
return output;
|
706
912
|
};
|
707
913
|
|
914
|
+
static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
|
915
|
+
VALUE kw_args = Qnil;
|
916
|
+
ID kw_table[1] = { rb_intern("capacity") };
|
917
|
+
VALUE kw_values[1] = { Qundef };
|
918
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
919
|
+
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
920
|
+
|
921
|
+
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
922
|
+
rb_raise(rb_eArgError, "capacity must be an integer");
|
923
|
+
return Qnil;
|
924
|
+
}
|
925
|
+
|
926
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
927
|
+
if (ptr->ctx == NULL) {
|
928
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
929
|
+
return Qnil;
|
930
|
+
}
|
931
|
+
|
932
|
+
const int capacity = NUM2INT(kw_values[0]);
|
933
|
+
std::vector<const char*> strings;
|
934
|
+
std::vector<float> scores;
|
935
|
+
int n_vocab = llama_n_vocab(ptr->ctx);
|
936
|
+
strings.resize(n_vocab, NULL);
|
937
|
+
scores.resize(n_vocab, 0);
|
938
|
+
|
939
|
+
n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
|
940
|
+
|
941
|
+
VALUE ret_strings = rb_ary_new();
|
942
|
+
VALUE ret_scores = rb_ary_new();
|
943
|
+
for (int i = 0; i < n_vocab; i++) {
|
944
|
+
rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
|
945
|
+
rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
|
946
|
+
}
|
947
|
+
|
948
|
+
return rb_ary_new_from_args(2, ret_strings, ret_scores);
|
949
|
+
};
|
950
|
+
|
708
951
|
static VALUE _llama_context_n_vocab(VALUE self) {
|
709
952
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
710
953
|
if (ptr->ctx == NULL) {
|
@@ -1428,10 +1671,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
|
|
1428
1671
|
|
1429
1672
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
1430
1673
|
VALUE kw_args = Qnil;
|
1431
|
-
ID kw_table[
|
1432
|
-
VALUE kw_values[
|
1674
|
+
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
1675
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
1433
1676
|
rb_scan_args(argc, argv, ":", &kw_args);
|
1434
|
-
rb_get_kwargs(kw_args, kw_table, 3,
|
1677
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
1435
1678
|
|
1436
1679
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1437
1680
|
rb_raise(rb_eArgError, "input_path must be a string");
|
@@ -1441,21 +1684,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
|
1441
1684
|
rb_raise(rb_eArgError, "output_path must be a string");
|
1442
1685
|
return Qnil;
|
1443
1686
|
}
|
1444
|
-
if (!
|
1445
|
-
rb_raise(rb_eArgError, "
|
1446
|
-
return Qnil;
|
1447
|
-
}
|
1448
|
-
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
1449
|
-
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1687
|
+
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
|
1688
|
+
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
|
1450
1689
|
return Qnil;
|
1451
1690
|
}
|
1452
1691
|
|
1453
1692
|
const char* input_path = StringValueCStr(kw_values[0]);
|
1454
1693
|
const char* output_path = StringValueCStr(kw_values[1]);
|
1455
|
-
|
1456
|
-
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
1694
|
+
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
|
1457
1695
|
|
1458
|
-
if (llama_model_quantize(input_path, output_path, (
|
1696
|
+
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
|
1459
1697
|
rb_raise(rb_eRuntimeError, "Failed to quantize model");
|
1460
1698
|
return Qnil;
|
1461
1699
|
}
|
@@ -1505,6 +1743,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
1505
1743
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
1506
1744
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
1507
1745
|
|
1746
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
1747
|
+
|
1508
1748
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
1509
1749
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
1510
1750
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
@@ -1513,6 +1753,15 @@ extern "C" void Init_llama_cpp(void) {
|
|
1513
1753
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
|
1514
1754
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
|
1515
1755
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
|
1756
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
|
1757
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
|
1758
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
|
1759
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
|
1760
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
|
1761
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
|
1762
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
1763
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
1764
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
1516
1765
|
|
1517
1766
|
std::stringstream ss_magic;
|
1518
1767
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
|