llama_cpp 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
|
4
|
+
data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
|
7
|
+
data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,32 @@
|
|
1
|
+
## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
|
4
|
+
- Fix installation files for CUDA.
|
5
|
+
- Add metal config option:
|
6
|
+
```
|
7
|
+
$ gem install llama_cpp -- --with-metal
|
8
|
+
```
|
9
|
+
```ruby
|
10
|
+
require 'llama_cpp'
|
11
|
+
|
12
|
+
params = LLaMACpp::ContextParams.new
|
13
|
+
params.n_gpu_layers = 1
|
14
|
+
|
15
|
+
context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
|
16
|
+
LLaMACpp.generate(context, 'Hello, world.')
|
17
|
+
```
|
18
|
+
|
19
|
+
**Breaking Changes**
|
20
|
+
|
21
|
+
- Add ModelQuantizationParams class.
|
22
|
+
- Change the argument of the `model_quantize` module function in LLaMACpp.
|
23
|
+
```ruby
|
24
|
+
require 'llama_cpp'
|
25
|
+
|
26
|
+
params = LLaMACpp::ModelQuantizeParams.new
|
27
|
+
LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
|
28
|
+
```
|
29
|
+
|
1
30
|
## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
|
2
31
|
|
3
32
|
- Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'mkmf'
|
4
|
+
require 'fileutils'
|
4
5
|
|
5
6
|
abort 'libstdc++ is not found.' unless have_library('stdc++')
|
6
7
|
|
@@ -36,17 +37,30 @@ if with_config('accelerate')
|
|
36
37
|
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
37
38
|
end
|
38
39
|
|
40
|
+
if with_config('metal')
|
41
|
+
$CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
|
42
|
+
$CXXFLAGS << ' -DGGML_USE_METAL'
|
43
|
+
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
|
44
|
+
$objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
|
45
|
+
end
|
46
|
+
|
39
47
|
if with_config('cublas')
|
40
48
|
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
49
|
+
$CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
41
50
|
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
42
51
|
$objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
|
43
52
|
end
|
44
53
|
|
45
54
|
if with_config('clblast')
|
46
55
|
abort 'libclblast is not found.' unless have_library('clblast')
|
47
|
-
abort 'libOpenCL is not found.' unless have_library('OpenCL')
|
48
56
|
|
49
57
|
$CFLAGS << ' -DGGML_USE_CLBLAST'
|
58
|
+
$CXXFLAGS << ' -DGGML_USE_CLBLAST'
|
59
|
+
if RUBY_PLATFORM.match?(/darwin/)
|
60
|
+
$LDFLAGS << ' -framework OpenCL'
|
61
|
+
else
|
62
|
+
abort 'libOpenCL is not found.' unless have_library('OpenCL')
|
63
|
+
end
|
50
64
|
end
|
51
65
|
|
52
66
|
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
@@ -78,3 +92,14 @@ if with_config('cublas')
|
|
78
92
|
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
79
93
|
end
|
80
94
|
end
|
95
|
+
|
96
|
+
if with_config('metal')
|
97
|
+
File.open('Makefile', 'a') do |f|
|
98
|
+
f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
|
99
|
+
f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
|
100
|
+
end
|
101
|
+
|
102
|
+
metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
|
103
|
+
dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
|
104
|
+
FileUtils.cp(metal_path, dest_path)
|
105
|
+
end
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
VALUE rb_mLLaMACpp;
|
5
5
|
VALUE rb_cLLaMAContext;
|
6
6
|
VALUE rb_cLLaMAContextParams;
|
7
|
+
VALUE rb_cLLaMAModelQuantizeParams;
|
7
8
|
VALUE rb_cLLaMATokenData;
|
8
9
|
VALUE rb_cLLaMATokenDataArray;
|
9
10
|
|
@@ -292,6 +293,13 @@ public:
|
|
292
293
|
// rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
|
293
294
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
|
294
295
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
|
296
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
|
297
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
|
298
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
|
299
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
|
300
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
|
301
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
|
302
|
+
rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
|
295
303
|
rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
|
296
304
|
rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
|
297
305
|
rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
|
@@ -329,6 +337,55 @@ private:
|
|
329
337
|
return INT2NUM(ptr->params.n_ctx);
|
330
338
|
};
|
331
339
|
|
340
|
+
// n_batch
|
341
|
+
static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
|
342
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
343
|
+
ptr->params.n_batch = NUM2INT(n_batch);
|
344
|
+
return INT2NUM(ptr->params.n_batch);
|
345
|
+
};
|
346
|
+
|
347
|
+
static VALUE _llama_context_params_get_n_batch(VALUE self) {
|
348
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
349
|
+
return INT2NUM(ptr->params.n_batch);
|
350
|
+
};
|
351
|
+
|
352
|
+
// n_gpu_layers
|
353
|
+
static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
|
354
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
355
|
+
ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
|
356
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
357
|
+
};
|
358
|
+
|
359
|
+
static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
|
360
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
361
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
362
|
+
};
|
363
|
+
|
364
|
+
// main_gpu
|
365
|
+
static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
|
366
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
367
|
+
ptr->params.main_gpu = NUM2INT(main_gpu);
|
368
|
+
return INT2NUM(ptr->params.main_gpu);
|
369
|
+
};
|
370
|
+
|
371
|
+
static VALUE _llama_context_params_get_main_gpu(VALUE self) {
|
372
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
373
|
+
return INT2NUM(ptr->params.main_gpu);
|
374
|
+
};
|
375
|
+
|
376
|
+
// tensor_split
|
377
|
+
static VALUE _llama_context_params_get_tensor_split(VALUE self) {
|
378
|
+
if (LLAMA_MAX_DEVICES < 1) {
|
379
|
+
return rb_ary_new();
|
380
|
+
}
|
381
|
+
VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
|
382
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
383
|
+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
|
384
|
+
rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
|
385
|
+
}
|
386
|
+
return ret;
|
387
|
+
};
|
388
|
+
|
332
389
|
// seed
|
333
390
|
static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
|
334
391
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -424,6 +481,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
|
424
481
|
RUBY_TYPED_FREE_IMMEDIATELY
|
425
482
|
};
|
426
483
|
|
484
|
+
class LLaMAModelQuantizeParamsWrapper {
|
485
|
+
public:
|
486
|
+
llama_model_quantize_params params;
|
487
|
+
|
488
|
+
LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
|
489
|
+
|
490
|
+
~LLaMAModelQuantizeParamsWrapper(){};
|
491
|
+
};
|
492
|
+
|
493
|
+
class RbLLaMAModelQuantizeParams {
|
494
|
+
public:
|
495
|
+
static VALUE llama_model_quantize_params_alloc(VALUE self) {
|
496
|
+
LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
|
497
|
+
new (ptr) LLaMAModelQuantizeParamsWrapper();
|
498
|
+
return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
|
499
|
+
};
|
500
|
+
|
501
|
+
static void llama_model_quantize_params_free(void* ptr) {
|
502
|
+
((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
|
503
|
+
ruby_xfree(ptr);
|
504
|
+
};
|
505
|
+
|
506
|
+
static size_t llama_model_quantize_params_size(const void* ptr) {
|
507
|
+
return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
|
508
|
+
};
|
509
|
+
|
510
|
+
static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
|
511
|
+
LLaMAModelQuantizeParamsWrapper* ptr;
|
512
|
+
TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
|
513
|
+
return ptr;
|
514
|
+
};
|
515
|
+
|
516
|
+
static void define_class(VALUE outer) {
|
517
|
+
rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
|
518
|
+
rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
|
519
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
|
520
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
|
521
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
|
522
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
|
523
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
|
524
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
525
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
526
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
527
|
+
};
|
528
|
+
|
529
|
+
private:
|
530
|
+
static const rb_data_type_t llama_model_quantize_params_type;
|
531
|
+
|
532
|
+
// n_thread
|
533
|
+
static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
|
534
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
535
|
+
ptr->params.nthread = NUM2INT(n_thread);
|
536
|
+
return INT2NUM(ptr->params.nthread);
|
537
|
+
};
|
538
|
+
|
539
|
+
static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
|
540
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
541
|
+
return INT2NUM(ptr->params.nthread);
|
542
|
+
};
|
543
|
+
|
544
|
+
// ftype
|
545
|
+
static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
|
546
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
547
|
+
ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
|
548
|
+
return INT2NUM(ptr->params.ftype);
|
549
|
+
};
|
550
|
+
|
551
|
+
static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
|
552
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
553
|
+
return INT2NUM(ptr->params.ftype);
|
554
|
+
};
|
555
|
+
|
556
|
+
// allow_requantize
|
557
|
+
static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
|
558
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
559
|
+
if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
|
560
|
+
ptr->params.allow_requantize = false;
|
561
|
+
} else {
|
562
|
+
ptr->params.allow_requantize = true;
|
563
|
+
}
|
564
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
565
|
+
};
|
566
|
+
|
567
|
+
static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
|
568
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
569
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
570
|
+
};
|
571
|
+
|
572
|
+
// quantize_output_tensor
|
573
|
+
static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
|
574
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
575
|
+
if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
|
576
|
+
ptr->params.quantize_output_tensor = false;
|
577
|
+
} else {
|
578
|
+
ptr->params.quantize_output_tensor = true;
|
579
|
+
}
|
580
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
581
|
+
};
|
582
|
+
|
583
|
+
static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
|
584
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
585
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
586
|
+
};
|
587
|
+
};
|
588
|
+
|
589
|
+
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
590
|
+
"RbLLaMAModelQuantizeParams",
|
591
|
+
{ NULL,
|
592
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
|
593
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
|
594
|
+
NULL,
|
595
|
+
NULL,
|
596
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
597
|
+
};
|
598
|
+
|
427
599
|
class LLaMAContextWrapper {
|
428
600
|
public:
|
429
601
|
struct llama_context* ctx;
|
@@ -465,6 +637,7 @@ public:
|
|
465
637
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
466
638
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
467
639
|
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
640
|
+
rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
|
468
641
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
469
642
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
470
643
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
@@ -517,7 +690,7 @@ private:
|
|
517
690
|
return Qnil;
|
518
691
|
}
|
519
692
|
if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
|
520
|
-
rb_raise(rb_eArgError, "params must be a
|
693
|
+
rb_raise(rb_eArgError, "params must be a ContextParams");
|
521
694
|
return Qnil;
|
522
695
|
}
|
523
696
|
|
@@ -599,6 +772,24 @@ private:
|
|
599
772
|
return Qnil;
|
600
773
|
};
|
601
774
|
|
775
|
+
static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
|
776
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
777
|
+
if (ptr->ctx == NULL) {
|
778
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
779
|
+
return Qnil;
|
780
|
+
}
|
781
|
+
if (!RB_TYPE_P(fname_, T_STRING)) {
|
782
|
+
rb_raise(rb_eArgError, "fname must be a string");
|
783
|
+
return Qnil;
|
784
|
+
}
|
785
|
+
const char* fname = StringValueCStr(fname_);
|
786
|
+
if (llama_eval_export(ptr->ctx, fname) != 0) {
|
787
|
+
return Qfalse;
|
788
|
+
}
|
789
|
+
RB_GC_GUARD(fname_);
|
790
|
+
return Qtrue;
|
791
|
+
};
|
792
|
+
|
602
793
|
static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
|
603
794
|
VALUE kw_args = Qnil;
|
604
795
|
ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
|
@@ -1428,10 +1619,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
|
|
1428
1619
|
|
1429
1620
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
1430
1621
|
VALUE kw_args = Qnil;
|
1431
|
-
ID kw_table[
|
1432
|
-
VALUE kw_values[
|
1622
|
+
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
1623
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
1433
1624
|
rb_scan_args(argc, argv, ":", &kw_args);
|
1434
|
-
rb_get_kwargs(kw_args, kw_table, 3,
|
1625
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
1435
1626
|
|
1436
1627
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1437
1628
|
rb_raise(rb_eArgError, "input_path must be a string");
|
@@ -1441,21 +1632,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
|
1441
1632
|
rb_raise(rb_eArgError, "output_path must be a string");
|
1442
1633
|
return Qnil;
|
1443
1634
|
}
|
1444
|
-
if (!
|
1445
|
-
rb_raise(rb_eArgError, "
|
1446
|
-
return Qnil;
|
1447
|
-
}
|
1448
|
-
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
1449
|
-
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1635
|
+
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
|
1636
|
+
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
|
1450
1637
|
return Qnil;
|
1451
1638
|
}
|
1452
1639
|
|
1453
1640
|
const char* input_path = StringValueCStr(kw_values[0]);
|
1454
1641
|
const char* output_path = StringValueCStr(kw_values[1]);
|
1455
|
-
|
1456
|
-
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
1642
|
+
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
|
1457
1643
|
|
1458
|
-
if (llama_model_quantize(input_path, output_path, (
|
1644
|
+
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
|
1459
1645
|
rb_raise(rb_eRuntimeError, "Failed to quantize model");
|
1460
1646
|
return Qnil;
|
1461
1647
|
}
|
@@ -1505,6 +1691,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
1505
1691
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
1506
1692
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
1507
1693
|
|
1694
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
1695
|
+
|
1508
1696
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
1509
1697
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
1510
1698
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
@@ -1513,6 +1701,15 @@ extern "C" void Init_llama_cpp(void) {
|
|
1513
1701
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
|
1514
1702
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
|
1515
1703
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
|
1704
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
|
1705
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
|
1706
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
|
1707
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
|
1708
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
|
1709
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
|
1710
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
1711
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
1712
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
1516
1713
|
|
1517
1714
|
std::stringstream ss_magic;
|
1518
1715
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
|