llama_cpp 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
|
4
|
+
data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
|
7
|
+
data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,32 @@
|
|
1
|
+
## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
|
4
|
+
- Fix installation files for CUDA.
|
5
|
+
- Add metal config option:
|
6
|
+
```
|
7
|
+
$ gem install llama_cpp -- --with-metal
|
8
|
+
```
|
9
|
+
```ruby
|
10
|
+
require 'llama_cpp'
|
11
|
+
|
12
|
+
params = LLaMACpp::ContextParams.new
|
13
|
+
params.n_gpu_layers = 1
|
14
|
+
|
15
|
+
context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
|
16
|
+
LLaMACpp.generate(context, 'Hello, world.')
|
17
|
+
```
|
18
|
+
|
19
|
+
**Breaking Changes**
|
20
|
+
|
21
|
+
- Add ModelQuantizationParams class.
|
22
|
+
- Change the argument of the `model_quantize` module function in LLaMACpp.
|
23
|
+
```ruby
|
24
|
+
require 'llama_cpp'
|
25
|
+
|
26
|
+
params = LLaMACpp::ModelQuantizeParams.new
|
27
|
+
LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
|
28
|
+
```
|
29
|
+
|
1
30
|
## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
|
2
31
|
|
3
32
|
- Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'mkmf'
|
4
|
+
require 'fileutils'
|
4
5
|
|
5
6
|
abort 'libstdc++ is not found.' unless have_library('stdc++')
|
6
7
|
|
@@ -36,17 +37,30 @@ if with_config('accelerate')
|
|
36
37
|
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
37
38
|
end
|
38
39
|
|
40
|
+
if with_config('metal')
|
41
|
+
$CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
|
42
|
+
$CXXFLAGS << ' -DGGML_USE_METAL'
|
43
|
+
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
|
44
|
+
$objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
|
45
|
+
end
|
46
|
+
|
39
47
|
if with_config('cublas')
|
40
48
|
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
49
|
+
$CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
41
50
|
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
42
51
|
$objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
|
43
52
|
end
|
44
53
|
|
45
54
|
if with_config('clblast')
|
46
55
|
abort 'libclblast is not found.' unless have_library('clblast')
|
47
|
-
abort 'libOpenCL is not found.' unless have_library('OpenCL')
|
48
56
|
|
49
57
|
$CFLAGS << ' -DGGML_USE_CLBLAST'
|
58
|
+
$CXXFLAGS << ' -DGGML_USE_CLBLAST'
|
59
|
+
if RUBY_PLATFORM.match?(/darwin/)
|
60
|
+
$LDFLAGS << ' -framework OpenCL'
|
61
|
+
else
|
62
|
+
abort 'libOpenCL is not found.' unless have_library('OpenCL')
|
63
|
+
end
|
50
64
|
end
|
51
65
|
|
52
66
|
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
@@ -78,3 +92,14 @@ if with_config('cublas')
|
|
78
92
|
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
79
93
|
end
|
80
94
|
end
|
95
|
+
|
96
|
+
if with_config('metal')
|
97
|
+
File.open('Makefile', 'a') do |f|
|
98
|
+
f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
|
99
|
+
f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
|
100
|
+
end
|
101
|
+
|
102
|
+
metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
|
103
|
+
dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
|
104
|
+
FileUtils.cp(metal_path, dest_path)
|
105
|
+
end
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
VALUE rb_mLLaMACpp;
|
5
5
|
VALUE rb_cLLaMAContext;
|
6
6
|
VALUE rb_cLLaMAContextParams;
|
7
|
+
VALUE rb_cLLaMAModelQuantizeParams;
|
7
8
|
VALUE rb_cLLaMATokenData;
|
8
9
|
VALUE rb_cLLaMATokenDataArray;
|
9
10
|
|
@@ -292,6 +293,13 @@ public:
|
|
292
293
|
// rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
|
293
294
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
|
294
295
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
|
296
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
|
297
|
+
rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
|
298
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
|
299
|
+
rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
|
300
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
|
301
|
+
rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
|
302
|
+
rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
|
295
303
|
rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
|
296
304
|
rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
|
297
305
|
rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
|
@@ -329,6 +337,55 @@ private:
|
|
329
337
|
return INT2NUM(ptr->params.n_ctx);
|
330
338
|
};
|
331
339
|
|
340
|
+
// n_batch
|
341
|
+
static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
|
342
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
343
|
+
ptr->params.n_batch = NUM2INT(n_batch);
|
344
|
+
return INT2NUM(ptr->params.n_batch);
|
345
|
+
};
|
346
|
+
|
347
|
+
static VALUE _llama_context_params_get_n_batch(VALUE self) {
|
348
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
349
|
+
return INT2NUM(ptr->params.n_batch);
|
350
|
+
};
|
351
|
+
|
352
|
+
// n_gpu_layers
|
353
|
+
static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
|
354
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
355
|
+
ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
|
356
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
357
|
+
};
|
358
|
+
|
359
|
+
static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
|
360
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
361
|
+
return INT2NUM(ptr->params.n_gpu_layers);
|
362
|
+
};
|
363
|
+
|
364
|
+
// main_gpu
|
365
|
+
static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
|
366
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
367
|
+
ptr->params.main_gpu = NUM2INT(main_gpu);
|
368
|
+
return INT2NUM(ptr->params.main_gpu);
|
369
|
+
};
|
370
|
+
|
371
|
+
static VALUE _llama_context_params_get_main_gpu(VALUE self) {
|
372
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
373
|
+
return INT2NUM(ptr->params.main_gpu);
|
374
|
+
};
|
375
|
+
|
376
|
+
// tensor_split
|
377
|
+
static VALUE _llama_context_params_get_tensor_split(VALUE self) {
|
378
|
+
if (LLAMA_MAX_DEVICES < 1) {
|
379
|
+
return rb_ary_new();
|
380
|
+
}
|
381
|
+
VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
|
382
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
383
|
+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
|
384
|
+
rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
|
385
|
+
}
|
386
|
+
return ret;
|
387
|
+
};
|
388
|
+
|
332
389
|
// seed
|
333
390
|
static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
|
334
391
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -424,6 +481,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
|
424
481
|
RUBY_TYPED_FREE_IMMEDIATELY
|
425
482
|
};
|
426
483
|
|
484
|
+
class LLaMAModelQuantizeParamsWrapper {
|
485
|
+
public:
|
486
|
+
llama_model_quantize_params params;
|
487
|
+
|
488
|
+
LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
|
489
|
+
|
490
|
+
~LLaMAModelQuantizeParamsWrapper(){};
|
491
|
+
};
|
492
|
+
|
493
|
+
class RbLLaMAModelQuantizeParams {
|
494
|
+
public:
|
495
|
+
static VALUE llama_model_quantize_params_alloc(VALUE self) {
|
496
|
+
LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
|
497
|
+
new (ptr) LLaMAModelQuantizeParamsWrapper();
|
498
|
+
return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
|
499
|
+
};
|
500
|
+
|
501
|
+
static void llama_model_quantize_params_free(void* ptr) {
|
502
|
+
((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
|
503
|
+
ruby_xfree(ptr);
|
504
|
+
};
|
505
|
+
|
506
|
+
static size_t llama_model_quantize_params_size(const void* ptr) {
|
507
|
+
return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
|
508
|
+
};
|
509
|
+
|
510
|
+
static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
|
511
|
+
LLaMAModelQuantizeParamsWrapper* ptr;
|
512
|
+
TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
|
513
|
+
return ptr;
|
514
|
+
};
|
515
|
+
|
516
|
+
static void define_class(VALUE outer) {
|
517
|
+
rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
|
518
|
+
rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
|
519
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
|
520
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
|
521
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
|
522
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
|
523
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
|
524
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
525
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
526
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
527
|
+
};
|
528
|
+
|
529
|
+
private:
|
530
|
+
static const rb_data_type_t llama_model_quantize_params_type;
|
531
|
+
|
532
|
+
// n_thread
|
533
|
+
static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
|
534
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
535
|
+
ptr->params.nthread = NUM2INT(n_thread);
|
536
|
+
return INT2NUM(ptr->params.nthread);
|
537
|
+
};
|
538
|
+
|
539
|
+
static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
|
540
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
541
|
+
return INT2NUM(ptr->params.nthread);
|
542
|
+
};
|
543
|
+
|
544
|
+
// ftype
|
545
|
+
static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
|
546
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
547
|
+
ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
|
548
|
+
return INT2NUM(ptr->params.ftype);
|
549
|
+
};
|
550
|
+
|
551
|
+
static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
|
552
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
553
|
+
return INT2NUM(ptr->params.ftype);
|
554
|
+
};
|
555
|
+
|
556
|
+
// allow_requantize
|
557
|
+
static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
|
558
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
559
|
+
if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
|
560
|
+
ptr->params.allow_requantize = false;
|
561
|
+
} else {
|
562
|
+
ptr->params.allow_requantize = true;
|
563
|
+
}
|
564
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
565
|
+
};
|
566
|
+
|
567
|
+
static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
|
568
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
569
|
+
return ptr->params.allow_requantize ? Qtrue : Qfalse;
|
570
|
+
};
|
571
|
+
|
572
|
+
// quantize_output_tensor
|
573
|
+
static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
|
574
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
575
|
+
if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
|
576
|
+
ptr->params.quantize_output_tensor = false;
|
577
|
+
} else {
|
578
|
+
ptr->params.quantize_output_tensor = true;
|
579
|
+
}
|
580
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
581
|
+
};
|
582
|
+
|
583
|
+
static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
|
584
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
585
|
+
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
586
|
+
};
|
587
|
+
};
|
588
|
+
|
589
|
+
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
590
|
+
"RbLLaMAModelQuantizeParams",
|
591
|
+
{ NULL,
|
592
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
|
593
|
+
RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
|
594
|
+
NULL,
|
595
|
+
NULL,
|
596
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
597
|
+
};
|
598
|
+
|
427
599
|
class LLaMAContextWrapper {
|
428
600
|
public:
|
429
601
|
struct llama_context* ctx;
|
@@ -465,6 +637,7 @@ public:
|
|
465
637
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
466
638
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
467
639
|
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
640
|
+
rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
|
468
641
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
469
642
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
470
643
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
@@ -517,7 +690,7 @@ private:
|
|
517
690
|
return Qnil;
|
518
691
|
}
|
519
692
|
if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
|
520
|
-
rb_raise(rb_eArgError, "params must be a
|
693
|
+
rb_raise(rb_eArgError, "params must be a ContextParams");
|
521
694
|
return Qnil;
|
522
695
|
}
|
523
696
|
|
@@ -599,6 +772,24 @@ private:
|
|
599
772
|
return Qnil;
|
600
773
|
};
|
601
774
|
|
775
|
+
static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
|
776
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
777
|
+
if (ptr->ctx == NULL) {
|
778
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
779
|
+
return Qnil;
|
780
|
+
}
|
781
|
+
if (!RB_TYPE_P(fname_, T_STRING)) {
|
782
|
+
rb_raise(rb_eArgError, "fname must be a string");
|
783
|
+
return Qnil;
|
784
|
+
}
|
785
|
+
const char* fname = StringValueCStr(fname_);
|
786
|
+
if (llama_eval_export(ptr->ctx, fname) != 0) {
|
787
|
+
return Qfalse;
|
788
|
+
}
|
789
|
+
RB_GC_GUARD(fname_);
|
790
|
+
return Qtrue;
|
791
|
+
};
|
792
|
+
|
602
793
|
static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
|
603
794
|
VALUE kw_args = Qnil;
|
604
795
|
ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
|
@@ -1428,10 +1619,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
|
|
1428
1619
|
|
1429
1620
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
1430
1621
|
VALUE kw_args = Qnil;
|
1431
|
-
ID kw_table[
|
1432
|
-
VALUE kw_values[
|
1622
|
+
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
1623
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
1433
1624
|
rb_scan_args(argc, argv, ":", &kw_args);
|
1434
|
-
rb_get_kwargs(kw_args, kw_table, 3,
|
1625
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
1435
1626
|
|
1436
1627
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1437
1628
|
rb_raise(rb_eArgError, "input_path must be a string");
|
@@ -1441,21 +1632,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
|
1441
1632
|
rb_raise(rb_eArgError, "output_path must be a string");
|
1442
1633
|
return Qnil;
|
1443
1634
|
}
|
1444
|
-
if (!
|
1445
|
-
rb_raise(rb_eArgError, "
|
1446
|
-
return Qnil;
|
1447
|
-
}
|
1448
|
-
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
1449
|
-
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1635
|
+
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
|
1636
|
+
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
|
1450
1637
|
return Qnil;
|
1451
1638
|
}
|
1452
1639
|
|
1453
1640
|
const char* input_path = StringValueCStr(kw_values[0]);
|
1454
1641
|
const char* output_path = StringValueCStr(kw_values[1]);
|
1455
|
-
|
1456
|
-
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
1642
|
+
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
|
1457
1643
|
|
1458
|
-
if (llama_model_quantize(input_path, output_path, (
|
1644
|
+
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
|
1459
1645
|
rb_raise(rb_eRuntimeError, "Failed to quantize model");
|
1460
1646
|
return Qnil;
|
1461
1647
|
}
|
@@ -1505,6 +1691,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
1505
1691
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
1506
1692
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
1507
1693
|
|
1694
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
1695
|
+
|
1508
1696
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
1509
1697
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
1510
1698
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
@@ -1513,6 +1701,15 @@ extern "C" void Init_llama_cpp(void) {
|
|
1513
1701
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
|
1514
1702
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
|
1515
1703
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
|
1704
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
|
1705
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
|
1706
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
|
1707
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
|
1708
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
|
1709
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
|
1710
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
1711
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
1712
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
1516
1713
|
|
1517
1714
|
std::stringstream ss_magic;
|
1518
1715
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
|