llama_cpp 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +15 -1
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +1343 -800
- data/ext/llama_cpp/src/ggml.h +12 -2
- data/ext/llama_cpp/src/llama.cpp +60 -16
- data/ext/llama_cpp/src/llama.h +5 -1
- data/ext/llama_cpp/src/llama_util.h +0 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -204,7 +204,9 @@ enum ggml_type {
|
|
204
204
|
GGML_TYPE_F16 = 1,
|
205
205
|
GGML_TYPE_Q4_0 = 2,
|
206
206
|
GGML_TYPE_Q4_1 = 3,
|
207
|
-
|
207
|
+
GGML_TYPE_Q4_2 = 4,
|
208
|
+
GGML_TYPE_Q4_3 = 5,
|
209
|
+
GGML_TYPE_Q8_0 = 6,
|
208
210
|
GGML_TYPE_I8,
|
209
211
|
GGML_TYPE_I16,
|
210
212
|
GGML_TYPE_I32,
|
@@ -359,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
|
|
359
361
|
|
360
362
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
361
363
|
|
364
|
+
bool ggml_is_quantized(enum ggml_type type);
|
365
|
+
|
362
366
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
363
367
|
void ggml_free(struct ggml_context * ctx);
|
364
368
|
|
@@ -626,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
|
|
626
630
|
|
627
631
|
// rotary position embedding
|
628
632
|
// in-place, returns view(a)
|
629
|
-
// if mode == 1, skip n_past elements
|
633
|
+
// if mode & 1 == 1, skip n_past elements
|
634
|
+
// if mode & 2 == 1, GPT-NeoX style
|
630
635
|
// TODO: avoid creating a new tensor every time
|
631
636
|
struct ggml_tensor * ggml_rope(
|
632
637
|
struct ggml_context * ctx,
|
@@ -806,6 +811,10 @@ enum ggml_opt_result ggml_opt(
|
|
806
811
|
|
807
812
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
808
813
|
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
814
|
+
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
815
|
+
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
816
|
+
|
817
|
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
809
818
|
|
810
819
|
//
|
811
820
|
// system info
|
@@ -823,6 +832,7 @@ int ggml_cpu_has_f16c(void);
|
|
823
832
|
int ggml_cpu_has_fp16_va(void);
|
824
833
|
int ggml_cpu_has_wasm_simd(void);
|
825
834
|
int ggml_cpu_has_blas(void);
|
835
|
+
int ggml_cpu_has_cublas(void);
|
826
836
|
int ggml_cpu_has_sse3(void);
|
827
837
|
int ggml_cpu_has_vsx(void);
|
828
838
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -24,6 +24,9 @@
|
|
24
24
|
#include <memory>
|
25
25
|
#include <algorithm>
|
26
26
|
#include <initializer_list>
|
27
|
+
#include <thread>
|
28
|
+
#include <atomic>
|
29
|
+
#include <mutex>
|
27
30
|
|
28
31
|
#define LLAMA_USE_SCRATCH
|
29
32
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -478,6 +481,8 @@ struct llama_file_loader {
|
|
478
481
|
case GGML_TYPE_F16:
|
479
482
|
case GGML_TYPE_Q4_0:
|
480
483
|
case GGML_TYPE_Q4_1:
|
484
|
+
case GGML_TYPE_Q4_2:
|
485
|
+
case GGML_TYPE_Q4_3:
|
481
486
|
break;
|
482
487
|
default: {
|
483
488
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -550,6 +555,8 @@ struct llama_file_saver {
|
|
550
555
|
case GGML_TYPE_F16:
|
551
556
|
case GGML_TYPE_Q4_0:
|
552
557
|
case GGML_TYPE_Q4_1:
|
558
|
+
case GGML_TYPE_Q4_2:
|
559
|
+
case GGML_TYPE_Q4_3:
|
553
560
|
break;
|
554
561
|
default: LLAMA_ASSERT(false);
|
555
562
|
}
|
@@ -838,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
838
845
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
839
846
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
840
847
|
return "mostly Q4_1, some F16";
|
848
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
849
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
841
850
|
default: return "unknown, may not work";
|
842
851
|
}
|
843
852
|
}
|
@@ -1066,7 +1075,7 @@ static bool llama_eval_internal(
|
|
1066
1075
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1067
1076
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1068
1077
|
ggml_cgraph gf = {};
|
1069
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
1078
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
|
1070
1079
|
|
1071
1080
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1072
1081
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1566,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1566
1575
|
// quantization
|
1567
1576
|
//
|
1568
1577
|
|
1569
|
-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1578
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
1570
1579
|
ggml_type quantized_type;
|
1571
1580
|
switch (ftype) {
|
1572
1581
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1573
1582
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1583
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1584
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1574
1585
|
default: throw format("invalid output file type %d\n", ftype);
|
1575
1586
|
};
|
1576
1587
|
|
1588
|
+
if (nthread <= 0) {
|
1589
|
+
nthread = std::thread::hardware_concurrency();
|
1590
|
+
}
|
1591
|
+
|
1577
1592
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1578
1593
|
/*vocab_only*/ false));
|
1579
1594
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
@@ -1582,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1582
1597
|
size_t total_size_new = 0;
|
1583
1598
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
1584
1599
|
|
1600
|
+
std::vector<std::thread> workers;
|
1601
|
+
std::mutex mutex;
|
1602
|
+
|
1585
1603
|
size_t idx = 0;
|
1586
1604
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1587
1605
|
llama_buffer read_data;
|
@@ -1600,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1600
1618
|
// quantize only 2D tensors
|
1601
1619
|
quantize &= (tensor.ne.size() == 2);
|
1602
1620
|
|
1621
|
+
// GG: uncomment this to keep the output layer in FP16
|
1622
|
+
//if (tensor.name.rfind("output")) {
|
1623
|
+
// quantize = false;
|
1624
|
+
//}
|
1625
|
+
|
1603
1626
|
enum ggml_type new_type;
|
1604
1627
|
void * new_data;
|
1605
1628
|
size_t new_size;
|
@@ -1635,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1635
1658
|
new_data = work.addr;
|
1636
1659
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1637
1660
|
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1661
|
+
int chunk_size = 32 * 512;
|
1662
|
+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
1663
|
+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
1664
|
+
if (nthread_use < 2) {
|
1665
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
1666
|
+
} else {
|
1667
|
+
size_t counter = 0;
|
1668
|
+
new_size = 0;
|
1669
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
1670
|
+
std::vector<int64_t> local_hist;
|
1671
|
+
size_t local_size = 0;
|
1672
|
+
while (true) {
|
1673
|
+
std::unique_lock<std::mutex> lock(mutex);
|
1674
|
+
size_t first = counter; counter += chunk_size;
|
1675
|
+
if (first >= nelements) {
|
1676
|
+
if (!local_hist.empty()) {
|
1677
|
+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
1678
|
+
new_size += local_size;
|
1679
|
+
}
|
1680
|
+
break;
|
1681
|
+
}
|
1682
|
+
lock.unlock();
|
1683
|
+
size_t last = std::min(nelements, first + chunk_size);
|
1684
|
+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
1685
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
1686
|
+
}
|
1687
|
+
};
|
1688
|
+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
1689
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
1690
|
+
compute();
|
1691
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
1649
1692
|
}
|
1650
1693
|
|
1651
1694
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -1767,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1767
1810
|
int llama_model_quantize(
|
1768
1811
|
const char * fname_inp,
|
1769
1812
|
const char * fname_out,
|
1770
|
-
enum llama_ftype ftype
|
1813
|
+
enum llama_ftype ftype,
|
1814
|
+
int nthread) {
|
1771
1815
|
try {
|
1772
|
-
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1816
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
1773
1817
|
return 0;
|
1774
1818
|
} catch (const std::string & err) {
|
1775
1819
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
@@ -1955,7 +1999,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
1955
1999
|
base_t = dest_t;
|
1956
2000
|
}
|
1957
2001
|
|
1958
|
-
if (base_t->type
|
2002
|
+
if (ggml_is_quantized(base_t->type)) {
|
1959
2003
|
if (!warned) {
|
1960
2004
|
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
2005
|
"use a f16 or f32 base model with --lora-base\n", __func__);
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -72,6 +72,8 @@ extern "C" {
|
|
72
72
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
73
73
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
|
+
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
+
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
75
77
|
};
|
76
78
|
|
77
79
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -91,10 +93,12 @@ extern "C" {
|
|
91
93
|
|
92
94
|
// TODO: not great API - very likely to change
|
93
95
|
// Returns 0 on success
|
96
|
+
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
94
97
|
LLAMA_API int llama_model_quantize(
|
95
98
|
const char * fname_inp,
|
96
99
|
const char * fname_out,
|
97
|
-
enum llama_ftype ftype
|
100
|
+
enum llama_ftype ftype,
|
101
|
+
int nthread);
|
98
102
|
|
99
103
|
// Apply a LoRA adapter to a loaded model
|
100
104
|
// path_base_model is the path to a higher quality model to use as a base for
|
@@ -202,7 +202,6 @@ struct llama_mmap {
|
|
202
202
|
|
203
203
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
204
204
|
DWORD error = GetLastError();
|
205
|
-
CloseHandle(hFile);
|
206
205
|
|
207
206
|
if (hMapping == NULL) {
|
208
207
|
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-12b5900'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,6 +5,15 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
+
LLAMA_FTYPE_ALL_F32: Integer
|
9
|
+
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
|
+
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
|
+
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
+
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
|
+
|
16
|
+
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
8
17
|
def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
|
9
18
|
def self?.print_system_info: () -> void
|
10
19
|
def self?.token_bos: () -> Integer
|
@@ -18,7 +27,7 @@ module LLaMACpp
|
|
18
27
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
19
28
|
| () -> void
|
20
29
|
def embeddings: () -> Array[Float]
|
21
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) ->
|
30
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
22
31
|
def free: () -> void
|
23
32
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
24
33
|
def logits: () -> Array[Float]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- ext/llama_cpp/llama_cpp.cpp
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
|
+
- ext/llama_cpp/src/ggml-cuda.h
|
29
30
|
- ext/llama_cpp/src/ggml.c
|
30
31
|
- ext/llama_cpp/src/ggml.h
|
31
32
|
- ext/llama_cpp/src/llama.cpp
|