llama_cpp 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +15 -1
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +1343 -800
- data/ext/llama_cpp/src/ggml.h +12 -2
- data/ext/llama_cpp/src/llama.cpp +60 -16
- data/ext/llama_cpp/src/llama.h +5 -1
- data/ext/llama_cpp/src/llama_util.h +0 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -204,7 +204,9 @@ enum ggml_type {
|
|
204
204
|
GGML_TYPE_F16 = 1,
|
205
205
|
GGML_TYPE_Q4_0 = 2,
|
206
206
|
GGML_TYPE_Q4_1 = 3,
|
207
|
-
|
207
|
+
GGML_TYPE_Q4_2 = 4,
|
208
|
+
GGML_TYPE_Q4_3 = 5,
|
209
|
+
GGML_TYPE_Q8_0 = 6,
|
208
210
|
GGML_TYPE_I8,
|
209
211
|
GGML_TYPE_I16,
|
210
212
|
GGML_TYPE_I32,
|
@@ -359,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
|
|
359
361
|
|
360
362
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
361
363
|
|
364
|
+
bool ggml_is_quantized(enum ggml_type type);
|
365
|
+
|
362
366
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
363
367
|
void ggml_free(struct ggml_context * ctx);
|
364
368
|
|
@@ -626,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
|
|
626
630
|
|
627
631
|
// rotary position embedding
|
628
632
|
// in-place, returns view(a)
|
629
|
-
// if mode == 1, skip n_past elements
|
633
|
+
// if mode & 1 == 1, skip n_past elements
|
634
|
+
// if mode & 2 == 1, GPT-NeoX style
|
630
635
|
// TODO: avoid creating a new tensor every time
|
631
636
|
struct ggml_tensor * ggml_rope(
|
632
637
|
struct ggml_context * ctx,
|
@@ -806,6 +811,10 @@ enum ggml_opt_result ggml_opt(
|
|
806
811
|
|
807
812
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
808
813
|
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
814
|
+
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
815
|
+
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
816
|
+
|
817
|
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
809
818
|
|
810
819
|
//
|
811
820
|
// system info
|
@@ -823,6 +832,7 @@ int ggml_cpu_has_f16c(void);
|
|
823
832
|
int ggml_cpu_has_fp16_va(void);
|
824
833
|
int ggml_cpu_has_wasm_simd(void);
|
825
834
|
int ggml_cpu_has_blas(void);
|
835
|
+
int ggml_cpu_has_cublas(void);
|
826
836
|
int ggml_cpu_has_sse3(void);
|
827
837
|
int ggml_cpu_has_vsx(void);
|
828
838
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -24,6 +24,9 @@
|
|
24
24
|
#include <memory>
|
25
25
|
#include <algorithm>
|
26
26
|
#include <initializer_list>
|
27
|
+
#include <thread>
|
28
|
+
#include <atomic>
|
29
|
+
#include <mutex>
|
27
30
|
|
28
31
|
#define LLAMA_USE_SCRATCH
|
29
32
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -478,6 +481,8 @@ struct llama_file_loader {
|
|
478
481
|
case GGML_TYPE_F16:
|
479
482
|
case GGML_TYPE_Q4_0:
|
480
483
|
case GGML_TYPE_Q4_1:
|
484
|
+
case GGML_TYPE_Q4_2:
|
485
|
+
case GGML_TYPE_Q4_3:
|
481
486
|
break;
|
482
487
|
default: {
|
483
488
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -550,6 +555,8 @@ struct llama_file_saver {
|
|
550
555
|
case GGML_TYPE_F16:
|
551
556
|
case GGML_TYPE_Q4_0:
|
552
557
|
case GGML_TYPE_Q4_1:
|
558
|
+
case GGML_TYPE_Q4_2:
|
559
|
+
case GGML_TYPE_Q4_3:
|
553
560
|
break;
|
554
561
|
default: LLAMA_ASSERT(false);
|
555
562
|
}
|
@@ -838,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
838
845
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
839
846
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
840
847
|
return "mostly Q4_1, some F16";
|
848
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
849
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
841
850
|
default: return "unknown, may not work";
|
842
851
|
}
|
843
852
|
}
|
@@ -1066,7 +1075,7 @@ static bool llama_eval_internal(
|
|
1066
1075
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1067
1076
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1068
1077
|
ggml_cgraph gf = {};
|
1069
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
1078
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
|
1070
1079
|
|
1071
1080
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1072
1081
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1566,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1566
1575
|
// quantization
|
1567
1576
|
//
|
1568
1577
|
|
1569
|
-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1578
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
1570
1579
|
ggml_type quantized_type;
|
1571
1580
|
switch (ftype) {
|
1572
1581
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1573
1582
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1583
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1584
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1574
1585
|
default: throw format("invalid output file type %d\n", ftype);
|
1575
1586
|
};
|
1576
1587
|
|
1588
|
+
if (nthread <= 0) {
|
1589
|
+
nthread = std::thread::hardware_concurrency();
|
1590
|
+
}
|
1591
|
+
|
1577
1592
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1578
1593
|
/*vocab_only*/ false));
|
1579
1594
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
@@ -1582,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1582
1597
|
size_t total_size_new = 0;
|
1583
1598
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
1584
1599
|
|
1600
|
+
std::vector<std::thread> workers;
|
1601
|
+
std::mutex mutex;
|
1602
|
+
|
1585
1603
|
size_t idx = 0;
|
1586
1604
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1587
1605
|
llama_buffer read_data;
|
@@ -1600,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1600
1618
|
// quantize only 2D tensors
|
1601
1619
|
quantize &= (tensor.ne.size() == 2);
|
1602
1620
|
|
1621
|
+
// GG: uncomment this to keep the output layer in FP16
|
1622
|
+
//if (tensor.name.rfind("output")) {
|
1623
|
+
// quantize = false;
|
1624
|
+
//}
|
1625
|
+
|
1603
1626
|
enum ggml_type new_type;
|
1604
1627
|
void * new_data;
|
1605
1628
|
size_t new_size;
|
@@ -1635,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1635
1658
|
new_data = work.addr;
|
1636
1659
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1637
1660
|
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1661
|
+
int chunk_size = 32 * 512;
|
1662
|
+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
1663
|
+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
1664
|
+
if (nthread_use < 2) {
|
1665
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
1666
|
+
} else {
|
1667
|
+
size_t counter = 0;
|
1668
|
+
new_size = 0;
|
1669
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
1670
|
+
std::vector<int64_t> local_hist;
|
1671
|
+
size_t local_size = 0;
|
1672
|
+
while (true) {
|
1673
|
+
std::unique_lock<std::mutex> lock(mutex);
|
1674
|
+
size_t first = counter; counter += chunk_size;
|
1675
|
+
if (first >= nelements) {
|
1676
|
+
if (!local_hist.empty()) {
|
1677
|
+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
1678
|
+
new_size += local_size;
|
1679
|
+
}
|
1680
|
+
break;
|
1681
|
+
}
|
1682
|
+
lock.unlock();
|
1683
|
+
size_t last = std::min(nelements, first + chunk_size);
|
1684
|
+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
1685
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
1686
|
+
}
|
1687
|
+
};
|
1688
|
+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
1689
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
1690
|
+
compute();
|
1691
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
1649
1692
|
}
|
1650
1693
|
|
1651
1694
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -1767,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1767
1810
|
int llama_model_quantize(
|
1768
1811
|
const char * fname_inp,
|
1769
1812
|
const char * fname_out,
|
1770
|
-
enum llama_ftype ftype
|
1813
|
+
enum llama_ftype ftype,
|
1814
|
+
int nthread) {
|
1771
1815
|
try {
|
1772
|
-
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1816
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
1773
1817
|
return 0;
|
1774
1818
|
} catch (const std::string & err) {
|
1775
1819
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
@@ -1955,7 +1999,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
1955
1999
|
base_t = dest_t;
|
1956
2000
|
}
|
1957
2001
|
|
1958
|
-
if (base_t->type
|
2002
|
+
if (ggml_is_quantized(base_t->type)) {
|
1959
2003
|
if (!warned) {
|
1960
2004
|
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
2005
|
"use a f16 or f32 base model with --lora-base\n", __func__);
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -72,6 +72,8 @@ extern "C" {
|
|
72
72
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
73
73
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
|
+
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
+
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
75
77
|
};
|
76
78
|
|
77
79
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -91,10 +93,12 @@ extern "C" {
|
|
91
93
|
|
92
94
|
// TODO: not great API - very likely to change
|
93
95
|
// Returns 0 on success
|
96
|
+
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
94
97
|
LLAMA_API int llama_model_quantize(
|
95
98
|
const char * fname_inp,
|
96
99
|
const char * fname_out,
|
97
|
-
enum llama_ftype ftype
|
100
|
+
enum llama_ftype ftype,
|
101
|
+
int nthread);
|
98
102
|
|
99
103
|
// Apply a LoRA adapter to a loaded model
|
100
104
|
// path_base_model is the path to a higher quality model to use as a base for
|
@@ -202,7 +202,6 @@ struct llama_mmap {
|
|
202
202
|
|
203
203
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
204
204
|
DWORD error = GetLastError();
|
205
|
-
CloseHandle(hFile);
|
206
205
|
|
207
206
|
if (hMapping == NULL) {
|
208
207
|
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-12b5900'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,6 +5,15 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
+
LLAMA_FTYPE_ALL_F32: Integer
|
9
|
+
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
|
+
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
|
+
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
+
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
|
+
|
16
|
+
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
8
17
|
def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
|
9
18
|
def self?.print_system_info: () -> void
|
10
19
|
def self?.token_bos: () -> Integer
|
@@ -18,7 +27,7 @@ module LLaMACpp
|
|
18
27
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
19
28
|
| () -> void
|
20
29
|
def embeddings: () -> Array[Float]
|
21
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) ->
|
30
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
22
31
|
def free: () -> void
|
23
32
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
24
33
|
def logits: () -> Array[Float]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- ext/llama_cpp/llama_cpp.cpp
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
|
+
- ext/llama_cpp/src/ggml-cuda.h
|
29
30
|
- ext/llama_cpp/src/ggml.c
|
30
31
|
- ext/llama_cpp/src/ggml.h
|
31
32
|
- ext/llama_cpp/src/llama.cpp
|