llama_cpp 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +106 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +2038 -895
- data/ext/llama_cpp/src/ggml.h +21 -1
- data/ext/llama_cpp/src/llama.cpp +376 -62
- data/ext/llama_cpp/src/llama.h +17 -1
- data/ext/llama_cpp/src/llama_util.h +22 -16
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +13 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -204,6 +204,9 @@ enum ggml_type {
|
|
204
204
|
GGML_TYPE_F16 = 1,
|
205
205
|
GGML_TYPE_Q4_0 = 2,
|
206
206
|
GGML_TYPE_Q4_1 = 3,
|
207
|
+
GGML_TYPE_Q4_2 = 4,
|
208
|
+
GGML_TYPE_Q4_3 = 5,
|
209
|
+
GGML_TYPE_Q8_0 = 6,
|
207
210
|
GGML_TYPE_I8,
|
208
211
|
GGML_TYPE_I16,
|
209
212
|
GGML_TYPE_I32,
|
@@ -358,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
|
|
358
361
|
|
359
362
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
360
363
|
|
364
|
+
bool ggml_is_quantized(enum ggml_type type);
|
365
|
+
|
361
366
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
362
367
|
void ggml_free(struct ggml_context * ctx);
|
363
368
|
|
@@ -429,6 +434,12 @@ struct ggml_tensor * ggml_add(
|
|
429
434
|
struct ggml_tensor * a,
|
430
435
|
struct ggml_tensor * b);
|
431
436
|
|
437
|
+
|
438
|
+
struct ggml_tensor * ggml_add_inplace(
|
439
|
+
struct ggml_context * ctx,
|
440
|
+
struct ggml_tensor * a,
|
441
|
+
struct ggml_tensor * b);
|
442
|
+
|
432
443
|
struct ggml_tensor * ggml_sub(
|
433
444
|
struct ggml_context * ctx,
|
434
445
|
struct ggml_tensor * a,
|
@@ -619,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
|
|
619
630
|
|
620
631
|
// rotary position embedding
|
621
632
|
// in-place, returns view(a)
|
622
|
-
// if mode == 1, skip n_past elements
|
633
|
+
// if mode & 1 == 1, skip n_past elements
|
634
|
+
// if mode & 2 == 1, GPT-NeoX style
|
623
635
|
// TODO: avoid creating a new tensor every time
|
624
636
|
struct ggml_tensor * ggml_rope(
|
625
637
|
struct ggml_context * ctx,
|
@@ -799,6 +811,10 @@ enum ggml_opt_result ggml_opt(
|
|
799
811
|
|
800
812
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
801
813
|
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
814
|
+
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
815
|
+
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
816
|
+
|
817
|
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
802
818
|
|
803
819
|
//
|
804
820
|
// system info
|
@@ -807,6 +823,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
|
807
823
|
int ggml_cpu_has_avx(void);
|
808
824
|
int ggml_cpu_has_avx2(void);
|
809
825
|
int ggml_cpu_has_avx512(void);
|
826
|
+
int ggml_cpu_has_avx512_vbmi(void);
|
827
|
+
int ggml_cpu_has_avx512_vnni(void);
|
810
828
|
int ggml_cpu_has_fma(void);
|
811
829
|
int ggml_cpu_has_neon(void);
|
812
830
|
int ggml_cpu_has_arm_fma(void);
|
@@ -814,6 +832,7 @@ int ggml_cpu_has_f16c(void);
|
|
814
832
|
int ggml_cpu_has_fp16_va(void);
|
815
833
|
int ggml_cpu_has_wasm_simd(void);
|
816
834
|
int ggml_cpu_has_blas(void);
|
835
|
+
int ggml_cpu_has_cublas(void);
|
817
836
|
int ggml_cpu_has_sse3(void);
|
818
837
|
int ggml_cpu_has_vsx(void);
|
819
838
|
|
@@ -836,6 +855,7 @@ typedef struct {
|
|
836
855
|
dequantize_row_q_t dequantize_row_q;
|
837
856
|
quantize_row_q_t quantize_row_q;
|
838
857
|
quantize_row_q_t quantize_row_q_reference;
|
858
|
+
quantize_row_q_t quantize_row_q_dot;
|
839
859
|
vec_dot_q_t vec_dot_q;
|
840
860
|
} quantize_fns_t;
|
841
861
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstdint>
|
5
|
+
#include <cstdio>
|
4
6
|
#endif
|
5
7
|
|
6
8
|
#include "llama_util.h"
|
@@ -9,6 +11,7 @@
|
|
9
11
|
#include "ggml.h"
|
10
12
|
|
11
13
|
#include <array>
|
14
|
+
#include <ctime>
|
12
15
|
#include <cinttypes>
|
13
16
|
#include <fstream>
|
14
17
|
#include <random>
|
@@ -21,6 +24,9 @@
|
|
21
24
|
#include <memory>
|
22
25
|
#include <algorithm>
|
23
26
|
#include <initializer_list>
|
27
|
+
#include <thread>
|
28
|
+
#include <atomic>
|
29
|
+
#include <mutex>
|
24
30
|
|
25
31
|
#define LLAMA_USE_SCRATCH
|
26
32
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -41,35 +47,51 @@ static const size_t MB = 1024*1024;
|
|
41
47
|
// TODO: dynamically determine these sizes
|
42
48
|
// needs modifications in ggml
|
43
49
|
|
44
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
}
|
50
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
51
|
+
{
|
52
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
53
|
+
{ MODEL_7B, 512ull * MB },
|
54
|
+
{ MODEL_13B, 512ull * MB },
|
55
|
+
{ MODEL_30B, 512ull * MB },
|
56
|
+
{ MODEL_65B, 512ull * MB },
|
57
|
+
};
|
58
|
+
return _MEM_REQ_SCRATCH0;
|
59
|
+
}
|
50
60
|
|
51
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
61
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
62
|
+
{
|
63
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
64
|
+
{ MODEL_7B, 512ull * MB },
|
65
|
+
{ MODEL_13B, 512ull * MB },
|
66
|
+
{ MODEL_30B, 512ull * MB },
|
67
|
+
{ MODEL_65B, 512ull * MB },
|
68
|
+
};
|
69
|
+
return _MEM_REQ_SCRATCH1;
|
56
70
|
};
|
57
71
|
|
58
72
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
59
|
-
static const std::map<e_model, size_t> MEM_REQ_KV_SELF
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
73
|
+
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
74
|
+
{
|
75
|
+
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
76
|
+
{ MODEL_7B, 1026ull * MB },
|
77
|
+
{ MODEL_13B, 1608ull * MB },
|
78
|
+
{ MODEL_30B, 3124ull * MB },
|
79
|
+
{ MODEL_65B, 5120ull * MB },
|
80
|
+
};
|
81
|
+
return _MEM_REQ_KV_SELF;
|
64
82
|
};
|
65
83
|
|
66
84
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
67
85
|
// not actually needed if BLAS is disabled
|
68
|
-
static const std::map<e_model, size_t> MEM_REQ_EVAL
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
86
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
87
|
+
{
|
88
|
+
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
89
|
+
{ MODEL_7B, 768ull * MB },
|
90
|
+
{ MODEL_13B, 1024ull * MB },
|
91
|
+
{ MODEL_30B, 1280ull * MB },
|
92
|
+
{ MODEL_65B, 1536ull * MB },
|
93
|
+
};
|
94
|
+
return _MEM_REQ_EVAL;
|
73
95
|
};
|
74
96
|
|
75
97
|
// default hparams (LLaMA 7B)
|
@@ -261,12 +283,12 @@ static size_t checked_div(size_t a, size_t b) {
|
|
261
283
|
}
|
262
284
|
|
263
285
|
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
264
|
-
|
286
|
+
char buf[256];
|
287
|
+
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
265
288
|
for (size_t i = 1; i < ne.size(); i++) {
|
266
|
-
|
289
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
267
290
|
}
|
268
|
-
|
269
|
-
return ret;
|
291
|
+
return buf;
|
270
292
|
}
|
271
293
|
|
272
294
|
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
@@ -459,6 +481,8 @@ struct llama_file_loader {
|
|
459
481
|
case GGML_TYPE_F16:
|
460
482
|
case GGML_TYPE_Q4_0:
|
461
483
|
case GGML_TYPE_Q4_1:
|
484
|
+
case GGML_TYPE_Q4_2:
|
485
|
+
case GGML_TYPE_Q4_3:
|
462
486
|
break;
|
463
487
|
default: {
|
464
488
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -531,6 +555,8 @@ struct llama_file_saver {
|
|
531
555
|
case GGML_TYPE_F16:
|
532
556
|
case GGML_TYPE_Q4_0:
|
533
557
|
case GGML_TYPE_Q4_1:
|
558
|
+
case GGML_TYPE_Q4_2:
|
559
|
+
case GGML_TYPE_Q4_3:
|
534
560
|
break;
|
535
561
|
default: LLAMA_ASSERT(false);
|
536
562
|
}
|
@@ -616,6 +642,7 @@ struct llama_model_loader {
|
|
616
642
|
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
617
643
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
618
644
|
}
|
645
|
+
|
619
646
|
return get_tensor_for(lt);
|
620
647
|
}
|
621
648
|
|
@@ -818,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
818
845
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
819
846
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
820
847
|
return "mostly Q4_1, some F16";
|
848
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
849
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
821
850
|
default: return "unknown, may not work";
|
822
851
|
}
|
823
852
|
}
|
@@ -898,13 +927,13 @@ static void llama_model_load_internal(
|
|
898
927
|
const size_t mem_required =
|
899
928
|
ctx_size +
|
900
929
|
mmapped_size +
|
901
|
-
MEM_REQ_SCRATCH0.at(model.type) +
|
902
|
-
MEM_REQ_SCRATCH1.at(model.type) +
|
903
|
-
MEM_REQ_EVAL.at
|
930
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
931
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
932
|
+
MEM_REQ_EVAL().at(model.type);
|
904
933
|
|
905
934
|
// this is the memory required by one llama_state
|
906
935
|
const size_t mem_required_state =
|
907
|
-
scale*MEM_REQ_KV_SELF.at(model.type);
|
936
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
908
937
|
|
909
938
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
910
939
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -941,8 +970,8 @@ static void llama_model_load_internal(
|
|
941
970
|
ml->ggml_ctx = ctx;
|
942
971
|
|
943
972
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
944
|
-
model.norm
|
945
|
-
model.output
|
973
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
974
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
946
975
|
|
947
976
|
model.layers.resize(n_layer);
|
948
977
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
@@ -1046,7 +1075,7 @@ static bool llama_eval_internal(
|
|
1046
1075
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1047
1076
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1048
1077
|
ggml_cgraph gf = {};
|
1049
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
1078
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
|
1050
1079
|
|
1051
1080
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1052
1081
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1546,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1546
1575
|
// quantization
|
1547
1576
|
//
|
1548
1577
|
|
1549
|
-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1578
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
1550
1579
|
ggml_type quantized_type;
|
1551
1580
|
switch (ftype) {
|
1552
1581
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1553
1582
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1583
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1584
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1554
1585
|
default: throw format("invalid output file type %d\n", ftype);
|
1555
1586
|
};
|
1556
1587
|
|
1588
|
+
if (nthread <= 0) {
|
1589
|
+
nthread = std::thread::hardware_concurrency();
|
1590
|
+
}
|
1591
|
+
|
1557
1592
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1558
1593
|
/*vocab_only*/ false));
|
1559
1594
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
@@ -1562,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1562
1597
|
size_t total_size_new = 0;
|
1563
1598
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
1564
1599
|
|
1600
|
+
std::vector<std::thread> workers;
|
1601
|
+
std::mutex mutex;
|
1602
|
+
|
1565
1603
|
size_t idx = 0;
|
1566
1604
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1567
1605
|
llama_buffer read_data;
|
@@ -1569,7 +1607,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1569
1607
|
tensor.data = read_data.addr;
|
1570
1608
|
model_loader->load_data_for(tensor);
|
1571
1609
|
|
1572
|
-
printf("[%
|
1610
|
+
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
1573
1611
|
++idx, model_loader->tensors_map.tensors.size(),
|
1574
1612
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1575
1613
|
ggml_type_name(tensor.type));
|
@@ -1580,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1580
1618
|
// quantize only 2D tensors
|
1581
1619
|
quantize &= (tensor.ne.size() == 2);
|
1582
1620
|
|
1621
|
+
// GG: uncomment this to keep the output layer in FP16
|
1622
|
+
//if (tensor.name.rfind("output")) {
|
1623
|
+
// quantize = false;
|
1624
|
+
//}
|
1625
|
+
|
1583
1626
|
enum ggml_type new_type;
|
1584
1627
|
void * new_data;
|
1585
1628
|
size_t new_size;
|
@@ -1615,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1615
1658
|
new_data = work.addr;
|
1616
1659
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1617
1660
|
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1661
|
+
int chunk_size = 32 * 512;
|
1662
|
+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
1663
|
+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
1664
|
+
if (nthread_use < 2) {
|
1665
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
1666
|
+
} else {
|
1667
|
+
size_t counter = 0;
|
1668
|
+
new_size = 0;
|
1669
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
1670
|
+
std::vector<int64_t> local_hist;
|
1671
|
+
size_t local_size = 0;
|
1672
|
+
while (true) {
|
1673
|
+
std::unique_lock<std::mutex> lock(mutex);
|
1674
|
+
size_t first = counter; counter += chunk_size;
|
1675
|
+
if (first >= nelements) {
|
1676
|
+
if (!local_hist.empty()) {
|
1677
|
+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
1678
|
+
new_size += local_size;
|
1679
|
+
}
|
1680
|
+
break;
|
1681
|
+
}
|
1682
|
+
lock.unlock();
|
1683
|
+
size_t last = std::min(nelements, first + chunk_size);
|
1684
|
+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
1685
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
1686
|
+
}
|
1687
|
+
};
|
1688
|
+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
1689
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
1690
|
+
compute();
|
1691
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
1629
1692
|
}
|
1630
1693
|
|
1631
1694
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -1731,10 +1794,10 @@ struct llama_context * llama_init_from_file(
|
|
1731
1794
|
ctx->embedding.resize(hparams.n_embd);
|
1732
1795
|
}
|
1733
1796
|
|
1734
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
1797
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
1735
1798
|
|
1736
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
1737
|
-
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
1799
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
1800
|
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
1738
1801
|
}
|
1739
1802
|
|
1740
1803
|
return ctx;
|
@@ -1747,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1747
1810
|
int llama_model_quantize(
|
1748
1811
|
const char * fname_inp,
|
1749
1812
|
const char * fname_out,
|
1750
|
-
enum llama_ftype ftype
|
1813
|
+
enum llama_ftype ftype,
|
1814
|
+
int nthread) {
|
1751
1815
|
try {
|
1752
|
-
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1816
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
1753
1817
|
return 0;
|
1754
1818
|
} catch (const std::string & err) {
|
1755
1819
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
@@ -1757,6 +1821,254 @@ int llama_model_quantize(
|
|
1757
1821
|
}
|
1758
1822
|
}
|
1759
1823
|
|
1824
|
+
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
1825
|
+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
1826
|
+
|
1827
|
+
auto & model = ctx->model;
|
1828
|
+
|
1829
|
+
const int64_t t_start_lora_us = ggml_time_us();
|
1830
|
+
|
1831
|
+
auto fin = std::ifstream(path_lora, std::ios::binary);
|
1832
|
+
if (!fin) {
|
1833
|
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
1834
|
+
return 1;
|
1835
|
+
}
|
1836
|
+
|
1837
|
+
// verify magic and version
|
1838
|
+
{
|
1839
|
+
uint32_t magic;
|
1840
|
+
fin.read((char *) &magic, sizeof(magic));
|
1841
|
+
if (magic != 'ggla') {
|
1842
|
+
fprintf(stderr, "%s: bad file magic\n", __func__);
|
1843
|
+
return 1;
|
1844
|
+
}
|
1845
|
+
uint32_t format_version;
|
1846
|
+
fin.read((char *) &format_version, sizeof(format_version));
|
1847
|
+
|
1848
|
+
if (format_version != 1) {
|
1849
|
+
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
1850
|
+
return 1;
|
1851
|
+
}
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
int32_t lora_r;
|
1855
|
+
int32_t lora_alpha;
|
1856
|
+
fin.read((char *) &lora_r, sizeof(lora_r));
|
1857
|
+
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
1858
|
+
float scaling = (float)lora_alpha / (float)lora_r;
|
1859
|
+
|
1860
|
+
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
1861
|
+
|
1862
|
+
|
1863
|
+
// create a temporary ggml context to store the lora tensors
|
1864
|
+
// todo: calculate size from biggest possible tensor
|
1865
|
+
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
1866
|
+
struct ggml_init_params params;
|
1867
|
+
params.mem_size = lora_buf.size();
|
1868
|
+
params.mem_buffer = lora_buf.data();
|
1869
|
+
params.no_alloc = false;
|
1870
|
+
|
1871
|
+
ggml_context * lora_ctx = ggml_init(params);
|
1872
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
1873
|
+
|
1874
|
+
// create a name -> tensor map of the model to accelerate lookups
|
1875
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
1876
|
+
for (auto & kv: model.tensors_by_name) {
|
1877
|
+
model_tensors.insert(kv);
|
1878
|
+
}
|
1879
|
+
|
1880
|
+
|
1881
|
+
// load base model
|
1882
|
+
std::unique_ptr<llama_model_loader> model_loader;
|
1883
|
+
ggml_context * base_ctx = NULL;
|
1884
|
+
llama_buffer base_buf;
|
1885
|
+
if (path_base_model) {
|
1886
|
+
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
1887
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
1888
|
+
|
1889
|
+
size_t ctx_size, mmapped_size;
|
1890
|
+
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
1891
|
+
base_buf.resize(ctx_size);
|
1892
|
+
|
1893
|
+
ggml_init_params base_params;
|
1894
|
+
base_params.mem_size = base_buf.size;
|
1895
|
+
base_params.mem_buffer = base_buf.addr;
|
1896
|
+
base_params.no_alloc = model_loader->use_mmap;
|
1897
|
+
|
1898
|
+
base_ctx = ggml_init(base_params);
|
1899
|
+
|
1900
|
+
model_loader->ggml_ctx = base_ctx;
|
1901
|
+
|
1902
|
+
// maybe this should in llama_model_loader
|
1903
|
+
if (model_loader->use_mmap) {
|
1904
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
1905
|
+
}
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
// read tensors and apply
|
1909
|
+
bool warned = false;
|
1910
|
+
int n_tensors = 0;
|
1911
|
+
while (true) {
|
1912
|
+
int32_t n_dims;
|
1913
|
+
int32_t length;
|
1914
|
+
int32_t ftype;
|
1915
|
+
|
1916
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1917
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1918
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1919
|
+
if (fin.eof()) {
|
1920
|
+
break;
|
1921
|
+
}
|
1922
|
+
|
1923
|
+
int32_t ne[2] = { 1, 1 };
|
1924
|
+
for (int i = 0; i < n_dims; ++i) {
|
1925
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1926
|
+
}
|
1927
|
+
|
1928
|
+
std::string name(length, 0);
|
1929
|
+
fin.read(&name[0], length);
|
1930
|
+
|
1931
|
+
// check for lora suffix and get the type of tensor
|
1932
|
+
const std::string lora_suffix = ".lora";
|
1933
|
+
size_t pos = name.rfind(lora_suffix);
|
1934
|
+
if (pos == std::string::npos) {
|
1935
|
+
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
1936
|
+
return 1;
|
1937
|
+
}
|
1938
|
+
|
1939
|
+
std::string lora_type = name.substr(pos + lora_suffix.length());
|
1940
|
+
std::string base_name = name;
|
1941
|
+
base_name.erase(pos);
|
1942
|
+
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
1943
|
+
|
1944
|
+
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
1945
|
+
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
1946
|
+
return 1;
|
1947
|
+
}
|
1948
|
+
|
1949
|
+
// create ggml tensor
|
1950
|
+
ggml_type wtype;
|
1951
|
+
switch (ftype) {
|
1952
|
+
case 0: wtype = GGML_TYPE_F32; break;
|
1953
|
+
case 1: wtype = GGML_TYPE_F16; break;
|
1954
|
+
default:
|
1955
|
+
{
|
1956
|
+
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
1957
|
+
__func__, ftype);
|
1958
|
+
return false;
|
1959
|
+
}
|
1960
|
+
}
|
1961
|
+
ggml_tensor* lora_tensor;
|
1962
|
+
if (n_dims == 2) {
|
1963
|
+
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
1964
|
+
}
|
1965
|
+
else {
|
1966
|
+
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
1967
|
+
return 1;
|
1968
|
+
}
|
1969
|
+
|
1970
|
+
// load tensor data
|
1971
|
+
size_t offset = fin.tellg();
|
1972
|
+
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
1973
|
+
offset = (offset + 31) & -32;
|
1974
|
+
fin.seekg(offset);
|
1975
|
+
fin.read((char*)lora_tensor->data, tensor_data_size);
|
1976
|
+
|
1977
|
+
lora_tensors[name] = lora_tensor;
|
1978
|
+
|
1979
|
+
// check if we have both A and B tensors and apply
|
1980
|
+
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
1981
|
+
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
1982
|
+
|
1983
|
+
ggml_tensor * dest_t = model_tensors[base_name];
|
1984
|
+
ggml_tensor * base_t;
|
1985
|
+
if (model_loader) {
|
1986
|
+
// load from base model
|
1987
|
+
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
1988
|
+
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
1989
|
+
return 1;
|
1990
|
+
}
|
1991
|
+
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
1992
|
+
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
1993
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
1994
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
1995
|
+
model_loader->load_data_for(lt);
|
1996
|
+
lt.ggml_tensor->data = lt.data;
|
1997
|
+
}
|
1998
|
+
else {
|
1999
|
+
base_t = dest_t;
|
2000
|
+
}
|
2001
|
+
|
2002
|
+
if (ggml_is_quantized(base_t->type)) {
|
2003
|
+
if (!warned) {
|
2004
|
+
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
2005
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
2006
|
+
warned = true;
|
2007
|
+
}
|
2008
|
+
}
|
2009
|
+
|
2010
|
+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2011
|
+
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2012
|
+
|
2013
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
2014
|
+
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
2015
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
2016
|
+
return 1;
|
2017
|
+
}
|
2018
|
+
|
2019
|
+
// w = w + BA*s
|
2020
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2021
|
+
|
2022
|
+
if (scaling != 1.0f) {
|
2023
|
+
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2024
|
+
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
2025
|
+
}
|
2026
|
+
|
2027
|
+
ggml_tensor * r;
|
2028
|
+
if (base_t == dest_t) {
|
2029
|
+
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2030
|
+
}
|
2031
|
+
else {
|
2032
|
+
r = ggml_add(lora_ctx, base_t, BA);
|
2033
|
+
r = ggml_cpy(lora_ctx, r, dest_t);
|
2034
|
+
}
|
2035
|
+
|
2036
|
+
struct ggml_cgraph gf = ggml_build_forward(r);
|
2037
|
+
gf.n_threads = n_threads;
|
2038
|
+
ggml_graph_compute(lora_ctx, &gf);
|
2039
|
+
|
2040
|
+
// we won't need these tensors again, reset the context to save memory
|
2041
|
+
ggml_free(lora_ctx);
|
2042
|
+
lora_ctx = ggml_init(params);
|
2043
|
+
lora_tensors.clear();
|
2044
|
+
|
2045
|
+
n_tensors++;
|
2046
|
+
if (n_tensors % 4 == 0)
|
2047
|
+
fprintf(stderr, ".");
|
2048
|
+
}
|
2049
|
+
}
|
2050
|
+
|
2051
|
+
// TODO: this should be in a destructor, it will leak on failure
|
2052
|
+
ggml_free(lora_ctx);
|
2053
|
+
if (base_ctx) {
|
2054
|
+
ggml_free(base_ctx);
|
2055
|
+
}
|
2056
|
+
|
2057
|
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
2058
|
+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
2059
|
+
|
2060
|
+
return 0;
|
2061
|
+
}
|
2062
|
+
|
2063
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2064
|
+
try {
|
2065
|
+
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2066
|
+
} catch (const std::string & err) {
|
2067
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
2068
|
+
return 1;
|
2069
|
+
}
|
2070
|
+
}
|
2071
|
+
|
1760
2072
|
// Returns the KV cache that will contain the context for the
|
1761
2073
|
// ongoing prediction with the model.
|
1762
2074
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
@@ -1914,18 +2226,20 @@ const char * llama_print_system_info(void) {
|
|
1914
2226
|
static std::string s;
|
1915
2227
|
|
1916
2228
|
s = "";
|
1917
|
-
s += "AVX = "
|
1918
|
-
s += "AVX2 = "
|
1919
|
-
s += "AVX512 = "
|
1920
|
-
s += "
|
1921
|
-
s += "
|
1922
|
-
s += "
|
1923
|
-
s += "
|
1924
|
-
s += "
|
1925
|
-
s += "
|
1926
|
-
s += "
|
1927
|
-
s += "
|
1928
|
-
s += "
|
2229
|
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
2230
|
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
2231
|
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
2232
|
+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
2233
|
+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
2234
|
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
2235
|
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
2236
|
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
2237
|
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
2238
|
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
2239
|
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
2240
|
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
2241
|
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
2242
|
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
1929
2243
|
|
1930
2244
|
return s.c_str();
|
1931
2245
|
}
|