llama_cpp 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +106 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +2038 -895
- data/ext/llama_cpp/src/ggml.h +21 -1
- data/ext/llama_cpp/src/llama.cpp +376 -62
- data/ext/llama_cpp/src/llama.h +17 -1
- data/ext/llama_cpp/src/llama_util.h +22 -16
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +13 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -204,6 +204,9 @@ enum ggml_type {
|
|
204
204
|
GGML_TYPE_F16 = 1,
|
205
205
|
GGML_TYPE_Q4_0 = 2,
|
206
206
|
GGML_TYPE_Q4_1 = 3,
|
207
|
+
GGML_TYPE_Q4_2 = 4,
|
208
|
+
GGML_TYPE_Q4_3 = 5,
|
209
|
+
GGML_TYPE_Q8_0 = 6,
|
207
210
|
GGML_TYPE_I8,
|
208
211
|
GGML_TYPE_I16,
|
209
212
|
GGML_TYPE_I32,
|
@@ -358,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
|
|
358
361
|
|
359
362
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
360
363
|
|
364
|
+
bool ggml_is_quantized(enum ggml_type type);
|
365
|
+
|
361
366
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
362
367
|
void ggml_free(struct ggml_context * ctx);
|
363
368
|
|
@@ -429,6 +434,12 @@ struct ggml_tensor * ggml_add(
|
|
429
434
|
struct ggml_tensor * a,
|
430
435
|
struct ggml_tensor * b);
|
431
436
|
|
437
|
+
|
438
|
+
struct ggml_tensor * ggml_add_inplace(
|
439
|
+
struct ggml_context * ctx,
|
440
|
+
struct ggml_tensor * a,
|
441
|
+
struct ggml_tensor * b);
|
442
|
+
|
432
443
|
struct ggml_tensor * ggml_sub(
|
433
444
|
struct ggml_context * ctx,
|
434
445
|
struct ggml_tensor * a,
|
@@ -619,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
|
|
619
630
|
|
620
631
|
// rotary position embedding
|
621
632
|
// in-place, returns view(a)
|
622
|
-
// if mode == 1, skip n_past elements
|
633
|
+
// if mode & 1 == 1, skip n_past elements
|
634
|
+
// if mode & 2 == 1, GPT-NeoX style
|
623
635
|
// TODO: avoid creating a new tensor every time
|
624
636
|
struct ggml_tensor * ggml_rope(
|
625
637
|
struct ggml_context * ctx,
|
@@ -799,6 +811,10 @@ enum ggml_opt_result ggml_opt(
|
|
799
811
|
|
800
812
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
801
813
|
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
814
|
+
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
815
|
+
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
816
|
+
|
817
|
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
802
818
|
|
803
819
|
//
|
804
820
|
// system info
|
@@ -807,6 +823,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
|
807
823
|
int ggml_cpu_has_avx(void);
|
808
824
|
int ggml_cpu_has_avx2(void);
|
809
825
|
int ggml_cpu_has_avx512(void);
|
826
|
+
int ggml_cpu_has_avx512_vbmi(void);
|
827
|
+
int ggml_cpu_has_avx512_vnni(void);
|
810
828
|
int ggml_cpu_has_fma(void);
|
811
829
|
int ggml_cpu_has_neon(void);
|
812
830
|
int ggml_cpu_has_arm_fma(void);
|
@@ -814,6 +832,7 @@ int ggml_cpu_has_f16c(void);
|
|
814
832
|
int ggml_cpu_has_fp16_va(void);
|
815
833
|
int ggml_cpu_has_wasm_simd(void);
|
816
834
|
int ggml_cpu_has_blas(void);
|
835
|
+
int ggml_cpu_has_cublas(void);
|
817
836
|
int ggml_cpu_has_sse3(void);
|
818
837
|
int ggml_cpu_has_vsx(void);
|
819
838
|
|
@@ -836,6 +855,7 @@ typedef struct {
|
|
836
855
|
dequantize_row_q_t dequantize_row_q;
|
837
856
|
quantize_row_q_t quantize_row_q;
|
838
857
|
quantize_row_q_t quantize_row_q_reference;
|
858
|
+
quantize_row_q_t quantize_row_q_dot;
|
839
859
|
vec_dot_q_t vec_dot_q;
|
840
860
|
} quantize_fns_t;
|
841
861
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstdint>
|
5
|
+
#include <cstdio>
|
4
6
|
#endif
|
5
7
|
|
6
8
|
#include "llama_util.h"
|
@@ -9,6 +11,7 @@
|
|
9
11
|
#include "ggml.h"
|
10
12
|
|
11
13
|
#include <array>
|
14
|
+
#include <ctime>
|
12
15
|
#include <cinttypes>
|
13
16
|
#include <fstream>
|
14
17
|
#include <random>
|
@@ -21,6 +24,9 @@
|
|
21
24
|
#include <memory>
|
22
25
|
#include <algorithm>
|
23
26
|
#include <initializer_list>
|
27
|
+
#include <thread>
|
28
|
+
#include <atomic>
|
29
|
+
#include <mutex>
|
24
30
|
|
25
31
|
#define LLAMA_USE_SCRATCH
|
26
32
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -41,35 +47,51 @@ static const size_t MB = 1024*1024;
|
|
41
47
|
// TODO: dynamically determine these sizes
|
42
48
|
// needs modifications in ggml
|
43
49
|
|
44
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
}
|
50
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
51
|
+
{
|
52
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
53
|
+
{ MODEL_7B, 512ull * MB },
|
54
|
+
{ MODEL_13B, 512ull * MB },
|
55
|
+
{ MODEL_30B, 512ull * MB },
|
56
|
+
{ MODEL_65B, 512ull * MB },
|
57
|
+
};
|
58
|
+
return _MEM_REQ_SCRATCH0;
|
59
|
+
}
|
50
60
|
|
51
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
61
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
62
|
+
{
|
63
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
64
|
+
{ MODEL_7B, 512ull * MB },
|
65
|
+
{ MODEL_13B, 512ull * MB },
|
66
|
+
{ MODEL_30B, 512ull * MB },
|
67
|
+
{ MODEL_65B, 512ull * MB },
|
68
|
+
};
|
69
|
+
return _MEM_REQ_SCRATCH1;
|
56
70
|
};
|
57
71
|
|
58
72
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
59
|
-
static const std::map<e_model, size_t> MEM_REQ_KV_SELF
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
73
|
+
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
74
|
+
{
|
75
|
+
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
76
|
+
{ MODEL_7B, 1026ull * MB },
|
77
|
+
{ MODEL_13B, 1608ull * MB },
|
78
|
+
{ MODEL_30B, 3124ull * MB },
|
79
|
+
{ MODEL_65B, 5120ull * MB },
|
80
|
+
};
|
81
|
+
return _MEM_REQ_KV_SELF;
|
64
82
|
};
|
65
83
|
|
66
84
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
67
85
|
// not actually needed if BLAS is disabled
|
68
|
-
static const std::map<e_model, size_t> MEM_REQ_EVAL
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
86
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
87
|
+
{
|
88
|
+
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
89
|
+
{ MODEL_7B, 768ull * MB },
|
90
|
+
{ MODEL_13B, 1024ull * MB },
|
91
|
+
{ MODEL_30B, 1280ull * MB },
|
92
|
+
{ MODEL_65B, 1536ull * MB },
|
93
|
+
};
|
94
|
+
return _MEM_REQ_EVAL;
|
73
95
|
};
|
74
96
|
|
75
97
|
// default hparams (LLaMA 7B)
|
@@ -261,12 +283,12 @@ static size_t checked_div(size_t a, size_t b) {
|
|
261
283
|
}
|
262
284
|
|
263
285
|
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
264
|
-
|
286
|
+
char buf[256];
|
287
|
+
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
265
288
|
for (size_t i = 1; i < ne.size(); i++) {
|
266
|
-
|
289
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
267
290
|
}
|
268
|
-
|
269
|
-
return ret;
|
291
|
+
return buf;
|
270
292
|
}
|
271
293
|
|
272
294
|
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
@@ -459,6 +481,8 @@ struct llama_file_loader {
|
|
459
481
|
case GGML_TYPE_F16:
|
460
482
|
case GGML_TYPE_Q4_0:
|
461
483
|
case GGML_TYPE_Q4_1:
|
484
|
+
case GGML_TYPE_Q4_2:
|
485
|
+
case GGML_TYPE_Q4_3:
|
462
486
|
break;
|
463
487
|
default: {
|
464
488
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -531,6 +555,8 @@ struct llama_file_saver {
|
|
531
555
|
case GGML_TYPE_F16:
|
532
556
|
case GGML_TYPE_Q4_0:
|
533
557
|
case GGML_TYPE_Q4_1:
|
558
|
+
case GGML_TYPE_Q4_2:
|
559
|
+
case GGML_TYPE_Q4_3:
|
534
560
|
break;
|
535
561
|
default: LLAMA_ASSERT(false);
|
536
562
|
}
|
@@ -616,6 +642,7 @@ struct llama_model_loader {
|
|
616
642
|
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
617
643
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
618
644
|
}
|
645
|
+
|
619
646
|
return get_tensor_for(lt);
|
620
647
|
}
|
621
648
|
|
@@ -818,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
818
845
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
819
846
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
820
847
|
return "mostly Q4_1, some F16";
|
848
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
849
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
821
850
|
default: return "unknown, may not work";
|
822
851
|
}
|
823
852
|
}
|
@@ -898,13 +927,13 @@ static void llama_model_load_internal(
|
|
898
927
|
const size_t mem_required =
|
899
928
|
ctx_size +
|
900
929
|
mmapped_size +
|
901
|
-
MEM_REQ_SCRATCH0.at(model.type) +
|
902
|
-
MEM_REQ_SCRATCH1.at(model.type) +
|
903
|
-
MEM_REQ_EVAL.at
|
930
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
931
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
932
|
+
MEM_REQ_EVAL().at(model.type);
|
904
933
|
|
905
934
|
// this is the memory required by one llama_state
|
906
935
|
const size_t mem_required_state =
|
907
|
-
scale*MEM_REQ_KV_SELF.at(model.type);
|
936
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
908
937
|
|
909
938
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
910
939
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -941,8 +970,8 @@ static void llama_model_load_internal(
|
|
941
970
|
ml->ggml_ctx = ctx;
|
942
971
|
|
943
972
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
944
|
-
model.norm
|
945
|
-
model.output
|
973
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
974
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
946
975
|
|
947
976
|
model.layers.resize(n_layer);
|
948
977
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
@@ -1046,7 +1075,7 @@ static bool llama_eval_internal(
|
|
1046
1075
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1047
1076
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1048
1077
|
ggml_cgraph gf = {};
|
1049
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
1078
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
|
1050
1079
|
|
1051
1080
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1052
1081
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1546,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1546
1575
|
// quantization
|
1547
1576
|
//
|
1548
1577
|
|
1549
|
-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1578
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
1550
1579
|
ggml_type quantized_type;
|
1551
1580
|
switch (ftype) {
|
1552
1581
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1553
1582
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1583
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1584
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1554
1585
|
default: throw format("invalid output file type %d\n", ftype);
|
1555
1586
|
};
|
1556
1587
|
|
1588
|
+
if (nthread <= 0) {
|
1589
|
+
nthread = std::thread::hardware_concurrency();
|
1590
|
+
}
|
1591
|
+
|
1557
1592
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1558
1593
|
/*vocab_only*/ false));
|
1559
1594
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
@@ -1562,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1562
1597
|
size_t total_size_new = 0;
|
1563
1598
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
1564
1599
|
|
1600
|
+
std::vector<std::thread> workers;
|
1601
|
+
std::mutex mutex;
|
1602
|
+
|
1565
1603
|
size_t idx = 0;
|
1566
1604
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1567
1605
|
llama_buffer read_data;
|
@@ -1569,7 +1607,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1569
1607
|
tensor.data = read_data.addr;
|
1570
1608
|
model_loader->load_data_for(tensor);
|
1571
1609
|
|
1572
|
-
printf("[%
|
1610
|
+
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
1573
1611
|
++idx, model_loader->tensors_map.tensors.size(),
|
1574
1612
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1575
1613
|
ggml_type_name(tensor.type));
|
@@ -1580,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1580
1618
|
// quantize only 2D tensors
|
1581
1619
|
quantize &= (tensor.ne.size() == 2);
|
1582
1620
|
|
1621
|
+
// GG: uncomment this to keep the output layer in FP16
|
1622
|
+
//if (tensor.name.rfind("output")) {
|
1623
|
+
// quantize = false;
|
1624
|
+
//}
|
1625
|
+
|
1583
1626
|
enum ggml_type new_type;
|
1584
1627
|
void * new_data;
|
1585
1628
|
size_t new_size;
|
@@ -1615,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1615
1658
|
new_data = work.addr;
|
1616
1659
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1617
1660
|
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1661
|
+
int chunk_size = 32 * 512;
|
1662
|
+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
1663
|
+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
1664
|
+
if (nthread_use < 2) {
|
1665
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
1666
|
+
} else {
|
1667
|
+
size_t counter = 0;
|
1668
|
+
new_size = 0;
|
1669
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
1670
|
+
std::vector<int64_t> local_hist;
|
1671
|
+
size_t local_size = 0;
|
1672
|
+
while (true) {
|
1673
|
+
std::unique_lock<std::mutex> lock(mutex);
|
1674
|
+
size_t first = counter; counter += chunk_size;
|
1675
|
+
if (first >= nelements) {
|
1676
|
+
if (!local_hist.empty()) {
|
1677
|
+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
1678
|
+
new_size += local_size;
|
1679
|
+
}
|
1680
|
+
break;
|
1681
|
+
}
|
1682
|
+
lock.unlock();
|
1683
|
+
size_t last = std::min(nelements, first + chunk_size);
|
1684
|
+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
1685
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
1686
|
+
}
|
1687
|
+
};
|
1688
|
+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
1689
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
1690
|
+
compute();
|
1691
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
1629
1692
|
}
|
1630
1693
|
|
1631
1694
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -1731,10 +1794,10 @@ struct llama_context * llama_init_from_file(
|
|
1731
1794
|
ctx->embedding.resize(hparams.n_embd);
|
1732
1795
|
}
|
1733
1796
|
|
1734
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
1797
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
1735
1798
|
|
1736
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
1737
|
-
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
1799
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
1800
|
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
1738
1801
|
}
|
1739
1802
|
|
1740
1803
|
return ctx;
|
@@ -1747,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1747
1810
|
int llama_model_quantize(
|
1748
1811
|
const char * fname_inp,
|
1749
1812
|
const char * fname_out,
|
1750
|
-
enum llama_ftype ftype
|
1813
|
+
enum llama_ftype ftype,
|
1814
|
+
int nthread) {
|
1751
1815
|
try {
|
1752
|
-
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1816
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
1753
1817
|
return 0;
|
1754
1818
|
} catch (const std::string & err) {
|
1755
1819
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
@@ -1757,6 +1821,254 @@ int llama_model_quantize(
|
|
1757
1821
|
}
|
1758
1822
|
}
|
1759
1823
|
|
1824
|
+
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
1825
|
+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
1826
|
+
|
1827
|
+
auto & model = ctx->model;
|
1828
|
+
|
1829
|
+
const int64_t t_start_lora_us = ggml_time_us();
|
1830
|
+
|
1831
|
+
auto fin = std::ifstream(path_lora, std::ios::binary);
|
1832
|
+
if (!fin) {
|
1833
|
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
1834
|
+
return 1;
|
1835
|
+
}
|
1836
|
+
|
1837
|
+
// verify magic and version
|
1838
|
+
{
|
1839
|
+
uint32_t magic;
|
1840
|
+
fin.read((char *) &magic, sizeof(magic));
|
1841
|
+
if (magic != 'ggla') {
|
1842
|
+
fprintf(stderr, "%s: bad file magic\n", __func__);
|
1843
|
+
return 1;
|
1844
|
+
}
|
1845
|
+
uint32_t format_version;
|
1846
|
+
fin.read((char *) &format_version, sizeof(format_version));
|
1847
|
+
|
1848
|
+
if (format_version != 1) {
|
1849
|
+
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
1850
|
+
return 1;
|
1851
|
+
}
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
int32_t lora_r;
|
1855
|
+
int32_t lora_alpha;
|
1856
|
+
fin.read((char *) &lora_r, sizeof(lora_r));
|
1857
|
+
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
1858
|
+
float scaling = (float)lora_alpha / (float)lora_r;
|
1859
|
+
|
1860
|
+
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
1861
|
+
|
1862
|
+
|
1863
|
+
// create a temporary ggml context to store the lora tensors
|
1864
|
+
// todo: calculate size from biggest possible tensor
|
1865
|
+
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
1866
|
+
struct ggml_init_params params;
|
1867
|
+
params.mem_size = lora_buf.size();
|
1868
|
+
params.mem_buffer = lora_buf.data();
|
1869
|
+
params.no_alloc = false;
|
1870
|
+
|
1871
|
+
ggml_context * lora_ctx = ggml_init(params);
|
1872
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
1873
|
+
|
1874
|
+
// create a name -> tensor map of the model to accelerate lookups
|
1875
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
1876
|
+
for (auto & kv: model.tensors_by_name) {
|
1877
|
+
model_tensors.insert(kv);
|
1878
|
+
}
|
1879
|
+
|
1880
|
+
|
1881
|
+
// load base model
|
1882
|
+
std::unique_ptr<llama_model_loader> model_loader;
|
1883
|
+
ggml_context * base_ctx = NULL;
|
1884
|
+
llama_buffer base_buf;
|
1885
|
+
if (path_base_model) {
|
1886
|
+
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
1887
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
1888
|
+
|
1889
|
+
size_t ctx_size, mmapped_size;
|
1890
|
+
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
1891
|
+
base_buf.resize(ctx_size);
|
1892
|
+
|
1893
|
+
ggml_init_params base_params;
|
1894
|
+
base_params.mem_size = base_buf.size;
|
1895
|
+
base_params.mem_buffer = base_buf.addr;
|
1896
|
+
base_params.no_alloc = model_loader->use_mmap;
|
1897
|
+
|
1898
|
+
base_ctx = ggml_init(base_params);
|
1899
|
+
|
1900
|
+
model_loader->ggml_ctx = base_ctx;
|
1901
|
+
|
1902
|
+
// maybe this should in llama_model_loader
|
1903
|
+
if (model_loader->use_mmap) {
|
1904
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
1905
|
+
}
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
// read tensors and apply
|
1909
|
+
bool warned = false;
|
1910
|
+
int n_tensors = 0;
|
1911
|
+
while (true) {
|
1912
|
+
int32_t n_dims;
|
1913
|
+
int32_t length;
|
1914
|
+
int32_t ftype;
|
1915
|
+
|
1916
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1917
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1918
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1919
|
+
if (fin.eof()) {
|
1920
|
+
break;
|
1921
|
+
}
|
1922
|
+
|
1923
|
+
int32_t ne[2] = { 1, 1 };
|
1924
|
+
for (int i = 0; i < n_dims; ++i) {
|
1925
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1926
|
+
}
|
1927
|
+
|
1928
|
+
std::string name(length, 0);
|
1929
|
+
fin.read(&name[0], length);
|
1930
|
+
|
1931
|
+
// check for lora suffix and get the type of tensor
|
1932
|
+
const std::string lora_suffix = ".lora";
|
1933
|
+
size_t pos = name.rfind(lora_suffix);
|
1934
|
+
if (pos == std::string::npos) {
|
1935
|
+
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
1936
|
+
return 1;
|
1937
|
+
}
|
1938
|
+
|
1939
|
+
std::string lora_type = name.substr(pos + lora_suffix.length());
|
1940
|
+
std::string base_name = name;
|
1941
|
+
base_name.erase(pos);
|
1942
|
+
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
1943
|
+
|
1944
|
+
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
1945
|
+
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
1946
|
+
return 1;
|
1947
|
+
}
|
1948
|
+
|
1949
|
+
// create ggml tensor
|
1950
|
+
ggml_type wtype;
|
1951
|
+
switch (ftype) {
|
1952
|
+
case 0: wtype = GGML_TYPE_F32; break;
|
1953
|
+
case 1: wtype = GGML_TYPE_F16; break;
|
1954
|
+
default:
|
1955
|
+
{
|
1956
|
+
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
1957
|
+
__func__, ftype);
|
1958
|
+
return false;
|
1959
|
+
}
|
1960
|
+
}
|
1961
|
+
ggml_tensor* lora_tensor;
|
1962
|
+
if (n_dims == 2) {
|
1963
|
+
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
1964
|
+
}
|
1965
|
+
else {
|
1966
|
+
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
1967
|
+
return 1;
|
1968
|
+
}
|
1969
|
+
|
1970
|
+
// load tensor data
|
1971
|
+
size_t offset = fin.tellg();
|
1972
|
+
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
1973
|
+
offset = (offset + 31) & -32;
|
1974
|
+
fin.seekg(offset);
|
1975
|
+
fin.read((char*)lora_tensor->data, tensor_data_size);
|
1976
|
+
|
1977
|
+
lora_tensors[name] = lora_tensor;
|
1978
|
+
|
1979
|
+
// check if we have both A and B tensors and apply
|
1980
|
+
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
1981
|
+
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
1982
|
+
|
1983
|
+
ggml_tensor * dest_t = model_tensors[base_name];
|
1984
|
+
ggml_tensor * base_t;
|
1985
|
+
if (model_loader) {
|
1986
|
+
// load from base model
|
1987
|
+
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
1988
|
+
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
1989
|
+
return 1;
|
1990
|
+
}
|
1991
|
+
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
1992
|
+
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
1993
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
1994
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
1995
|
+
model_loader->load_data_for(lt);
|
1996
|
+
lt.ggml_tensor->data = lt.data;
|
1997
|
+
}
|
1998
|
+
else {
|
1999
|
+
base_t = dest_t;
|
2000
|
+
}
|
2001
|
+
|
2002
|
+
if (ggml_is_quantized(base_t->type)) {
|
2003
|
+
if (!warned) {
|
2004
|
+
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
2005
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
2006
|
+
warned = true;
|
2007
|
+
}
|
2008
|
+
}
|
2009
|
+
|
2010
|
+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2011
|
+
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2012
|
+
|
2013
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
2014
|
+
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
2015
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
2016
|
+
return 1;
|
2017
|
+
}
|
2018
|
+
|
2019
|
+
// w = w + BA*s
|
2020
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2021
|
+
|
2022
|
+
if (scaling != 1.0f) {
|
2023
|
+
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2024
|
+
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
2025
|
+
}
|
2026
|
+
|
2027
|
+
ggml_tensor * r;
|
2028
|
+
if (base_t == dest_t) {
|
2029
|
+
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2030
|
+
}
|
2031
|
+
else {
|
2032
|
+
r = ggml_add(lora_ctx, base_t, BA);
|
2033
|
+
r = ggml_cpy(lora_ctx, r, dest_t);
|
2034
|
+
}
|
2035
|
+
|
2036
|
+
struct ggml_cgraph gf = ggml_build_forward(r);
|
2037
|
+
gf.n_threads = n_threads;
|
2038
|
+
ggml_graph_compute(lora_ctx, &gf);
|
2039
|
+
|
2040
|
+
// we won't need these tensors again, reset the context to save memory
|
2041
|
+
ggml_free(lora_ctx);
|
2042
|
+
lora_ctx = ggml_init(params);
|
2043
|
+
lora_tensors.clear();
|
2044
|
+
|
2045
|
+
n_tensors++;
|
2046
|
+
if (n_tensors % 4 == 0)
|
2047
|
+
fprintf(stderr, ".");
|
2048
|
+
}
|
2049
|
+
}
|
2050
|
+
|
2051
|
+
// TODO: this should be in a destructor, it will leak on failure
|
2052
|
+
ggml_free(lora_ctx);
|
2053
|
+
if (base_ctx) {
|
2054
|
+
ggml_free(base_ctx);
|
2055
|
+
}
|
2056
|
+
|
2057
|
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
2058
|
+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
2059
|
+
|
2060
|
+
return 0;
|
2061
|
+
}
|
2062
|
+
|
2063
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2064
|
+
try {
|
2065
|
+
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2066
|
+
} catch (const std::string & err) {
|
2067
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
2068
|
+
return 1;
|
2069
|
+
}
|
2070
|
+
}
|
2071
|
+
|
1760
2072
|
// Returns the KV cache that will contain the context for the
|
1761
2073
|
// ongoing prediction with the model.
|
1762
2074
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
@@ -1914,18 +2226,20 @@ const char * llama_print_system_info(void) {
|
|
1914
2226
|
static std::string s;
|
1915
2227
|
|
1916
2228
|
s = "";
|
1917
|
-
s += "AVX = "
|
1918
|
-
s += "AVX2 = "
|
1919
|
-
s += "AVX512 = "
|
1920
|
-
s += "
|
1921
|
-
s += "
|
1922
|
-
s += "
|
1923
|
-
s += "
|
1924
|
-
s += "
|
1925
|
-
s += "
|
1926
|
-
s += "
|
1927
|
-
s += "
|
1928
|
-
s += "
|
2229
|
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
2230
|
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
2231
|
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
2232
|
+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
2233
|
+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
2234
|
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
2235
|
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
2236
|
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
2237
|
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
2238
|
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
2239
|
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
2240
|
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
2241
|
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
2242
|
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
1929
2243
|
|
1930
2244
|
return s.c_str();
|
1931
2245
|
}
|