llama_cpp 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +305 -133
- data/ext/llama_cpp/src/ggml-cuda.cu +367 -69
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +262 -291
- data/ext/llama_cpp/src/llama.h +49 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +14 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -7,7 +7,13 @@
|
|
7
7
|
#include <stddef.h>
|
8
8
|
|
9
9
|
// Super-block size
|
10
|
+
#ifdef GGML_QKK_64
|
11
|
+
#define QK_K 64
|
12
|
+
#define K_SCALE_SIZE 4
|
13
|
+
#else
|
10
14
|
#define QK_K 256
|
15
|
+
#define K_SCALE_SIZE 12
|
16
|
+
#endif
|
11
17
|
|
12
18
|
//
|
13
19
|
// Super-block quantization structures
|
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
29
35
|
// weight is represented as x = a * q
|
30
36
|
// 16 blocks of 16 elemenets each
|
31
37
|
// Effectively 3.4375 bits per weight
|
38
|
+
#ifdef GGML_QKK_64
|
32
39
|
typedef struct {
|
33
40
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
34
41
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
35
|
-
uint8_t scales[
|
42
|
+
uint8_t scales[2];
|
36
43
|
ggml_fp16_t d; // super-block scale
|
37
44
|
} block_q3_K;
|
38
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 +
|
45
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
46
|
+
#else
|
47
|
+
typedef struct {
|
48
|
+
uint8_t hmask[QK_K/8]; // quants - high bit
|
49
|
+
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
50
|
+
uint8_t scales[12]; // scales, quantized with 6 bits
|
51
|
+
ggml_fp16_t d; // super-block scale
|
52
|
+
} block_q3_K;
|
53
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
54
|
+
#endif
|
39
55
|
|
40
56
|
// 4-bit quantization
|
41
57
|
// 16 blocks of 32 elements each
|
42
58
|
// weight is represented as x = a * q + b
|
43
59
|
// Effectively 4.5 bits per weight
|
60
|
+
#ifdef GGML_QKK_64
|
61
|
+
typedef struct {
|
62
|
+
ggml_fp16_t d[2]; // super-block scales/mins
|
63
|
+
uint8_t scales[2]; // 4-bit block scales/mins
|
64
|
+
uint8_t qs[QK_K/2]; // 4--bit quants
|
65
|
+
} block_q4_K;
|
66
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
67
|
+
#else
|
44
68
|
typedef struct {
|
45
69
|
ggml_fp16_t d; // super-block scale for quantized scales
|
46
70
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
47
|
-
uint8_t scales[
|
71
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
48
72
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
49
73
|
} block_q4_K;
|
50
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) +
|
74
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
75
|
+
#endif
|
51
76
|
|
52
77
|
// 5-bit quantization
|
53
78
|
// 16 blocks of 32 elements each
|
54
79
|
// weight is represented as x = a * q + b
|
55
80
|
// Effectively 5.5 bits per weight
|
81
|
+
#ifdef GGML_QKK_64
|
82
|
+
typedef struct {
|
83
|
+
ggml_fp16_t d; // super-block scale
|
84
|
+
int8_t scales[QK_K/16]; // 8-bit block scales
|
85
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
86
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
87
|
+
} block_q5_K;
|
88
|
+
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
89
|
+
#else
|
56
90
|
typedef struct {
|
57
91
|
ggml_fp16_t d; // super-block scale for quantized scales
|
58
92
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
59
|
-
uint8_t scales[
|
93
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
60
94
|
uint8_t qh[QK_K/8]; // quants, high bit
|
61
95
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
62
96
|
} block_q5_K;
|
63
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) +
|
97
|
+
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
98
|
+
#endif
|
64
99
|
|
65
100
|
// 6-bit quantization
|
66
101
|
// weight is represented as x = a * q
|
@@ -172,12 +172,14 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
179
|
+
// prefetch/readahead impairs performance on NUMA systems
|
180
|
+
if (numa) { prefetch = 0; }
|
179
181
|
#ifdef __linux__
|
180
|
-
flags |= MAP_POPULATE;
|
182
|
+
if (prefetch) { flags |= MAP_POPULATE; }
|
181
183
|
#endif
|
182
184
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
183
185
|
if (addr == MAP_FAILED) {
|
@@ -191,6 +193,14 @@ struct llama_mmap {
|
|
191
193
|
strerror(errno));
|
192
194
|
}
|
193
195
|
}
|
196
|
+
if (numa) {
|
197
|
+
// advise the kernel not to use readahead
|
198
|
+
// (because the next page might not belong on the same node)
|
199
|
+
if (madvise(addr, file->size, MADV_RANDOM)) {
|
200
|
+
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
201
|
+
strerror(errno));
|
202
|
+
}
|
203
|
+
}
|
194
204
|
}
|
195
205
|
|
196
206
|
~llama_mmap() {
|
@@ -199,7 +209,9 @@ struct llama_mmap {
|
|
199
209
|
#elif defined(_WIN32)
|
200
210
|
static constexpr bool SUPPORTED = true;
|
201
211
|
|
202
|
-
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
212
|
+
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
213
|
+
(void) numa;
|
214
|
+
|
203
215
|
size = file->size;
|
204
216
|
|
205
217
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
@@ -244,8 +256,10 @@ struct llama_mmap {
|
|
244
256
|
#else
|
245
257
|
static constexpr bool SUPPORTED = false;
|
246
258
|
|
247
|
-
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
-
(void)prefetch;
|
259
|
+
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
260
|
+
(void) prefetch;
|
261
|
+
(void) numa;
|
262
|
+
|
249
263
|
throw std::runtime_error(std::string("mmap not supported"));
|
250
264
|
}
|
251
265
|
#endif
|