llama_cpp 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +305 -133
- data/ext/llama_cpp/src/ggml-cuda.cu +367 -69
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +262 -291
- data/ext/llama_cpp/src/llama.h +49 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +14 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -7,7 +7,13 @@
|
|
7
7
|
#include <stddef.h>
|
8
8
|
|
9
9
|
// Super-block size
|
10
|
+
#ifdef GGML_QKK_64
|
11
|
+
#define QK_K 64
|
12
|
+
#define K_SCALE_SIZE 4
|
13
|
+
#else
|
10
14
|
#define QK_K 256
|
15
|
+
#define K_SCALE_SIZE 12
|
16
|
+
#endif
|
11
17
|
|
12
18
|
//
|
13
19
|
// Super-block quantization structures
|
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
29
35
|
// weight is represented as x = a * q
|
30
36
|
// 16 blocks of 16 elemenets each
|
31
37
|
// Effectively 3.4375 bits per weight
|
38
|
+
#ifdef GGML_QKK_64
|
32
39
|
typedef struct {
|
33
40
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
34
41
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
35
|
-
uint8_t scales[
|
42
|
+
uint8_t scales[2];
|
36
43
|
ggml_fp16_t d; // super-block scale
|
37
44
|
} block_q3_K;
|
38
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 +
|
45
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
46
|
+
#else
|
47
|
+
typedef struct {
|
48
|
+
uint8_t hmask[QK_K/8]; // quants - high bit
|
49
|
+
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
50
|
+
uint8_t scales[12]; // scales, quantized with 6 bits
|
51
|
+
ggml_fp16_t d; // super-block scale
|
52
|
+
} block_q3_K;
|
53
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
54
|
+
#endif
|
39
55
|
|
40
56
|
// 4-bit quantization
|
41
57
|
// 16 blocks of 32 elements each
|
42
58
|
// weight is represented as x = a * q + b
|
43
59
|
// Effectively 4.5 bits per weight
|
60
|
+
#ifdef GGML_QKK_64
|
61
|
+
typedef struct {
|
62
|
+
ggml_fp16_t d[2]; // super-block scales/mins
|
63
|
+
uint8_t scales[2]; // 4-bit block scales/mins
|
64
|
+
uint8_t qs[QK_K/2]; // 4--bit quants
|
65
|
+
} block_q4_K;
|
66
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
67
|
+
#else
|
44
68
|
typedef struct {
|
45
69
|
ggml_fp16_t d; // super-block scale for quantized scales
|
46
70
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
47
|
-
uint8_t scales[
|
71
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
48
72
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
49
73
|
} block_q4_K;
|
50
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) +
|
74
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
75
|
+
#endif
|
51
76
|
|
52
77
|
// 5-bit quantization
|
53
78
|
// 16 blocks of 32 elements each
|
54
79
|
// weight is represented as x = a * q + b
|
55
80
|
// Effectively 5.5 bits per weight
|
81
|
+
#ifdef GGML_QKK_64
|
82
|
+
typedef struct {
|
83
|
+
ggml_fp16_t d; // super-block scale
|
84
|
+
int8_t scales[QK_K/16]; // 8-bit block scales
|
85
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
86
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
87
|
+
} block_q5_K;
|
88
|
+
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
89
|
+
#else
|
56
90
|
typedef struct {
|
57
91
|
ggml_fp16_t d; // super-block scale for quantized scales
|
58
92
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
59
|
-
uint8_t scales[
|
93
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
60
94
|
uint8_t qh[QK_K/8]; // quants, high bit
|
61
95
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
62
96
|
} block_q5_K;
|
63
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) +
|
97
|
+
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
98
|
+
#endif
|
64
99
|
|
65
100
|
// 6-bit quantization
|
66
101
|
// weight is represented as x = a * q
|
@@ -172,12 +172,14 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
179
|
+
// prefetch/readahead impairs performance on NUMA systems
|
180
|
+
if (numa) { prefetch = 0; }
|
179
181
|
#ifdef __linux__
|
180
|
-
flags |= MAP_POPULATE;
|
182
|
+
if (prefetch) { flags |= MAP_POPULATE; }
|
181
183
|
#endif
|
182
184
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
183
185
|
if (addr == MAP_FAILED) {
|
@@ -191,6 +193,14 @@ struct llama_mmap {
|
|
191
193
|
strerror(errno));
|
192
194
|
}
|
193
195
|
}
|
196
|
+
if (numa) {
|
197
|
+
// advise the kernel not to use readahead
|
198
|
+
// (because the next page might not belong on the same node)
|
199
|
+
if (madvise(addr, file->size, MADV_RANDOM)) {
|
200
|
+
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
201
|
+
strerror(errno));
|
202
|
+
}
|
203
|
+
}
|
194
204
|
}
|
195
205
|
|
196
206
|
~llama_mmap() {
|
@@ -199,7 +209,9 @@ struct llama_mmap {
|
|
199
209
|
#elif defined(_WIN32)
|
200
210
|
static constexpr bool SUPPORTED = true;
|
201
211
|
|
202
|
-
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
212
|
+
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
213
|
+
(void) numa;
|
214
|
+
|
203
215
|
size = file->size;
|
204
216
|
|
205
217
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
@@ -244,8 +256,10 @@ struct llama_mmap {
|
|
244
256
|
#else
|
245
257
|
static constexpr bool SUPPORTED = false;
|
246
258
|
|
247
|
-
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
-
(void)prefetch;
|
259
|
+
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
260
|
+
(void) prefetch;
|
261
|
+
(void) numa;
|
262
|
+
|
249
263
|
throw std::runtime_error(std::string("mmap not supported"));
|
250
264
|
}
|
251
265
|
#endif
|