llama_cpp 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,7 +7,13 @@
7
7
  #include <stddef.h>
8
8
 
9
9
  // Super-block size
10
+ #ifdef GGML_QKK_64
11
+ #define QK_K 64
12
+ #define K_SCALE_SIZE 4
13
+ #else
10
14
  #define QK_K 256
15
+ #define K_SCALE_SIZE 12
16
+ #endif
11
17
 
12
18
  //
13
19
  // Super-block quantization structures
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
29
35
  // weight is represented as x = a * q
30
36
  // 16 blocks of 16 elemenets each
31
37
  // Effectively 3.4375 bits per weight
38
+ #ifdef GGML_QKK_64
32
39
  typedef struct {
33
40
  uint8_t hmask[QK_K/8]; // quants - high bit
34
41
  uint8_t qs[QK_K/4]; // quants - low 2 bits
35
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
42
+ uint8_t scales[2];
36
43
  ggml_fp16_t d; // super-block scale
37
44
  } block_q3_K;
38
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
45
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
46
+ #else
47
+ typedef struct {
48
+ uint8_t hmask[QK_K/8]; // quants - high bit
49
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
50
+ uint8_t scales[12]; // scales, quantized with 6 bits
51
+ ggml_fp16_t d; // super-block scale
52
+ } block_q3_K;
53
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
54
+ #endif
39
55
 
40
56
  // 4-bit quantization
41
57
  // 16 blocks of 32 elements each
42
58
  // weight is represented as x = a * q + b
43
59
  // Effectively 4.5 bits per weight
60
+ #ifdef GGML_QKK_64
61
+ typedef struct {
62
+ ggml_fp16_t d[2]; // super-block scales/mins
63
+ uint8_t scales[2]; // 4-bit block scales/mins
64
+ uint8_t qs[QK_K/2]; // 4--bit quants
65
+ } block_q4_K;
66
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
67
+ #else
44
68
  typedef struct {
45
69
  ggml_fp16_t d; // super-block scale for quantized scales
46
70
  ggml_fp16_t dmin; // super-block scale for quantized mins
47
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
71
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
48
72
  uint8_t qs[QK_K/2]; // 4--bit quants
49
73
  } block_q4_K;
50
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
74
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
75
+ #endif
51
76
 
52
77
  // 5-bit quantization
53
78
  // 16 blocks of 32 elements each
54
79
  // weight is represented as x = a * q + b
55
80
  // Effectively 5.5 bits per weight
81
+ #ifdef GGML_QKK_64
82
+ typedef struct {
83
+ ggml_fp16_t d; // super-block scale
84
+ int8_t scales[QK_K/16]; // 8-bit block scales
85
+ uint8_t qh[QK_K/8]; // quants, high bit
86
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
87
+ } block_q5_K;
88
+ static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
89
+ #else
56
90
  typedef struct {
57
91
  ggml_fp16_t d; // super-block scale for quantized scales
58
92
  ggml_fp16_t dmin; // super-block scale for quantized mins
59
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
93
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
60
94
  uint8_t qh[QK_K/8]; // quants, high bit
61
95
  uint8_t qs[QK_K/2]; // quants, low 4 bits
62
96
  } block_q5_K;
63
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
97
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
98
+ #endif
64
99
 
65
100
  // 6-bit quantization
66
101
  // weight is represented as x = a * q
@@ -172,12 +172,14 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
179
+ // prefetch/readahead impairs performance on NUMA systems
180
+ if (numa) { prefetch = 0; }
179
181
  #ifdef __linux__
180
- flags |= MAP_POPULATE;
182
+ if (prefetch) { flags |= MAP_POPULATE; }
181
183
  #endif
182
184
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
183
185
  if (addr == MAP_FAILED) {
@@ -191,6 +193,14 @@ struct llama_mmap {
191
193
  strerror(errno));
192
194
  }
193
195
  }
196
+ if (numa) {
197
+ // advise the kernel not to use readahead
198
+ // (because the next page might not belong on the same node)
199
+ if (madvise(addr, file->size, MADV_RANDOM)) {
200
+ fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
201
+ strerror(errno));
202
+ }
203
+ }
194
204
  }
195
205
 
196
206
  ~llama_mmap() {
@@ -199,7 +209,9 @@ struct llama_mmap {
199
209
  #elif defined(_WIN32)
200
210
  static constexpr bool SUPPORTED = true;
201
211
 
202
- llama_mmap(struct llama_file * file, bool prefetch = true) {
212
+ llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
213
+ (void) numa;
214
+
203
215
  size = file->size;
204
216
 
205
217
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -244,8 +256,10 @@ struct llama_mmap {
244
256
  #else
245
257
  static constexpr bool SUPPORTED = false;
246
258
 
247
- llama_mmap(struct llama_file *, bool prefetch = true) {
248
- (void)prefetch;
259
+ llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
260
+ (void) prefetch;
261
+ (void) numa;
262
+
249
263
  throw std::runtime_error(std::string("mmap not supported"));
250
264
  }
251
265
  #endif