llama_cpp 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,13 @@
7
7
  #include <stddef.h>
8
8
 
9
9
  // Super-block size
10
+ #ifdef GGML_QKK_64
11
+ #define QK_K 64
12
+ #define K_SCALE_SIZE 4
13
+ #else
10
14
  #define QK_K 256
15
+ #define K_SCALE_SIZE 12
16
+ #endif
11
17
 
12
18
  //
13
19
  // Super-block quantization structures
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
29
35
  // weight is represented as x = a * q
30
36
  // 16 blocks of 16 elemenets each
31
37
  // Effectively 3.4375 bits per weight
38
+ #ifdef GGML_QKK_64
32
39
  typedef struct {
33
40
  uint8_t hmask[QK_K/8]; // quants - high bit
34
41
  uint8_t qs[QK_K/4]; // quants - low 2 bits
35
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
42
+ uint8_t scales[2];
36
43
  ggml_fp16_t d; // super-block scale
37
44
  } block_q3_K;
38
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
45
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
46
+ #else
47
+ typedef struct {
48
+ uint8_t hmask[QK_K/8]; // quants - high bit
49
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
50
+ uint8_t scales[12]; // scales, quantized with 6 bits
51
+ ggml_fp16_t d; // super-block scale
52
+ } block_q3_K;
53
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
54
+ #endif
39
55
 
40
56
  // 4-bit quantization
41
57
  // 16 blocks of 32 elements each
42
58
  // weight is represented as x = a * q + b
43
59
  // Effectively 4.5 bits per weight
60
+ #ifdef GGML_QKK_64
61
+ typedef struct {
62
+ ggml_fp16_t d[2]; // super-block scales/mins
63
+ uint8_t scales[2]; // 4-bit block scales/mins
64
+ uint8_t qs[QK_K/2]; // 4--bit quants
65
+ } block_q4_K;
66
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
67
+ #else
44
68
  typedef struct {
45
69
  ggml_fp16_t d; // super-block scale for quantized scales
46
70
  ggml_fp16_t dmin; // super-block scale for quantized mins
47
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
71
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
48
72
  uint8_t qs[QK_K/2]; // 4--bit quants
49
73
  } block_q4_K;
50
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
74
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
75
+ #endif
51
76
 
52
77
  // 5-bit quantization
53
78
  // 16 blocks of 32 elements each
54
79
  // weight is represented as x = a * q + b
55
80
  // Effectively 5.5 bits per weight
81
+ #ifdef GGML_QKK_64
82
+ typedef struct {
83
+ ggml_fp16_t d; // super-block scale
84
+ int8_t scales[QK_K/16]; // 8-bit block scales
85
+ uint8_t qh[QK_K/8]; // quants, high bit
86
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
87
+ } block_q5_K;
88
+ static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
89
+ #else
56
90
  typedef struct {
57
91
  ggml_fp16_t d; // super-block scale for quantized scales
58
92
  ggml_fp16_t dmin; // super-block scale for quantized mins
59
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
93
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
60
94
  uint8_t qh[QK_K/8]; // quants, high bit
61
95
  uint8_t qs[QK_K/2]; // quants, low 4 bits
62
96
  } block_q5_K;
63
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
97
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
98
+ #endif
64
99
 
65
100
  // 6-bit quantization
66
101
  // weight is represented as x = a * q
@@ -172,12 +172,14 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
179
+ // prefetch/readahead impairs performance on NUMA systems
180
+ if (numa) { prefetch = 0; }
179
181
  #ifdef __linux__
180
- flags |= MAP_POPULATE;
182
+ if (prefetch) { flags |= MAP_POPULATE; }
181
183
  #endif
182
184
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
183
185
  if (addr == MAP_FAILED) {
@@ -191,6 +193,14 @@ struct llama_mmap {
191
193
  strerror(errno));
192
194
  }
193
195
  }
196
+ if (numa) {
197
+ // advise the kernel not to use readahead
198
+ // (because the next page might not belong on the same node)
199
+ if (madvise(addr, file->size, MADV_RANDOM)) {
200
+ fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
201
+ strerror(errno));
202
+ }
203
+ }
194
204
  }
195
205
 
196
206
  ~llama_mmap() {
@@ -199,7 +209,9 @@ struct llama_mmap {
199
209
  #elif defined(_WIN32)
200
210
  static constexpr bool SUPPORTED = true;
201
211
 
202
- llama_mmap(struct llama_file * file, bool prefetch = true) {
212
+ llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
213
+ (void) numa;
214
+
203
215
  size = file->size;
204
216
 
205
217
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -244,8 +256,10 @@ struct llama_mmap {
244
256
  #else
245
257
  static constexpr bool SUPPORTED = false;
246
258
 
247
- llama_mmap(struct llama_file *, bool prefetch = true) {
248
- (void)prefetch;
259
+ llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
260
+ (void) prefetch;
261
+ (void) numa;
262
+
249
263
  throw std::runtime_error(std::string("mmap not supported"));
250
264
  }
251
265
  #endif