llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,247 +1,11 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml-impl.h"
3
+ #define GGML_COMMON_DECL_C
4
+ #include "ggml-common.h"
4
5
 
5
- // GGML internal header
6
-
7
- #include <stdint.h>
8
- #include <stddef.h>
9
-
10
- #define QK4_0 32
11
- typedef struct {
12
- ggml_fp16_t d; // delta
13
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
14
- } block_q4_0;
15
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
16
-
17
- #define QK4_1 32
18
- typedef struct {
19
- ggml_fp16_t d; // delta
20
- ggml_fp16_t m; // min
21
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
22
- } block_q4_1;
23
- static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
24
-
25
- #define QK5_0 32
26
- typedef struct {
27
- ggml_fp16_t d; // delta
28
- uint8_t qh[4]; // 5-th bit of quants
29
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
30
- } block_q5_0;
31
- static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
32
-
33
- #define QK5_1 32
34
- typedef struct {
35
- ggml_fp16_t d; // delta
36
- ggml_fp16_t m; // min
37
- uint8_t qh[4]; // 5-th bit of quants
38
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
39
- } block_q5_1;
40
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
41
-
42
- #define QK8_0 32
43
- typedef struct {
44
- ggml_fp16_t d; // delta
45
- int8_t qs[QK8_0]; // quants
46
- } block_q8_0;
47
- static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
48
-
49
- #define QK8_1 32
50
- typedef struct {
51
- float d; // delta
52
- float s; // d * sum(qs[i])
53
- int8_t qs[QK8_1]; // quants
54
- } block_q8_1;
55
- static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
56
-
57
- //
58
- // Super-block quantization structures
59
- //
60
-
61
- // Super-block size
62
- #ifdef GGML_QKK_64
63
- #define QK_K 64
64
- #define K_SCALE_SIZE 4
65
- #else
66
- #define QK_K 256
67
- #define K_SCALE_SIZE 12
68
- #endif
69
-
70
- // 2-bit quantization
71
- // weight is represented as x = a * q + b
72
- // 16 blocks of 16 elements each
73
- // Effectively 2.625 bits per weight
74
- typedef struct {
75
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
76
- uint8_t qs[QK_K/4]; // quants
77
- ggml_fp16_t d; // super-block scale for quantized scales
78
- ggml_fp16_t dmin; // super-block scale for quantized mins
79
- } block_q2_K;
80
- static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
81
-
82
- // 3-bit quantization
83
- // weight is represented as x = a * q
84
- // 16 blocks of 16 elements each
85
- // Effectively 3.4375 bits per weight
86
- #ifdef GGML_QKK_64
87
- typedef struct {
88
- uint8_t hmask[QK_K/8]; // quants - high bit
89
- uint8_t qs[QK_K/4]; // quants - low 2 bits
90
- uint8_t scales[2];
91
- ggml_fp16_t d; // super-block scale
92
- } block_q3_K;
93
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
94
- #else
95
- typedef struct {
96
- uint8_t hmask[QK_K/8]; // quants - high bit
97
- uint8_t qs[QK_K/4]; // quants - low 2 bits
98
- uint8_t scales[12]; // scales, quantized with 6 bits
99
- ggml_fp16_t d; // super-block scale
100
- } block_q3_K;
101
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
102
- #endif
103
-
104
- // 4-bit quantization
105
- // 8 blocks of 32 elements each
106
- // weight is represented as x = a * q + b
107
- // Effectively 4.5 bits per weight
108
- #ifdef GGML_QKK_64
109
- typedef struct {
110
- ggml_fp16_t d[2]; // super-block scales/mins
111
- uint8_t scales[2]; // 4-bit block scales/mins
112
- uint8_t qs[QK_K/2]; // 4--bit quants
113
- } block_q4_K;
114
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
115
- #else
116
- typedef struct {
117
- ggml_fp16_t d; // super-block scale for quantized scales
118
- ggml_fp16_t dmin; // super-block scale for quantized mins
119
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
120
- uint8_t qs[QK_K/2]; // 4--bit quants
121
- } block_q4_K;
122
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
123
- #endif
124
-
125
- // 5-bit quantization
126
- // 8 blocks of 32 elements each
127
- // weight is represented as x = a * q + b
128
- // Effectively 5.5 bits per weight
129
- #ifdef GGML_QKK_64
130
- typedef struct {
131
- ggml_fp16_t d; // super-block scale
132
- int8_t scales[QK_K/16]; // 8-bit block scales
133
- uint8_t qh[QK_K/8]; // quants, high bit
134
- uint8_t qs[QK_K/2]; // quants, low 4 bits
135
- } block_q5_K;
136
- static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
137
- #else
138
- typedef struct {
139
- ggml_fp16_t d; // super-block scale for quantized scales
140
- ggml_fp16_t dmin; // super-block scale for quantized mins
141
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
142
- uint8_t qh[QK_K/8]; // quants, high bit
143
- uint8_t qs[QK_K/2]; // quants, low 4 bits
144
- } block_q5_K;
145
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
146
- #endif
147
-
148
- // 6-bit quantization
149
- // weight is represented as x = a * q
150
- // 16 blocks of 16 elements each
151
- // Effectively 6.5625 bits per weight
152
- typedef struct {
153
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
154
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
155
- int8_t scales[QK_K/16]; // scales, quantized with 8 bits
156
- ggml_fp16_t d; // super-block scale
157
- } block_q6_K;
158
- static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
159
-
160
- // This is only used for intermediate quantization and dot products
161
- typedef struct {
162
- float d; // delta
163
- int8_t qs[QK_K]; // quants
164
- int16_t bsums[QK_K/16]; // sum of quants in groups of 16
165
- } block_q8_K;
166
- static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
167
-
168
- // (Almost) "true" 2-bit quantization.
169
- // Due to the need to use blocks as per ggml design, it ends up using
170
- // 2.0625 bpw because of the 16-bit scale for each block of 256.
171
- typedef struct {
172
- ggml_fp16_t d;
173
- uint16_t qs[QK_K/8];
174
- } block_iq2_xxs;
175
- static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
176
-
177
- // 2.3125 bpw quants
178
- typedef struct {
179
- ggml_fp16_t d;
180
- uint16_t qs[QK_K/8];
181
- uint8_t scales[QK_K/32];
182
- } block_iq2_xs;
183
- static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
184
-
185
- // 2.5625 bpw quants
186
- typedef struct {
187
- ggml_fp16_t d;
188
- uint8_t qs[QK_K/4];
189
- uint8_t qh[QK_K/32];
190
- uint8_t scales[QK_K/32];
191
- } block_iq2_s;
192
- static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
193
-
194
- // (Almost) "true" 3-bit quantization.
195
- // Due to the need to use blocks as per ggml design, it ends up using
196
- // 3.0625 bpw because of the 16-bit scale for each block of 256.
197
- typedef struct {
198
- ggml_fp16_t d;
199
- uint8_t qs[3*QK_K/8];
200
- } block_iq3_xxs;
201
- static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
202
-
203
- // 3.4375 bpw
204
- #if QK_K == 64
205
- #define IQ3S_N_SCALE 2
206
- #else
207
- #define IQ3S_N_SCALE QK_K/64
208
- #endif
209
- typedef struct {
210
- ggml_fp16_t d;
211
- uint8_t qs[QK_K/4];
212
- uint8_t qh[QK_K/32];
213
- uint8_t signs[QK_K/8];
214
- uint8_t scales[IQ3S_N_SCALE];
215
- } block_iq3_s;
216
- static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
6
+ #include "ggml.h"
217
7
 
218
- typedef struct {
219
- ggml_fp16_t d;
220
- uint8_t qs[QK_K/8];
221
- uint8_t scales[QK_K/16];
222
- } block_iq1_s;
223
- static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
224
-
225
- // Non-linear quants
226
- #define QK4_NL 32
227
- typedef struct {
228
- ggml_fp16_t d;
229
- uint8_t qs[QK4_NL/2];
230
- } block_iq4_nl;
231
- static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
232
-
233
- #if QK_K == 64
234
- #define block_iq4_xs block_iq4_nl
235
- //typedef struct block_iq4_nl block_iq4_xs;
236
- #else
237
- typedef struct {
238
- ggml_fp16_t d;
239
- uint16_t scales_h;
240
- uint8_t scales_l[QK_K/64];
241
- uint8_t qs[QK_K/2];
242
- } block_iq4_xs;
243
- static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
244
- #endif
8
+ // GGML internal header
245
9
 
246
10
  #ifdef __cplusplus
247
11
  extern "C" {
@@ -261,6 +25,7 @@ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGM
261
25
  void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
262
26
  void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
263
27
  void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
28
+
264
29
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
265
30
  void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
266
31
  void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
@@ -280,6 +45,7 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
280
45
  void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
281
46
  void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
282
47
  void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
48
+
283
49
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
284
50
  void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
285
51
  void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@@ -300,6 +66,7 @@ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRI
300
66
  void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
301
67
  void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
302
68
  void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
69
+
303
70
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
304
71
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
305
72
  void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -321,6 +88,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
321
88
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
322
89
  void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
323
90
  void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
91
+
324
92
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
325
93
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
326
94
  void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -330,26 +98,26 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
330
98
  void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
331
99
  void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
332
100
 
333
- //
334
101
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
335
- //
336
- size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
337
- size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
338
- size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
339
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
340
- size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
341
- size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
342
- size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
343
- size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
344
- size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
345
- size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
346
- size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
347
- size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
348
- size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
349
- size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
350
- size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
351
- size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
352
- size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
102
+ size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
103
+ size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
104
+ size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
105
+ size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
106
+ size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
107
+ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
108
+ size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
109
+ size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
110
+
111
+ size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
112
+ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
113
+ size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
114
+ size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
115
+ size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
116
+ size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
117
+ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
118
+ size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
119
+ size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
120
+ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
353
121
 
354
122
  void iq2xs_init_impl(enum ggml_type type);
355
123
  void iq2xs_free_impl(enum ggml_type type);