llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,247 +1,11 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml-impl.h"
3
+ #define GGML_COMMON_DECL_C
4
+ #include "ggml-common.h"
4
5
 
5
- // GGML internal header
6
-
7
- #include <stdint.h>
8
- #include <stddef.h>
9
-
10
- #define QK4_0 32
11
- typedef struct {
12
- ggml_fp16_t d; // delta
13
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
14
- } block_q4_0;
15
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
16
-
17
- #define QK4_1 32
18
- typedef struct {
19
- ggml_fp16_t d; // delta
20
- ggml_fp16_t m; // min
21
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
22
- } block_q4_1;
23
- static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
24
-
25
- #define QK5_0 32
26
- typedef struct {
27
- ggml_fp16_t d; // delta
28
- uint8_t qh[4]; // 5-th bit of quants
29
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
30
- } block_q5_0;
31
- static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
32
-
33
- #define QK5_1 32
34
- typedef struct {
35
- ggml_fp16_t d; // delta
36
- ggml_fp16_t m; // min
37
- uint8_t qh[4]; // 5-th bit of quants
38
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
39
- } block_q5_1;
40
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
41
-
42
- #define QK8_0 32
43
- typedef struct {
44
- ggml_fp16_t d; // delta
45
- int8_t qs[QK8_0]; // quants
46
- } block_q8_0;
47
- static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
48
-
49
- #define QK8_1 32
50
- typedef struct {
51
- float d; // delta
52
- float s; // d * sum(qs[i])
53
- int8_t qs[QK8_1]; // quants
54
- } block_q8_1;
55
- static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
56
-
57
- //
58
- // Super-block quantization structures
59
- //
60
-
61
- // Super-block size
62
- #ifdef GGML_QKK_64
63
- #define QK_K 64
64
- #define K_SCALE_SIZE 4
65
- #else
66
- #define QK_K 256
67
- #define K_SCALE_SIZE 12
68
- #endif
69
-
70
- // 2-bit quantization
71
- // weight is represented as x = a * q + b
72
- // 16 blocks of 16 elements each
73
- // Effectively 2.625 bits per weight
74
- typedef struct {
75
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
76
- uint8_t qs[QK_K/4]; // quants
77
- ggml_fp16_t d; // super-block scale for quantized scales
78
- ggml_fp16_t dmin; // super-block scale for quantized mins
79
- } block_q2_K;
80
- static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
81
-
82
- // 3-bit quantization
83
- // weight is represented as x = a * q
84
- // 16 blocks of 16 elements each
85
- // Effectively 3.4375 bits per weight
86
- #ifdef GGML_QKK_64
87
- typedef struct {
88
- uint8_t hmask[QK_K/8]; // quants - high bit
89
- uint8_t qs[QK_K/4]; // quants - low 2 bits
90
- uint8_t scales[2];
91
- ggml_fp16_t d; // super-block scale
92
- } block_q3_K;
93
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
94
- #else
95
- typedef struct {
96
- uint8_t hmask[QK_K/8]; // quants - high bit
97
- uint8_t qs[QK_K/4]; // quants - low 2 bits
98
- uint8_t scales[12]; // scales, quantized with 6 bits
99
- ggml_fp16_t d; // super-block scale
100
- } block_q3_K;
101
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
102
- #endif
103
-
104
- // 4-bit quantization
105
- // 8 blocks of 32 elements each
106
- // weight is represented as x = a * q + b
107
- // Effectively 4.5 bits per weight
108
- #ifdef GGML_QKK_64
109
- typedef struct {
110
- ggml_fp16_t d[2]; // super-block scales/mins
111
- uint8_t scales[2]; // 4-bit block scales/mins
112
- uint8_t qs[QK_K/2]; // 4--bit quants
113
- } block_q4_K;
114
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
115
- #else
116
- typedef struct {
117
- ggml_fp16_t d; // super-block scale for quantized scales
118
- ggml_fp16_t dmin; // super-block scale for quantized mins
119
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
120
- uint8_t qs[QK_K/2]; // 4--bit quants
121
- } block_q4_K;
122
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
123
- #endif
124
-
125
- // 5-bit quantization
126
- // 8 blocks of 32 elements each
127
- // weight is represented as x = a * q + b
128
- // Effectively 5.5 bits per weight
129
- #ifdef GGML_QKK_64
130
- typedef struct {
131
- ggml_fp16_t d; // super-block scale
132
- int8_t scales[QK_K/16]; // 8-bit block scales
133
- uint8_t qh[QK_K/8]; // quants, high bit
134
- uint8_t qs[QK_K/2]; // quants, low 4 bits
135
- } block_q5_K;
136
- static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
137
- #else
138
- typedef struct {
139
- ggml_fp16_t d; // super-block scale for quantized scales
140
- ggml_fp16_t dmin; // super-block scale for quantized mins
141
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
142
- uint8_t qh[QK_K/8]; // quants, high bit
143
- uint8_t qs[QK_K/2]; // quants, low 4 bits
144
- } block_q5_K;
145
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
146
- #endif
147
-
148
- // 6-bit quantization
149
- // weight is represented as x = a * q
150
- // 16 blocks of 16 elements each
151
- // Effectively 6.5625 bits per weight
152
- typedef struct {
153
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
154
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
155
- int8_t scales[QK_K/16]; // scales, quantized with 8 bits
156
- ggml_fp16_t d; // super-block scale
157
- } block_q6_K;
158
- static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
159
-
160
- // This is only used for intermediate quantization and dot products
161
- typedef struct {
162
- float d; // delta
163
- int8_t qs[QK_K]; // quants
164
- int16_t bsums[QK_K/16]; // sum of quants in groups of 16
165
- } block_q8_K;
166
- static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
167
-
168
- // (Almost) "true" 2-bit quantization.
169
- // Due to the need to use blocks as per ggml design, it ends up using
170
- // 2.0625 bpw because of the 16-bit scale for each block of 256.
171
- typedef struct {
172
- ggml_fp16_t d;
173
- uint16_t qs[QK_K/8];
174
- } block_iq2_xxs;
175
- static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
176
-
177
- // 2.3125 bpw quants
178
- typedef struct {
179
- ggml_fp16_t d;
180
- uint16_t qs[QK_K/8];
181
- uint8_t scales[QK_K/32];
182
- } block_iq2_xs;
183
- static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
184
-
185
- // 2.5625 bpw quants
186
- typedef struct {
187
- ggml_fp16_t d;
188
- uint8_t qs[QK_K/4];
189
- uint8_t qh[QK_K/32];
190
- uint8_t scales[QK_K/32];
191
- } block_iq2_s;
192
- static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
193
-
194
- // (Almost) "true" 3-bit quantization.
195
- // Due to the need to use blocks as per ggml design, it ends up using
196
- // 3.0625 bpw because of the 16-bit scale for each block of 256.
197
- typedef struct {
198
- ggml_fp16_t d;
199
- uint8_t qs[3*QK_K/8];
200
- } block_iq3_xxs;
201
- static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
202
-
203
- // 3.4375 bpw
204
- #if QK_K == 64
205
- #define IQ3S_N_SCALE 2
206
- #else
207
- #define IQ3S_N_SCALE QK_K/64
208
- #endif
209
- typedef struct {
210
- ggml_fp16_t d;
211
- uint8_t qs[QK_K/4];
212
- uint8_t qh[QK_K/32];
213
- uint8_t signs[QK_K/8];
214
- uint8_t scales[IQ3S_N_SCALE];
215
- } block_iq3_s;
216
- static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
6
+ #include "ggml.h"
217
7
 
218
- typedef struct {
219
- ggml_fp16_t d;
220
- uint8_t qs[QK_K/8];
221
- uint8_t scales[QK_K/16];
222
- } block_iq1_s;
223
- static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
224
-
225
- // Non-linear quants
226
- #define QK4_NL 32
227
- typedef struct {
228
- ggml_fp16_t d;
229
- uint8_t qs[QK4_NL/2];
230
- } block_iq4_nl;
231
- static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
232
-
233
- #if QK_K == 64
234
- #define block_iq4_xs block_iq4_nl
235
- //typedef struct block_iq4_nl block_iq4_xs;
236
- #else
237
- typedef struct {
238
- ggml_fp16_t d;
239
- uint16_t scales_h;
240
- uint8_t scales_l[QK_K/64];
241
- uint8_t qs[QK_K/2];
242
- } block_iq4_xs;
243
- static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
244
- #endif
8
+ // GGML internal header
245
9
 
246
10
  #ifdef __cplusplus
247
11
  extern "C" {
@@ -261,6 +25,7 @@ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGM
261
25
  void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
262
26
  void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
263
27
  void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
28
+
264
29
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
265
30
  void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
266
31
  void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
@@ -280,6 +45,7 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
280
45
  void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
281
46
  void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
282
47
  void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
48
+
283
49
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
284
50
  void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
285
51
  void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@@ -300,6 +66,7 @@ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRI
300
66
  void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
301
67
  void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
302
68
  void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
69
+
303
70
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
304
71
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
305
72
  void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -321,6 +88,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
321
88
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
322
89
  void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
323
90
  void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
91
+
324
92
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
325
93
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
326
94
  void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -330,26 +98,26 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
330
98
  void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
331
99
  void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
332
100
 
333
- //
334
101
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
335
- //
336
- size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
337
- size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
338
- size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
339
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
340
- size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
341
- size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
342
- size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
343
- size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
344
- size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
345
- size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
346
- size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
347
- size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
348
- size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
349
- size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
350
- size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
351
- size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
352
- size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
102
+ size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
103
+ size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
104
+ size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
105
+ size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
106
+ size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
107
+ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
108
+ size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
109
+ size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
110
+
111
+ size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
112
+ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
113
+ size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
114
+ size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
115
+ size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
116
+ size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
117
+ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
118
+ size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
119
+ size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
120
+ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
353
121
 
354
122
  void iq2xs_init_impl(enum ggml_type type);
355
123
  void iq2xs_free_impl(enum ggml_type type);