llama_cpp 0.8.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,63 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml.h"
3
+ #include "ggml-impl.h"
4
+
5
+ // GGML internal header
4
6
 
5
7
  #include <stdint.h>
6
- #include <assert.h>
7
8
  #include <stddef.h>
8
9
 
10
+ #define QK4_0 32
11
+ typedef struct {
12
+ ggml_fp16_t d; // delta
13
+ uint8_t qs[QK4_0 / 2]; // nibbles / quants
14
+ } block_q4_0;
15
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
16
+
17
+ #define QK4_1 32
18
+ typedef struct {
19
+ ggml_fp16_t d; // delta
20
+ ggml_fp16_t m; // min
21
+ uint8_t qs[QK4_1 / 2]; // nibbles / quants
22
+ } block_q4_1;
23
+ static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
24
+
25
+ #define QK5_0 32
26
+ typedef struct {
27
+ ggml_fp16_t d; // delta
28
+ uint8_t qh[4]; // 5-th bit of quants
29
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
30
+ } block_q5_0;
31
+ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
32
+
33
+ #define QK5_1 32
34
+ typedef struct {
35
+ ggml_fp16_t d; // delta
36
+ ggml_fp16_t m; // min
37
+ uint8_t qh[4]; // 5-th bit of quants
38
+ uint8_t qs[QK5_1 / 2]; // nibbles / quants
39
+ } block_q5_1;
40
+ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
41
+
42
+ #define QK8_0 32
43
+ typedef struct {
44
+ ggml_fp16_t d; // delta
45
+ int8_t qs[QK8_0]; // quants
46
+ } block_q8_0;
47
+ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
48
+
49
+ #define QK8_1 32
50
+ typedef struct {
51
+ float d; // delta
52
+ float s; // d * sum(qs[i])
53
+ int8_t qs[QK8_1]; // quants
54
+ } block_q8_1;
55
+ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
56
+
57
+ //
58
+ // Super-block quantization structures
59
+ //
60
+
9
61
  // Super-block size
10
62
  #ifdef GGML_QKK_64
11
63
  #define QK_K 64
@@ -15,18 +67,6 @@
15
67
  #define K_SCALE_SIZE 12
16
68
  #endif
17
69
 
18
- #ifndef static_assert
19
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
20
- #define static_assert(cond, msg) _Static_assert(cond, msg)
21
- #else
22
- #define static_assert(cond, msg) struct global_scope_noop_trick
23
- #endif
24
- #endif
25
-
26
- //
27
- // Super-block quantization structures
28
- //
29
-
30
70
  // 2-bit quantization
31
71
  // weight is represented as x = a * q + b
32
72
  // 16 blocks of 16 elements each
@@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
127
167
 
128
168
 
129
169
  // Quantization
170
+ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
171
+ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
172
+ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
173
+ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
174
+ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
175
+ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
176
+
130
177
  void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
131
178
  void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
132
179
  void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
@@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
134
181
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
135
182
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
136
183
 
184
+ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
185
+ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
186
+ void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
187
+ void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
188
+ void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
189
+ void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
190
+
137
191
  void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
138
192
  void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
139
193
  void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
@@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
142
196
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
143
197
 
144
198
  // Dequantization
199
+ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
200
+ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
201
+ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
202
+ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
203
+ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
204
+ //void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
205
+
145
206
  void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
146
207
  void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
147
208
  void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
@@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
150
211
  void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
151
212
 
152
213
  // Dot product
214
+ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
215
+ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
216
+ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
217
+ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
218
+ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
219
+
153
220
  void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
154
221
  void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
155
222
  void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
156
223
  void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
157
224
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
158
-
159
- // Quantization with histogram collection
160
- size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
161
- size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
162
- size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
163
- size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
164
- size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
165
-