llama_cpp 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,63 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml.h"
3
+ #include "ggml-impl.h"
4
+
5
+ // GGML internal header
4
6
 
5
7
  #include <stdint.h>
6
- #include <assert.h>
7
8
  #include <stddef.h>
8
9
 
10
+ #define QK4_0 32
11
+ typedef struct {
12
+ ggml_fp16_t d; // delta
13
+ uint8_t qs[QK4_0 / 2]; // nibbles / quants
14
+ } block_q4_0;
15
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
16
+
17
+ #define QK4_1 32
18
+ typedef struct {
19
+ ggml_fp16_t d; // delta
20
+ ggml_fp16_t m; // min
21
+ uint8_t qs[QK4_1 / 2]; // nibbles / quants
22
+ } block_q4_1;
23
+ static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
24
+
25
+ #define QK5_0 32
26
+ typedef struct {
27
+ ggml_fp16_t d; // delta
28
+ uint8_t qh[4]; // 5-th bit of quants
29
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
30
+ } block_q5_0;
31
+ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
32
+
33
+ #define QK5_1 32
34
+ typedef struct {
35
+ ggml_fp16_t d; // delta
36
+ ggml_fp16_t m; // min
37
+ uint8_t qh[4]; // 5-th bit of quants
38
+ uint8_t qs[QK5_1 / 2]; // nibbles / quants
39
+ } block_q5_1;
40
+ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
41
+
42
+ #define QK8_0 32
43
+ typedef struct {
44
+ ggml_fp16_t d; // delta
45
+ int8_t qs[QK8_0]; // quants
46
+ } block_q8_0;
47
+ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
48
+
49
+ #define QK8_1 32
50
+ typedef struct {
51
+ float d; // delta
52
+ float s; // d * sum(qs[i])
53
+ int8_t qs[QK8_1]; // quants
54
+ } block_q8_1;
55
+ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
56
+
57
+ //
58
+ // Super-block quantization structures
59
+ //
60
+
9
61
  // Super-block size
10
62
  #ifdef GGML_QKK_64
11
63
  #define QK_K 64
@@ -15,18 +67,6 @@
15
67
  #define K_SCALE_SIZE 12
16
68
  #endif
17
69
 
18
- #ifndef static_assert
19
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
20
- #define static_assert(cond, msg) _Static_assert(cond, msg)
21
- #else
22
- #define static_assert(cond, msg) struct global_scope_noop_trick
23
- #endif
24
- #endif
25
-
26
- //
27
- // Super-block quantization structures
28
- //
29
-
30
70
  // 2-bit quantization
31
71
  // weight is represented as x = a * q + b
32
72
  // 16 blocks of 16 elements each
@@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
127
167
 
128
168
 
129
169
  // Quantization
170
+ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
171
+ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
172
+ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
173
+ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
174
+ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
175
+ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
176
+
130
177
  void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
131
178
  void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
132
179
  void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
@@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
134
181
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
135
182
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
136
183
 
184
+ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
185
+ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
186
+ void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
187
+ void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
188
+ void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
189
+ void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
190
+
137
191
  void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
138
192
  void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
139
193
  void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
@@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
142
196
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
143
197
 
144
198
  // Dequantization
199
+ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
200
+ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
201
+ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
202
+ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
203
+ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
204
+ //void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
205
+
145
206
  void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
146
207
  void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
147
208
  void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
@@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
150
211
  void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
151
212
 
152
213
  // Dot product
214
+ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
215
+ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
216
+ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
217
+ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
218
+ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
219
+
153
220
  void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
154
221
  void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
155
222
  void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
156
223
  void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
157
224
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
158
-
159
- // Quantization with histogram collection
160
- size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
161
- size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
162
- size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
163
- size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
164
- size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
165
-