llama_cpp 0.12.3 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -166,7 +166,7 @@ typedef struct {
166
166
  static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
167
167
 
168
168
  // (Almost) "true" 2-bit quantization.
169
- // Due to the need to use blocks as per ggml dsign, it ends up using
169
+ // Due to the need to use blocks as per ggml design, it ends up using
170
170
  // 2.0625 bpw because of the 16-bit scale for each block of 256.
171
171
  typedef struct {
172
172
  ggml_fp16_t d;
@@ -182,72 +182,90 @@ typedef struct {
182
182
  } block_iq2_xs;
183
183
  static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
184
184
 
185
+ // (Almost) "true" 3-bit quantization.
186
+ // Due to the need to use blocks as per ggml design, it ends up using
187
+ // 3.0625 bpw because of the 16-bit scale for each block of 256.
188
+ typedef struct {
189
+ ggml_fp16_t d;
190
+ uint8_t qs[3*QK_K/8];
191
+ } block_iq3_xxs;
192
+ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
193
+
194
+ #ifdef __cplusplus
195
+ extern "C" {
196
+ #endif
197
+
185
198
  // Quantization
186
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
187
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
188
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
189
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
190
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
191
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
192
-
193
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
194
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
195
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
196
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
197
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
198
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
199
-
200
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
201
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
202
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
203
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
204
- void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
205
- void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
206
-
207
- void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
208
- void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
209
- void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
210
- void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
211
- void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
212
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
199
+ void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
200
+ void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
201
+ void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
202
+ void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
203
+ void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
204
+ void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
205
+
206
+ void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
207
+ void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
208
+ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
209
+ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
210
+ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
211
+ void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
212
+ void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
213
+
214
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
215
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
216
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
217
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
218
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
219
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
220
+
221
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
222
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
223
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
224
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
225
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
226
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
227
+ void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
213
228
 
214
229
  // Dequantization
215
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
216
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
217
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
218
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
219
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
220
- //void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
221
-
222
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
223
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
224
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
225
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
226
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
227
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
228
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
229
- void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
230
+ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
231
+ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
232
+ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
233
+ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
234
+ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
235
+ //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
236
+
237
+ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
238
+ void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
239
+ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
240
+ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
241
+ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
242
+ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
243
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
244
+ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
245
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
230
246
 
231
247
  // Dot product
232
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
233
- void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
234
- void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
235
- void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
236
- void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
237
-
238
- void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
239
- void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
240
- void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
241
- void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
242
- void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
243
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
244
- void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
249
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
250
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
251
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
252
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
253
+
254
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
255
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
256
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
257
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
258
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
259
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
260
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
261
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
245
262
 
246
263
  //
247
264
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
248
265
  //
249
266
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
250
267
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
268
+ size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
251
269
  size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
252
270
  size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
253
271
  size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@@ -260,3 +278,10 @@ size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row,
260
278
 
261
279
  void iq2xs_init_impl(int grid_size);
262
280
  void iq2xs_free_impl(int grid_size);
281
+ void iq3xs_init_impl(int grid_size);
282
+ void iq3xs_free_impl(int grid_size);
283
+
284
+ #ifdef __cplusplus
285
+ }
286
+ #endif
287
+