llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -1,247 +1,11 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
|
-
#
|
3
|
+
#define GGML_COMMON_DECL_C
|
4
|
+
#include "ggml-common.h"
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
#include <stdint.h>
|
8
|
-
#include <stddef.h>
|
9
|
-
|
10
|
-
#define QK4_0 32
|
11
|
-
typedef struct {
|
12
|
-
ggml_fp16_t d; // delta
|
13
|
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
14
|
-
} block_q4_0;
|
15
|
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
16
|
-
|
17
|
-
#define QK4_1 32
|
18
|
-
typedef struct {
|
19
|
-
ggml_fp16_t d; // delta
|
20
|
-
ggml_fp16_t m; // min
|
21
|
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
22
|
-
} block_q4_1;
|
23
|
-
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
24
|
-
|
25
|
-
#define QK5_0 32
|
26
|
-
typedef struct {
|
27
|
-
ggml_fp16_t d; // delta
|
28
|
-
uint8_t qh[4]; // 5-th bit of quants
|
29
|
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
30
|
-
} block_q5_0;
|
31
|
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
32
|
-
|
33
|
-
#define QK5_1 32
|
34
|
-
typedef struct {
|
35
|
-
ggml_fp16_t d; // delta
|
36
|
-
ggml_fp16_t m; // min
|
37
|
-
uint8_t qh[4]; // 5-th bit of quants
|
38
|
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
39
|
-
} block_q5_1;
|
40
|
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
41
|
-
|
42
|
-
#define QK8_0 32
|
43
|
-
typedef struct {
|
44
|
-
ggml_fp16_t d; // delta
|
45
|
-
int8_t qs[QK8_0]; // quants
|
46
|
-
} block_q8_0;
|
47
|
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
48
|
-
|
49
|
-
#define QK8_1 32
|
50
|
-
typedef struct {
|
51
|
-
float d; // delta
|
52
|
-
float s; // d * sum(qs[i])
|
53
|
-
int8_t qs[QK8_1]; // quants
|
54
|
-
} block_q8_1;
|
55
|
-
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
56
|
-
|
57
|
-
//
|
58
|
-
// Super-block quantization structures
|
59
|
-
//
|
60
|
-
|
61
|
-
// Super-block size
|
62
|
-
#ifdef GGML_QKK_64
|
63
|
-
#define QK_K 64
|
64
|
-
#define K_SCALE_SIZE 4
|
65
|
-
#else
|
66
|
-
#define QK_K 256
|
67
|
-
#define K_SCALE_SIZE 12
|
68
|
-
#endif
|
69
|
-
|
70
|
-
// 2-bit quantization
|
71
|
-
// weight is represented as x = a * q + b
|
72
|
-
// 16 blocks of 16 elements each
|
73
|
-
// Effectively 2.625 bits per weight
|
74
|
-
typedef struct {
|
75
|
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
76
|
-
uint8_t qs[QK_K/4]; // quants
|
77
|
-
ggml_fp16_t d; // super-block scale for quantized scales
|
78
|
-
ggml_fp16_t dmin; // super-block scale for quantized mins
|
79
|
-
} block_q2_K;
|
80
|
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
81
|
-
|
82
|
-
// 3-bit quantization
|
83
|
-
// weight is represented as x = a * q
|
84
|
-
// 16 blocks of 16 elements each
|
85
|
-
// Effectively 3.4375 bits per weight
|
86
|
-
#ifdef GGML_QKK_64
|
87
|
-
typedef struct {
|
88
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
89
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
90
|
-
uint8_t scales[2];
|
91
|
-
ggml_fp16_t d; // super-block scale
|
92
|
-
} block_q3_K;
|
93
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
94
|
-
#else
|
95
|
-
typedef struct {
|
96
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
97
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
98
|
-
uint8_t scales[12]; // scales, quantized with 6 bits
|
99
|
-
ggml_fp16_t d; // super-block scale
|
100
|
-
} block_q3_K;
|
101
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
102
|
-
#endif
|
103
|
-
|
104
|
-
// 4-bit quantization
|
105
|
-
// 8 blocks of 32 elements each
|
106
|
-
// weight is represented as x = a * q + b
|
107
|
-
// Effectively 4.5 bits per weight
|
108
|
-
#ifdef GGML_QKK_64
|
109
|
-
typedef struct {
|
110
|
-
ggml_fp16_t d[2]; // super-block scales/mins
|
111
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
112
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
113
|
-
} block_q4_K;
|
114
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
115
|
-
#else
|
116
|
-
typedef struct {
|
117
|
-
ggml_fp16_t d; // super-block scale for quantized scales
|
118
|
-
ggml_fp16_t dmin; // super-block scale for quantized mins
|
119
|
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
120
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
121
|
-
} block_q4_K;
|
122
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
123
|
-
#endif
|
124
|
-
|
125
|
-
// 5-bit quantization
|
126
|
-
// 8 blocks of 32 elements each
|
127
|
-
// weight is represented as x = a * q + b
|
128
|
-
// Effectively 5.5 bits per weight
|
129
|
-
#ifdef GGML_QKK_64
|
130
|
-
typedef struct {
|
131
|
-
ggml_fp16_t d; // super-block scale
|
132
|
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
133
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
134
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
135
|
-
} block_q5_K;
|
136
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
137
|
-
#else
|
138
|
-
typedef struct {
|
139
|
-
ggml_fp16_t d; // super-block scale for quantized scales
|
140
|
-
ggml_fp16_t dmin; // super-block scale for quantized mins
|
141
|
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
142
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
143
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
144
|
-
} block_q5_K;
|
145
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
146
|
-
#endif
|
147
|
-
|
148
|
-
// 6-bit quantization
|
149
|
-
// weight is represented as x = a * q
|
150
|
-
// 16 blocks of 16 elements each
|
151
|
-
// Effectively 6.5625 bits per weight
|
152
|
-
typedef struct {
|
153
|
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
154
|
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
155
|
-
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
156
|
-
ggml_fp16_t d; // super-block scale
|
157
|
-
} block_q6_K;
|
158
|
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
|
159
|
-
|
160
|
-
// This is only used for intermediate quantization and dot products
|
161
|
-
typedef struct {
|
162
|
-
float d; // delta
|
163
|
-
int8_t qs[QK_K]; // quants
|
164
|
-
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
165
|
-
} block_q8_K;
|
166
|
-
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
167
|
-
|
168
|
-
// (Almost) "true" 2-bit quantization.
|
169
|
-
// Due to the need to use blocks as per ggml design, it ends up using
|
170
|
-
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
171
|
-
typedef struct {
|
172
|
-
ggml_fp16_t d;
|
173
|
-
uint16_t qs[QK_K/8];
|
174
|
-
} block_iq2_xxs;
|
175
|
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
176
|
-
|
177
|
-
// 2.3125 bpw quants
|
178
|
-
typedef struct {
|
179
|
-
ggml_fp16_t d;
|
180
|
-
uint16_t qs[QK_K/8];
|
181
|
-
uint8_t scales[QK_K/32];
|
182
|
-
} block_iq2_xs;
|
183
|
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
184
|
-
|
185
|
-
// 2.5625 bpw quants
|
186
|
-
typedef struct {
|
187
|
-
ggml_fp16_t d;
|
188
|
-
uint8_t qs[QK_K/4];
|
189
|
-
uint8_t qh[QK_K/32];
|
190
|
-
uint8_t scales[QK_K/32];
|
191
|
-
} block_iq2_s;
|
192
|
-
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
193
|
-
|
194
|
-
// (Almost) "true" 3-bit quantization.
|
195
|
-
// Due to the need to use blocks as per ggml design, it ends up using
|
196
|
-
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
197
|
-
typedef struct {
|
198
|
-
ggml_fp16_t d;
|
199
|
-
uint8_t qs[3*QK_K/8];
|
200
|
-
} block_iq3_xxs;
|
201
|
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
202
|
-
|
203
|
-
// 3.4375 bpw
|
204
|
-
#if QK_K == 64
|
205
|
-
#define IQ3S_N_SCALE 2
|
206
|
-
#else
|
207
|
-
#define IQ3S_N_SCALE QK_K/64
|
208
|
-
#endif
|
209
|
-
typedef struct {
|
210
|
-
ggml_fp16_t d;
|
211
|
-
uint8_t qs[QK_K/4];
|
212
|
-
uint8_t qh[QK_K/32];
|
213
|
-
uint8_t signs[QK_K/8];
|
214
|
-
uint8_t scales[IQ3S_N_SCALE];
|
215
|
-
} block_iq3_s;
|
216
|
-
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
6
|
+
#include "ggml.h"
|
217
7
|
|
218
|
-
|
219
|
-
ggml_fp16_t d;
|
220
|
-
uint8_t qs[QK_K/8];
|
221
|
-
uint8_t scales[QK_K/16];
|
222
|
-
} block_iq1_s;
|
223
|
-
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
224
|
-
|
225
|
-
// Non-linear quants
|
226
|
-
#define QK4_NL 32
|
227
|
-
typedef struct {
|
228
|
-
ggml_fp16_t d;
|
229
|
-
uint8_t qs[QK4_NL/2];
|
230
|
-
} block_iq4_nl;
|
231
|
-
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
232
|
-
|
233
|
-
#if QK_K == 64
|
234
|
-
#define block_iq4_xs block_iq4_nl
|
235
|
-
//typedef struct block_iq4_nl block_iq4_xs;
|
236
|
-
#else
|
237
|
-
typedef struct {
|
238
|
-
ggml_fp16_t d;
|
239
|
-
uint16_t scales_h;
|
240
|
-
uint8_t scales_l[QK_K/64];
|
241
|
-
uint8_t qs[QK_K/2];
|
242
|
-
} block_iq4_xs;
|
243
|
-
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
244
|
-
#endif
|
8
|
+
// GGML internal header
|
245
9
|
|
246
10
|
#ifdef __cplusplus
|
247
11
|
extern "C" {
|
@@ -261,6 +25,7 @@ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGM
|
|
261
25
|
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
262
26
|
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
263
27
|
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
28
|
+
|
264
29
|
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
265
30
|
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
266
31
|
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
@@ -280,6 +45,7 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|
280
45
|
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
281
46
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
282
47
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
48
|
+
|
283
49
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
284
50
|
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
285
51
|
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
@@ -300,6 +66,7 @@ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRI
|
|
300
66
|
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
301
67
|
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
302
68
|
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
69
|
+
|
303
70
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
304
71
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
305
72
|
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
@@ -321,6 +88,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
321
88
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
322
89
|
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
323
90
|
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
91
|
+
|
324
92
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
325
93
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
326
94
|
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
@@ -330,26 +98,26 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
330
98
|
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
331
99
|
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
332
100
|
|
333
|
-
//
|
334
101
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
335
|
-
|
336
|
-
size_t
|
337
|
-
size_t
|
338
|
-
size_t
|
339
|
-
size_t
|
340
|
-
size_t
|
341
|
-
size_t
|
342
|
-
size_t
|
343
|
-
|
344
|
-
size_t quantize_q2_K
|
345
|
-
size_t quantize_q3_K
|
346
|
-
size_t quantize_q4_K
|
347
|
-
size_t quantize_q5_K
|
348
|
-
size_t quantize_q6_K
|
349
|
-
size_t quantize_q4_0
|
350
|
-
size_t quantize_q4_1
|
351
|
-
size_t quantize_q5_0
|
352
|
-
size_t quantize_q5_1
|
102
|
+
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
103
|
+
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
104
|
+
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
105
|
+
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
106
|
+
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
107
|
+
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
108
|
+
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
109
|
+
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
110
|
+
|
111
|
+
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
112
|
+
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
113
|
+
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
114
|
+
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
115
|
+
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
116
|
+
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
117
|
+
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
118
|
+
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
119
|
+
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
120
|
+
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
353
121
|
|
354
122
|
void iq2xs_init_impl(enum ggml_type type);
|
355
123
|
void iq2xs_free_impl(enum ggml_type type);
|