llama_cpp 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
@@ -1,11 +1,63 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
|
-
#include "ggml.h"
|
3
|
+
#include "ggml-impl.h"
|
4
|
+
|
5
|
+
// GGML internal header
|
4
6
|
|
5
7
|
#include <stdint.h>
|
6
|
-
#include <assert.h>
|
7
8
|
#include <stddef.h>
|
8
9
|
|
10
|
+
#define QK4_0 32
|
11
|
+
typedef struct {
|
12
|
+
ggml_fp16_t d; // delta
|
13
|
+
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
14
|
+
} block_q4_0;
|
15
|
+
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
16
|
+
|
17
|
+
#define QK4_1 32
|
18
|
+
typedef struct {
|
19
|
+
ggml_fp16_t d; // delta
|
20
|
+
ggml_fp16_t m; // min
|
21
|
+
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
22
|
+
} block_q4_1;
|
23
|
+
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
24
|
+
|
25
|
+
#define QK5_0 32
|
26
|
+
typedef struct {
|
27
|
+
ggml_fp16_t d; // delta
|
28
|
+
uint8_t qh[4]; // 5-th bit of quants
|
29
|
+
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
30
|
+
} block_q5_0;
|
31
|
+
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
32
|
+
|
33
|
+
#define QK5_1 32
|
34
|
+
typedef struct {
|
35
|
+
ggml_fp16_t d; // delta
|
36
|
+
ggml_fp16_t m; // min
|
37
|
+
uint8_t qh[4]; // 5-th bit of quants
|
38
|
+
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
39
|
+
} block_q5_1;
|
40
|
+
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
41
|
+
|
42
|
+
#define QK8_0 32
|
43
|
+
typedef struct {
|
44
|
+
ggml_fp16_t d; // delta
|
45
|
+
int8_t qs[QK8_0]; // quants
|
46
|
+
} block_q8_0;
|
47
|
+
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
48
|
+
|
49
|
+
#define QK8_1 32
|
50
|
+
typedef struct {
|
51
|
+
float d; // delta
|
52
|
+
float s; // d * sum(qs[i])
|
53
|
+
int8_t qs[QK8_1]; // quants
|
54
|
+
} block_q8_1;
|
55
|
+
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
56
|
+
|
57
|
+
//
|
58
|
+
// Super-block quantization structures
|
59
|
+
//
|
60
|
+
|
9
61
|
// Super-block size
|
10
62
|
#ifdef GGML_QKK_64
|
11
63
|
#define QK_K 64
|
@@ -15,18 +67,6 @@
|
|
15
67
|
#define K_SCALE_SIZE 12
|
16
68
|
#endif
|
17
69
|
|
18
|
-
#ifndef static_assert
|
19
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
20
|
-
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
21
|
-
#else
|
22
|
-
#define static_assert(cond, msg) struct global_scope_noop_trick
|
23
|
-
#endif
|
24
|
-
#endif
|
25
|
-
|
26
|
-
//
|
27
|
-
// Super-block quantization structures
|
28
|
-
//
|
29
|
-
|
30
70
|
// 2-bit quantization
|
31
71
|
// weight is represented as x = a * q + b
|
32
72
|
// 16 blocks of 16 elements each
|
@@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
|
127
167
|
|
128
168
|
|
129
169
|
// Quantization
|
170
|
+
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
171
|
+
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
172
|
+
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
173
|
+
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
174
|
+
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
175
|
+
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
176
|
+
|
130
177
|
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
131
178
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
132
179
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
@@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
134
181
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
135
182
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
136
183
|
|
184
|
+
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
185
|
+
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
186
|
+
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
187
|
+
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
188
|
+
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
189
|
+
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
190
|
+
|
137
191
|
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
138
192
|
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
139
193
|
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
@@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
|
142
196
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
143
197
|
|
144
198
|
// Dequantization
|
199
|
+
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
200
|
+
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
201
|
+
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
202
|
+
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
203
|
+
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
204
|
+
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
205
|
+
|
145
206
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
146
207
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
147
208
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
@@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
|
150
211
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
151
212
|
|
152
213
|
// Dot product
|
214
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
215
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
216
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
217
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
218
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
219
|
+
|
153
220
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
154
221
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
155
222
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
156
223
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
157
224
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
158
|
-
|
159
|
-
// Quantization with histogram collection
|
160
|
-
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
161
|
-
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
162
|
-
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
163
|
-
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
164
|
-
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
165
|
-
|