llama_cpp 0.14.4 → 0.14.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +11 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +7 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +155 -155
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +878 -216
- data/vendor/tmp/llama.cpp/ggml.c +8 -8
- data/vendor/tmp/llama.cpp/ggml.h +7 -7
- data/vendor/tmp/llama.cpp/llama.cpp +686 -124
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- metadata +2 -2
@@ -12,70 +12,70 @@ extern "C" {
|
|
12
12
|
#endif
|
13
13
|
|
14
14
|
// Quantization
|
15
|
-
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y,
|
16
|
-
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y,
|
17
|
-
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y,
|
18
|
-
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y,
|
19
|
-
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y,
|
20
|
-
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y,
|
21
|
-
|
22
|
-
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y,
|
23
|
-
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y,
|
24
|
-
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y,
|
25
|
-
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y,
|
26
|
-
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y,
|
27
|
-
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y,
|
28
|
-
|
29
|
-
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y,
|
30
|
-
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y,
|
31
|
-
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y,
|
32
|
-
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y,
|
33
|
-
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y,
|
34
|
-
|
35
|
-
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
36
|
-
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
37
|
-
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
38
|
-
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
39
|
-
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
40
|
-
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
41
|
-
|
42
|
-
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
43
|
-
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
44
|
-
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
45
|
-
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
46
|
-
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
47
|
-
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
48
|
-
|
49
|
-
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
50
|
-
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
51
|
-
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
52
|
-
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
53
|
-
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
15
|
+
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
16
|
+
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
17
|
+
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
18
|
+
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
19
|
+
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
20
|
+
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
21
|
+
|
22
|
+
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
23
|
+
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
24
|
+
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
25
|
+
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
26
|
+
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
27
|
+
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
28
|
+
|
29
|
+
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
30
|
+
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
31
|
+
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
32
|
+
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
33
|
+
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
34
|
+
|
35
|
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
36
|
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
37
|
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
38
|
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
39
|
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
40
|
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
41
|
+
|
42
|
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
43
|
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
44
|
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
45
|
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
46
|
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
47
|
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
48
|
+
|
49
|
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
50
|
+
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
51
|
+
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
52
|
+
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
53
|
+
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
54
54
|
|
55
55
|
// Dequantization
|
56
|
-
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
57
|
-
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
58
|
-
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
59
|
-
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
60
|
-
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
61
|
-
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
62
|
-
|
63
|
-
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
64
|
-
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
65
|
-
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
66
|
-
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
67
|
-
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
68
|
-
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
69
|
-
|
70
|
-
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
71
|
-
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
72
|
-
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
73
|
-
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
74
|
-
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
75
|
-
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
76
|
-
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
77
|
-
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
78
|
-
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
56
|
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
57
|
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
58
|
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
59
|
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
60
|
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
61
|
+
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
62
|
+
|
63
|
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
64
|
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
65
|
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
66
|
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
67
|
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
68
|
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
69
|
+
|
70
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
71
|
+
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
72
|
+
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
73
|
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
74
|
+
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
75
|
+
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
76
|
+
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
77
|
+
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
78
|
+
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
79
79
|
|
80
80
|
// Dot product
|
81
81
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
@@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
101
101
|
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
102
102
|
|
103
103
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
104
|
-
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
105
|
-
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
106
|
-
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
107
|
-
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
108
|
-
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
109
|
-
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
110
|
-
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
111
|
-
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
112
|
-
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
113
|
-
|
114
|
-
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
115
|
-
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
116
|
-
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
117
|
-
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
118
|
-
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
119
|
-
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
120
|
-
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
121
|
-
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
122
|
-
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
123
|
-
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
104
|
+
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
105
|
+
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
106
|
+
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
107
|
+
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
108
|
+
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
109
|
+
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
110
|
+
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
111
|
+
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
112
|
+
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
113
|
+
|
114
|
+
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
115
|
+
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
116
|
+
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
117
|
+
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
118
|
+
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
119
|
+
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
120
|
+
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
121
|
+
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
122
|
+
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
123
|
+
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
124
124
|
|
125
125
|
void iq2xs_init_impl(enum ggml_type type);
|
126
126
|
void iq2xs_free_impl(enum ggml_type type);
|