llama_cpp 0.7.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -231,8 +231,9 @@
|
|
231
231
|
#define GGML_EXIT_SUCCESS 0
|
232
232
|
#define GGML_EXIT_ABORTED 1
|
233
233
|
|
234
|
-
#define GGUF_MAGIC
|
235
|
-
|
234
|
+
#define GGUF_MAGIC "GGUF"
|
235
|
+
|
236
|
+
#define GGUF_VERSION 3
|
236
237
|
|
237
238
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
238
239
|
|
@@ -400,15 +401,16 @@ extern "C" {
|
|
400
401
|
GGML_OP_ALIBI,
|
401
402
|
GGML_OP_CLAMP,
|
402
403
|
GGML_OP_CONV_1D,
|
403
|
-
|
404
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
404
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
+
GGML_OP_CONV_2D,
|
408
|
+
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
+
GGML_OP_CONV_2D_STAGE_1, // internal
|
405
410
|
GGML_OP_CONV_TRANSPOSE_2D,
|
406
411
|
GGML_OP_POOL_1D,
|
407
412
|
GGML_OP_POOL_2D,
|
408
413
|
|
409
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
410
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
411
|
-
|
412
414
|
GGML_OP_UPSCALE, // nearest interpolate
|
413
415
|
|
414
416
|
GGML_OP_FLASH_ATTN,
|
@@ -1019,9 +1021,9 @@ extern "C" {
|
|
1019
1021
|
struct ggml_tensor * b,
|
1020
1022
|
float eps);
|
1021
1023
|
|
1022
|
-
// A:
|
1023
|
-
// B:
|
1024
|
-
// result is
|
1024
|
+
// A: k columns, n rows => [ne03, ne02, n, k]
|
1025
|
+
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
1026
|
+
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
1025
1027
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
1026
1028
|
struct ggml_context * ctx,
|
1027
1029
|
struct ggml_tensor * a,
|
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
46
46
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
47
47
|
#include <intrin.h>
|
48
48
|
#else
|
49
|
-
#if !defined(__riscv)
|
49
|
+
#if !defined(__riscv) && !defined(__s390__)
|
50
50
|
#include <immintrin.h>
|
51
51
|
#endif
|
52
52
|
#endif
|
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
462
462
|
}
|
463
463
|
|
464
464
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
465
|
-
|
466
|
-
|
467
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
468
|
-
(void)hist;
|
465
|
+
(void)hist; // TODO: collect histograms
|
469
466
|
|
470
|
-
for (int j = 0; j <
|
467
|
+
for (int j = 0; j < n; j += k) {
|
471
468
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
472
469
|
quantize_row_q2_K_reference(src + j, y, k);
|
473
470
|
}
|
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
678
675
|
}
|
679
676
|
|
680
677
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
681
|
-
|
682
|
-
|
683
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
684
|
-
(void)hist;
|
678
|
+
(void)hist; // TODO: collect histograms
|
685
679
|
|
686
|
-
for (int j = 0; j <
|
680
|
+
for (int j = 0; j < n; j += k) {
|
687
681
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
688
682
|
quantize_row_q3_K_reference(src + j, y, k);
|
689
683
|
}
|
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
846
840
|
|
847
841
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
848
842
|
assert(k % QK_K == 0);
|
849
|
-
const int nb = k / QK_K;
|
850
843
|
(void)hist; // TODO: collect histograms
|
851
|
-
|
844
|
+
|
845
|
+
for (int j = 0; j < n; j += k) {
|
852
846
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
853
847
|
quantize_row_q4_K_reference(src + j, y, k);
|
854
848
|
}
|
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
1052
1046
|
|
1053
1047
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1054
1048
|
assert(k % QK_K == 0);
|
1055
|
-
|
1056
|
-
|
1057
|
-
for (int j = 0; j <
|
1049
|
+
(void)hist; // TODO: collect histograms
|
1050
|
+
|
1051
|
+
for (int j = 0; j < n; j += k) {
|
1058
1052
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
1059
1053
|
quantize_row_q5_K_reference(src + j, y, k);
|
1060
1054
|
}
|
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
1200
1194
|
|
1201
1195
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
1202
1196
|
assert(k % QK_K == 0);
|
1203
|
-
|
1204
|
-
|
1205
|
-
(void)hist; // TODO
|
1197
|
+
(void)hist; // TODO: collect histograms
|
1206
1198
|
|
1207
|
-
for (int j = 0; j <
|
1199
|
+
for (int j = 0; j < n; j += k) {
|
1208
1200
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
1209
1201
|
quantize_row_q6_K_reference(src + j, y, k);
|
1210
1202
|
}
|