llama_cpp 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -231,8 +231,9 @@
|
|
231
231
|
#define GGML_EXIT_SUCCESS 0
|
232
232
|
#define GGML_EXIT_ABORTED 1
|
233
233
|
|
234
|
-
#define GGUF_MAGIC
|
235
|
-
|
234
|
+
#define GGUF_MAGIC "GGUF"
|
235
|
+
|
236
|
+
#define GGUF_VERSION 3
|
236
237
|
|
237
238
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
238
239
|
|
@@ -400,15 +401,16 @@ extern "C" {
|
|
400
401
|
GGML_OP_ALIBI,
|
401
402
|
GGML_OP_CLAMP,
|
402
403
|
GGML_OP_CONV_1D,
|
403
|
-
|
404
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
404
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
+
GGML_OP_CONV_2D,
|
408
|
+
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
+
GGML_OP_CONV_2D_STAGE_1, // internal
|
405
410
|
GGML_OP_CONV_TRANSPOSE_2D,
|
406
411
|
GGML_OP_POOL_1D,
|
407
412
|
GGML_OP_POOL_2D,
|
408
413
|
|
409
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
410
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
411
|
-
|
412
414
|
GGML_OP_UPSCALE, // nearest interpolate
|
413
415
|
|
414
416
|
GGML_OP_FLASH_ATTN,
|
@@ -1019,9 +1021,9 @@ extern "C" {
|
|
1019
1021
|
struct ggml_tensor * b,
|
1020
1022
|
float eps);
|
1021
1023
|
|
1022
|
-
// A:
|
1023
|
-
// B:
|
1024
|
-
// result is
|
1024
|
+
// A: k columns, n rows => [ne03, ne02, n, k]
|
1025
|
+
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
1026
|
+
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
1025
1027
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
1026
1028
|
struct ggml_context * ctx,
|
1027
1029
|
struct ggml_tensor * a,
|
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
46
46
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
47
47
|
#include <intrin.h>
|
48
48
|
#else
|
49
|
-
#if !defined(__riscv)
|
49
|
+
#if !defined(__riscv) && !defined(__s390__)
|
50
50
|
#include <immintrin.h>
|
51
51
|
#endif
|
52
52
|
#endif
|
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
462
462
|
}
|
463
463
|
|
464
464
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
465
|
-
|
466
|
-
|
467
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
468
|
-
(void)hist;
|
465
|
+
(void)hist; // TODO: collect histograms
|
469
466
|
|
470
|
-
for (int j = 0; j <
|
467
|
+
for (int j = 0; j < n; j += k) {
|
471
468
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
472
469
|
quantize_row_q2_K_reference(src + j, y, k);
|
473
470
|
}
|
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
678
675
|
}
|
679
676
|
|
680
677
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
681
|
-
|
682
|
-
|
683
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
684
|
-
(void)hist;
|
678
|
+
(void)hist; // TODO: collect histograms
|
685
679
|
|
686
|
-
for (int j = 0; j <
|
680
|
+
for (int j = 0; j < n; j += k) {
|
687
681
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
688
682
|
quantize_row_q3_K_reference(src + j, y, k);
|
689
683
|
}
|
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
846
840
|
|
847
841
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
848
842
|
assert(k % QK_K == 0);
|
849
|
-
const int nb = k / QK_K;
|
850
843
|
(void)hist; // TODO: collect histograms
|
851
|
-
|
844
|
+
|
845
|
+
for (int j = 0; j < n; j += k) {
|
852
846
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
853
847
|
quantize_row_q4_K_reference(src + j, y, k);
|
854
848
|
}
|
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
1052
1046
|
|
1053
1047
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1054
1048
|
assert(k % QK_K == 0);
|
1055
|
-
|
1056
|
-
|
1057
|
-
for (int j = 0; j <
|
1049
|
+
(void)hist; // TODO: collect histograms
|
1050
|
+
|
1051
|
+
for (int j = 0; j < n; j += k) {
|
1058
1052
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
1059
1053
|
quantize_row_q5_K_reference(src + j, y, k);
|
1060
1054
|
}
|
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
1200
1194
|
|
1201
1195
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
1202
1196
|
assert(k % QK_K == 0);
|
1203
|
-
|
1204
|
-
|
1205
|
-
(void)hist; // TODO
|
1197
|
+
(void)hist; // TODO: collect histograms
|
1206
1198
|
|
1207
|
-
for (int j = 0; j <
|
1199
|
+
for (int j = 0; j < n; j += k) {
|
1208
1200
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
1209
1201
|
quantize_row_q6_K_reference(src + j, y, k);
|
1210
1202
|
}
|