llama_cpp 0.7.1 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-metal.m +44 -3
- data/ext/llama_cpp/src/ggml-metal.metal +162 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +30 -56
- data/ext/llama_cpp/src/ggml.c +13 -9
- data/ext/llama_cpp/src/ggml.h +3 -2
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +359 -58
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -13537,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
|
|
13537
13537
|
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
13538
13538
|
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
13539
13539
|
}
|
13540
|
-
} if (!is_neox) {
|
13540
|
+
} else if (!is_neox) {
|
13541
13541
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13542
13542
|
const float cos_theta = cosf(theta);
|
13543
13543
|
const float sin_theta = sinf(theta);
|
@@ -19170,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
19170
19170
|
|
19171
19171
|
if (idx == -1) {
|
19172
19172
|
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
19173
|
+
fclose(fout);
|
19173
19174
|
return;
|
19174
19175
|
}
|
19175
19176
|
|
@@ -20844,7 +20845,7 @@ struct gguf_kv {
|
|
20844
20845
|
};
|
20845
20846
|
|
20846
20847
|
struct gguf_header {
|
20847
|
-
|
20848
|
+
char magic[4];
|
20848
20849
|
uint32_t version;
|
20849
20850
|
uint64_t n_tensors; // GGUFv2
|
20850
20851
|
uint64_t n_kv; // GGUFv2
|
@@ -20914,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
|
|
20914
20915
|
struct gguf_context * gguf_init_empty(void) {
|
20915
20916
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20916
20917
|
|
20917
|
-
ctx->header.magic
|
20918
|
+
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
20918
20919
|
ctx->header.version = GGUF_VERSION;
|
20919
20920
|
ctx->header.n_tensors = 0;
|
20920
20921
|
ctx->header.n_kv = 0;
|
@@ -20940,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20940
20941
|
// offset from start of file
|
20941
20942
|
size_t offset = 0;
|
20942
20943
|
|
20943
|
-
|
20944
|
+
char magic[4];
|
20944
20945
|
|
20945
20946
|
// check the magic before making allocations
|
20946
20947
|
{
|
20947
20948
|
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
20948
20949
|
|
20949
|
-
|
20950
|
-
|
20951
|
-
|
20952
|
-
|
20950
|
+
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
20951
|
+
if (magic[i] != GGUF_MAGIC[i]) {
|
20952
|
+
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
|
20953
|
+
fclose(file);
|
20954
|
+
return NULL;
|
20955
|
+
}
|
20953
20956
|
}
|
20954
20957
|
}
|
20955
20958
|
|
@@ -20959,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20959
20962
|
|
20960
20963
|
// read the header
|
20961
20964
|
{
|
20962
|
-
ctx->header.magic
|
20965
|
+
strncpy(ctx->header.magic, magic, 4);
|
20966
|
+
|
20963
20967
|
|
20964
20968
|
ctx->kv = NULL;
|
20965
20969
|
ctx->infos = NULL;
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
46
46
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
47
47
|
#include <intrin.h>
|
48
48
|
#else
|
49
|
-
#if !defined(__riscv)
|
49
|
+
#if !defined(__riscv) && !defined(__s390__)
|
50
50
|
#include <immintrin.h>
|
51
51
|
#endif
|
52
52
|
#endif
|
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
462
462
|
}
|
463
463
|
|
464
464
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
465
|
-
|
466
|
-
|
467
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
468
|
-
(void)hist;
|
465
|
+
(void)hist; // TODO: collect histograms
|
469
466
|
|
470
|
-
for (int j = 0; j <
|
467
|
+
for (int j = 0; j < n; j += k) {
|
471
468
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
472
469
|
quantize_row_q2_K_reference(src + j, y, k);
|
473
470
|
}
|
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
678
675
|
}
|
679
676
|
|
680
677
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
681
|
-
|
682
|
-
|
683
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
684
|
-
(void)hist;
|
678
|
+
(void)hist; // TODO: collect histograms
|
685
679
|
|
686
|
-
for (int j = 0; j <
|
680
|
+
for (int j = 0; j < n; j += k) {
|
687
681
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
688
682
|
quantize_row_q3_K_reference(src + j, y, k);
|
689
683
|
}
|
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
846
840
|
|
847
841
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
848
842
|
assert(k % QK_K == 0);
|
849
|
-
const int nb = k / QK_K;
|
850
843
|
(void)hist; // TODO: collect histograms
|
851
|
-
|
844
|
+
|
845
|
+
for (int j = 0; j < n; j += k) {
|
852
846
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
853
847
|
quantize_row_q4_K_reference(src + j, y, k);
|
854
848
|
}
|
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
1052
1046
|
|
1053
1047
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1054
1048
|
assert(k % QK_K == 0);
|
1055
|
-
|
1056
|
-
|
1057
|
-
for (int j = 0; j <
|
1049
|
+
(void)hist; // TODO: collect histograms
|
1050
|
+
|
1051
|
+
for (int j = 0; j < n; j += k) {
|
1058
1052
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
1059
1053
|
quantize_row_q5_K_reference(src + j, y, k);
|
1060
1054
|
}
|
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
1200
1194
|
|
1201
1195
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
1202
1196
|
assert(k % QK_K == 0);
|
1203
|
-
|
1204
|
-
|
1205
|
-
(void)hist; // TODO
|
1197
|
+
(void)hist; // TODO: collect histograms
|
1206
1198
|
|
1207
|
-
for (int j = 0; j <
|
1199
|
+
for (int j = 0; j < n; j += k) {
|
1208
1200
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
1209
1201
|
quantize_row_q6_K_reference(src + j, y, k);
|
1210
1202
|
}
|