llama_cpp 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-metal.m +44 -3
- data/ext/llama_cpp/src/ggml-metal.metal +162 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +30 -56
- data/ext/llama_cpp/src/ggml.c +13 -9
- data/ext/llama_cpp/src/ggml.h +3 -2
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +359 -58
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -13537,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
|
|
13537
13537
|
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
13538
13538
|
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
13539
13539
|
}
|
13540
|
-
} if (!is_neox) {
|
13540
|
+
} else if (!is_neox) {
|
13541
13541
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13542
13542
|
const float cos_theta = cosf(theta);
|
13543
13543
|
const float sin_theta = sinf(theta);
|
@@ -19170,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
19170
19170
|
|
19171
19171
|
if (idx == -1) {
|
19172
19172
|
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
19173
|
+
fclose(fout);
|
19173
19174
|
return;
|
19174
19175
|
}
|
19175
19176
|
|
@@ -20844,7 +20845,7 @@ struct gguf_kv {
|
|
20844
20845
|
};
|
20845
20846
|
|
20846
20847
|
struct gguf_header {
|
20847
|
-
|
20848
|
+
char magic[4];
|
20848
20849
|
uint32_t version;
|
20849
20850
|
uint64_t n_tensors; // GGUFv2
|
20850
20851
|
uint64_t n_kv; // GGUFv2
|
@@ -20914,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
|
|
20914
20915
|
struct gguf_context * gguf_init_empty(void) {
|
20915
20916
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20916
20917
|
|
20917
|
-
ctx->header.magic
|
20918
|
+
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
20918
20919
|
ctx->header.version = GGUF_VERSION;
|
20919
20920
|
ctx->header.n_tensors = 0;
|
20920
20921
|
ctx->header.n_kv = 0;
|
@@ -20940,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20940
20941
|
// offset from start of file
|
20941
20942
|
size_t offset = 0;
|
20942
20943
|
|
20943
|
-
|
20944
|
+
char magic[4];
|
20944
20945
|
|
20945
20946
|
// check the magic before making allocations
|
20946
20947
|
{
|
20947
20948
|
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
20948
20949
|
|
20949
|
-
|
20950
|
-
|
20951
|
-
|
20952
|
-
|
20950
|
+
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
20951
|
+
if (magic[i] != GGUF_MAGIC[i]) {
|
20952
|
+
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
|
20953
|
+
fclose(file);
|
20954
|
+
return NULL;
|
20955
|
+
}
|
20953
20956
|
}
|
20954
20957
|
}
|
20955
20958
|
|
@@ -20959,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20959
20962
|
|
20960
20963
|
// read the header
|
20961
20964
|
{
|
20962
|
-
ctx->header.magic
|
20965
|
+
strncpy(ctx->header.magic, magic, 4);
|
20966
|
+
|
20963
20967
|
|
20964
20968
|
ctx->kv = NULL;
|
20965
20969
|
ctx->infos = NULL;
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
46
46
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
47
47
|
#include <intrin.h>
|
48
48
|
#else
|
49
|
-
#if !defined(__riscv)
|
49
|
+
#if !defined(__riscv) && !defined(__s390__)
|
50
50
|
#include <immintrin.h>
|
51
51
|
#endif
|
52
52
|
#endif
|
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
462
462
|
}
|
463
463
|
|
464
464
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
465
|
-
|
466
|
-
|
467
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
468
|
-
(void)hist;
|
465
|
+
(void)hist; // TODO: collect histograms
|
469
466
|
|
470
|
-
for (int j = 0; j <
|
467
|
+
for (int j = 0; j < n; j += k) {
|
471
468
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
472
469
|
quantize_row_q2_K_reference(src + j, y, k);
|
473
470
|
}
|
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
678
675
|
}
|
679
676
|
|
680
677
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
681
|
-
|
682
|
-
|
683
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
684
|
-
(void)hist;
|
678
|
+
(void)hist; // TODO: collect histograms
|
685
679
|
|
686
|
-
for (int j = 0; j <
|
680
|
+
for (int j = 0; j < n; j += k) {
|
687
681
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
688
682
|
quantize_row_q3_K_reference(src + j, y, k);
|
689
683
|
}
|
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
846
840
|
|
847
841
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
848
842
|
assert(k % QK_K == 0);
|
849
|
-
const int nb = k / QK_K;
|
850
843
|
(void)hist; // TODO: collect histograms
|
851
|
-
|
844
|
+
|
845
|
+
for (int j = 0; j < n; j += k) {
|
852
846
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
853
847
|
quantize_row_q4_K_reference(src + j, y, k);
|
854
848
|
}
|
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
1052
1046
|
|
1053
1047
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1054
1048
|
assert(k % QK_K == 0);
|
1055
|
-
|
1056
|
-
|
1057
|
-
for (int j = 0; j <
|
1049
|
+
(void)hist; // TODO: collect histograms
|
1050
|
+
|
1051
|
+
for (int j = 0; j < n; j += k) {
|
1058
1052
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
1059
1053
|
quantize_row_q5_K_reference(src + j, y, k);
|
1060
1054
|
}
|
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
1200
1194
|
|
1201
1195
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
1202
1196
|
assert(k % QK_K == 0);
|
1203
|
-
|
1204
|
-
|
1205
|
-
(void)hist; // TODO
|
1197
|
+
(void)hist; // TODO: collect histograms
|
1206
1198
|
|
1207
|
-
for (int j = 0; j <
|
1199
|
+
for (int j = 0; j < n; j += k) {
|
1208
1200
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
1209
1201
|
quantize_row_q6_K_reference(src + j, y, k);
|
1210
1202
|
}
|