llama_cpp 0.14.4 → 0.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +29 -9
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +142 -49
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +156 -156
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +942 -267
- data/vendor/tmp/llama.cpp/ggml.c +161 -95
- data/vendor/tmp/llama.cpp/ggml.h +12 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1577 -274
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
+
#include "sgemm.h"
|
7
8
|
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -32,6 +33,10 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#endif
|
34
35
|
|
36
|
+
#ifdef __ARM_FEATURE_MATMUL_INT8
|
37
|
+
#undef GGML_USE_LLAMAFILE
|
38
|
+
#endif
|
39
|
+
|
35
40
|
#if defined(_MSC_VER)
|
36
41
|
// disable "possible loss of data" to avoid hundreds of casts
|
37
42
|
// we should just be careful :)
|
@@ -338,14 +343,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
338
343
|
return GGML_FP32_TO_FP16(x);
|
339
344
|
}
|
340
345
|
|
341
|
-
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
342
|
-
for (
|
346
|
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
347
|
+
for (int64_t i = 0; i < n; i++) {
|
343
348
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
344
349
|
}
|
345
350
|
}
|
346
351
|
|
347
|
-
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
348
|
-
|
352
|
+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
353
|
+
int64_t i = 0;
|
349
354
|
#if defined(__F16C__)
|
350
355
|
for (; i + 7 < n; i += 8) {
|
351
356
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
@@ -4573,21 +4578,32 @@ void ggml_mul_mat_set_prec(
|
|
4573
4578
|
|
4574
4579
|
// ggml_mul_mat_id
|
4575
4580
|
|
4576
|
-
|
4577
|
-
|
4581
|
+
/*
|
4582
|
+
c = ggml_mul_mat_id(ctx, as, b, ids);
|
4583
|
+
|
4584
|
+
as -> [cols, rows, n_expert]
|
4585
|
+
ids -> [n_experts_used, n_tokens] (i32)
|
4586
|
+
b -> [cols, n_expert_used, n_tokens]
|
4587
|
+
c -> [cols, n_expert_used, n_tokens]
|
4588
|
+
|
4589
|
+
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
4590
|
+
|
4591
|
+
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
|
4592
|
+
*/
|
4578
4593
|
struct ggml_tensor * ggml_mul_mat_id(
|
4579
4594
|
struct ggml_context * ctx,
|
4580
4595
|
struct ggml_tensor * as,
|
4581
|
-
struct ggml_tensor *
|
4582
|
-
|
4583
|
-
|
4584
|
-
|
4596
|
+
struct ggml_tensor * b,
|
4597
|
+
struct ggml_tensor * ids) {
|
4598
|
+
GGML_ASSERT(!ggml_is_transposed(as));
|
4585
4599
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4600
|
+
|
4601
|
+
GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
|
4602
|
+
GGML_ASSERT(b->ne[3] == 1); // b is 3d
|
4586
4603
|
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
-
GGML_ASSERT(ids->ne[1] == b->ne[
|
4588
|
-
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4589
|
-
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4604
|
+
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
|
4590
4605
|
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4606
|
+
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
|
4591
4607
|
|
4592
4608
|
bool is_node = false;
|
4593
4609
|
|
@@ -4595,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
4595
4611
|
is_node = true;
|
4596
4612
|
}
|
4597
4613
|
|
4598
|
-
const int64_t ne[4] = { as->ne[1],
|
4614
|
+
const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
|
4599
4615
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4600
4616
|
|
4601
|
-
ggml_set_op_params_i32(result, 0, id);
|
4602
|
-
|
4603
4617
|
result->op = GGML_OP_MUL_MAT_ID;
|
4604
4618
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4605
4619
|
result->src[0] = as;
|
@@ -10810,6 +10824,28 @@ static void ggml_compute_forward_mul_mat(
|
|
10810
10824
|
}
|
10811
10825
|
#endif
|
10812
10826
|
|
10827
|
+
#if GGML_USE_LLAMAFILE
|
10828
|
+
if (nb10 == ggml_type_size(src1->type)) {
|
10829
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10830
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10831
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10832
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10833
|
+
nb01/ggml_type_size(src0->type),
|
10834
|
+
(const char *)src1->data + i12*nb12 + i13*nb13,
|
10835
|
+
nb11/ggml_type_size(src1->type),
|
10836
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10837
|
+
nb1/ggml_type_size(dst->type),
|
10838
|
+
ith, nth,
|
10839
|
+
params->type,
|
10840
|
+
src0->type,
|
10841
|
+
src1->type,
|
10842
|
+
dst->type))
|
10843
|
+
goto UseGgmlGemm1;
|
10844
|
+
return;
|
10845
|
+
}
|
10846
|
+
UseGgmlGemm1:;
|
10847
|
+
#endif
|
10848
|
+
|
10813
10849
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10814
10850
|
if (ith != 0) {
|
10815
10851
|
return;
|
@@ -10841,6 +10877,30 @@ static void ggml_compute_forward_mul_mat(
|
|
10841
10877
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10842
10878
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10843
10879
|
|
10880
|
+
#if GGML_USE_LLAMAFILE
|
10881
|
+
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
10882
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10883
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10884
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10885
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10886
|
+
nb01/ggml_type_size(src0->type),
|
10887
|
+
(const char *)wdata + ggml_row_size(vec_dot_type,
|
10888
|
+
nb12/ggml_type_size(src1->type)*i12 +
|
10889
|
+
nb13/ggml_type_size(src1->type)*i13),
|
10890
|
+
row_size/ggml_type_size(vec_dot_type),
|
10891
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10892
|
+
nb1/ggml_type_size(dst->type),
|
10893
|
+
ith, nth,
|
10894
|
+
params->type,
|
10895
|
+
src0->type,
|
10896
|
+
vec_dot_type,
|
10897
|
+
dst->type))
|
10898
|
+
goto UseGgmlGemm2;
|
10899
|
+
return;
|
10900
|
+
}
|
10901
|
+
UseGgmlGemm2:;
|
10902
|
+
#endif
|
10903
|
+
|
10844
10904
|
const int64_t nr0 = ne01; // src0 rows
|
10845
10905
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
10846
10906
|
|
@@ -10958,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10958
11018
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10959
11019
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10960
11020
|
|
10961
|
-
GGML_ASSERT(ne0 == ne01);
|
10962
|
-
GGML_ASSERT(ne1 == ne11);
|
10963
|
-
GGML_ASSERT(ne2 == ne12);
|
10964
|
-
GGML_ASSERT(ne3 == ne13);
|
10965
|
-
|
10966
11021
|
// we don't support permuted src0 or src1
|
10967
11022
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10968
11023
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
@@ -10973,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10973
11028
|
GGML_ASSERT(nb1 <= nb2);
|
10974
11029
|
GGML_ASSERT(nb2 <= nb3);
|
10975
11030
|
|
10976
|
-
// broadcast is not supported with mmid
|
10977
|
-
assert(ne12 == 1);
|
10978
|
-
assert(ne13 == 1);
|
10979
|
-
|
10980
11031
|
// row groups
|
10981
|
-
const int
|
10982
|
-
const int n_as
|
11032
|
+
const int n_ids = ids->ne[0]; // n_expert_used
|
11033
|
+
const int n_as = ne02; // n_expert
|
10983
11034
|
|
10984
11035
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10985
11036
|
(char *) params->wdata :
|
10986
11037
|
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
10987
11038
|
|
10988
|
-
|
10989
|
-
|
11039
|
+
struct mmid_row_mapping {
|
11040
|
+
int32_t i1;
|
11041
|
+
int32_t i2;
|
11042
|
+
};
|
10990
11043
|
|
10991
|
-
|
11044
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
11045
|
+
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
|
10992
11046
|
|
10993
11047
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10994
11048
|
if (ith != 0) {
|
@@ -11012,16 +11066,20 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11012
11066
|
}
|
11013
11067
|
|
11014
11068
|
// initialize matrix_row_counts
|
11015
|
-
GGML_ASSERT(wdata == wdata_src1_end);
|
11016
11069
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
11017
11070
|
|
11071
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
11072
|
+
|
11018
11073
|
// group rows by src0 matrix
|
11019
|
-
for (int64_t
|
11020
|
-
|
11074
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
11075
|
+
for (int id = 0; id < n_ids; ++id) {
|
11076
|
+
const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
|
11021
11077
|
|
11022
|
-
|
11023
|
-
|
11024
|
-
|
11078
|
+
assert(i02 >= 0 && i02 < n_as);
|
11079
|
+
|
11080
|
+
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
|
11081
|
+
matrix_row_counts[i02] += 1;
|
11082
|
+
}
|
11025
11083
|
}
|
11026
11084
|
|
11027
11085
|
return;
|
@@ -11039,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11039
11097
|
continue;
|
11040
11098
|
}
|
11041
11099
|
|
11042
|
-
|
11100
|
+
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
11043
11101
|
|
11044
11102
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11045
11103
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11046
11104
|
|
11047
|
-
const int64_t nr0 = ne01;
|
11048
|
-
const int64_t nr1 = cne1
|
11049
|
-
|
11050
|
-
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
11105
|
+
const int64_t nr0 = ne01; // src0 rows
|
11106
|
+
const int64_t nr1 = cne1; // src1 rows
|
11051
11107
|
|
11052
11108
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
11053
11109
|
|
@@ -11066,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11066
11122
|
const int64_t ir110 = dr1*ith1;
|
11067
11123
|
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11068
11124
|
|
11069
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11070
|
-
|
11071
11125
|
// threads with no work simply yield (not sure if it helps)
|
11072
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11073
|
-
|
11074
|
-
|
11075
|
-
}
|
11126
|
+
//if (ir010 >= ir011 || ir110 >= ir111) {
|
11127
|
+
// sched_yield();
|
11128
|
+
// continue;
|
11129
|
+
//}
|
11076
11130
|
|
11077
11131
|
// block-tiling attempt
|
11078
11132
|
const int64_t blck_0 = 16;
|
@@ -11084,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11084
11138
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
11085
11139
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
11086
11140
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
11087
|
-
const int64_t
|
11088
|
-
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
11089
|
-
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
11090
|
-
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
11141
|
+
const int64_t _i12 = ir1; // logical row index for this expert
|
11091
11142
|
|
11092
|
-
|
11093
|
-
|
11094
|
-
//const int64_t i02 = i12/r2;
|
11143
|
+
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
11144
|
+
const int id = row_mapping.i1; // selected expert index
|
11095
11145
|
|
11096
|
-
const int64_t
|
11097
|
-
const int64_t i2
|
11098
|
-
const int64_t i3 = i13;
|
11146
|
+
const int64_t i11 = id % ne11;
|
11147
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
11099
11148
|
|
11100
|
-
const
|
11149
|
+
const int64_t i1 = id; // selected expert index
|
11150
|
+
const int64_t i2 = i12; // row
|
11101
11151
|
|
11102
11152
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
11103
11153
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11105,25 +11155,26 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11105
11155
|
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
11106
11156
|
const char * src1_col = (const char *) wdata +
|
11107
11157
|
(src1_cont || src1->type != vec_dot_type
|
11108
|
-
? (i11 + i12*ne11
|
11109
|
-
: (i11*nb11 + i12*nb12
|
11158
|
+
? (i11 + i12*ne11)*row_size
|
11159
|
+
: (i11*nb11 + i12*nb12));
|
11110
11160
|
|
11111
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2
|
11161
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
11112
11162
|
|
11113
11163
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11114
11164
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
11115
11165
|
//}
|
11116
11166
|
|
11117
11167
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11118
|
-
vec_dot(ne00, &tmp[ir0 - iir0], 0,
|
11168
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
11119
11169
|
}
|
11170
|
+
|
11120
11171
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
11121
11172
|
}
|
11122
11173
|
}
|
11123
11174
|
}
|
11124
11175
|
}
|
11125
11176
|
|
11126
|
-
|
11177
|
+
#undef MMID_MATRIX_ROW
|
11127
11178
|
}
|
11128
11179
|
|
11129
11180
|
// ggml_compute_forward_out_prod
|
@@ -18462,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18462
18513
|
const int n_as = src0->ne[2];
|
18463
18514
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18464
18515
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18465
|
-
cur += n_as * src1->ne[
|
18516
|
+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
18466
18517
|
} break;
|
18467
18518
|
case GGML_OP_OUT_PROD:
|
18468
18519
|
{
|
@@ -20331,11 +20382,11 @@ size_t ggml_quantize_chunk(
|
|
20331
20382
|
enum ggml_type type,
|
20332
20383
|
const float * src,
|
20333
20384
|
void * dst,
|
20334
|
-
|
20335
|
-
|
20336
|
-
|
20385
|
+
int64_t start,
|
20386
|
+
int64_t nrows,
|
20387
|
+
int64_t n_per_row,
|
20337
20388
|
const float * imatrix) {
|
20338
|
-
const
|
20389
|
+
const int64_t n = (int64_t) nrows * n_per_row;
|
20339
20390
|
|
20340
20391
|
if (ggml_quantize_requires_imatrix(type)) {
|
20341
20392
|
GGML_ASSERT(imatrix != NULL);
|
@@ -20550,6 +20601,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
20550
20601
|
return ok;
|
20551
20602
|
}
|
20552
20603
|
|
20604
|
+
static void gguf_free_kv(struct gguf_kv * kv) {
|
20605
|
+
if (kv->key.data) {
|
20606
|
+
GGML_FREE(kv->key.data);
|
20607
|
+
}
|
20608
|
+
|
20609
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
20610
|
+
if (kv->value.str.data) {
|
20611
|
+
GGML_FREE(kv->value.str.data);
|
20612
|
+
}
|
20613
|
+
}
|
20614
|
+
|
20615
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
20616
|
+
if (kv->value.arr.data) {
|
20617
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20618
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20619
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20620
|
+
if (str->data) {
|
20621
|
+
GGML_FREE(str->data);
|
20622
|
+
}
|
20623
|
+
}
|
20624
|
+
}
|
20625
|
+
GGML_FREE(kv->value.arr.data);
|
20626
|
+
}
|
20627
|
+
}
|
20628
|
+
}
|
20629
|
+
|
20553
20630
|
struct gguf_context * gguf_init_empty(void) {
|
20554
20631
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20555
20632
|
|
@@ -20862,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20862
20939
|
|
20863
20940
|
ok = ok && cur != NULL;
|
20864
20941
|
|
20865
|
-
ggml_set_name(cur, ctx->infos[i].name.data);
|
20866
|
-
|
20867
20942
|
if (!ok) {
|
20868
20943
|
break;
|
20869
20944
|
}
|
20870
20945
|
|
20946
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
20947
|
+
|
20871
20948
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20872
20949
|
if (!params.no_alloc) {
|
20873
20950
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
@@ -20899,31 +20976,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20899
20976
|
if (ctx->kv) {
|
20900
20977
|
// free string memory - not great..
|
20901
20978
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
20902
|
-
|
20903
|
-
|
20904
|
-
if (kv->key.data) {
|
20905
|
-
GGML_FREE(kv->key.data);
|
20906
|
-
}
|
20907
|
-
|
20908
|
-
if (kv->type == GGUF_TYPE_STRING) {
|
20909
|
-
if (kv->value.str.data) {
|
20910
|
-
GGML_FREE(kv->value.str.data);
|
20911
|
-
}
|
20912
|
-
}
|
20913
|
-
|
20914
|
-
if (kv->type == GGUF_TYPE_ARRAY) {
|
20915
|
-
if (kv->value.arr.data) {
|
20916
|
-
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20917
|
-
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20918
|
-
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20919
|
-
if (str->data) {
|
20920
|
-
GGML_FREE(str->data);
|
20921
|
-
}
|
20922
|
-
}
|
20923
|
-
}
|
20924
|
-
GGML_FREE(kv->value.arr.data);
|
20925
|
-
}
|
20926
|
-
}
|
20979
|
+
gguf_free_kv(&ctx->kv[i]);
|
20927
20980
|
}
|
20928
20981
|
|
20929
20982
|
GGML_FREE(ctx->kv);
|
@@ -21148,6 +21201,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
|
21148
21201
|
return n_kv;
|
21149
21202
|
}
|
21150
21203
|
|
21204
|
+
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
|
21205
|
+
const int idx = gguf_find_key(ctx, key);
|
21206
|
+
if (idx >= 0) {
|
21207
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
21208
|
+
gguf_free_kv(&ctx->kv[idx]);
|
21209
|
+
for (int i = idx; i < n_kv-1; ++i) {
|
21210
|
+
ctx->kv[i] = ctx->kv[i+1];
|
21211
|
+
}
|
21212
|
+
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
|
21213
|
+
ctx->header.n_kv--;
|
21214
|
+
}
|
21215
|
+
}
|
21216
|
+
|
21151
21217
|
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
21152
21218
|
const int idx = gguf_get_or_add_key(ctx, key);
|
21153
21219
|
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -332,8 +332,8 @@ extern "C" {
|
|
332
332
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
333
333
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
334
334
|
|
335
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
336
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
335
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
336
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
337
337
|
|
338
338
|
struct ggml_object;
|
339
339
|
struct ggml_context;
|
@@ -1161,13 +1161,11 @@ extern "C" {
|
|
1161
1161
|
enum ggml_prec prec);
|
1162
1162
|
|
1163
1163
|
// indirect matrix multiplication
|
1164
|
-
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1165
1164
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1166
1165
|
struct ggml_context * ctx,
|
1167
1166
|
struct ggml_tensor * as,
|
1168
|
-
struct ggml_tensor *
|
1169
|
-
|
1170
|
-
struct ggml_tensor * b);
|
1167
|
+
struct ggml_tensor * b,
|
1168
|
+
struct ggml_tensor * ids);
|
1171
1169
|
|
1172
1170
|
// A: m columns, n rows,
|
1173
1171
|
// B: p columns, n rows,
|
@@ -2210,9 +2208,9 @@ extern "C" {
|
|
2210
2208
|
enum ggml_type type,
|
2211
2209
|
const float * src,
|
2212
2210
|
void * dst,
|
2213
|
-
|
2214
|
-
|
2215
|
-
|
2211
|
+
int64_t start,
|
2212
|
+
int64_t nrows,
|
2213
|
+
int64_t n_per_row,
|
2216
2214
|
const float * imatrix);
|
2217
2215
|
|
2218
2216
|
//
|
@@ -2289,6 +2287,9 @@ extern "C" {
|
|
2289
2287
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
2290
2288
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
2291
2289
|
|
2290
|
+
// removes key if it exists
|
2291
|
+
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
2292
|
+
|
2292
2293
|
// overrides existing values or adds a new one
|
2293
2294
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
2294
2295
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
@@ -2377,8 +2378,8 @@ extern "C" {
|
|
2377
2378
|
#else
|
2378
2379
|
#define GGML_RESTRICT restrict
|
2379
2380
|
#endif
|
2380
|
-
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
2381
|
-
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
2381
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
2382
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
2382
2383
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
2383
2384
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
2384
2385
|
|