llama_cpp 0.14.5 → 0.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +24 -7
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +263 -5
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -294
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +151 -99
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +1308 -254
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +999 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
+
#include "sgemm.h"
|
7
8
|
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -32,6 +33,10 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#endif
|
34
35
|
|
36
|
+
#ifdef __ARM_FEATURE_MATMUL_INT8
|
37
|
+
#undef GGML_USE_LLAMAFILE
|
38
|
+
#endif
|
39
|
+
|
35
40
|
#if defined(_MSC_VER)
|
36
41
|
// disable "possible loss of data" to avoid hundreds of casts
|
37
42
|
// we should just be careful :)
|
@@ -853,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
853
858
|
// simd mappings
|
854
859
|
//
|
855
860
|
|
856
|
-
#if defined(__ARM_NEON)
|
857
|
-
#if !defined(__aarch64__)
|
858
|
-
|
859
|
-
// 64-bit compatibility
|
860
|
-
|
861
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
862
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
863
|
-
}
|
864
|
-
|
865
|
-
#endif
|
866
|
-
#endif
|
867
|
-
|
868
861
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
869
862
|
// we then implement the fundamental computation operations below using only these macros
|
870
863
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
@@ -4573,21 +4566,32 @@ void ggml_mul_mat_set_prec(
|
|
4573
4566
|
|
4574
4567
|
// ggml_mul_mat_id
|
4575
4568
|
|
4576
|
-
|
4577
|
-
|
4569
|
+
/*
|
4570
|
+
c = ggml_mul_mat_id(ctx, as, b, ids);
|
4571
|
+
|
4572
|
+
as -> [cols, rows, n_expert]
|
4573
|
+
ids -> [n_experts_used, n_tokens] (i32)
|
4574
|
+
b -> [cols, n_expert_used, n_tokens]
|
4575
|
+
c -> [cols, n_expert_used, n_tokens]
|
4576
|
+
|
4577
|
+
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
4578
|
+
|
4579
|
+
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
|
4580
|
+
*/
|
4578
4581
|
struct ggml_tensor * ggml_mul_mat_id(
|
4579
4582
|
struct ggml_context * ctx,
|
4580
4583
|
struct ggml_tensor * as,
|
4581
|
-
struct ggml_tensor *
|
4582
|
-
|
4583
|
-
|
4584
|
-
|
4584
|
+
struct ggml_tensor * b,
|
4585
|
+
struct ggml_tensor * ids) {
|
4586
|
+
GGML_ASSERT(!ggml_is_transposed(as));
|
4585
4587
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4588
|
+
|
4589
|
+
GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
|
4590
|
+
GGML_ASSERT(b->ne[3] == 1); // b is 3d
|
4586
4591
|
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
-
GGML_ASSERT(ids->ne[1] == b->ne[
|
4588
|
-
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4589
|
-
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4592
|
+
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
|
4590
4593
|
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4594
|
+
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
|
4591
4595
|
|
4592
4596
|
bool is_node = false;
|
4593
4597
|
|
@@ -4595,11 +4599,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
4595
4599
|
is_node = true;
|
4596
4600
|
}
|
4597
4601
|
|
4598
|
-
const int64_t ne[4] = { as->ne[1],
|
4602
|
+
const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
|
4599
4603
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4600
4604
|
|
4601
|
-
ggml_set_op_params_i32(result, 0, id);
|
4602
|
-
|
4603
4605
|
result->op = GGML_OP_MUL_MAT_ID;
|
4604
4606
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4605
4607
|
result->src[0] = as;
|
@@ -10810,6 +10812,28 @@ static void ggml_compute_forward_mul_mat(
|
|
10810
10812
|
}
|
10811
10813
|
#endif
|
10812
10814
|
|
10815
|
+
#if GGML_USE_LLAMAFILE
|
10816
|
+
if (src1_cont) {
|
10817
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10818
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10819
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10820
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10821
|
+
nb01/ggml_type_size(src0->type),
|
10822
|
+
(const char *)src1->data + i12*nb12 + i13*nb13,
|
10823
|
+
nb11/ggml_type_size(src1->type),
|
10824
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10825
|
+
nb1/ggml_type_size(dst->type),
|
10826
|
+
ith, nth,
|
10827
|
+
params->type,
|
10828
|
+
src0->type,
|
10829
|
+
src1->type,
|
10830
|
+
dst->type))
|
10831
|
+
goto UseGgmlGemm1;
|
10832
|
+
return;
|
10833
|
+
}
|
10834
|
+
UseGgmlGemm1:;
|
10835
|
+
#endif
|
10836
|
+
|
10813
10837
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10814
10838
|
if (ith != 0) {
|
10815
10839
|
return;
|
@@ -10841,6 +10865,28 @@ static void ggml_compute_forward_mul_mat(
|
|
10841
10865
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10842
10866
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10843
10867
|
|
10868
|
+
#if GGML_USE_LLAMAFILE
|
10869
|
+
if (src1->type != vec_dot_type) {
|
10870
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10871
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10872
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10873
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10874
|
+
nb01/ggml_type_size(src0->type),
|
10875
|
+
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
10876
|
+
row_size/ggml_type_size(vec_dot_type),
|
10877
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10878
|
+
nb1/ggml_type_size(dst->type),
|
10879
|
+
ith, nth,
|
10880
|
+
params->type,
|
10881
|
+
src0->type,
|
10882
|
+
vec_dot_type,
|
10883
|
+
dst->type))
|
10884
|
+
goto UseGgmlGemm2;
|
10885
|
+
return;
|
10886
|
+
}
|
10887
|
+
UseGgmlGemm2:;
|
10888
|
+
#endif
|
10889
|
+
|
10844
10890
|
const int64_t nr0 = ne01; // src0 rows
|
10845
10891
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
10846
10892
|
|
@@ -10958,11 +11004,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10958
11004
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10959
11005
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10960
11006
|
|
10961
|
-
GGML_ASSERT(ne0 == ne01);
|
10962
|
-
GGML_ASSERT(ne1 == ne11);
|
10963
|
-
GGML_ASSERT(ne2 == ne12);
|
10964
|
-
GGML_ASSERT(ne3 == ne13);
|
10965
|
-
|
10966
11007
|
// we don't support permuted src0 or src1
|
10967
11008
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10968
11009
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
@@ -10973,22 +11014,21 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10973
11014
|
GGML_ASSERT(nb1 <= nb2);
|
10974
11015
|
GGML_ASSERT(nb2 <= nb3);
|
10975
11016
|
|
10976
|
-
// broadcast is not supported with mmid
|
10977
|
-
assert(ne12 == 1);
|
10978
|
-
assert(ne13 == 1);
|
10979
|
-
|
10980
11017
|
// row groups
|
10981
|
-
const int
|
10982
|
-
const int n_as
|
11018
|
+
const int n_ids = ids->ne[0]; // n_expert_used
|
11019
|
+
const int n_as = ne02; // n_expert
|
10983
11020
|
|
10984
11021
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10985
11022
|
(char *) params->wdata :
|
10986
11023
|
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
10987
11024
|
|
10988
|
-
|
10989
|
-
|
11025
|
+
struct mmid_row_mapping {
|
11026
|
+
int32_t i1;
|
11027
|
+
int32_t i2;
|
11028
|
+
};
|
10990
11029
|
|
10991
|
-
|
11030
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
11031
|
+
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
|
10992
11032
|
|
10993
11033
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10994
11034
|
if (ith != 0) {
|
@@ -11012,16 +11052,20 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11012
11052
|
}
|
11013
11053
|
|
11014
11054
|
// initialize matrix_row_counts
|
11015
|
-
GGML_ASSERT(wdata == wdata_src1_end);
|
11016
11055
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
11017
11056
|
|
11057
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
11058
|
+
|
11018
11059
|
// group rows by src0 matrix
|
11019
|
-
for (int64_t
|
11020
|
-
|
11060
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
11061
|
+
for (int id = 0; id < n_ids; ++id) {
|
11062
|
+
const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
|
11063
|
+
|
11064
|
+
assert(i02 >= 0 && i02 < n_as);
|
11021
11065
|
|
11022
|
-
|
11023
|
-
|
11024
|
-
|
11066
|
+
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
|
11067
|
+
matrix_row_counts[i02] += 1;
|
11068
|
+
}
|
11025
11069
|
}
|
11026
11070
|
|
11027
11071
|
return;
|
@@ -11039,15 +11083,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11039
11083
|
continue;
|
11040
11084
|
}
|
11041
11085
|
|
11042
|
-
|
11086
|
+
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
11043
11087
|
|
11044
11088
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11045
11089
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11046
11090
|
|
11047
|
-
const int64_t nr0 = ne01;
|
11048
|
-
const int64_t nr1 = cne1
|
11049
|
-
|
11050
|
-
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
11091
|
+
const int64_t nr0 = ne01; // src0 rows
|
11092
|
+
const int64_t nr1 = cne1; // src1 rows
|
11051
11093
|
|
11052
11094
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
11053
11095
|
|
@@ -11066,13 +11108,11 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11066
11108
|
const int64_t ir110 = dr1*ith1;
|
11067
11109
|
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11068
11110
|
|
11069
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11070
|
-
|
11071
11111
|
// threads with no work simply yield (not sure if it helps)
|
11072
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11073
|
-
|
11074
|
-
|
11075
|
-
}
|
11112
|
+
//if (ir010 >= ir011 || ir110 >= ir111) {
|
11113
|
+
// sched_yield();
|
11114
|
+
// continue;
|
11115
|
+
//}
|
11076
11116
|
|
11077
11117
|
// block-tiling attempt
|
11078
11118
|
const int64_t blck_0 = 16;
|
@@ -11084,20 +11124,16 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11084
11124
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
11085
11125
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
11086
11126
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
11087
|
-
const int64_t
|
11088
|
-
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
11089
|
-
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
11090
|
-
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
11127
|
+
const int64_t _i12 = ir1; // logical row index for this expert
|
11091
11128
|
|
11092
|
-
|
11093
|
-
|
11094
|
-
//const int64_t i02 = i12/r2;
|
11129
|
+
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
11130
|
+
const int id = row_mapping.i1; // selected expert index
|
11095
11131
|
|
11096
|
-
const int64_t
|
11097
|
-
const int64_t i2
|
11098
|
-
const int64_t i3 = i13;
|
11132
|
+
const int64_t i11 = id % ne11;
|
11133
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
11099
11134
|
|
11100
|
-
const
|
11135
|
+
const int64_t i1 = id; // selected expert index
|
11136
|
+
const int64_t i2 = i12; // row
|
11101
11137
|
|
11102
11138
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
11103
11139
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11105,25 +11141,26 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11105
11141
|
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
11106
11142
|
const char * src1_col = (const char *) wdata +
|
11107
11143
|
(src1_cont || src1->type != vec_dot_type
|
11108
|
-
? (i11 + i12*ne11
|
11109
|
-
: (i11*nb11 + i12*nb12
|
11144
|
+
? (i11 + i12*ne11)*row_size
|
11145
|
+
: (i11*nb11 + i12*nb12));
|
11110
11146
|
|
11111
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2
|
11147
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
11112
11148
|
|
11113
11149
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11114
11150
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
11115
11151
|
//}
|
11116
11152
|
|
11117
11153
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11118
|
-
vec_dot(ne00, &tmp[ir0 - iir0], 0,
|
11154
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
11119
11155
|
}
|
11156
|
+
|
11120
11157
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
11121
11158
|
}
|
11122
11159
|
}
|
11123
11160
|
}
|
11124
11161
|
}
|
11125
11162
|
|
11126
|
-
|
11163
|
+
#undef MMID_MATRIX_ROW
|
11127
11164
|
}
|
11128
11165
|
|
11129
11166
|
// ggml_compute_forward_out_prod
|
@@ -18462,7 +18499,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18462
18499
|
const int n_as = src0->ne[2];
|
18463
18500
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18464
18501
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18465
|
-
cur += n_as * src1->ne[
|
18502
|
+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
18466
18503
|
} break;
|
18467
18504
|
case GGML_OP_OUT_PROD:
|
18468
18505
|
{
|
@@ -20550,6 +20587,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
20550
20587
|
return ok;
|
20551
20588
|
}
|
20552
20589
|
|
20590
|
+
static void gguf_free_kv(struct gguf_kv * kv) {
|
20591
|
+
if (kv->key.data) {
|
20592
|
+
GGML_FREE(kv->key.data);
|
20593
|
+
}
|
20594
|
+
|
20595
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
20596
|
+
if (kv->value.str.data) {
|
20597
|
+
GGML_FREE(kv->value.str.data);
|
20598
|
+
}
|
20599
|
+
}
|
20600
|
+
|
20601
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
20602
|
+
if (kv->value.arr.data) {
|
20603
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20604
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20605
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20606
|
+
if (str->data) {
|
20607
|
+
GGML_FREE(str->data);
|
20608
|
+
}
|
20609
|
+
}
|
20610
|
+
}
|
20611
|
+
GGML_FREE(kv->value.arr.data);
|
20612
|
+
}
|
20613
|
+
}
|
20614
|
+
}
|
20615
|
+
|
20553
20616
|
struct gguf_context * gguf_init_empty(void) {
|
20554
20617
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20555
20618
|
|
@@ -20862,12 +20925,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20862
20925
|
|
20863
20926
|
ok = ok && cur != NULL;
|
20864
20927
|
|
20865
|
-
ggml_set_name(cur, ctx->infos[i].name.data);
|
20866
|
-
|
20867
20928
|
if (!ok) {
|
20868
20929
|
break;
|
20869
20930
|
}
|
20870
20931
|
|
20932
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
20933
|
+
|
20871
20934
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20872
20935
|
if (!params.no_alloc) {
|
20873
20936
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
@@ -20899,31 +20962,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20899
20962
|
if (ctx->kv) {
|
20900
20963
|
// free string memory - not great..
|
20901
20964
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
20902
|
-
|
20903
|
-
|
20904
|
-
if (kv->key.data) {
|
20905
|
-
GGML_FREE(kv->key.data);
|
20906
|
-
}
|
20907
|
-
|
20908
|
-
if (kv->type == GGUF_TYPE_STRING) {
|
20909
|
-
if (kv->value.str.data) {
|
20910
|
-
GGML_FREE(kv->value.str.data);
|
20911
|
-
}
|
20912
|
-
}
|
20913
|
-
|
20914
|
-
if (kv->type == GGUF_TYPE_ARRAY) {
|
20915
|
-
if (kv->value.arr.data) {
|
20916
|
-
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20917
|
-
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20918
|
-
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20919
|
-
if (str->data) {
|
20920
|
-
GGML_FREE(str->data);
|
20921
|
-
}
|
20922
|
-
}
|
20923
|
-
}
|
20924
|
-
GGML_FREE(kv->value.arr.data);
|
20925
|
-
}
|
20926
|
-
}
|
20965
|
+
gguf_free_kv(&ctx->kv[i]);
|
20927
20966
|
}
|
20928
20967
|
|
20929
20968
|
GGML_FREE(ctx->kv);
|
@@ -21148,6 +21187,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
|
21148
21187
|
return n_kv;
|
21149
21188
|
}
|
21150
21189
|
|
21190
|
+
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
|
21191
|
+
const int idx = gguf_find_key(ctx, key);
|
21192
|
+
if (idx >= 0) {
|
21193
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
21194
|
+
gguf_free_kv(&ctx->kv[idx]);
|
21195
|
+
for (int i = idx; i < n_kv-1; ++i) {
|
21196
|
+
ctx->kv[i] = ctx->kv[i+1];
|
21197
|
+
}
|
21198
|
+
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
|
21199
|
+
ctx->header.n_kv--;
|
21200
|
+
}
|
21201
|
+
}
|
21202
|
+
|
21151
21203
|
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
21152
21204
|
const int idx = gguf_get_or_add_key(ctx, key);
|
21153
21205
|
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -1161,13 +1161,11 @@ extern "C" {
|
|
1161
1161
|
enum ggml_prec prec);
|
1162
1162
|
|
1163
1163
|
// indirect matrix multiplication
|
1164
|
-
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1165
1164
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1166
1165
|
struct ggml_context * ctx,
|
1167
1166
|
struct ggml_tensor * as,
|
1168
|
-
struct ggml_tensor *
|
1169
|
-
|
1170
|
-
struct ggml_tensor * b);
|
1167
|
+
struct ggml_tensor * b,
|
1168
|
+
struct ggml_tensor * ids);
|
1171
1169
|
|
1172
1170
|
// A: m columns, n rows,
|
1173
1171
|
// B: p columns, n rows,
|
@@ -2289,6 +2287,9 @@ extern "C" {
|
|
2289
2287
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
2290
2288
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
2291
2289
|
|
2290
|
+
// removes key if it exists
|
2291
|
+
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
2292
|
+
|
2292
2293
|
// overrides existing values or adds a new one
|
2293
2294
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
2294
2295
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|