llama_cpp 0.14.5 → 0.14.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +24 -7
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +263 -5
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -294
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +151 -99
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +1308 -254
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +999 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
+
#include "sgemm.h"
|
7
8
|
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -32,6 +33,10 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#endif
|
34
35
|
|
36
|
+
#ifdef __ARM_FEATURE_MATMUL_INT8
|
37
|
+
#undef GGML_USE_LLAMAFILE
|
38
|
+
#endif
|
39
|
+
|
35
40
|
#if defined(_MSC_VER)
|
36
41
|
// disable "possible loss of data" to avoid hundreds of casts
|
37
42
|
// we should just be careful :)
|
@@ -853,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
853
858
|
// simd mappings
|
854
859
|
//
|
855
860
|
|
856
|
-
#if defined(__ARM_NEON)
|
857
|
-
#if !defined(__aarch64__)
|
858
|
-
|
859
|
-
// 64-bit compatibility
|
860
|
-
|
861
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
862
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
863
|
-
}
|
864
|
-
|
865
|
-
#endif
|
866
|
-
#endif
|
867
|
-
|
868
861
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
869
862
|
// we then implement the fundamental computation operations below using only these macros
|
870
863
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
@@ -4573,21 +4566,32 @@ void ggml_mul_mat_set_prec(
|
|
4573
4566
|
|
4574
4567
|
// ggml_mul_mat_id
|
4575
4568
|
|
4576
|
-
|
4577
|
-
|
4569
|
+
/*
|
4570
|
+
c = ggml_mul_mat_id(ctx, as, b, ids);
|
4571
|
+
|
4572
|
+
as -> [cols, rows, n_expert]
|
4573
|
+
ids -> [n_experts_used, n_tokens] (i32)
|
4574
|
+
b -> [cols, n_expert_used, n_tokens]
|
4575
|
+
c -> [cols, n_expert_used, n_tokens]
|
4576
|
+
|
4577
|
+
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
4578
|
+
|
4579
|
+
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
|
4580
|
+
*/
|
4578
4581
|
struct ggml_tensor * ggml_mul_mat_id(
|
4579
4582
|
struct ggml_context * ctx,
|
4580
4583
|
struct ggml_tensor * as,
|
4581
|
-
struct ggml_tensor *
|
4582
|
-
|
4583
|
-
|
4584
|
-
|
4584
|
+
struct ggml_tensor * b,
|
4585
|
+
struct ggml_tensor * ids) {
|
4586
|
+
GGML_ASSERT(!ggml_is_transposed(as));
|
4585
4587
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4588
|
+
|
4589
|
+
GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
|
4590
|
+
GGML_ASSERT(b->ne[3] == 1); // b is 3d
|
4586
4591
|
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
-
GGML_ASSERT(ids->ne[1] == b->ne[
|
4588
|
-
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4589
|
-
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4592
|
+
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
|
4590
4593
|
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4594
|
+
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
|
4591
4595
|
|
4592
4596
|
bool is_node = false;
|
4593
4597
|
|
@@ -4595,11 +4599,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
4595
4599
|
is_node = true;
|
4596
4600
|
}
|
4597
4601
|
|
4598
|
-
const int64_t ne[4] = { as->ne[1],
|
4602
|
+
const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
|
4599
4603
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4600
4604
|
|
4601
|
-
ggml_set_op_params_i32(result, 0, id);
|
4602
|
-
|
4603
4605
|
result->op = GGML_OP_MUL_MAT_ID;
|
4604
4606
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4605
4607
|
result->src[0] = as;
|
@@ -10810,6 +10812,28 @@ static void ggml_compute_forward_mul_mat(
|
|
10810
10812
|
}
|
10811
10813
|
#endif
|
10812
10814
|
|
10815
|
+
#if GGML_USE_LLAMAFILE
|
10816
|
+
if (src1_cont) {
|
10817
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10818
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10819
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10820
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10821
|
+
nb01/ggml_type_size(src0->type),
|
10822
|
+
(const char *)src1->data + i12*nb12 + i13*nb13,
|
10823
|
+
nb11/ggml_type_size(src1->type),
|
10824
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10825
|
+
nb1/ggml_type_size(dst->type),
|
10826
|
+
ith, nth,
|
10827
|
+
params->type,
|
10828
|
+
src0->type,
|
10829
|
+
src1->type,
|
10830
|
+
dst->type))
|
10831
|
+
goto UseGgmlGemm1;
|
10832
|
+
return;
|
10833
|
+
}
|
10834
|
+
UseGgmlGemm1:;
|
10835
|
+
#endif
|
10836
|
+
|
10813
10837
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10814
10838
|
if (ith != 0) {
|
10815
10839
|
return;
|
@@ -10841,6 +10865,28 @@ static void ggml_compute_forward_mul_mat(
|
|
10841
10865
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10842
10866
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10843
10867
|
|
10868
|
+
#if GGML_USE_LLAMAFILE
|
10869
|
+
if (src1->type != vec_dot_type) {
|
10870
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10871
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10872
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10873
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10874
|
+
nb01/ggml_type_size(src0->type),
|
10875
|
+
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
10876
|
+
row_size/ggml_type_size(vec_dot_type),
|
10877
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10878
|
+
nb1/ggml_type_size(dst->type),
|
10879
|
+
ith, nth,
|
10880
|
+
params->type,
|
10881
|
+
src0->type,
|
10882
|
+
vec_dot_type,
|
10883
|
+
dst->type))
|
10884
|
+
goto UseGgmlGemm2;
|
10885
|
+
return;
|
10886
|
+
}
|
10887
|
+
UseGgmlGemm2:;
|
10888
|
+
#endif
|
10889
|
+
|
10844
10890
|
const int64_t nr0 = ne01; // src0 rows
|
10845
10891
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
10846
10892
|
|
@@ -10958,11 +11004,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10958
11004
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10959
11005
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10960
11006
|
|
10961
|
-
GGML_ASSERT(ne0 == ne01);
|
10962
|
-
GGML_ASSERT(ne1 == ne11);
|
10963
|
-
GGML_ASSERT(ne2 == ne12);
|
10964
|
-
GGML_ASSERT(ne3 == ne13);
|
10965
|
-
|
10966
11007
|
// we don't support permuted src0 or src1
|
10967
11008
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10968
11009
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
@@ -10973,22 +11014,21 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10973
11014
|
GGML_ASSERT(nb1 <= nb2);
|
10974
11015
|
GGML_ASSERT(nb2 <= nb3);
|
10975
11016
|
|
10976
|
-
// broadcast is not supported with mmid
|
10977
|
-
assert(ne12 == 1);
|
10978
|
-
assert(ne13 == 1);
|
10979
|
-
|
10980
11017
|
// row groups
|
10981
|
-
const int
|
10982
|
-
const int n_as
|
11018
|
+
const int n_ids = ids->ne[0]; // n_expert_used
|
11019
|
+
const int n_as = ne02; // n_expert
|
10983
11020
|
|
10984
11021
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10985
11022
|
(char *) params->wdata :
|
10986
11023
|
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
10987
11024
|
|
10988
|
-
|
10989
|
-
|
11025
|
+
struct mmid_row_mapping {
|
11026
|
+
int32_t i1;
|
11027
|
+
int32_t i2;
|
11028
|
+
};
|
10990
11029
|
|
10991
|
-
|
11030
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
11031
|
+
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
|
10992
11032
|
|
10993
11033
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10994
11034
|
if (ith != 0) {
|
@@ -11012,16 +11052,20 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11012
11052
|
}
|
11013
11053
|
|
11014
11054
|
// initialize matrix_row_counts
|
11015
|
-
GGML_ASSERT(wdata == wdata_src1_end);
|
11016
11055
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
11017
11056
|
|
11057
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
11058
|
+
|
11018
11059
|
// group rows by src0 matrix
|
11019
|
-
for (int64_t
|
11020
|
-
|
11060
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
11061
|
+
for (int id = 0; id < n_ids; ++id) {
|
11062
|
+
const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
|
11063
|
+
|
11064
|
+
assert(i02 >= 0 && i02 < n_as);
|
11021
11065
|
|
11022
|
-
|
11023
|
-
|
11024
|
-
|
11066
|
+
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
|
11067
|
+
matrix_row_counts[i02] += 1;
|
11068
|
+
}
|
11025
11069
|
}
|
11026
11070
|
|
11027
11071
|
return;
|
@@ -11039,15 +11083,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11039
11083
|
continue;
|
11040
11084
|
}
|
11041
11085
|
|
11042
|
-
|
11086
|
+
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
11043
11087
|
|
11044
11088
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11045
11089
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11046
11090
|
|
11047
|
-
const int64_t nr0 = ne01;
|
11048
|
-
const int64_t nr1 = cne1
|
11049
|
-
|
11050
|
-
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
11091
|
+
const int64_t nr0 = ne01; // src0 rows
|
11092
|
+
const int64_t nr1 = cne1; // src1 rows
|
11051
11093
|
|
11052
11094
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
11053
11095
|
|
@@ -11066,13 +11108,11 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11066
11108
|
const int64_t ir110 = dr1*ith1;
|
11067
11109
|
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11068
11110
|
|
11069
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11070
|
-
|
11071
11111
|
// threads with no work simply yield (not sure if it helps)
|
11072
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11073
|
-
|
11074
|
-
|
11075
|
-
}
|
11112
|
+
//if (ir010 >= ir011 || ir110 >= ir111) {
|
11113
|
+
// sched_yield();
|
11114
|
+
// continue;
|
11115
|
+
//}
|
11076
11116
|
|
11077
11117
|
// block-tiling attempt
|
11078
11118
|
const int64_t blck_0 = 16;
|
@@ -11084,20 +11124,16 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11084
11124
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
11085
11125
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
11086
11126
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
11087
|
-
const int64_t
|
11088
|
-
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
11089
|
-
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
11090
|
-
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
11127
|
+
const int64_t _i12 = ir1; // logical row index for this expert
|
11091
11128
|
|
11092
|
-
|
11093
|
-
|
11094
|
-
//const int64_t i02 = i12/r2;
|
11129
|
+
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
11130
|
+
const int id = row_mapping.i1; // selected expert index
|
11095
11131
|
|
11096
|
-
const int64_t
|
11097
|
-
const int64_t i2
|
11098
|
-
const int64_t i3 = i13;
|
11132
|
+
const int64_t i11 = id % ne11;
|
11133
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
11099
11134
|
|
11100
|
-
const
|
11135
|
+
const int64_t i1 = id; // selected expert index
|
11136
|
+
const int64_t i2 = i12; // row
|
11101
11137
|
|
11102
11138
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
11103
11139
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11105,25 +11141,26 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11105
11141
|
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
11106
11142
|
const char * src1_col = (const char *) wdata +
|
11107
11143
|
(src1_cont || src1->type != vec_dot_type
|
11108
|
-
? (i11 + i12*ne11
|
11109
|
-
: (i11*nb11 + i12*nb12
|
11144
|
+
? (i11 + i12*ne11)*row_size
|
11145
|
+
: (i11*nb11 + i12*nb12));
|
11110
11146
|
|
11111
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2
|
11147
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
11112
11148
|
|
11113
11149
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11114
11150
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
11115
11151
|
//}
|
11116
11152
|
|
11117
11153
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11118
|
-
vec_dot(ne00, &tmp[ir0 - iir0], 0,
|
11154
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
11119
11155
|
}
|
11156
|
+
|
11120
11157
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
11121
11158
|
}
|
11122
11159
|
}
|
11123
11160
|
}
|
11124
11161
|
}
|
11125
11162
|
|
11126
|
-
|
11163
|
+
#undef MMID_MATRIX_ROW
|
11127
11164
|
}
|
11128
11165
|
|
11129
11166
|
// ggml_compute_forward_out_prod
|
@@ -18462,7 +18499,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18462
18499
|
const int n_as = src0->ne[2];
|
18463
18500
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18464
18501
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18465
|
-
cur += n_as * src1->ne[
|
18502
|
+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
18466
18503
|
} break;
|
18467
18504
|
case GGML_OP_OUT_PROD:
|
18468
18505
|
{
|
@@ -20550,6 +20587,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
20550
20587
|
return ok;
|
20551
20588
|
}
|
20552
20589
|
|
20590
|
+
static void gguf_free_kv(struct gguf_kv * kv) {
|
20591
|
+
if (kv->key.data) {
|
20592
|
+
GGML_FREE(kv->key.data);
|
20593
|
+
}
|
20594
|
+
|
20595
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
20596
|
+
if (kv->value.str.data) {
|
20597
|
+
GGML_FREE(kv->value.str.data);
|
20598
|
+
}
|
20599
|
+
}
|
20600
|
+
|
20601
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
20602
|
+
if (kv->value.arr.data) {
|
20603
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20604
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20605
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20606
|
+
if (str->data) {
|
20607
|
+
GGML_FREE(str->data);
|
20608
|
+
}
|
20609
|
+
}
|
20610
|
+
}
|
20611
|
+
GGML_FREE(kv->value.arr.data);
|
20612
|
+
}
|
20613
|
+
}
|
20614
|
+
}
|
20615
|
+
|
20553
20616
|
struct gguf_context * gguf_init_empty(void) {
|
20554
20617
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20555
20618
|
|
@@ -20862,12 +20925,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20862
20925
|
|
20863
20926
|
ok = ok && cur != NULL;
|
20864
20927
|
|
20865
|
-
ggml_set_name(cur, ctx->infos[i].name.data);
|
20866
|
-
|
20867
20928
|
if (!ok) {
|
20868
20929
|
break;
|
20869
20930
|
}
|
20870
20931
|
|
20932
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
20933
|
+
|
20871
20934
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20872
20935
|
if (!params.no_alloc) {
|
20873
20936
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
@@ -20899,31 +20962,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20899
20962
|
if (ctx->kv) {
|
20900
20963
|
// free string memory - not great..
|
20901
20964
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
20902
|
-
|
20903
|
-
|
20904
|
-
if (kv->key.data) {
|
20905
|
-
GGML_FREE(kv->key.data);
|
20906
|
-
}
|
20907
|
-
|
20908
|
-
if (kv->type == GGUF_TYPE_STRING) {
|
20909
|
-
if (kv->value.str.data) {
|
20910
|
-
GGML_FREE(kv->value.str.data);
|
20911
|
-
}
|
20912
|
-
}
|
20913
|
-
|
20914
|
-
if (kv->type == GGUF_TYPE_ARRAY) {
|
20915
|
-
if (kv->value.arr.data) {
|
20916
|
-
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20917
|
-
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20918
|
-
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20919
|
-
if (str->data) {
|
20920
|
-
GGML_FREE(str->data);
|
20921
|
-
}
|
20922
|
-
}
|
20923
|
-
}
|
20924
|
-
GGML_FREE(kv->value.arr.data);
|
20925
|
-
}
|
20926
|
-
}
|
20965
|
+
gguf_free_kv(&ctx->kv[i]);
|
20927
20966
|
}
|
20928
20967
|
|
20929
20968
|
GGML_FREE(ctx->kv);
|
@@ -21148,6 +21187,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
|
21148
21187
|
return n_kv;
|
21149
21188
|
}
|
21150
21189
|
|
21190
|
+
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
|
21191
|
+
const int idx = gguf_find_key(ctx, key);
|
21192
|
+
if (idx >= 0) {
|
21193
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
21194
|
+
gguf_free_kv(&ctx->kv[idx]);
|
21195
|
+
for (int i = idx; i < n_kv-1; ++i) {
|
21196
|
+
ctx->kv[i] = ctx->kv[i+1];
|
21197
|
+
}
|
21198
|
+
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
|
21199
|
+
ctx->header.n_kv--;
|
21200
|
+
}
|
21201
|
+
}
|
21202
|
+
|
21151
21203
|
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
21152
21204
|
const int idx = gguf_get_or_add_key(ctx, key);
|
21153
21205
|
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -1161,13 +1161,11 @@ extern "C" {
|
|
1161
1161
|
enum ggml_prec prec);
|
1162
1162
|
|
1163
1163
|
// indirect matrix multiplication
|
1164
|
-
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1165
1164
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1166
1165
|
struct ggml_context * ctx,
|
1167
1166
|
struct ggml_tensor * as,
|
1168
|
-
struct ggml_tensor *
|
1169
|
-
|
1170
|
-
struct ggml_tensor * b);
|
1167
|
+
struct ggml_tensor * b,
|
1168
|
+
struct ggml_tensor * ids);
|
1171
1169
|
|
1172
1170
|
// A: m columns, n rows,
|
1173
1171
|
// B: p columns, n rows,
|
@@ -2289,6 +2287,9 @@ extern "C" {
|
|
2289
2287
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
2290
2288
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
2291
2289
|
|
2290
|
+
// removes key if it exists
|
2291
|
+
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
2292
|
+
|
2292
2293
|
// overrides existing values or adds a new one
|
2293
2294
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
2294
2295
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|