llama_cpp 0.14.5 → 0.14.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
+ #include "sgemm.h"
7
8
 
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -32,6 +33,10 @@
32
33
  #include <unistd.h>
33
34
  #endif
34
35
 
36
+ #ifdef __ARM_FEATURE_MATMUL_INT8
37
+ #undef GGML_USE_LLAMAFILE
38
+ #endif
39
+
35
40
  #if defined(_MSC_VER)
36
41
  // disable "possible loss of data" to avoid hundreds of casts
37
42
  // we should just be careful :)
@@ -853,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
853
858
  // simd mappings
854
859
  //
855
860
 
856
- #if defined(__ARM_NEON)
857
- #if !defined(__aarch64__)
858
-
859
- // 64-bit compatibility
860
-
861
- inline static float vaddvq_f32(float32x4_t v) {
862
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
863
- }
864
-
865
- #endif
866
- #endif
867
-
868
861
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
869
862
  // we then implement the fundamental computation operations below using only these macros
870
863
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -4573,21 +4566,32 @@ void ggml_mul_mat_set_prec(
4573
4566
 
4574
4567
  // ggml_mul_mat_id
4575
4568
 
4576
- // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
- // this will allow computing all the used experts in a single matrix multiplication
4569
+ /*
4570
+ c = ggml_mul_mat_id(ctx, as, b, ids);
4571
+
4572
+ as -> [cols, rows, n_expert]
4573
+ ids -> [n_experts_used, n_tokens] (i32)
4574
+ b -> [cols, n_expert_used, n_tokens]
4575
+ c -> [cols, n_expert_used, n_tokens]
4576
+
4577
+ in b, n_experts_used can be broadcasted to match the n_expert_used of ids
4578
+
4579
+ c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
4580
+ */
4578
4581
  struct ggml_tensor * ggml_mul_mat_id(
4579
4582
  struct ggml_context * ctx,
4580
4583
  struct ggml_tensor * as,
4581
- struct ggml_tensor * ids,
4582
- int id,
4583
- struct ggml_tensor * b) {
4584
-
4584
+ struct ggml_tensor * b,
4585
+ struct ggml_tensor * ids) {
4586
+ GGML_ASSERT(!ggml_is_transposed(as));
4585
4587
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4588
+
4589
+ GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
4590
+ GGML_ASSERT(b->ne[3] == 1); // b is 3d
4586
4591
  GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
- GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4588
- GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4589
- GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4592
+ GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
4590
4593
  GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4594
+ GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
4591
4595
 
4592
4596
  bool is_node = false;
4593
4597
 
@@ -4595,11 +4599,9 @@ struct ggml_tensor * ggml_mul_mat_id(
4595
4599
  is_node = true;
4596
4600
  }
4597
4601
 
4598
- const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4602
+ const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
4599
4603
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4600
4604
 
4601
- ggml_set_op_params_i32(result, 0, id);
4602
-
4603
4605
  result->op = GGML_OP_MUL_MAT_ID;
4604
4606
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4605
4607
  result->src[0] = as;
@@ -10810,6 +10812,28 @@ static void ggml_compute_forward_mul_mat(
10810
10812
  }
10811
10813
  #endif
10812
10814
 
10815
+ #if GGML_USE_LLAMAFILE
10816
+ if (src1_cont) {
10817
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10818
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10819
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10820
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10821
+ nb01/ggml_type_size(src0->type),
10822
+ (const char *)src1->data + i12*nb12 + i13*nb13,
10823
+ nb11/ggml_type_size(src1->type),
10824
+ (char *)dst->data + i12*nb2 + i13*nb3,
10825
+ nb1/ggml_type_size(dst->type),
10826
+ ith, nth,
10827
+ params->type,
10828
+ src0->type,
10829
+ src1->type,
10830
+ dst->type))
10831
+ goto UseGgmlGemm1;
10832
+ return;
10833
+ }
10834
+ UseGgmlGemm1:;
10835
+ #endif
10836
+
10813
10837
  if (params->type == GGML_TASK_TYPE_INIT) {
10814
10838
  if (ith != 0) {
10815
10839
  return;
@@ -10841,6 +10865,28 @@ static void ggml_compute_forward_mul_mat(
10841
10865
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10842
10866
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10843
10867
 
10868
+ #if GGML_USE_LLAMAFILE
10869
+ if (src1->type != vec_dot_type) {
10870
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10871
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10872
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10873
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10874
+ nb01/ggml_type_size(src0->type),
10875
+ (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
10876
+ row_size/ggml_type_size(vec_dot_type),
10877
+ (char *)dst->data + i12*nb2 + i13*nb3,
10878
+ nb1/ggml_type_size(dst->type),
10879
+ ith, nth,
10880
+ params->type,
10881
+ src0->type,
10882
+ vec_dot_type,
10883
+ dst->type))
10884
+ goto UseGgmlGemm2;
10885
+ return;
10886
+ }
10887
+ UseGgmlGemm2:;
10888
+ #endif
10889
+
10844
10890
  const int64_t nr0 = ne01; // src0 rows
10845
10891
  const int64_t nr1 = ne1*ne12*ne13; // src1 rows
10846
10892
 
@@ -10958,11 +11004,6 @@ static void ggml_compute_forward_mul_mat_id(
10958
11004
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10959
11005
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10960
11006
 
10961
- GGML_ASSERT(ne0 == ne01);
10962
- GGML_ASSERT(ne1 == ne11);
10963
- GGML_ASSERT(ne2 == ne12);
10964
- GGML_ASSERT(ne3 == ne13);
10965
-
10966
11007
  // we don't support permuted src0 or src1
10967
11008
  GGML_ASSERT(nb00 == ggml_type_size(type));
10968
11009
  GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@@ -10973,22 +11014,21 @@ static void ggml_compute_forward_mul_mat_id(
10973
11014
  GGML_ASSERT(nb1 <= nb2);
10974
11015
  GGML_ASSERT(nb2 <= nb3);
10975
11016
 
10976
- // broadcast is not supported with mmid
10977
- assert(ne12 == 1);
10978
- assert(ne13 == 1);
10979
-
10980
11017
  // row groups
10981
- const int id = ggml_get_op_params_i32(dst, 0);
10982
- const int n_as = src0->ne[2];
11018
+ const int n_ids = ids->ne[0]; // n_expert_used
11019
+ const int n_as = ne02; // n_expert
10983
11020
 
10984
11021
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10985
11022
  (char *) params->wdata :
10986
11023
  (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
10987
11024
 
10988
- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
10989
- int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
11025
+ struct mmid_row_mapping {
11026
+ int32_t i1;
11027
+ int32_t i2;
11028
+ };
10990
11029
 
10991
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
11030
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
11031
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
10992
11032
 
10993
11033
  if (params->type == GGML_TASK_TYPE_INIT) {
10994
11034
  if (ith != 0) {
@@ -11012,16 +11052,20 @@ static void ggml_compute_forward_mul_mat_id(
11012
11052
  }
11013
11053
 
11014
11054
  // initialize matrix_row_counts
11015
- GGML_ASSERT(wdata == wdata_src1_end);
11016
11055
  memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
11017
11056
 
11057
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
11058
+
11018
11059
  // group rows by src0 matrix
11019
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
11020
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
11060
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
11061
+ for (int id = 0; id < n_ids; ++id) {
11062
+ const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
11063
+
11064
+ assert(i02 >= 0 && i02 < n_as);
11021
11065
 
11022
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
11023
- MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
11024
- matrix_row_counts[row_id] += 1;
11066
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
11067
+ matrix_row_counts[i02] += 1;
11068
+ }
11025
11069
  }
11026
11070
 
11027
11071
  return;
@@ -11039,15 +11083,13 @@ static void ggml_compute_forward_mul_mat_id(
11039
11083
  continue;
11040
11084
  }
11041
11085
 
11042
- size_t src0_offset = cur_a*src0->nb[2];
11086
+ const char * src0_cur = (const char *) src0->data + cur_a*nb02;
11043
11087
 
11044
11088
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11045
11089
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
11046
11090
 
11047
- const int64_t nr0 = ne01; // src0 rows
11048
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
11049
-
11050
- //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
11091
+ const int64_t nr0 = ne01; // src0 rows
11092
+ const int64_t nr1 = cne1; // src1 rows
11051
11093
 
11052
11094
  // distribute the thread work across the inner or outer loop based on which one is larger
11053
11095
 
@@ -11066,13 +11108,11 @@ static void ggml_compute_forward_mul_mat_id(
11066
11108
  const int64_t ir110 = dr1*ith1;
11067
11109
  const int64_t ir111 = MIN(ir110 + dr1, nr1);
11068
11110
 
11069
- //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
11070
-
11071
11111
  // threads with no work simply yield (not sure if it helps)
11072
- if (ir010 >= ir011 || ir110 >= ir111) {
11073
- sched_yield();
11074
- continue;
11075
- }
11112
+ //if (ir010 >= ir011 || ir110 >= ir111) {
11113
+ // sched_yield();
11114
+ // continue;
11115
+ //}
11076
11116
 
11077
11117
  // block-tiling attempt
11078
11118
  const int64_t blck_0 = 16;
@@ -11084,20 +11124,16 @@ static void ggml_compute_forward_mul_mat_id(
11084
11124
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
11085
11125
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
11086
11126
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
11087
- const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
11088
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
11089
- const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
11090
- const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
11127
+ const int64_t _i12 = ir1; // logical row index for this expert
11091
11128
 
11092
- // broadcast src0 into src1
11093
- //const int64_t i03 = i13/r3;
11094
- //const int64_t i02 = i12/r2;
11129
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
11130
+ const int id = row_mapping.i1; // selected expert index
11095
11131
 
11096
- const int64_t i1 = i11;
11097
- const int64_t i2 = i12;
11098
- const int64_t i3 = i13;
11132
+ const int64_t i11 = id % ne11;
11133
+ const int64_t i12 = row_mapping.i2; // row index in src1
11099
11134
 
11100
- const char * src0_row = (const char *) src0->data + src0_offset;
11135
+ const int64_t i1 = id; // selected expert index
11136
+ const int64_t i2 = i12; // row
11101
11137
 
11102
11138
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
11103
11139
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11105,25 +11141,26 @@ static void ggml_compute_forward_mul_mat_id(
11105
11141
  // TODO: this is a bit of a hack, we should probably have a better way to handle this
11106
11142
  const char * src1_col = (const char *) wdata +
11107
11143
  (src1_cont || src1->type != vec_dot_type
11108
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
11109
- : (i11*nb11 + i12*nb12 + i13*nb13));
11144
+ ? (i11 + i12*ne11)*row_size
11145
+ : (i11*nb11 + i12*nb12));
11110
11146
 
11111
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
11147
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
11112
11148
 
11113
11149
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11114
11150
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
11115
11151
  //}
11116
11152
 
11117
11153
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11118
- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
11154
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
11119
11155
  }
11156
+
11120
11157
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
11121
11158
  }
11122
11159
  }
11123
11160
  }
11124
11161
  }
11125
11162
 
11126
- #undef MMID_MATRIX_ROW
11163
+ #undef MMID_MATRIX_ROW
11127
11164
  }
11128
11165
 
11129
11166
  // ggml_compute_forward_out_prod
@@ -18462,7 +18499,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18462
18499
  const int n_as = src0->ne[2];
18463
18500
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18464
18501
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18465
- cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
18502
+ cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
18466
18503
  } break;
18467
18504
  case GGML_OP_OUT_PROD:
18468
18505
  {
@@ -20550,6 +20587,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
20550
20587
  return ok;
20551
20588
  }
20552
20589
 
20590
+ static void gguf_free_kv(struct gguf_kv * kv) {
20591
+ if (kv->key.data) {
20592
+ GGML_FREE(kv->key.data);
20593
+ }
20594
+
20595
+ if (kv->type == GGUF_TYPE_STRING) {
20596
+ if (kv->value.str.data) {
20597
+ GGML_FREE(kv->value.str.data);
20598
+ }
20599
+ }
20600
+
20601
+ if (kv->type == GGUF_TYPE_ARRAY) {
20602
+ if (kv->value.arr.data) {
20603
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
20604
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20605
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20606
+ if (str->data) {
20607
+ GGML_FREE(str->data);
20608
+ }
20609
+ }
20610
+ }
20611
+ GGML_FREE(kv->value.arr.data);
20612
+ }
20613
+ }
20614
+ }
20615
+
20553
20616
  struct gguf_context * gguf_init_empty(void) {
20554
20617
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20555
20618
 
@@ -20862,12 +20925,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20862
20925
 
20863
20926
  ok = ok && cur != NULL;
20864
20927
 
20865
- ggml_set_name(cur, ctx->infos[i].name.data);
20866
-
20867
20928
  if (!ok) {
20868
20929
  break;
20869
20930
  }
20870
20931
 
20932
+ ggml_set_name(cur, ctx->infos[i].name.data);
20933
+
20871
20934
  // point the data member to the appropriate location in the binary blob using the tensor infos
20872
20935
  if (!params.no_alloc) {
20873
20936
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
@@ -20899,31 +20962,7 @@ void gguf_free(struct gguf_context * ctx) {
20899
20962
  if (ctx->kv) {
20900
20963
  // free string memory - not great..
20901
20964
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
20902
- struct gguf_kv * kv = &ctx->kv[i];
20903
-
20904
- if (kv->key.data) {
20905
- GGML_FREE(kv->key.data);
20906
- }
20907
-
20908
- if (kv->type == GGUF_TYPE_STRING) {
20909
- if (kv->value.str.data) {
20910
- GGML_FREE(kv->value.str.data);
20911
- }
20912
- }
20913
-
20914
- if (kv->type == GGUF_TYPE_ARRAY) {
20915
- if (kv->value.arr.data) {
20916
- if (kv->value.arr.type == GGUF_TYPE_STRING) {
20917
- for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20918
- struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20919
- if (str->data) {
20920
- GGML_FREE(str->data);
20921
- }
20922
- }
20923
- }
20924
- GGML_FREE(kv->value.arr.data);
20925
- }
20926
- }
20965
+ gguf_free_kv(&ctx->kv[i]);
20927
20966
  }
20928
20967
 
20929
20968
  GGML_FREE(ctx->kv);
@@ -21148,6 +21187,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
21148
21187
  return n_kv;
21149
21188
  }
21150
21189
 
21190
+ void gguf_remove_key(struct gguf_context * ctx, const char * key) {
21191
+ const int idx = gguf_find_key(ctx, key);
21192
+ if (idx >= 0) {
21193
+ const int n_kv = gguf_get_n_kv(ctx);
21194
+ gguf_free_kv(&ctx->kv[idx]);
21195
+ for (int i = idx; i < n_kv-1; ++i) {
21196
+ ctx->kv[i] = ctx->kv[i+1];
21197
+ }
21198
+ ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
21199
+ ctx->header.n_kv--;
21200
+ }
21201
+ }
21202
+
21151
21203
  void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
21152
21204
  const int idx = gguf_get_or_add_key(ctx, key);
21153
21205
 
@@ -1161,13 +1161,11 @@ extern "C" {
1161
1161
  enum ggml_prec prec);
1162
1162
 
1163
1163
  // indirect matrix multiplication
1164
- // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1165
1164
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1166
1165
  struct ggml_context * ctx,
1167
1166
  struct ggml_tensor * as,
1168
- struct ggml_tensor * ids,
1169
- int id,
1170
- struct ggml_tensor * b);
1167
+ struct ggml_tensor * b,
1168
+ struct ggml_tensor * ids);
1171
1169
 
1172
1170
  // A: m columns, n rows,
1173
1171
  // B: p columns, n rows,
@@ -2289,6 +2287,9 @@ extern "C" {
2289
2287
  GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2290
2288
  GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2291
2289
 
2290
+ // removes key if it exists
2291
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2292
+
2292
2293
  // overrides existing values or adds a new one
2293
2294
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2294
2295
  GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);