llama_cpp 0.14.5 → 0.14.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,6 +4,7 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
+ #include "sgemm.h"
7
8
 
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -32,6 +33,10 @@
32
33
  #include <unistd.h>
33
34
  #endif
34
35
 
36
+ #ifdef __ARM_FEATURE_MATMUL_INT8
37
+ #undef GGML_USE_LLAMAFILE
38
+ #endif
39
+
35
40
  #if defined(_MSC_VER)
36
41
  // disable "possible loss of data" to avoid hundreds of casts
37
42
  // we should just be careful :)
@@ -853,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
853
858
  // simd mappings
854
859
  //
855
860
 
856
- #if defined(__ARM_NEON)
857
- #if !defined(__aarch64__)
858
-
859
- // 64-bit compatibility
860
-
861
- inline static float vaddvq_f32(float32x4_t v) {
862
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
863
- }
864
-
865
- #endif
866
- #endif
867
-
868
861
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
869
862
  // we then implement the fundamental computation operations below using only these macros
870
863
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -4573,21 +4566,32 @@ void ggml_mul_mat_set_prec(
4573
4566
 
4574
4567
  // ggml_mul_mat_id
4575
4568
 
4576
- // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
- // this will allow computing all the used experts in a single matrix multiplication
4569
+ /*
4570
+ c = ggml_mul_mat_id(ctx, as, b, ids);
4571
+
4572
+ as -> [cols, rows, n_expert]
4573
+ ids -> [n_experts_used, n_tokens] (i32)
4574
+ b -> [cols, n_expert_used, n_tokens]
4575
+ c -> [cols, n_expert_used, n_tokens]
4576
+
4577
+ in b, n_experts_used can be broadcasted to match the n_expert_used of ids
4578
+
4579
+ c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
4580
+ */
4578
4581
  struct ggml_tensor * ggml_mul_mat_id(
4579
4582
  struct ggml_context * ctx,
4580
4583
  struct ggml_tensor * as,
4581
- struct ggml_tensor * ids,
4582
- int id,
4583
- struct ggml_tensor * b) {
4584
-
4584
+ struct ggml_tensor * b,
4585
+ struct ggml_tensor * ids) {
4586
+ GGML_ASSERT(!ggml_is_transposed(as));
4585
4587
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4588
+
4589
+ GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
4590
+ GGML_ASSERT(b->ne[3] == 1); // b is 3d
4586
4591
  GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
- GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4588
- GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4589
- GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4592
+ GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
4590
4593
  GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4594
+ GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
4591
4595
 
4592
4596
  bool is_node = false;
4593
4597
 
@@ -4595,11 +4599,9 @@ struct ggml_tensor * ggml_mul_mat_id(
4595
4599
  is_node = true;
4596
4600
  }
4597
4601
 
4598
- const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4602
+ const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
4599
4603
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4600
4604
 
4601
- ggml_set_op_params_i32(result, 0, id);
4602
-
4603
4605
  result->op = GGML_OP_MUL_MAT_ID;
4604
4606
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4605
4607
  result->src[0] = as;
@@ -10810,6 +10812,28 @@ static void ggml_compute_forward_mul_mat(
10810
10812
  }
10811
10813
  #endif
10812
10814
 
10815
+ #if GGML_USE_LLAMAFILE
10816
+ if (src1_cont) {
10817
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10818
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10819
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10820
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10821
+ nb01/ggml_type_size(src0->type),
10822
+ (const char *)src1->data + i12*nb12 + i13*nb13,
10823
+ nb11/ggml_type_size(src1->type),
10824
+ (char *)dst->data + i12*nb2 + i13*nb3,
10825
+ nb1/ggml_type_size(dst->type),
10826
+ ith, nth,
10827
+ params->type,
10828
+ src0->type,
10829
+ src1->type,
10830
+ dst->type))
10831
+ goto UseGgmlGemm1;
10832
+ return;
10833
+ }
10834
+ UseGgmlGemm1:;
10835
+ #endif
10836
+
10813
10837
  if (params->type == GGML_TASK_TYPE_INIT) {
10814
10838
  if (ith != 0) {
10815
10839
  return;
@@ -10841,6 +10865,28 @@ static void ggml_compute_forward_mul_mat(
10841
10865
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10842
10866
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10843
10867
 
10868
+ #if GGML_USE_LLAMAFILE
10869
+ if (src1->type != vec_dot_type) {
10870
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10871
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10872
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10873
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10874
+ nb01/ggml_type_size(src0->type),
10875
+ (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
10876
+ row_size/ggml_type_size(vec_dot_type),
10877
+ (char *)dst->data + i12*nb2 + i13*nb3,
10878
+ nb1/ggml_type_size(dst->type),
10879
+ ith, nth,
10880
+ params->type,
10881
+ src0->type,
10882
+ vec_dot_type,
10883
+ dst->type))
10884
+ goto UseGgmlGemm2;
10885
+ return;
10886
+ }
10887
+ UseGgmlGemm2:;
10888
+ #endif
10889
+
10844
10890
  const int64_t nr0 = ne01; // src0 rows
10845
10891
  const int64_t nr1 = ne1*ne12*ne13; // src1 rows
10846
10892
 
@@ -10958,11 +11004,6 @@ static void ggml_compute_forward_mul_mat_id(
10958
11004
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10959
11005
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10960
11006
 
10961
- GGML_ASSERT(ne0 == ne01);
10962
- GGML_ASSERT(ne1 == ne11);
10963
- GGML_ASSERT(ne2 == ne12);
10964
- GGML_ASSERT(ne3 == ne13);
10965
-
10966
11007
  // we don't support permuted src0 or src1
10967
11008
  GGML_ASSERT(nb00 == ggml_type_size(type));
10968
11009
  GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@@ -10973,22 +11014,21 @@ static void ggml_compute_forward_mul_mat_id(
10973
11014
  GGML_ASSERT(nb1 <= nb2);
10974
11015
  GGML_ASSERT(nb2 <= nb3);
10975
11016
 
10976
- // broadcast is not supported with mmid
10977
- assert(ne12 == 1);
10978
- assert(ne13 == 1);
10979
-
10980
11017
  // row groups
10981
- const int id = ggml_get_op_params_i32(dst, 0);
10982
- const int n_as = src0->ne[2];
11018
+ const int n_ids = ids->ne[0]; // n_expert_used
11019
+ const int n_as = ne02; // n_expert
10983
11020
 
10984
11021
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10985
11022
  (char *) params->wdata :
10986
11023
  (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
10987
11024
 
10988
- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
10989
- int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
11025
+ struct mmid_row_mapping {
11026
+ int32_t i1;
11027
+ int32_t i2;
11028
+ };
10990
11029
 
10991
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
11030
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
11031
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
10992
11032
 
10993
11033
  if (params->type == GGML_TASK_TYPE_INIT) {
10994
11034
  if (ith != 0) {
@@ -11012,16 +11052,20 @@ static void ggml_compute_forward_mul_mat_id(
11012
11052
  }
11013
11053
 
11014
11054
  // initialize matrix_row_counts
11015
- GGML_ASSERT(wdata == wdata_src1_end);
11016
11055
  memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
11017
11056
 
11057
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
11058
+
11018
11059
  // group rows by src0 matrix
11019
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
11020
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
11060
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
11061
+ for (int id = 0; id < n_ids; ++id) {
11062
+ const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
11063
+
11064
+ assert(i02 >= 0 && i02 < n_as);
11021
11065
 
11022
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
11023
- MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
11024
- matrix_row_counts[row_id] += 1;
11066
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
11067
+ matrix_row_counts[i02] += 1;
11068
+ }
11025
11069
  }
11026
11070
 
11027
11071
  return;
@@ -11039,15 +11083,13 @@ static void ggml_compute_forward_mul_mat_id(
11039
11083
  continue;
11040
11084
  }
11041
11085
 
11042
- size_t src0_offset = cur_a*src0->nb[2];
11086
+ const char * src0_cur = (const char *) src0->data + cur_a*nb02;
11043
11087
 
11044
11088
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11045
11089
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
11046
11090
 
11047
- const int64_t nr0 = ne01; // src0 rows
11048
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
11049
-
11050
- //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
11091
+ const int64_t nr0 = ne01; // src0 rows
11092
+ const int64_t nr1 = cne1; // src1 rows
11051
11093
 
11052
11094
  // distribute the thread work across the inner or outer loop based on which one is larger
11053
11095
 
@@ -11066,13 +11108,11 @@ static void ggml_compute_forward_mul_mat_id(
11066
11108
  const int64_t ir110 = dr1*ith1;
11067
11109
  const int64_t ir111 = MIN(ir110 + dr1, nr1);
11068
11110
 
11069
- //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
11070
-
11071
11111
  // threads with no work simply yield (not sure if it helps)
11072
- if (ir010 >= ir011 || ir110 >= ir111) {
11073
- sched_yield();
11074
- continue;
11075
- }
11112
+ //if (ir010 >= ir011 || ir110 >= ir111) {
11113
+ // sched_yield();
11114
+ // continue;
11115
+ //}
11076
11116
 
11077
11117
  // block-tiling attempt
11078
11118
  const int64_t blck_0 = 16;
@@ -11084,20 +11124,16 @@ static void ggml_compute_forward_mul_mat_id(
11084
11124
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
11085
11125
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
11086
11126
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
11087
- const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
11088
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
11089
- const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
11090
- const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
11127
+ const int64_t _i12 = ir1; // logical row index for this expert
11091
11128
 
11092
- // broadcast src0 into src1
11093
- //const int64_t i03 = i13/r3;
11094
- //const int64_t i02 = i12/r2;
11129
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
11130
+ const int id = row_mapping.i1; // selected expert index
11095
11131
 
11096
- const int64_t i1 = i11;
11097
- const int64_t i2 = i12;
11098
- const int64_t i3 = i13;
11132
+ const int64_t i11 = id % ne11;
11133
+ const int64_t i12 = row_mapping.i2; // row index in src1
11099
11134
 
11100
- const char * src0_row = (const char *) src0->data + src0_offset;
11135
+ const int64_t i1 = id; // selected expert index
11136
+ const int64_t i2 = i12; // row
11101
11137
 
11102
11138
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
11103
11139
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11105,25 +11141,26 @@ static void ggml_compute_forward_mul_mat_id(
11105
11141
  // TODO: this is a bit of a hack, we should probably have a better way to handle this
11106
11142
  const char * src1_col = (const char *) wdata +
11107
11143
  (src1_cont || src1->type != vec_dot_type
11108
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
11109
- : (i11*nb11 + i12*nb12 + i13*nb13));
11144
+ ? (i11 + i12*ne11)*row_size
11145
+ : (i11*nb11 + i12*nb12));
11110
11146
 
11111
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
11147
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
11112
11148
 
11113
11149
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11114
11150
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
11115
11151
  //}
11116
11152
 
11117
11153
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11118
- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
11154
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
11119
11155
  }
11156
+
11120
11157
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
11121
11158
  }
11122
11159
  }
11123
11160
  }
11124
11161
  }
11125
11162
 
11126
- #undef MMID_MATRIX_ROW
11163
+ #undef MMID_MATRIX_ROW
11127
11164
  }
11128
11165
 
11129
11166
  // ggml_compute_forward_out_prod
@@ -18462,7 +18499,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18462
18499
  const int n_as = src0->ne[2];
18463
18500
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18464
18501
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18465
- cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
18502
+ cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
18466
18503
  } break;
18467
18504
  case GGML_OP_OUT_PROD:
18468
18505
  {
@@ -20550,6 +20587,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
20550
20587
  return ok;
20551
20588
  }
20552
20589
 
20590
+ static void gguf_free_kv(struct gguf_kv * kv) {
20591
+ if (kv->key.data) {
20592
+ GGML_FREE(kv->key.data);
20593
+ }
20594
+
20595
+ if (kv->type == GGUF_TYPE_STRING) {
20596
+ if (kv->value.str.data) {
20597
+ GGML_FREE(kv->value.str.data);
20598
+ }
20599
+ }
20600
+
20601
+ if (kv->type == GGUF_TYPE_ARRAY) {
20602
+ if (kv->value.arr.data) {
20603
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
20604
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20605
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20606
+ if (str->data) {
20607
+ GGML_FREE(str->data);
20608
+ }
20609
+ }
20610
+ }
20611
+ GGML_FREE(kv->value.arr.data);
20612
+ }
20613
+ }
20614
+ }
20615
+
20553
20616
  struct gguf_context * gguf_init_empty(void) {
20554
20617
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20555
20618
 
@@ -20862,12 +20925,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20862
20925
 
20863
20926
  ok = ok && cur != NULL;
20864
20927
 
20865
- ggml_set_name(cur, ctx->infos[i].name.data);
20866
-
20867
20928
  if (!ok) {
20868
20929
  break;
20869
20930
  }
20870
20931
 
20932
+ ggml_set_name(cur, ctx->infos[i].name.data);
20933
+
20871
20934
  // point the data member to the appropriate location in the binary blob using the tensor infos
20872
20935
  if (!params.no_alloc) {
20873
20936
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
@@ -20899,31 +20962,7 @@ void gguf_free(struct gguf_context * ctx) {
20899
20962
  if (ctx->kv) {
20900
20963
  // free string memory - not great..
20901
20964
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
20902
- struct gguf_kv * kv = &ctx->kv[i];
20903
-
20904
- if (kv->key.data) {
20905
- GGML_FREE(kv->key.data);
20906
- }
20907
-
20908
- if (kv->type == GGUF_TYPE_STRING) {
20909
- if (kv->value.str.data) {
20910
- GGML_FREE(kv->value.str.data);
20911
- }
20912
- }
20913
-
20914
- if (kv->type == GGUF_TYPE_ARRAY) {
20915
- if (kv->value.arr.data) {
20916
- if (kv->value.arr.type == GGUF_TYPE_STRING) {
20917
- for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20918
- struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20919
- if (str->data) {
20920
- GGML_FREE(str->data);
20921
- }
20922
- }
20923
- }
20924
- GGML_FREE(kv->value.arr.data);
20925
- }
20926
- }
20965
+ gguf_free_kv(&ctx->kv[i]);
20927
20966
  }
20928
20967
 
20929
20968
  GGML_FREE(ctx->kv);
@@ -21148,6 +21187,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
21148
21187
  return n_kv;
21149
21188
  }
21150
21189
 
21190
+ void gguf_remove_key(struct gguf_context * ctx, const char * key) {
21191
+ const int idx = gguf_find_key(ctx, key);
21192
+ if (idx >= 0) {
21193
+ const int n_kv = gguf_get_n_kv(ctx);
21194
+ gguf_free_kv(&ctx->kv[idx]);
21195
+ for (int i = idx; i < n_kv-1; ++i) {
21196
+ ctx->kv[i] = ctx->kv[i+1];
21197
+ }
21198
+ ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
21199
+ ctx->header.n_kv--;
21200
+ }
21201
+ }
21202
+
21151
21203
  void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
21152
21204
  const int idx = gguf_get_or_add_key(ctx, key);
21153
21205
 
@@ -1161,13 +1161,11 @@ extern "C" {
1161
1161
  enum ggml_prec prec);
1162
1162
 
1163
1163
  // indirect matrix multiplication
1164
- // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1165
1164
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1166
1165
  struct ggml_context * ctx,
1167
1166
  struct ggml_tensor * as,
1168
- struct ggml_tensor * ids,
1169
- int id,
1170
- struct ggml_tensor * b);
1167
+ struct ggml_tensor * b,
1168
+ struct ggml_tensor * ids);
1171
1169
 
1172
1170
  // A: m columns, n rows,
1173
1171
  // B: p columns, n rows,
@@ -2289,6 +2287,9 @@ extern "C" {
2289
2287
  GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2290
2288
  GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2291
2289
 
2290
+ // removes key if it exists
2291
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2292
+
2292
2293
  // overrides existing values or adds a new one
2293
2294
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2294
2295
  GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);