llama_cpp 0.14.4 → 0.14.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
+ #include "sgemm.h"
7
8
 
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -32,6 +33,10 @@
32
33
  #include <unistd.h>
33
34
  #endif
34
35
 
36
+ #ifdef __ARM_FEATURE_MATMUL_INT8
37
+ #undef GGML_USE_LLAMAFILE
38
+ #endif
39
+
35
40
  #if defined(_MSC_VER)
36
41
  // disable "possible loss of data" to avoid hundreds of casts
37
42
  // we should just be careful :)
@@ -338,14 +343,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
338
343
  return GGML_FP32_TO_FP16(x);
339
344
  }
340
345
 
341
- void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
342
- for (int i = 0; i < n; i++) {
346
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
347
+ for (int64_t i = 0; i < n; i++) {
343
348
  y[i] = GGML_FP16_TO_FP32(x[i]);
344
349
  }
345
350
  }
346
351
 
347
- void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
348
- int i = 0;
352
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
353
+ int64_t i = 0;
349
354
  #if defined(__F16C__)
350
355
  for (; i + 7 < n; i += 8) {
351
356
  __m256 x_vec = _mm256_loadu_ps(x + i);
@@ -4573,21 +4578,32 @@ void ggml_mul_mat_set_prec(
4573
4578
 
4574
4579
  // ggml_mul_mat_id
4575
4580
 
4576
- // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
- // this will allow computing all the used experts in a single matrix multiplication
4581
+ /*
4582
+ c = ggml_mul_mat_id(ctx, as, b, ids);
4583
+
4584
+ as -> [cols, rows, n_expert]
4585
+ ids -> [n_experts_used, n_tokens] (i32)
4586
+ b -> [cols, n_expert_used, n_tokens]
4587
+ c -> [cols, n_expert_used, n_tokens]
4588
+
4589
+ in b, n_experts_used can be broadcasted to match the n_expert_used of ids
4590
+
4591
+ c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
4592
+ */
4578
4593
  struct ggml_tensor * ggml_mul_mat_id(
4579
4594
  struct ggml_context * ctx,
4580
4595
  struct ggml_tensor * as,
4581
- struct ggml_tensor * ids,
4582
- int id,
4583
- struct ggml_tensor * b) {
4584
-
4596
+ struct ggml_tensor * b,
4597
+ struct ggml_tensor * ids) {
4598
+ GGML_ASSERT(!ggml_is_transposed(as));
4585
4599
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4600
+
4601
+ GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
4602
+ GGML_ASSERT(b->ne[3] == 1); // b is 3d
4586
4603
  GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
- GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4588
- GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4589
- GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4604
+ GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
4590
4605
  GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4606
+ GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
4591
4607
 
4592
4608
  bool is_node = false;
4593
4609
 
@@ -4595,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id(
4595
4611
  is_node = true;
4596
4612
  }
4597
4613
 
4598
- const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4614
+ const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
4599
4615
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4600
4616
 
4601
- ggml_set_op_params_i32(result, 0, id);
4602
-
4603
4617
  result->op = GGML_OP_MUL_MAT_ID;
4604
4618
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4605
4619
  result->src[0] = as;
@@ -10810,6 +10824,28 @@ static void ggml_compute_forward_mul_mat(
10810
10824
  }
10811
10825
  #endif
10812
10826
 
10827
+ #if GGML_USE_LLAMAFILE
10828
+ if (nb10 == ggml_type_size(src1->type)) {
10829
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10830
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10831
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10832
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10833
+ nb01/ggml_type_size(src0->type),
10834
+ (const char *)src1->data + i12*nb12 + i13*nb13,
10835
+ nb11/ggml_type_size(src1->type),
10836
+ (char *)dst->data + i12*nb2 + i13*nb3,
10837
+ nb1/ggml_type_size(dst->type),
10838
+ ith, nth,
10839
+ params->type,
10840
+ src0->type,
10841
+ src1->type,
10842
+ dst->type))
10843
+ goto UseGgmlGemm1;
10844
+ return;
10845
+ }
10846
+ UseGgmlGemm1:;
10847
+ #endif
10848
+
10813
10849
  if (params->type == GGML_TASK_TYPE_INIT) {
10814
10850
  if (ith != 0) {
10815
10851
  return;
@@ -10841,6 +10877,30 @@ static void ggml_compute_forward_mul_mat(
10841
10877
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10842
10878
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10843
10879
 
10880
+ #if GGML_USE_LLAMAFILE
10881
+ if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
10882
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10883
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10884
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10885
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10886
+ nb01/ggml_type_size(src0->type),
10887
+ (const char *)wdata + ggml_row_size(vec_dot_type,
10888
+ nb12/ggml_type_size(src1->type)*i12 +
10889
+ nb13/ggml_type_size(src1->type)*i13),
10890
+ row_size/ggml_type_size(vec_dot_type),
10891
+ (char *)dst->data + i12*nb2 + i13*nb3,
10892
+ nb1/ggml_type_size(dst->type),
10893
+ ith, nth,
10894
+ params->type,
10895
+ src0->type,
10896
+ vec_dot_type,
10897
+ dst->type))
10898
+ goto UseGgmlGemm2;
10899
+ return;
10900
+ }
10901
+ UseGgmlGemm2:;
10902
+ #endif
10903
+
10844
10904
  const int64_t nr0 = ne01; // src0 rows
10845
10905
  const int64_t nr1 = ne1*ne12*ne13; // src1 rows
10846
10906
 
@@ -10958,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id(
10958
11018
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10959
11019
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10960
11020
 
10961
- GGML_ASSERT(ne0 == ne01);
10962
- GGML_ASSERT(ne1 == ne11);
10963
- GGML_ASSERT(ne2 == ne12);
10964
- GGML_ASSERT(ne3 == ne13);
10965
-
10966
11021
  // we don't support permuted src0 or src1
10967
11022
  GGML_ASSERT(nb00 == ggml_type_size(type));
10968
11023
  GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@@ -10973,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id(
10973
11028
  GGML_ASSERT(nb1 <= nb2);
10974
11029
  GGML_ASSERT(nb2 <= nb3);
10975
11030
 
10976
- // broadcast is not supported with mmid
10977
- assert(ne12 == 1);
10978
- assert(ne13 == 1);
10979
-
10980
11031
  // row groups
10981
- const int id = ggml_get_op_params_i32(dst, 0);
10982
- const int n_as = src0->ne[2];
11032
+ const int n_ids = ids->ne[0]; // n_expert_used
11033
+ const int n_as = ne02; // n_expert
10983
11034
 
10984
11035
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10985
11036
  (char *) params->wdata :
10986
11037
  (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
10987
11038
 
10988
- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
10989
- int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
11039
+ struct mmid_row_mapping {
11040
+ int32_t i1;
11041
+ int32_t i2;
11042
+ };
10990
11043
 
10991
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
11044
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
11045
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
10992
11046
 
10993
11047
  if (params->type == GGML_TASK_TYPE_INIT) {
10994
11048
  if (ith != 0) {
@@ -11012,16 +11066,20 @@ static void ggml_compute_forward_mul_mat_id(
11012
11066
  }
11013
11067
 
11014
11068
  // initialize matrix_row_counts
11015
- GGML_ASSERT(wdata == wdata_src1_end);
11016
11069
  memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
11017
11070
 
11071
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
11072
+
11018
11073
  // group rows by src0 matrix
11019
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
11020
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
11074
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
11075
+ for (int id = 0; id < n_ids; ++id) {
11076
+ const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
11021
11077
 
11022
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
11023
- MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
11024
- matrix_row_counts[row_id] += 1;
11078
+ assert(i02 >= 0 && i02 < n_as);
11079
+
11080
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
11081
+ matrix_row_counts[i02] += 1;
11082
+ }
11025
11083
  }
11026
11084
 
11027
11085
  return;
@@ -11039,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id(
11039
11097
  continue;
11040
11098
  }
11041
11099
 
11042
- size_t src0_offset = cur_a*src0->nb[2];
11100
+ const char * src0_cur = (const char *) src0->data + cur_a*nb02;
11043
11101
 
11044
11102
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11045
11103
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
11046
11104
 
11047
- const int64_t nr0 = ne01; // src0 rows
11048
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
11049
-
11050
- //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
11105
+ const int64_t nr0 = ne01; // src0 rows
11106
+ const int64_t nr1 = cne1; // src1 rows
11051
11107
 
11052
11108
  // distribute the thread work across the inner or outer loop based on which one is larger
11053
11109
 
@@ -11066,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id(
11066
11122
  const int64_t ir110 = dr1*ith1;
11067
11123
  const int64_t ir111 = MIN(ir110 + dr1, nr1);
11068
11124
 
11069
- //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
11070
-
11071
11125
  // threads with no work simply yield (not sure if it helps)
11072
- if (ir010 >= ir011 || ir110 >= ir111) {
11073
- sched_yield();
11074
- continue;
11075
- }
11126
+ //if (ir010 >= ir011 || ir110 >= ir111) {
11127
+ // sched_yield();
11128
+ // continue;
11129
+ //}
11076
11130
 
11077
11131
  // block-tiling attempt
11078
11132
  const int64_t blck_0 = 16;
@@ -11084,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id(
11084
11138
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
11085
11139
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
11086
11140
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
11087
- const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
11088
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
11089
- const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
11090
- const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
11141
+ const int64_t _i12 = ir1; // logical row index for this expert
11091
11142
 
11092
- // broadcast src0 into src1
11093
- //const int64_t i03 = i13/r3;
11094
- //const int64_t i02 = i12/r2;
11143
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
11144
+ const int id = row_mapping.i1; // selected expert index
11095
11145
 
11096
- const int64_t i1 = i11;
11097
- const int64_t i2 = i12;
11098
- const int64_t i3 = i13;
11146
+ const int64_t i11 = id % ne11;
11147
+ const int64_t i12 = row_mapping.i2; // row index in src1
11099
11148
 
11100
- const char * src0_row = (const char *) src0->data + src0_offset;
11149
+ const int64_t i1 = id; // selected expert index
11150
+ const int64_t i2 = i12; // row
11101
11151
 
11102
11152
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
11103
11153
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11105,25 +11155,26 @@ static void ggml_compute_forward_mul_mat_id(
11105
11155
  // TODO: this is a bit of a hack, we should probably have a better way to handle this
11106
11156
  const char * src1_col = (const char *) wdata +
11107
11157
  (src1_cont || src1->type != vec_dot_type
11108
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
11109
- : (i11*nb11 + i12*nb12 + i13*nb13));
11158
+ ? (i11 + i12*ne11)*row_size
11159
+ : (i11*nb11 + i12*nb12));
11110
11160
 
11111
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
11161
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
11112
11162
 
11113
11163
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11114
11164
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
11115
11165
  //}
11116
11166
 
11117
11167
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11118
- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
11168
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
11119
11169
  }
11170
+
11120
11171
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
11121
11172
  }
11122
11173
  }
11123
11174
  }
11124
11175
  }
11125
11176
 
11126
- #undef MMID_MATRIX_ROW
11177
+ #undef MMID_MATRIX_ROW
11127
11178
  }
11128
11179
 
11129
11180
  // ggml_compute_forward_out_prod
@@ -18462,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18462
18513
  const int n_as = src0->ne[2];
18463
18514
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18464
18515
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18465
- cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
18516
+ cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
18466
18517
  } break;
18467
18518
  case GGML_OP_OUT_PROD:
18468
18519
  {
@@ -20331,11 +20382,11 @@ size_t ggml_quantize_chunk(
20331
20382
  enum ggml_type type,
20332
20383
  const float * src,
20333
20384
  void * dst,
20334
- int start,
20335
- int nrows,
20336
- int n_per_row,
20385
+ int64_t start,
20386
+ int64_t nrows,
20387
+ int64_t n_per_row,
20337
20388
  const float * imatrix) {
20338
- const int n = nrows * n_per_row;
20389
+ const int64_t n = (int64_t) nrows * n_per_row;
20339
20390
 
20340
20391
  if (ggml_quantize_requires_imatrix(type)) {
20341
20392
  GGML_ASSERT(imatrix != NULL);
@@ -20550,6 +20601,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
20550
20601
  return ok;
20551
20602
  }
20552
20603
 
20604
+ static void gguf_free_kv(struct gguf_kv * kv) {
20605
+ if (kv->key.data) {
20606
+ GGML_FREE(kv->key.data);
20607
+ }
20608
+
20609
+ if (kv->type == GGUF_TYPE_STRING) {
20610
+ if (kv->value.str.data) {
20611
+ GGML_FREE(kv->value.str.data);
20612
+ }
20613
+ }
20614
+
20615
+ if (kv->type == GGUF_TYPE_ARRAY) {
20616
+ if (kv->value.arr.data) {
20617
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
20618
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20619
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20620
+ if (str->data) {
20621
+ GGML_FREE(str->data);
20622
+ }
20623
+ }
20624
+ }
20625
+ GGML_FREE(kv->value.arr.data);
20626
+ }
20627
+ }
20628
+ }
20629
+
20553
20630
  struct gguf_context * gguf_init_empty(void) {
20554
20631
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20555
20632
 
@@ -20862,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20862
20939
 
20863
20940
  ok = ok && cur != NULL;
20864
20941
 
20865
- ggml_set_name(cur, ctx->infos[i].name.data);
20866
-
20867
20942
  if (!ok) {
20868
20943
  break;
20869
20944
  }
20870
20945
 
20946
+ ggml_set_name(cur, ctx->infos[i].name.data);
20947
+
20871
20948
  // point the data member to the appropriate location in the binary blob using the tensor infos
20872
20949
  if (!params.no_alloc) {
20873
20950
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
@@ -20899,31 +20976,7 @@ void gguf_free(struct gguf_context * ctx) {
20899
20976
  if (ctx->kv) {
20900
20977
  // free string memory - not great..
20901
20978
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
20902
- struct gguf_kv * kv = &ctx->kv[i];
20903
-
20904
- if (kv->key.data) {
20905
- GGML_FREE(kv->key.data);
20906
- }
20907
-
20908
- if (kv->type == GGUF_TYPE_STRING) {
20909
- if (kv->value.str.data) {
20910
- GGML_FREE(kv->value.str.data);
20911
- }
20912
- }
20913
-
20914
- if (kv->type == GGUF_TYPE_ARRAY) {
20915
- if (kv->value.arr.data) {
20916
- if (kv->value.arr.type == GGUF_TYPE_STRING) {
20917
- for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20918
- struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20919
- if (str->data) {
20920
- GGML_FREE(str->data);
20921
- }
20922
- }
20923
- }
20924
- GGML_FREE(kv->value.arr.data);
20925
- }
20926
- }
20979
+ gguf_free_kv(&ctx->kv[i]);
20927
20980
  }
20928
20981
 
20929
20982
  GGML_FREE(ctx->kv);
@@ -21148,6 +21201,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
21148
21201
  return n_kv;
21149
21202
  }
21150
21203
 
21204
+ void gguf_remove_key(struct gguf_context * ctx, const char * key) {
21205
+ const int idx = gguf_find_key(ctx, key);
21206
+ if (idx >= 0) {
21207
+ const int n_kv = gguf_get_n_kv(ctx);
21208
+ gguf_free_kv(&ctx->kv[idx]);
21209
+ for (int i = idx; i < n_kv-1; ++i) {
21210
+ ctx->kv[i] = ctx->kv[i+1];
21211
+ }
21212
+ ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
21213
+ ctx->header.n_kv--;
21214
+ }
21215
+ }
21216
+
21151
21217
  void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
21152
21218
  const int idx = gguf_get_or_add_key(ctx, key);
21153
21219
 
@@ -332,8 +332,8 @@ extern "C" {
332
332
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
333
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
334
 
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
335
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
337
337
 
338
338
  struct ggml_object;
339
339
  struct ggml_context;
@@ -1161,13 +1161,11 @@ extern "C" {
1161
1161
  enum ggml_prec prec);
1162
1162
 
1163
1163
  // indirect matrix multiplication
1164
- // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1165
1164
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1166
1165
  struct ggml_context * ctx,
1167
1166
  struct ggml_tensor * as,
1168
- struct ggml_tensor * ids,
1169
- int id,
1170
- struct ggml_tensor * b);
1167
+ struct ggml_tensor * b,
1168
+ struct ggml_tensor * ids);
1171
1169
 
1172
1170
  // A: m columns, n rows,
1173
1171
  // B: p columns, n rows,
@@ -2210,9 +2208,9 @@ extern "C" {
2210
2208
  enum ggml_type type,
2211
2209
  const float * src,
2212
2210
  void * dst,
2213
- int start,
2214
- int nrows,
2215
- int n_per_row,
2211
+ int64_t start,
2212
+ int64_t nrows,
2213
+ int64_t n_per_row,
2216
2214
  const float * imatrix);
2217
2215
 
2218
2216
  //
@@ -2289,6 +2287,9 @@ extern "C" {
2289
2287
  GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2290
2288
  GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2291
2289
 
2290
+ // removes key if it exists
2291
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2292
+
2292
2293
  // overrides existing values or adds a new one
2293
2294
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2294
2295
  GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
@@ -2377,8 +2378,8 @@ extern "C" {
2377
2378
  #else
2378
2379
  #define GGML_RESTRICT restrict
2379
2380
  #endif
2380
- typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
2381
- typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
2381
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2382
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2382
2383
  typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2383
2384
  const void * GGML_RESTRICT y, size_t by, int nrc);
2384
2385