llama_cpp 0.14.5 → 0.14.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
132
132
  }
133
133
 
134
134
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
135
- #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
135
+ #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
136
136
  const __m256i zero = _mm256_setzero_si256();
137
137
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
138
138
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3154
3154
  #define SYCL_SCALE_BLOCK_SIZE 256
3155
3155
  #define SYCL_CLAMP_BLOCK_SIZE 256
3156
3156
  #define SYCL_ROPE_BLOCK_SIZE 256
3157
- #define SYCL_SOFT_MAX_BLOCK_SIZE 1024
3158
3157
  #define SYCL_ALIBI_BLOCK_SIZE 32
3159
3158
  #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
3160
3159
  #define SYCL_QUANTIZE_BLOCK_SIZE 256
@@ -13080,11 +13079,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13080
13079
  const int nrows_y, const float scale, const float max_bias,
13081
13080
  dpct::queue_ptr stream) {
13082
13081
  int nth = WARP_SIZE;
13083
- while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
13082
+ int max_block_size = g_work_group_size;
13083
+ while (nth < ncols_x && nth < max_block_size) nth *= 2;
13084
+ if (nth>max_block_size) nth = max_block_size;
13085
+
13084
13086
  const sycl::range<3> block_dims(1, 1, nth);
13085
13087
  const sycl::range<3> block_nums(1, 1, nrows_x);
13086
13088
  const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
13087
- static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
13088
13089
 
13089
13090
  const uint32_t n_head_kv = nrows_x/nrows_y;
13090
13091
  const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@@ -13094,6 +13095,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13094
13095
 
13095
13096
  const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
13096
13097
  if (n_local_scratch*sizeof(float) < local_mem_size) {
13098
+ if (ncols_x > max_block_size) {
13099
+ soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13100
+ max_bias, m0, m1, n_head_log2, block_nums,
13101
+ block_dims, n_local_scratch, stream);
13102
+ return;
13103
+ }
13097
13104
  switch (ncols_x) {
13098
13105
  case 32:
13099
13106
  soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
@@ -15989,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
15989
15996
  static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
15990
15997
  const ggml_tensor *src1,
15991
15998
  ggml_tensor *dst) try {
15992
- #if 0
15993
- ggml_sycl_mul_mat_id_sycl(dst);
15994
- // TODO: mmq/mmv support
15995
- #endif
15999
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
16000
+ "mul_mat_id does not support split buffers");
16001
+ const ggml_tensor *ids = dst->src[2];
16002
+ const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
15996
16003
 
15997
- const int64_t nb11 = src1->nb[1];
15998
- const int64_t nb1 = dst->nb[1];
16004
+ const size_t nb11 = src1->nb[1];
16005
+ const size_t nb1 = dst->nb[1];
15999
16006
 
16000
- const struct ggml_tensor * ids = src0;
16001
- const int32_t id = ((int32_t *) dst->op_params)[0];
16002
- const int32_t n_as = ((int32_t *) dst->op_params)[1];
16007
+ const int32_t id = ((int32_t *)dst->op_params)[0];
16008
+ const int32_t n_as = src0->ne[2];
16003
16009
 
16004
16010
  std::vector<char> ids_host(ggml_nbytes(ids));
16011
+ const char *ids_dev = (const char *)ids->data;
16005
16012
 
16006
- const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
16007
-
16008
- if (ids->backend == GGML_BACKEND_TYPE_GPU) {
16009
- const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
16010
- SYCL_CHECK(CHECK_TRY_ERROR(
16011
- stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
16012
- // SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
16013
- } else {
16014
- memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
16015
- }
16013
+ SYCL_CHECK(CHECK_TRY_ERROR(
16014
+ stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
16015
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
16016
16016
 
16017
- const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
16018
- const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
16017
+ const ggml_tensor_extra_gpu *src0_extra =
16018
+ (const ggml_tensor_extra_gpu *)src0->extra;
16019
+ const ggml_tensor_extra_gpu *src1_extra =
16020
+ (const ggml_tensor_extra_gpu *)src1->extra;
16021
+ const ggml_tensor_extra_gpu *dst_extra =
16022
+ (const ggml_tensor_extra_gpu *)dst->extra;
16019
16023
 
16024
+ ggml_tensor_extra_gpu src0_row_extra;
16020
16025
  ggml_tensor_extra_gpu src1_row_extra;
16021
16026
  ggml_tensor_extra_gpu dst_row_extra;
16022
16027
 
16028
+ ggml_tensor src0_row = *src0;
16023
16029
  ggml_tensor src1_row = *src1;
16024
16030
  ggml_tensor dst_row = *dst;
16025
16031
 
16026
16032
  src1_row.backend = GGML_BACKEND_TYPE_GPU;
16027
16033
  dst_row.backend = GGML_BACKEND_TYPE_GPU;
16028
16034
 
16035
+ src0_row.extra = &src0_row_extra;
16029
16036
  src1_row.extra = &src1_row_extra;
16030
16037
  dst_row.extra = &dst_row_extra;
16031
16038
 
16032
- char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
16033
- (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
16034
- char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
16035
- (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
16039
+ char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
16040
+ ? (char *)src0->data
16041
+ : (char *)src0_extra->data_device[g_main_device];
16042
+ char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
16043
+ ? (char *)src1->data
16044
+ : (char *)src1_extra->data_device[g_main_device];
16045
+ char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
16046
+ ? (char *)dst->data
16047
+ : (char *)dst_extra->data_device[g_main_device];
16036
16048
 
16037
- if (src1->ne[1] == 1) {
16038
- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
16039
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
16049
+ src0_row.ne[2] = 1;
16050
+ src0_row.ne[3] = 1;
16051
+ src0_row.nb[3] = src0->nb[2];
16040
16052
 
16053
+ if (src1->ne[1] == 1) {
16041
16054
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16042
- //int32_t row_id;
16043
- //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
16044
- //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
16045
-
16046
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
16055
+ const int32_t row_id =
16056
+ *(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
16057
+ id * ids->nb[0]);
16047
16058
 
16048
16059
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
16049
16060
 
16050
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
16051
-
16052
- src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
16053
- src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
16061
+ src0_row_extra.data_device[g_main_device] =
16062
+ src0_original + row_id * src0->nb[2];
16063
+ src1_row_extra.data_device[g_main_device] =
16064
+ src1_original + i01 * src1->nb[1];
16065
+ dst_row_extra.data_device[g_main_device] =
16066
+ dst_original + i01 * dst->nb[1];
16054
16067
 
16055
- dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
16056
- dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
16057
-
16058
- ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
16068
+ ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
16059
16069
  }
16060
16070
  } else {
16061
16071
  sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
@@ -16065,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16065
16075
  dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
16066
16076
 
16067
16077
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
16068
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
16069
-
16070
16078
  int64_t num_src1_rows = 0;
16071
16079
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16072
16080
  const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@@ -16079,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16079
16087
 
16080
16088
  SYCL_CHECK(CHECK_TRY_ERROR(
16081
16089
  stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
16082
- src1_original + i01 * nb11, nb11).wait()));
16090
+ src1_original + i01 * nb11, nb11)));
16083
16091
  num_src1_rows++;
16084
16092
  }
16085
16093
 
@@ -16087,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16087
16095
  continue;
16088
16096
  }
16089
16097
 
16098
+ src0_row_extra.data_device[g_main_device] =
16099
+ src0_original + row_id * src0->nb[2];
16100
+
16090
16101
  src1_row.ne[1] = num_src1_rows;
16091
16102
  dst_row.ne[1] = num_src1_rows;
16092
16103
 
@@ -16098,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16098
16109
  dst_row.nb[2] = num_src1_rows*nb1;
16099
16110
  dst_row.nb[3] = num_src1_rows*nb1;
16100
16111
 
16101
- ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
16112
+ ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
16102
16113
 
16103
16114
  num_src1_rows = 0;
16104
16115
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -16112,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16112
16123
 
16113
16124
  SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
16114
16125
  dst_original + i01 * nb1,
16115
- dst_contiguous.get() + num_src1_rows * nb1, nb1).wait()));
16126
+ dst_contiguous.get() + num_src1_rows * nb1, nb1)));
16116
16127
  num_src1_rows++;
16117
16128
  }
16118
16129
  }
@@ -16814,11 +16825,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
16814
16825
  const dpct::queue_ptr stream = g_syclStreams[ctx->device][0];
16815
16826
  SYCL_CHECK(
16816
16827
  CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
16817
-
16828
+ char* host_buf = (char*)malloc(size);
16829
+ memcpy(host_buf, data, size);
16818
16830
  SYCL_CHECK(
16819
16831
  CHECK_TRY_ERROR((*stream)
16820
- .memcpy((char *)tensor->data + offset, data, size)
16832
+ .memcpy((char *)tensor->data + offset, host_buf, size)
16821
16833
  .wait()));
16834
+ free(host_buf);
16822
16835
  }
16823
16836
  catch (sycl::exception const &exc) {
16824
16837
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -17739,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17739
17752
 
17740
17753
  GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
17741
17754
  const int min_batch_size = 32;
17742
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
17755
+ return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
17743
17756
  GGML_UNUSED(backend);
17744
17757
  }
17745
17758
 
@@ -4,6 +4,7 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
+ #include "sgemm.h"
7
8
 
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -32,6 +33,10 @@
32
33
  #include <unistd.h>
33
34
  #endif
34
35
 
36
+ #ifdef __ARM_FEATURE_MATMUL_INT8
37
+ #undef GGML_USE_LLAMAFILE
38
+ #endif
39
+
35
40
  #if defined(_MSC_VER)
36
41
  // disable "possible loss of data" to avoid hundreds of casts
37
42
  // we should just be careful :)
@@ -4573,21 +4578,32 @@ void ggml_mul_mat_set_prec(
4573
4578
 
4574
4579
  // ggml_mul_mat_id
4575
4580
 
4576
- // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
- // this will allow computing all the used experts in a single matrix multiplication
4581
+ /*
4582
+ c = ggml_mul_mat_id(ctx, as, b, ids);
4583
+
4584
+ as -> [cols, rows, n_expert]
4585
+ ids -> [n_experts_used, n_tokens] (i32)
4586
+ b -> [cols, n_expert_used, n_tokens]
4587
+ c -> [cols, n_expert_used, n_tokens]
4588
+
4589
+ in b, n_experts_used can be broadcasted to match the n_expert_used of ids
4590
+
4591
+ c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
4592
+ */
4578
4593
  struct ggml_tensor * ggml_mul_mat_id(
4579
4594
  struct ggml_context * ctx,
4580
4595
  struct ggml_tensor * as,
4581
- struct ggml_tensor * ids,
4582
- int id,
4583
- struct ggml_tensor * b) {
4584
-
4596
+ struct ggml_tensor * b,
4597
+ struct ggml_tensor * ids) {
4598
+ GGML_ASSERT(!ggml_is_transposed(as));
4585
4599
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4600
+
4601
+ GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
4602
+ GGML_ASSERT(b->ne[3] == 1); // b is 3d
4586
4603
  GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
- GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4588
- GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4589
- GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4604
+ GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
4590
4605
  GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4606
+ GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
4591
4607
 
4592
4608
  bool is_node = false;
4593
4609
 
@@ -4595,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id(
4595
4611
  is_node = true;
4596
4612
  }
4597
4613
 
4598
- const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4614
+ const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
4599
4615
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4600
4616
 
4601
- ggml_set_op_params_i32(result, 0, id);
4602
-
4603
4617
  result->op = GGML_OP_MUL_MAT_ID;
4604
4618
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4605
4619
  result->src[0] = as;
@@ -10810,6 +10824,28 @@ static void ggml_compute_forward_mul_mat(
10810
10824
  }
10811
10825
  #endif
10812
10826
 
10827
+ #if GGML_USE_LLAMAFILE
10828
+ if (nb10 == ggml_type_size(src1->type)) {
10829
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10830
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10831
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10832
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10833
+ nb01/ggml_type_size(src0->type),
10834
+ (const char *)src1->data + i12*nb12 + i13*nb13,
10835
+ nb11/ggml_type_size(src1->type),
10836
+ (char *)dst->data + i12*nb2 + i13*nb3,
10837
+ nb1/ggml_type_size(dst->type),
10838
+ ith, nth,
10839
+ params->type,
10840
+ src0->type,
10841
+ src1->type,
10842
+ dst->type))
10843
+ goto UseGgmlGemm1;
10844
+ return;
10845
+ }
10846
+ UseGgmlGemm1:;
10847
+ #endif
10848
+
10813
10849
  if (params->type == GGML_TASK_TYPE_INIT) {
10814
10850
  if (ith != 0) {
10815
10851
  return;
@@ -10841,6 +10877,30 @@ static void ggml_compute_forward_mul_mat(
10841
10877
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10842
10878
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10843
10879
 
10880
+ #if GGML_USE_LLAMAFILE
10881
+ if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
10882
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10883
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10884
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10885
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10886
+ nb01/ggml_type_size(src0->type),
10887
+ (const char *)wdata + ggml_row_size(vec_dot_type,
10888
+ nb12/ggml_type_size(src1->type)*i12 +
10889
+ nb13/ggml_type_size(src1->type)*i13),
10890
+ row_size/ggml_type_size(vec_dot_type),
10891
+ (char *)dst->data + i12*nb2 + i13*nb3,
10892
+ nb1/ggml_type_size(dst->type),
10893
+ ith, nth,
10894
+ params->type,
10895
+ src0->type,
10896
+ vec_dot_type,
10897
+ dst->type))
10898
+ goto UseGgmlGemm2;
10899
+ return;
10900
+ }
10901
+ UseGgmlGemm2:;
10902
+ #endif
10903
+
10844
10904
  const int64_t nr0 = ne01; // src0 rows
10845
10905
  const int64_t nr1 = ne1*ne12*ne13; // src1 rows
10846
10906
 
@@ -10958,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id(
10958
11018
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10959
11019
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10960
11020
 
10961
- GGML_ASSERT(ne0 == ne01);
10962
- GGML_ASSERT(ne1 == ne11);
10963
- GGML_ASSERT(ne2 == ne12);
10964
- GGML_ASSERT(ne3 == ne13);
10965
-
10966
11021
  // we don't support permuted src0 or src1
10967
11022
  GGML_ASSERT(nb00 == ggml_type_size(type));
10968
11023
  GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@@ -10973,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id(
10973
11028
  GGML_ASSERT(nb1 <= nb2);
10974
11029
  GGML_ASSERT(nb2 <= nb3);
10975
11030
 
10976
- // broadcast is not supported with mmid
10977
- assert(ne12 == 1);
10978
- assert(ne13 == 1);
10979
-
10980
11031
  // row groups
10981
- const int id = ggml_get_op_params_i32(dst, 0);
10982
- const int n_as = src0->ne[2];
11032
+ const int n_ids = ids->ne[0]; // n_expert_used
11033
+ const int n_as = ne02; // n_expert
10983
11034
 
10984
11035
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10985
11036
  (char *) params->wdata :
10986
11037
  (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
10987
11038
 
10988
- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
10989
- int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
11039
+ struct mmid_row_mapping {
11040
+ int32_t i1;
11041
+ int32_t i2;
11042
+ };
10990
11043
 
10991
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
11044
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
11045
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
10992
11046
 
10993
11047
  if (params->type == GGML_TASK_TYPE_INIT) {
10994
11048
  if (ith != 0) {
@@ -11012,16 +11066,20 @@ static void ggml_compute_forward_mul_mat_id(
11012
11066
  }
11013
11067
 
11014
11068
  // initialize matrix_row_counts
11015
- GGML_ASSERT(wdata == wdata_src1_end);
11016
11069
  memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
11017
11070
 
11071
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
11072
+
11018
11073
  // group rows by src0 matrix
11019
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
11020
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
11074
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
11075
+ for (int id = 0; id < n_ids; ++id) {
11076
+ const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
11077
+
11078
+ assert(i02 >= 0 && i02 < n_as);
11021
11079
 
11022
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
11023
- MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
11024
- matrix_row_counts[row_id] += 1;
11080
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
11081
+ matrix_row_counts[i02] += 1;
11082
+ }
11025
11083
  }
11026
11084
 
11027
11085
  return;
@@ -11039,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id(
11039
11097
  continue;
11040
11098
  }
11041
11099
 
11042
- size_t src0_offset = cur_a*src0->nb[2];
11100
+ const char * src0_cur = (const char *) src0->data + cur_a*nb02;
11043
11101
 
11044
11102
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11045
11103
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
11046
11104
 
11047
- const int64_t nr0 = ne01; // src0 rows
11048
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
11049
-
11050
- //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
11105
+ const int64_t nr0 = ne01; // src0 rows
11106
+ const int64_t nr1 = cne1; // src1 rows
11051
11107
 
11052
11108
  // distribute the thread work across the inner or outer loop based on which one is larger
11053
11109
 
@@ -11066,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id(
11066
11122
  const int64_t ir110 = dr1*ith1;
11067
11123
  const int64_t ir111 = MIN(ir110 + dr1, nr1);
11068
11124
 
11069
- //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
11070
-
11071
11125
  // threads with no work simply yield (not sure if it helps)
11072
- if (ir010 >= ir011 || ir110 >= ir111) {
11073
- sched_yield();
11074
- continue;
11075
- }
11126
+ //if (ir010 >= ir011 || ir110 >= ir111) {
11127
+ // sched_yield();
11128
+ // continue;
11129
+ //}
11076
11130
 
11077
11131
  // block-tiling attempt
11078
11132
  const int64_t blck_0 = 16;
@@ -11084,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id(
11084
11138
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
11085
11139
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
11086
11140
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
11087
- const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
11088
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
11089
- const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
11090
- const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
11141
+ const int64_t _i12 = ir1; // logical row index for this expert
11091
11142
 
11092
- // broadcast src0 into src1
11093
- //const int64_t i03 = i13/r3;
11094
- //const int64_t i02 = i12/r2;
11143
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
11144
+ const int id = row_mapping.i1; // selected expert index
11095
11145
 
11096
- const int64_t i1 = i11;
11097
- const int64_t i2 = i12;
11098
- const int64_t i3 = i13;
11146
+ const int64_t i11 = id % ne11;
11147
+ const int64_t i12 = row_mapping.i2; // row index in src1
11099
11148
 
11100
- const char * src0_row = (const char *) src0->data + src0_offset;
11149
+ const int64_t i1 = id; // selected expert index
11150
+ const int64_t i2 = i12; // row
11101
11151
 
11102
11152
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
11103
11153
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11105,25 +11155,26 @@ static void ggml_compute_forward_mul_mat_id(
11105
11155
  // TODO: this is a bit of a hack, we should probably have a better way to handle this
11106
11156
  const char * src1_col = (const char *) wdata +
11107
11157
  (src1_cont || src1->type != vec_dot_type
11108
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
11109
- : (i11*nb11 + i12*nb12 + i13*nb13));
11158
+ ? (i11 + i12*ne11)*row_size
11159
+ : (i11*nb11 + i12*nb12));
11110
11160
 
11111
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
11161
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
11112
11162
 
11113
11163
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11114
11164
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
11115
11165
  //}
11116
11166
 
11117
11167
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11118
- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
11168
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
11119
11169
  }
11170
+
11120
11171
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
11121
11172
  }
11122
11173
  }
11123
11174
  }
11124
11175
  }
11125
11176
 
11126
- #undef MMID_MATRIX_ROW
11177
+ #undef MMID_MATRIX_ROW
11127
11178
  }
11128
11179
 
11129
11180
  // ggml_compute_forward_out_prod
@@ -18462,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18462
18513
  const int n_as = src0->ne[2];
18463
18514
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18464
18515
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18465
- cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
18516
+ cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
18466
18517
  } break;
18467
18518
  case GGML_OP_OUT_PROD:
18468
18519
  {
@@ -20550,6 +20601,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
20550
20601
  return ok;
20551
20602
  }
20552
20603
 
20604
+ static void gguf_free_kv(struct gguf_kv * kv) {
20605
+ if (kv->key.data) {
20606
+ GGML_FREE(kv->key.data);
20607
+ }
20608
+
20609
+ if (kv->type == GGUF_TYPE_STRING) {
20610
+ if (kv->value.str.data) {
20611
+ GGML_FREE(kv->value.str.data);
20612
+ }
20613
+ }
20614
+
20615
+ if (kv->type == GGUF_TYPE_ARRAY) {
20616
+ if (kv->value.arr.data) {
20617
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
20618
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20619
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20620
+ if (str->data) {
20621
+ GGML_FREE(str->data);
20622
+ }
20623
+ }
20624
+ }
20625
+ GGML_FREE(kv->value.arr.data);
20626
+ }
20627
+ }
20628
+ }
20629
+
20553
20630
  struct gguf_context * gguf_init_empty(void) {
20554
20631
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20555
20632
 
@@ -20862,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20862
20939
 
20863
20940
  ok = ok && cur != NULL;
20864
20941
 
20865
- ggml_set_name(cur, ctx->infos[i].name.data);
20866
-
20867
20942
  if (!ok) {
20868
20943
  break;
20869
20944
  }
20870
20945
 
20946
+ ggml_set_name(cur, ctx->infos[i].name.data);
20947
+
20871
20948
  // point the data member to the appropriate location in the binary blob using the tensor infos
20872
20949
  if (!params.no_alloc) {
20873
20950
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
@@ -20899,31 +20976,7 @@ void gguf_free(struct gguf_context * ctx) {
20899
20976
  if (ctx->kv) {
20900
20977
  // free string memory - not great..
20901
20978
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
20902
- struct gguf_kv * kv = &ctx->kv[i];
20903
-
20904
- if (kv->key.data) {
20905
- GGML_FREE(kv->key.data);
20906
- }
20907
-
20908
- if (kv->type == GGUF_TYPE_STRING) {
20909
- if (kv->value.str.data) {
20910
- GGML_FREE(kv->value.str.data);
20911
- }
20912
- }
20913
-
20914
- if (kv->type == GGUF_TYPE_ARRAY) {
20915
- if (kv->value.arr.data) {
20916
- if (kv->value.arr.type == GGUF_TYPE_STRING) {
20917
- for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20918
- struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20919
- if (str->data) {
20920
- GGML_FREE(str->data);
20921
- }
20922
- }
20923
- }
20924
- GGML_FREE(kv->value.arr.data);
20925
- }
20926
- }
20979
+ gguf_free_kv(&ctx->kv[i]);
20927
20980
  }
20928
20981
 
20929
20982
  GGML_FREE(ctx->kv);
@@ -21148,6 +21201,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
21148
21201
  return n_kv;
21149
21202
  }
21150
21203
 
21204
+ void gguf_remove_key(struct gguf_context * ctx, const char * key) {
21205
+ const int idx = gguf_find_key(ctx, key);
21206
+ if (idx >= 0) {
21207
+ const int n_kv = gguf_get_n_kv(ctx);
21208
+ gguf_free_kv(&ctx->kv[idx]);
21209
+ for (int i = idx; i < n_kv-1; ++i) {
21210
+ ctx->kv[i] = ctx->kv[i+1];
21211
+ }
21212
+ ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
21213
+ ctx->header.n_kv--;
21214
+ }
21215
+ }
21216
+
21151
21217
  void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
21152
21218
  const int idx = gguf_get_or_add_key(ctx, key);
21153
21219
 
@@ -1161,13 +1161,11 @@ extern "C" {
1161
1161
  enum ggml_prec prec);
1162
1162
 
1163
1163
  // indirect matrix multiplication
1164
- // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1165
1164
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1166
1165
  struct ggml_context * ctx,
1167
1166
  struct ggml_tensor * as,
1168
- struct ggml_tensor * ids,
1169
- int id,
1170
- struct ggml_tensor * b);
1167
+ struct ggml_tensor * b,
1168
+ struct ggml_tensor * ids);
1171
1169
 
1172
1170
  // A: m columns, n rows,
1173
1171
  // B: p columns, n rows,
@@ -2289,6 +2287,9 @@ extern "C" {
2289
2287
  GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2290
2288
  GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2291
2289
 
2290
+ // removes key if it exists
2291
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2292
+
2292
2293
  // overrides existing values or adds a new one
2293
2294
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2294
2295
  GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);