llama_cpp 0.14.5 → 0.14.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
132
132
  }
133
133
 
134
134
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
135
- #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
135
+ #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
136
136
  const __m256i zero = _mm256_setzero_si256();
137
137
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
138
138
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3154
3154
  #define SYCL_SCALE_BLOCK_SIZE 256
3155
3155
  #define SYCL_CLAMP_BLOCK_SIZE 256
3156
3156
  #define SYCL_ROPE_BLOCK_SIZE 256
3157
- #define SYCL_SOFT_MAX_BLOCK_SIZE 1024
3158
3157
  #define SYCL_ALIBI_BLOCK_SIZE 32
3159
3158
  #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
3160
3159
  #define SYCL_QUANTIZE_BLOCK_SIZE 256
@@ -13080,11 +13079,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13080
13079
  const int nrows_y, const float scale, const float max_bias,
13081
13080
  dpct::queue_ptr stream) {
13082
13081
  int nth = WARP_SIZE;
13083
- while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
13082
+ int max_block_size = g_work_group_size;
13083
+ while (nth < ncols_x && nth < max_block_size) nth *= 2;
13084
+ if (nth>max_block_size) nth = max_block_size;
13085
+
13084
13086
  const sycl::range<3> block_dims(1, 1, nth);
13085
13087
  const sycl::range<3> block_nums(1, 1, nrows_x);
13086
13088
  const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
13087
- static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
13088
13089
 
13089
13090
  const uint32_t n_head_kv = nrows_x/nrows_y;
13090
13091
  const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@@ -13094,6 +13095,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13094
13095
 
13095
13096
  const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
13096
13097
  if (n_local_scratch*sizeof(float) < local_mem_size) {
13098
+ if (ncols_x > max_block_size) {
13099
+ soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13100
+ max_bias, m0, m1, n_head_log2, block_nums,
13101
+ block_dims, n_local_scratch, stream);
13102
+ return;
13103
+ }
13097
13104
  switch (ncols_x) {
13098
13105
  case 32:
13099
13106
  soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
@@ -15989,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
15989
15996
  static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
15990
15997
  const ggml_tensor *src1,
15991
15998
  ggml_tensor *dst) try {
15992
- #if 0
15993
- ggml_sycl_mul_mat_id_sycl(dst);
15994
- // TODO: mmq/mmv support
15995
- #endif
15999
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
16000
+ "mul_mat_id does not support split buffers");
16001
+ const ggml_tensor *ids = dst->src[2];
16002
+ const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
15996
16003
 
15997
- const int64_t nb11 = src1->nb[1];
15998
- const int64_t nb1 = dst->nb[1];
16004
+ const size_t nb11 = src1->nb[1];
16005
+ const size_t nb1 = dst->nb[1];
15999
16006
 
16000
- const struct ggml_tensor * ids = src0;
16001
- const int32_t id = ((int32_t *) dst->op_params)[0];
16002
- const int32_t n_as = ((int32_t *) dst->op_params)[1];
16007
+ const int32_t id = ((int32_t *)dst->op_params)[0];
16008
+ const int32_t n_as = src0->ne[2];
16003
16009
 
16004
16010
  std::vector<char> ids_host(ggml_nbytes(ids));
16011
+ const char *ids_dev = (const char *)ids->data;
16005
16012
 
16006
- const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
16007
-
16008
- if (ids->backend == GGML_BACKEND_TYPE_GPU) {
16009
- const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
16010
- SYCL_CHECK(CHECK_TRY_ERROR(
16011
- stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
16012
- // SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
16013
- } else {
16014
- memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
16015
- }
16013
+ SYCL_CHECK(CHECK_TRY_ERROR(
16014
+ stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
16015
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
16016
16016
 
16017
- const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
16018
- const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
16017
+ const ggml_tensor_extra_gpu *src0_extra =
16018
+ (const ggml_tensor_extra_gpu *)src0->extra;
16019
+ const ggml_tensor_extra_gpu *src1_extra =
16020
+ (const ggml_tensor_extra_gpu *)src1->extra;
16021
+ const ggml_tensor_extra_gpu *dst_extra =
16022
+ (const ggml_tensor_extra_gpu *)dst->extra;
16019
16023
 
16024
+ ggml_tensor_extra_gpu src0_row_extra;
16020
16025
  ggml_tensor_extra_gpu src1_row_extra;
16021
16026
  ggml_tensor_extra_gpu dst_row_extra;
16022
16027
 
16028
+ ggml_tensor src0_row = *src0;
16023
16029
  ggml_tensor src1_row = *src1;
16024
16030
  ggml_tensor dst_row = *dst;
16025
16031
 
16026
16032
  src1_row.backend = GGML_BACKEND_TYPE_GPU;
16027
16033
  dst_row.backend = GGML_BACKEND_TYPE_GPU;
16028
16034
 
16035
+ src0_row.extra = &src0_row_extra;
16029
16036
  src1_row.extra = &src1_row_extra;
16030
16037
  dst_row.extra = &dst_row_extra;
16031
16038
 
16032
- char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
16033
- (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
16034
- char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
16035
- (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
16039
+ char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
16040
+ ? (char *)src0->data
16041
+ : (char *)src0_extra->data_device[g_main_device];
16042
+ char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
16043
+ ? (char *)src1->data
16044
+ : (char *)src1_extra->data_device[g_main_device];
16045
+ char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
16046
+ ? (char *)dst->data
16047
+ : (char *)dst_extra->data_device[g_main_device];
16036
16048
 
16037
- if (src1->ne[1] == 1) {
16038
- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
16039
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
16049
+ src0_row.ne[2] = 1;
16050
+ src0_row.ne[3] = 1;
16051
+ src0_row.nb[3] = src0->nb[2];
16040
16052
 
16053
+ if (src1->ne[1] == 1) {
16041
16054
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16042
- //int32_t row_id;
16043
- //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
16044
- //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
16045
-
16046
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
16055
+ const int32_t row_id =
16056
+ *(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
16057
+ id * ids->nb[0]);
16047
16058
 
16048
16059
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
16049
16060
 
16050
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
16051
-
16052
- src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
16053
- src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
16061
+ src0_row_extra.data_device[g_main_device] =
16062
+ src0_original + row_id * src0->nb[2];
16063
+ src1_row_extra.data_device[g_main_device] =
16064
+ src1_original + i01 * src1->nb[1];
16065
+ dst_row_extra.data_device[g_main_device] =
16066
+ dst_original + i01 * dst->nb[1];
16054
16067
 
16055
- dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
16056
- dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
16057
-
16058
- ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
16068
+ ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
16059
16069
  }
16060
16070
  } else {
16061
16071
  sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
@@ -16065,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16065
16075
  dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
16066
16076
 
16067
16077
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
16068
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
16069
-
16070
16078
  int64_t num_src1_rows = 0;
16071
16079
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
16072
16080
  const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@@ -16079,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16079
16087
 
16080
16088
  SYCL_CHECK(CHECK_TRY_ERROR(
16081
16089
  stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
16082
- src1_original + i01 * nb11, nb11).wait()));
16090
+ src1_original + i01 * nb11, nb11)));
16083
16091
  num_src1_rows++;
16084
16092
  }
16085
16093
 
@@ -16087,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16087
16095
  continue;
16088
16096
  }
16089
16097
 
16098
+ src0_row_extra.data_device[g_main_device] =
16099
+ src0_original + row_id * src0->nb[2];
16100
+
16090
16101
  src1_row.ne[1] = num_src1_rows;
16091
16102
  dst_row.ne[1] = num_src1_rows;
16092
16103
 
@@ -16098,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16098
16109
  dst_row.nb[2] = num_src1_rows*nb1;
16099
16110
  dst_row.nb[3] = num_src1_rows*nb1;
16100
16111
 
16101
- ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
16112
+ ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
16102
16113
 
16103
16114
  num_src1_rows = 0;
16104
16115
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -16112,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
16112
16123
 
16113
16124
  SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
16114
16125
  dst_original + i01 * nb1,
16115
- dst_contiguous.get() + num_src1_rows * nb1, nb1).wait()));
16126
+ dst_contiguous.get() + num_src1_rows * nb1, nb1)));
16116
16127
  num_src1_rows++;
16117
16128
  }
16118
16129
  }
@@ -16814,11 +16825,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
16814
16825
  const dpct::queue_ptr stream = g_syclStreams[ctx->device][0];
16815
16826
  SYCL_CHECK(
16816
16827
  CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
16817
-
16828
+ char* host_buf = (char*)malloc(size);
16829
+ memcpy(host_buf, data, size);
16818
16830
  SYCL_CHECK(
16819
16831
  CHECK_TRY_ERROR((*stream)
16820
- .memcpy((char *)tensor->data + offset, data, size)
16832
+ .memcpy((char *)tensor->data + offset, host_buf, size)
16821
16833
  .wait()));
16834
+ free(host_buf);
16822
16835
  }
16823
16836
  catch (sycl::exception const &exc) {
16824
16837
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -17739,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17739
17752
 
17740
17753
  GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
17741
17754
  const int min_batch_size = 32;
17742
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
17755
+ return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
17743
17756
  GGML_UNUSED(backend);
17744
17757
  }
17745
17758
 
@@ -4,6 +4,7 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
+ #include "sgemm.h"
7
8
 
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -32,6 +33,10 @@
32
33
  #include <unistd.h>
33
34
  #endif
34
35
 
36
+ #ifdef __ARM_FEATURE_MATMUL_INT8
37
+ #undef GGML_USE_LLAMAFILE
38
+ #endif
39
+
35
40
  #if defined(_MSC_VER)
36
41
  // disable "possible loss of data" to avoid hundreds of casts
37
42
  // we should just be careful :)
@@ -4573,21 +4578,32 @@ void ggml_mul_mat_set_prec(
4573
4578
 
4574
4579
  // ggml_mul_mat_id
4575
4580
 
4576
- // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
4577
- // this will allow computing all the used experts in a single matrix multiplication
4581
+ /*
4582
+ c = ggml_mul_mat_id(ctx, as, b, ids);
4583
+
4584
+ as -> [cols, rows, n_expert]
4585
+ ids -> [n_experts_used, n_tokens] (i32)
4586
+ b -> [cols, n_expert_used, n_tokens]
4587
+ c -> [cols, n_expert_used, n_tokens]
4588
+
4589
+ in b, n_experts_used can be broadcasted to match the n_expert_used of ids
4590
+
4591
+ c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
4592
+ */
4578
4593
  struct ggml_tensor * ggml_mul_mat_id(
4579
4594
  struct ggml_context * ctx,
4580
4595
  struct ggml_tensor * as,
4581
- struct ggml_tensor * ids,
4582
- int id,
4583
- struct ggml_tensor * b) {
4584
-
4596
+ struct ggml_tensor * b,
4597
+ struct ggml_tensor * ids) {
4598
+ GGML_ASSERT(!ggml_is_transposed(as));
4585
4599
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
4600
+
4601
+ GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
4602
+ GGML_ASSERT(b->ne[3] == 1); // b is 3d
4586
4603
  GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
4587
- GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
4588
- GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
4589
- GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
4604
+ GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
4590
4605
  GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
4606
+ GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
4591
4607
 
4592
4608
  bool is_node = false;
4593
4609
 
@@ -4595,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id(
4595
4611
  is_node = true;
4596
4612
  }
4597
4613
 
4598
- const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
4614
+ const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
4599
4615
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4600
4616
 
4601
- ggml_set_op_params_i32(result, 0, id);
4602
-
4603
4617
  result->op = GGML_OP_MUL_MAT_ID;
4604
4618
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4605
4619
  result->src[0] = as;
@@ -10810,6 +10824,28 @@ static void ggml_compute_forward_mul_mat(
10810
10824
  }
10811
10825
  #endif
10812
10826
 
10827
+ #if GGML_USE_LLAMAFILE
10828
+ if (nb10 == ggml_type_size(src1->type)) {
10829
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10830
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10831
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10832
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10833
+ nb01/ggml_type_size(src0->type),
10834
+ (const char *)src1->data + i12*nb12 + i13*nb13,
10835
+ nb11/ggml_type_size(src1->type),
10836
+ (char *)dst->data + i12*nb2 + i13*nb3,
10837
+ nb1/ggml_type_size(dst->type),
10838
+ ith, nth,
10839
+ params->type,
10840
+ src0->type,
10841
+ src1->type,
10842
+ dst->type))
10843
+ goto UseGgmlGemm1;
10844
+ return;
10845
+ }
10846
+ UseGgmlGemm1:;
10847
+ #endif
10848
+
10813
10849
  if (params->type == GGML_TASK_TYPE_INIT) {
10814
10850
  if (ith != 0) {
10815
10851
  return;
@@ -10841,6 +10877,30 @@ static void ggml_compute_forward_mul_mat(
10841
10877
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10842
10878
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10843
10879
 
10880
+ #if GGML_USE_LLAMAFILE
10881
+ if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
10882
+ for (int64_t i13 = 0; i13 < ne13; i13++)
10883
+ for (int64_t i12 = 0; i12 < ne12; i12++)
10884
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10885
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10886
+ nb01/ggml_type_size(src0->type),
10887
+ (const char *)wdata + ggml_row_size(vec_dot_type,
10888
+ nb12/ggml_type_size(src1->type)*i12 +
10889
+ nb13/ggml_type_size(src1->type)*i13),
10890
+ row_size/ggml_type_size(vec_dot_type),
10891
+ (char *)dst->data + i12*nb2 + i13*nb3,
10892
+ nb1/ggml_type_size(dst->type),
10893
+ ith, nth,
10894
+ params->type,
10895
+ src0->type,
10896
+ vec_dot_type,
10897
+ dst->type))
10898
+ goto UseGgmlGemm2;
10899
+ return;
10900
+ }
10901
+ UseGgmlGemm2:;
10902
+ #endif
10903
+
10844
10904
  const int64_t nr0 = ne01; // src0 rows
10845
10905
  const int64_t nr1 = ne1*ne12*ne13; // src1 rows
10846
10906
 
@@ -10958,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id(
10958
11018
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10959
11019
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10960
11020
 
10961
- GGML_ASSERT(ne0 == ne01);
10962
- GGML_ASSERT(ne1 == ne11);
10963
- GGML_ASSERT(ne2 == ne12);
10964
- GGML_ASSERT(ne3 == ne13);
10965
-
10966
11021
  // we don't support permuted src0 or src1
10967
11022
  GGML_ASSERT(nb00 == ggml_type_size(type));
10968
11023
  GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@@ -10973,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id(
10973
11028
  GGML_ASSERT(nb1 <= nb2);
10974
11029
  GGML_ASSERT(nb2 <= nb3);
10975
11030
 
10976
- // broadcast is not supported with mmid
10977
- assert(ne12 == 1);
10978
- assert(ne13 == 1);
10979
-
10980
11031
  // row groups
10981
- const int id = ggml_get_op_params_i32(dst, 0);
10982
- const int n_as = src0->ne[2];
11032
+ const int n_ids = ids->ne[0]; // n_expert_used
11033
+ const int n_as = ne02; // n_expert
10983
11034
 
10984
11035
  char * wdata_src1_end = (src1->type == vec_dot_type) ?
10985
11036
  (char *) params->wdata :
10986
11037
  (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
10987
11038
 
10988
- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
10989
- int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
11039
+ struct mmid_row_mapping {
11040
+ int32_t i1;
11041
+ int32_t i2;
11042
+ };
10990
11043
 
10991
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
11044
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
11045
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
10992
11046
 
10993
11047
  if (params->type == GGML_TASK_TYPE_INIT) {
10994
11048
  if (ith != 0) {
@@ -11012,16 +11066,20 @@ static void ggml_compute_forward_mul_mat_id(
11012
11066
  }
11013
11067
 
11014
11068
  // initialize matrix_row_counts
11015
- GGML_ASSERT(wdata == wdata_src1_end);
11016
11069
  memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
11017
11070
 
11071
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
11072
+
11018
11073
  // group rows by src0 matrix
11019
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
11020
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
11074
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
11075
+ for (int id = 0; id < n_ids; ++id) {
11076
+ const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
11077
+
11078
+ assert(i02 >= 0 && i02 < n_as);
11021
11079
 
11022
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
11023
- MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
11024
- matrix_row_counts[row_id] += 1;
11080
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
11081
+ matrix_row_counts[i02] += 1;
11082
+ }
11025
11083
  }
11026
11084
 
11027
11085
  return;
@@ -11039,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id(
11039
11097
  continue;
11040
11098
  }
11041
11099
 
11042
- size_t src0_offset = cur_a*src0->nb[2];
11100
+ const char * src0_cur = (const char *) src0->data + cur_a*nb02;
11043
11101
 
11044
11102
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11045
11103
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
11046
11104
 
11047
- const int64_t nr0 = ne01; // src0 rows
11048
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
11049
-
11050
- //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
11105
+ const int64_t nr0 = ne01; // src0 rows
11106
+ const int64_t nr1 = cne1; // src1 rows
11051
11107
 
11052
11108
  // distribute the thread work across the inner or outer loop based on which one is larger
11053
11109
 
@@ -11066,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id(
11066
11122
  const int64_t ir110 = dr1*ith1;
11067
11123
  const int64_t ir111 = MIN(ir110 + dr1, nr1);
11068
11124
 
11069
- //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
11070
-
11071
11125
  // threads with no work simply yield (not sure if it helps)
11072
- if (ir010 >= ir011 || ir110 >= ir111) {
11073
- sched_yield();
11074
- continue;
11075
- }
11126
+ //if (ir010 >= ir011 || ir110 >= ir111) {
11127
+ // sched_yield();
11128
+ // continue;
11129
+ //}
11076
11130
 
11077
11131
  // block-tiling attempt
11078
11132
  const int64_t blck_0 = 16;
@@ -11084,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id(
11084
11138
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
11085
11139
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
11086
11140
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
11087
- const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
11088
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
11089
- const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
11090
- const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
11141
+ const int64_t _i12 = ir1; // logical row index for this expert
11091
11142
 
11092
- // broadcast src0 into src1
11093
- //const int64_t i03 = i13/r3;
11094
- //const int64_t i02 = i12/r2;
11143
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
11144
+ const int id = row_mapping.i1; // selected expert index
11095
11145
 
11096
- const int64_t i1 = i11;
11097
- const int64_t i2 = i12;
11098
- const int64_t i3 = i13;
11146
+ const int64_t i11 = id % ne11;
11147
+ const int64_t i12 = row_mapping.i2; // row index in src1
11099
11148
 
11100
- const char * src0_row = (const char *) src0->data + src0_offset;
11149
+ const int64_t i1 = id; // selected expert index
11150
+ const int64_t i2 = i12; // row
11101
11151
 
11102
11152
  // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
11103
11153
  // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11105,25 +11155,26 @@ static void ggml_compute_forward_mul_mat_id(
11105
11155
  // TODO: this is a bit of a hack, we should probably have a better way to handle this
11106
11156
  const char * src1_col = (const char *) wdata +
11107
11157
  (src1_cont || src1->type != vec_dot_type
11108
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
11109
- : (i11*nb11 + i12*nb12 + i13*nb13));
11158
+ ? (i11 + i12*ne11)*row_size
11159
+ : (i11*nb11 + i12*nb12));
11110
11160
 
11111
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
11161
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
11112
11162
 
11113
11163
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11114
11164
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
11115
11165
  //}
11116
11166
 
11117
11167
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
11118
- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
11168
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
11119
11169
  }
11170
+
11120
11171
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
11121
11172
  }
11122
11173
  }
11123
11174
  }
11124
11175
  }
11125
11176
 
11126
- #undef MMID_MATRIX_ROW
11177
+ #undef MMID_MATRIX_ROW
11127
11178
  }
11128
11179
 
11129
11180
  // ggml_compute_forward_out_prod
@@ -18462,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18462
18513
  const int n_as = src0->ne[2];
18463
18514
  cur += GGML_PAD(cur, sizeof(int64_t)); // align
18464
18515
  cur += n_as * sizeof(int64_t); // matrix_row_counts
18465
- cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
18516
+ cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
18466
18517
  } break;
18467
18518
  case GGML_OP_OUT_PROD:
18468
18519
  {
@@ -20550,6 +20601,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
20550
20601
  return ok;
20551
20602
  }
20552
20603
 
20604
+ static void gguf_free_kv(struct gguf_kv * kv) {
20605
+ if (kv->key.data) {
20606
+ GGML_FREE(kv->key.data);
20607
+ }
20608
+
20609
+ if (kv->type == GGUF_TYPE_STRING) {
20610
+ if (kv->value.str.data) {
20611
+ GGML_FREE(kv->value.str.data);
20612
+ }
20613
+ }
20614
+
20615
+ if (kv->type == GGUF_TYPE_ARRAY) {
20616
+ if (kv->value.arr.data) {
20617
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
20618
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20619
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20620
+ if (str->data) {
20621
+ GGML_FREE(str->data);
20622
+ }
20623
+ }
20624
+ }
20625
+ GGML_FREE(kv->value.arr.data);
20626
+ }
20627
+ }
20628
+ }
20629
+
20553
20630
  struct gguf_context * gguf_init_empty(void) {
20554
20631
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20555
20632
 
@@ -20862,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20862
20939
 
20863
20940
  ok = ok && cur != NULL;
20864
20941
 
20865
- ggml_set_name(cur, ctx->infos[i].name.data);
20866
-
20867
20942
  if (!ok) {
20868
20943
  break;
20869
20944
  }
20870
20945
 
20946
+ ggml_set_name(cur, ctx->infos[i].name.data);
20947
+
20871
20948
  // point the data member to the appropriate location in the binary blob using the tensor infos
20872
20949
  if (!params.no_alloc) {
20873
20950
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
@@ -20899,31 +20976,7 @@ void gguf_free(struct gguf_context * ctx) {
20899
20976
  if (ctx->kv) {
20900
20977
  // free string memory - not great..
20901
20978
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
20902
- struct gguf_kv * kv = &ctx->kv[i];
20903
-
20904
- if (kv->key.data) {
20905
- GGML_FREE(kv->key.data);
20906
- }
20907
-
20908
- if (kv->type == GGUF_TYPE_STRING) {
20909
- if (kv->value.str.data) {
20910
- GGML_FREE(kv->value.str.data);
20911
- }
20912
- }
20913
-
20914
- if (kv->type == GGUF_TYPE_ARRAY) {
20915
- if (kv->value.arr.data) {
20916
- if (kv->value.arr.type == GGUF_TYPE_STRING) {
20917
- for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
20918
- struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
20919
- if (str->data) {
20920
- GGML_FREE(str->data);
20921
- }
20922
- }
20923
- }
20924
- GGML_FREE(kv->value.arr.data);
20925
- }
20926
- }
20979
+ gguf_free_kv(&ctx->kv[i]);
20927
20980
  }
20928
20981
 
20929
20982
  GGML_FREE(ctx->kv);
@@ -21148,6 +21201,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
21148
21201
  return n_kv;
21149
21202
  }
21150
21203
 
21204
+ void gguf_remove_key(struct gguf_context * ctx, const char * key) {
21205
+ const int idx = gguf_find_key(ctx, key);
21206
+ if (idx >= 0) {
21207
+ const int n_kv = gguf_get_n_kv(ctx);
21208
+ gguf_free_kv(&ctx->kv[idx]);
21209
+ for (int i = idx; i < n_kv-1; ++i) {
21210
+ ctx->kv[i] = ctx->kv[i+1];
21211
+ }
21212
+ ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
21213
+ ctx->header.n_kv--;
21214
+ }
21215
+ }
21216
+
21151
21217
  void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
21152
21218
  const int idx = gguf_get_or_add_key(ctx, key);
21153
21219
 
@@ -1161,13 +1161,11 @@ extern "C" {
1161
1161
  enum ggml_prec prec);
1162
1162
 
1163
1163
  // indirect matrix multiplication
1164
- // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1165
1164
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1166
1165
  struct ggml_context * ctx,
1167
1166
  struct ggml_tensor * as,
1168
- struct ggml_tensor * ids,
1169
- int id,
1170
- struct ggml_tensor * b);
1167
+ struct ggml_tensor * b,
1168
+ struct ggml_tensor * ids);
1171
1169
 
1172
1170
  // A: m columns, n rows,
1173
1171
  // B: p columns, n rows,
@@ -2289,6 +2287,9 @@ extern "C" {
2289
2287
  GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2290
2288
  GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2291
2289
 
2290
+ // removes key if it exists
2291
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2292
+
2292
2293
  // overrides existing values or adds a new one
2293
2294
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2294
2295
  GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);