llama_cpp 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
2383
2383
  size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
2384
2384
  size_t max_size = 0;
2385
2385
 
2386
- struct ggml_object * obj = ctx->objects_begin;
2387
-
2388
- while (obj != NULL) {
2389
- if (obj->type == GGML_OBJECT_TENSOR) {
2390
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
2391
-
2392
- const size_t size = ggml_nbytes(tensor);
2393
-
2394
- if (max_size < size) {
2395
- max_size = size;
2396
- }
2397
- }
2398
-
2399
- obj = obj->next;
2386
+ for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
2387
+ max_size = MAX(max_size, ggml_nbytes(tensor));
2400
2388
  }
2401
2389
 
2402
2390
  return max_size;
@@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
3093
3081
  return result;
3094
3082
  }
3095
3083
 
3096
- struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3084
+ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3097
3085
  struct ggml_object * obj = ctx->objects_begin;
3098
3086
 
3099
3087
  char * const mem_buffer = ctx->mem_buffer;
@@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3109
3097
  return NULL;
3110
3098
  }
3111
3099
 
3112
- struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
3100
+ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
3113
3101
  struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
3114
3102
  obj = obj->next;
3115
3103
 
@@ -4098,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat(
4098
4086
  return result;
4099
4087
  }
4100
4088
 
4089
+ void ggml_mul_mat_set_prec(
4090
+ struct ggml_tensor * a,
4091
+ enum ggml_prec prec) {
4092
+ const int32_t prec_i32 = (int32_t) prec;
4093
+
4094
+ ggml_set_op_params_i32(a, 0, prec_i32);
4095
+ }
4096
+
4101
4097
  // ggml_mul_mat_id
4102
4098
 
4103
4099
  struct ggml_tensor * ggml_mul_mat_id(
@@ -4175,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
4175
4171
  static struct ggml_tensor * ggml_scale_impl(
4176
4172
  struct ggml_context * ctx,
4177
4173
  struct ggml_tensor * a,
4178
- struct ggml_tensor * b,
4174
+ float s,
4179
4175
  bool inplace) {
4180
- GGML_ASSERT(ggml_is_scalar(b));
4181
4176
  GGML_ASSERT(ggml_is_padded_1d(a));
4182
4177
 
4183
4178
  bool is_node = false;
4184
4179
 
4185
- if (a->grad || b->grad) {
4180
+ if (a->grad) {
4186
4181
  is_node = true;
4187
4182
  }
4188
4183
 
4189
4184
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4190
4185
 
4186
+ ggml_set_op_params(result, &s, sizeof(s));
4187
+
4191
4188
  result->op = GGML_OP_SCALE;
4192
4189
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4193
4190
  result->src[0] = a;
4194
- result->src[1] = b;
4195
4191
 
4196
4192
  return result;
4197
4193
  }
@@ -4199,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
4199
4195
  struct ggml_tensor * ggml_scale(
4200
4196
  struct ggml_context * ctx,
4201
4197
  struct ggml_tensor * a,
4202
- struct ggml_tensor * b) {
4203
- return ggml_scale_impl(ctx, a, b, false);
4198
+ float s) {
4199
+ return ggml_scale_impl(ctx, a, s, false);
4204
4200
  }
4205
4201
 
4206
4202
  struct ggml_tensor * ggml_scale_inplace(
4207
4203
  struct ggml_context * ctx,
4208
4204
  struct ggml_tensor * a,
4209
- struct ggml_tensor * b) {
4210
- return ggml_scale_impl(ctx, a, b, true);
4205
+ float s) {
4206
+ return ggml_scale_impl(ctx, a, s, true);
4211
4207
  }
4212
4208
 
4213
4209
  // ggml_set
@@ -9168,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
9168
9164
  float eps;
9169
9165
  memcpy(&eps, dst->op_params, sizeof(float));
9170
9166
 
9167
+ GGML_ASSERT(eps > 0.0f);
9168
+
9171
9169
  // TODO: optimize
9172
9170
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9173
9171
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9237,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
9237
9235
  float eps;
9238
9236
  memcpy(&eps, dst->op_params, sizeof(float));
9239
9237
 
9238
+ GGML_ASSERT(eps > 0.0f);
9239
+
9240
9240
  // TODO: optimize
9241
9241
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9242
9242
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9580,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9580
9580
  }
9581
9581
  #endif
9582
9582
 
9583
- // off1 = offset in i11 and i1
9584
- // cne1 = ne11 and ne1
9585
- // in a normal matrix multiplication, off1 = 0 and cne1 = ne1
9586
- // during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
9587
9583
  static void ggml_compute_forward_mul_mat(
9588
9584
  const struct ggml_compute_params * params,
9589
9585
  const struct ggml_tensor * src0,
9590
9586
  const struct ggml_tensor * src1,
9591
- struct ggml_tensor * dst,
9592
- int64_t off1, int64_t cne1) {
9587
+ struct ggml_tensor * dst) {
9593
9588
  int64_t t0 = ggml_perf_time_us();
9594
9589
  UNUSED(t0);
9595
9590
 
@@ -9657,9 +9652,9 @@ static void ggml_compute_forward_mul_mat(
9657
9652
  const int64_t i03 = i13/r3;
9658
9653
  const int64_t i02 = i12/r2;
9659
9654
 
9660
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9661
- const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
9662
- float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
9655
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9656
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
9657
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9663
9658
 
9664
9659
  if (type != GGML_TYPE_F32) {
9665
9660
  float * const wdata = params->wdata;
@@ -9676,7 +9671,7 @@ static void ggml_compute_forward_mul_mat(
9676
9671
  }
9677
9672
 
9678
9673
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9679
- cne1, ne01, ne10,
9674
+ ne1, ne01, ne10,
9680
9675
  1.0f, y, ne10,
9681
9676
  x, ne00,
9682
9677
  0.0f, d, ne01);
@@ -9717,8 +9712,8 @@ static void ggml_compute_forward_mul_mat(
9717
9712
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9718
9713
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9719
9714
 
9720
- const int64_t nr0 = ne01; // src0 rows
9721
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9715
+ const int64_t nr0 = ne01; // src0 rows
9716
+ const int64_t nr1 = ne1*ne12*ne13; // src1 rows
9722
9717
 
9723
9718
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9724
9719
 
@@ -9760,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
9760
9755
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9761
9756
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9762
9757
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9763
- const int64_t i13 = (ir1/(ne12*cne1));
9764
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9765
- const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
9758
+ const int64_t i13 = (ir1/(ne12*ne1));
9759
+ const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
9760
+ const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
9766
9761
 
9767
9762
  // broadcast src0 into src1
9768
9763
  const int64_t i03 = i13/r3;
@@ -9802,28 +9797,191 @@ static void ggml_compute_forward_mul_mat(
9802
9797
 
9803
9798
  static void ggml_compute_forward_mul_mat_id(
9804
9799
  const struct ggml_compute_params * params,
9805
- const struct ggml_tensor * src0,
9800
+ const struct ggml_tensor * ids,
9806
9801
  const struct ggml_tensor * src1,
9807
9802
  struct ggml_tensor * dst) {
9808
9803
 
9809
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9810
- // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
9811
- ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
9812
- return;
9813
- }
9804
+ const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
9805
+
9806
+ GGML_TENSOR_BINARY_OP_LOCALS
9807
+
9808
+ const int ith = params->ith;
9809
+ const int nth = params->nth;
9810
+
9811
+ const enum ggml_type type = src0->type;
9812
+
9813
+ const bool src1_cont = ggml_is_contiguous(src1);
9814
9814
 
9815
- const struct ggml_tensor * ids = src0;
9815
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9816
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9817
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
9818
+
9819
+ GGML_ASSERT(ne0 == ne01);
9820
+ GGML_ASSERT(ne1 == ne11);
9821
+ GGML_ASSERT(ne2 == ne12);
9822
+ GGML_ASSERT(ne3 == ne13);
9823
+
9824
+ // we don't support permuted src0 or src1
9825
+ GGML_ASSERT(nb00 == ggml_type_size(type));
9826
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9827
+
9828
+ // dst cannot be transposed or permuted
9829
+ GGML_ASSERT(nb0 == sizeof(float));
9830
+ GGML_ASSERT(nb0 <= nb1);
9831
+ GGML_ASSERT(nb1 <= nb2);
9832
+ GGML_ASSERT(nb2 <= nb3);
9833
+
9834
+ // broadcast factors
9835
+ const int64_t r2 = ne12/ne02;
9836
+ const int64_t r3 = ne13/ne03;
9837
+
9838
+ // row groups
9816
9839
  const int id = ggml_get_op_params_i32(dst, 0);
9817
9840
  const int n_as = ggml_get_op_params_i32(dst, 1);
9818
9841
 
9819
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9820
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9842
+ char * wdata_src1_end = (src1->type == vec_dot_type) ?
9843
+ (char *) params->wdata :
9844
+ (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
9845
+
9846
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
9847
+ int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
9848
+
9849
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
9850
+
9851
+ if (params->type == GGML_TASK_INIT) {
9852
+ char * wdata = params->wdata;
9853
+ if (src1->type != vec_dot_type) {
9854
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9855
+
9856
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
9857
+ assert(src1->type == GGML_TYPE_F32);
9858
+
9859
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
9860
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
9861
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
9862
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
9863
+ wdata += row_size;
9864
+ }
9865
+ }
9866
+ }
9867
+ }
9868
+
9869
+ // initialize matrix_row_counts
9870
+ GGML_ASSERT(wdata == wdata_src1_end);
9871
+ memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
9872
+
9873
+ // group rows by src0 matrix
9874
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9875
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9876
+
9877
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
9878
+ MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
9879
+ matrix_row_counts[row_id] += 1;
9880
+ }
9881
+
9882
+ return;
9883
+ }
9884
+
9885
+ if (params->type == GGML_TASK_FINALIZE) {
9886
+ return;
9887
+ }
9888
+
9889
+ // compute each matrix multiplication in sequence
9890
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
9891
+ const int64_t cne1 = matrix_row_counts[cur_a];
9892
+
9893
+ if (cne1 == 0) {
9894
+ continue;
9895
+ }
9896
+
9897
+ const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
9898
+
9899
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9900
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9901
+
9902
+ const int64_t nr0 = ne01; // src0 rows
9903
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9904
+
9905
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9906
+
9907
+ // distribute the thread work across the inner or outer loop based on which one is larger
9908
+
9909
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
9910
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
9911
+
9912
+ const int64_t ith0 = ith % nth0;
9913
+ const int64_t ith1 = ith / nth0;
9914
+
9915
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
9916
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
9917
+
9918
+ const int64_t ir010 = dr0*ith0;
9919
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
9920
+
9921
+ const int64_t ir110 = dr1*ith1;
9922
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
9923
+
9924
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
9925
+
9926
+ // threads with no work simply yield (not sure if it helps)
9927
+ if (ir010 >= ir011 || ir110 >= ir111) {
9928
+ sched_yield();
9929
+ continue;
9930
+ }
9931
+
9932
+ assert(ne12 % ne02 == 0);
9933
+ assert(ne13 % ne03 == 0);
9934
+
9935
+ // block-tiling attempt
9936
+ const int64_t blck_0 = 16;
9937
+ const int64_t blck_1 = 16;
9938
+
9939
+ // attempt to reduce false-sharing (does not seem to make a difference)
9940
+ float tmp[16];
9941
+
9942
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9943
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9944
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9945
+ const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
9946
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9947
+ const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
9948
+ const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
9949
+
9950
+ // broadcast src0 into src1
9951
+ const int64_t i03 = i13/r3;
9952
+ const int64_t i02 = i12/r2;
9953
+
9954
+ const int64_t i1 = i11;
9955
+ const int64_t i2 = i12;
9956
+ const int64_t i3 = i13;
9821
9957
 
9822
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
9958
+ const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
9823
9959
 
9824
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
9825
- ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
9960
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
9961
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
9962
+ // the original src1 data pointer, so we should index using the indices directly
9963
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
9964
+ const char * src1_col = (const char *) wdata +
9965
+ (src1_cont || src1->type != vec_dot_type
9966
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
9967
+ : (i11*nb11 + i12*nb12 + i13*nb13));
9968
+
9969
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
9970
+
9971
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9972
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
9973
+ //}
9974
+
9975
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9976
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
9977
+ }
9978
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
9979
+ }
9980
+ }
9981
+ }
9826
9982
  }
9983
+
9984
+ #undef MMID_MATRIX_ROW
9827
9985
  }
9828
9986
 
9829
9987
  // ggml_compute_forward_out_prod
@@ -10167,19 +10325,17 @@ static void ggml_compute_forward_out_prod(
10167
10325
  static void ggml_compute_forward_scale_f32(
10168
10326
  const struct ggml_compute_params * params,
10169
10327
  const struct ggml_tensor * src0,
10170
- const struct ggml_tensor * src1,
10171
10328
  struct ggml_tensor * dst) {
10172
10329
  GGML_ASSERT(ggml_is_contiguous(src0));
10173
10330
  GGML_ASSERT(ggml_is_contiguous(dst));
10174
10331
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10175
- GGML_ASSERT(ggml_is_scalar(src1));
10176
10332
 
10177
10333
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10178
10334
  return;
10179
10335
  }
10180
10336
 
10181
10337
  // scale factor
10182
- const float v = *(float *) src1->data;
10338
+ const float v = *(float *) dst->op_params;
10183
10339
 
10184
10340
  const int ith = params->ith;
10185
10341
  const int nth = params->nth;
@@ -10210,12 +10366,11 @@ static void ggml_compute_forward_scale_f32(
10210
10366
  static void ggml_compute_forward_scale(
10211
10367
  const struct ggml_compute_params * params,
10212
10368
  const struct ggml_tensor * src0,
10213
- const struct ggml_tensor * src1,
10214
10369
  struct ggml_tensor * dst) {
10215
10370
  switch (src0->type) {
10216
10371
  case GGML_TYPE_F32:
10217
10372
  {
10218
- ggml_compute_forward_scale_f32(params, src0, src1, dst);
10373
+ ggml_compute_forward_scale_f32(params, src0, dst);
10219
10374
  } break;
10220
10375
  default:
10221
10376
  {
@@ -11404,10 +11559,13 @@ static void ggml_compute_forward_rope_f32(
11404
11559
  }
11405
11560
  } else {
11406
11561
  // TODO: this might be wrong for ne0 != n_dims - need double check
11407
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
11562
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11563
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11408
11564
  theta_base *= freq_scale;
11409
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11410
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11565
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11566
+ if (ic < n_dims) {
11567
+ const int64_t ib = 0;
11568
+
11411
11569
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11412
11570
  float cur_rot = inv_ndims * ic - ib;
11413
11571
 
@@ -11430,6 +11588,14 @@ static void ggml_compute_forward_rope_f32(
11430
11588
 
11431
11589
  dst_data[0] = x0*cos_theta - x1*sin_theta;
11432
11590
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
11591
+ } else {
11592
+ const int64_t i0 = ic;
11593
+
11594
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11595
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11596
+
11597
+ dst_data[0] = src[0];
11598
+ dst_data[1] = src[1];
11433
11599
  }
11434
11600
  }
11435
11601
  }
@@ -11557,10 +11723,13 @@ static void ggml_compute_forward_rope_f16(
11557
11723
  }
11558
11724
  } else {
11559
11725
  // TODO: this might be wrong for ne0 != n_dims - need double check
11560
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
11726
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11727
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11561
11728
  theta_base *= freq_scale;
11562
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11563
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11729
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11730
+ if (ic < n_dims) {
11731
+ const int64_t ib = 0;
11732
+
11564
11733
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11565
11734
  float cur_rot = inv_ndims * ic - ib;
11566
11735
 
@@ -11583,6 +11752,14 @@ static void ggml_compute_forward_rope_f16(
11583
11752
 
11584
11753
  dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
11585
11754
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
11755
+ } else {
11756
+ const int64_t i0 = ic;
11757
+
11758
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11759
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11760
+
11761
+ dst_data[0] = src[0];
11762
+ dst_data[1] = src[1];
11586
11763
  }
11587
11764
  }
11588
11765
  }
@@ -14191,7 +14368,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14191
14368
  } break;
14192
14369
  case GGML_OP_MUL_MAT:
14193
14370
  {
14194
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
14371
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
14195
14372
  } break;
14196
14373
  case GGML_OP_MUL_MAT_ID:
14197
14374
  {
@@ -14203,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14203
14380
  } break;
14204
14381
  case GGML_OP_SCALE:
14205
14382
  {
14206
- ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
14383
+ ggml_compute_forward_scale(params, tensor->src[0], tensor);
14207
14384
  } break;
14208
14385
  case GGML_OP_SET:
14209
14386
  {
@@ -14659,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
14659
14836
 
14660
14837
  static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14661
14838
  if (ggml_hash_contains(zero_table, a)) {
14662
- struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14839
+ struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
14663
14840
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14664
14841
  } else {
14665
14842
  return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
@@ -14795,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14795
14972
  src0->grad,
14796
14973
  ggml_scale(ctx,
14797
14974
  ggml_mul(ctx, src0, tensor->grad),
14798
- ggml_new_f32(ctx, 2.0f)),
14975
+ 2.0f),
14799
14976
  zero_table);
14800
14977
  }
14801
14978
  } break;
@@ -14809,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14809
14986
  ggml_div(ctx,
14810
14987
  tensor->grad,
14811
14988
  tensor),
14812
- ggml_new_f32(ctx, 0.5f)),
14989
+ 0.5f),
14813
14990
  zero_table);
14814
14991
  }
14815
14992
  } break;
@@ -14975,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14975
15152
  {
14976
15153
  // necessary for llama
14977
15154
  if (src0->grad) {
15155
+ const float s = ((float *) tensor->op_params)[0];
15156
+
14978
15157
  src0->grad =
14979
15158
  ggml_add_or_set(ctx,
14980
15159
  src0->grad,
14981
- ggml_scale_impl(ctx, tensor->grad, src1, false),
14982
- zero_table);
14983
- }
14984
- if (src1->grad) {
14985
- src1->grad =
14986
- ggml_add_or_set(ctx,
14987
- src1->grad,
14988
- ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
15160
+ ggml_scale_impl(ctx, tensor->grad, s, false),
14989
15161
  zero_table);
14990
15162
  }
14991
15163
  } break;
@@ -15163,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15163
15335
  const int n_past = ((int32_t *) tensor->op_params)[0];
15164
15336
  src0->grad =
15165
15337
  ggml_add_or_set(ctx, src0->grad,
15338
+ /* ggml_diag_mask_inf_impl() shouldn't be here */
15339
+ /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
15166
15340
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15167
15341
  zero_table);
15168
15342
  }
@@ -15991,7 +16165,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15991
16165
  } break;
15992
16166
  case GGML_OP_MUL_MAT_ID:
15993
16167
  {
15994
- // FIXME: blas
15995
16168
  n_tasks = n_threads;
15996
16169
  } break;
15997
16170
  case GGML_OP_OUT_PROD:
@@ -16325,20 +16498,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16325
16498
  } break;
16326
16499
  case GGML_OP_MUL_MAT_ID:
16327
16500
  {
16328
- const struct ggml_tensor * a = node->src[2];
16329
- const struct ggml_tensor * b = node->src[1];
16330
- const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
16331
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16332
- if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
16333
- if (a->type != GGML_TYPE_F32) {
16334
- // here we need memory just for single 2D matrix from src0
16335
- cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
16336
- }
16337
- } else
16338
- #endif
16339
- if (b->type != vec_dot_type) {
16340
- cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
16501
+ const struct ggml_tensor * src0 = node->src[2];
16502
+ const struct ggml_tensor * src1 = node->src[1];
16503
+ const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16504
+ if (src1->type != vec_dot_type) {
16505
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
16341
16506
  }
16507
+ const int n_as = ggml_get_op_params_i32(node, 1);
16508
+ cur = GGML_PAD(cur, sizeof(int64_t)); // align
16509
+ cur += n_as * sizeof(int64_t); // matrix_row_counts
16510
+ cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16342
16511
  } break;
16343
16512
  case GGML_OP_OUT_PROD:
16344
16513
  {
@@ -19026,6 +19195,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
19026
19195
  return ctx->infos[i].name.data;
19027
19196
  }
19028
19197
 
19198
+ enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
19199
+ return ctx->infos[i].type;
19200
+ }
19201
+
19029
19202
  // returns the index
19030
19203
  static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
19031
19204
  const int idx = gguf_find_key(ctx, key);