llama_cpp 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
2383
2383
  size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
2384
2384
  size_t max_size = 0;
2385
2385
 
2386
- struct ggml_object * obj = ctx->objects_begin;
2387
-
2388
- while (obj != NULL) {
2389
- if (obj->type == GGML_OBJECT_TENSOR) {
2390
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
2391
-
2392
- const size_t size = ggml_nbytes(tensor);
2393
-
2394
- if (max_size < size) {
2395
- max_size = size;
2396
- }
2397
- }
2398
-
2399
- obj = obj->next;
2386
+ for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
2387
+ max_size = MAX(max_size, ggml_nbytes(tensor));
2400
2388
  }
2401
2389
 
2402
2390
  return max_size;
@@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
3093
3081
  return result;
3094
3082
  }
3095
3083
 
3096
- struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3084
+ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3097
3085
  struct ggml_object * obj = ctx->objects_begin;
3098
3086
 
3099
3087
  char * const mem_buffer = ctx->mem_buffer;
@@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3109
3097
  return NULL;
3110
3098
  }
3111
3099
 
3112
- struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
3100
+ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
3113
3101
  struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
3114
3102
  obj = obj->next;
3115
3103
 
@@ -4098,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat(
4098
4086
  return result;
4099
4087
  }
4100
4088
 
4089
+ void ggml_mul_mat_set_prec(
4090
+ struct ggml_tensor * a,
4091
+ enum ggml_prec prec) {
4092
+ const int32_t prec_i32 = (int32_t) prec;
4093
+
4094
+ ggml_set_op_params_i32(a, 0, prec_i32);
4095
+ }
4096
+
4101
4097
  // ggml_mul_mat_id
4102
4098
 
4103
4099
  struct ggml_tensor * ggml_mul_mat_id(
@@ -4175,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
4175
4171
  static struct ggml_tensor * ggml_scale_impl(
4176
4172
  struct ggml_context * ctx,
4177
4173
  struct ggml_tensor * a,
4178
- struct ggml_tensor * b,
4174
+ float s,
4179
4175
  bool inplace) {
4180
- GGML_ASSERT(ggml_is_scalar(b));
4181
4176
  GGML_ASSERT(ggml_is_padded_1d(a));
4182
4177
 
4183
4178
  bool is_node = false;
4184
4179
 
4185
- if (a->grad || b->grad) {
4180
+ if (a->grad) {
4186
4181
  is_node = true;
4187
4182
  }
4188
4183
 
4189
4184
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4190
4185
 
4186
+ ggml_set_op_params(result, &s, sizeof(s));
4187
+
4191
4188
  result->op = GGML_OP_SCALE;
4192
4189
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4193
4190
  result->src[0] = a;
4194
- result->src[1] = b;
4195
4191
 
4196
4192
  return result;
4197
4193
  }
@@ -4199,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
4199
4195
  struct ggml_tensor * ggml_scale(
4200
4196
  struct ggml_context * ctx,
4201
4197
  struct ggml_tensor * a,
4202
- struct ggml_tensor * b) {
4203
- return ggml_scale_impl(ctx, a, b, false);
4198
+ float s) {
4199
+ return ggml_scale_impl(ctx, a, s, false);
4204
4200
  }
4205
4201
 
4206
4202
  struct ggml_tensor * ggml_scale_inplace(
4207
4203
  struct ggml_context * ctx,
4208
4204
  struct ggml_tensor * a,
4209
- struct ggml_tensor * b) {
4210
- return ggml_scale_impl(ctx, a, b, true);
4205
+ float s) {
4206
+ return ggml_scale_impl(ctx, a, s, true);
4211
4207
  }
4212
4208
 
4213
4209
  // ggml_set
@@ -9168,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
9168
9164
  float eps;
9169
9165
  memcpy(&eps, dst->op_params, sizeof(float));
9170
9166
 
9167
+ GGML_ASSERT(eps > 0.0f);
9168
+
9171
9169
  // TODO: optimize
9172
9170
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9173
9171
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9237,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
9237
9235
  float eps;
9238
9236
  memcpy(&eps, dst->op_params, sizeof(float));
9239
9237
 
9238
+ GGML_ASSERT(eps > 0.0f);
9239
+
9240
9240
  // TODO: optimize
9241
9241
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9242
9242
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -9580,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9580
9580
  }
9581
9581
  #endif
9582
9582
 
9583
- // off1 = offset in i11 and i1
9584
- // cne1 = ne11 and ne1
9585
- // in a normal matrix multiplication, off1 = 0 and cne1 = ne1
9586
- // during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
9587
9583
  static void ggml_compute_forward_mul_mat(
9588
9584
  const struct ggml_compute_params * params,
9589
9585
  const struct ggml_tensor * src0,
9590
9586
  const struct ggml_tensor * src1,
9591
- struct ggml_tensor * dst,
9592
- int64_t off1, int64_t cne1) {
9587
+ struct ggml_tensor * dst) {
9593
9588
  int64_t t0 = ggml_perf_time_us();
9594
9589
  UNUSED(t0);
9595
9590
 
@@ -9657,9 +9652,9 @@ static void ggml_compute_forward_mul_mat(
9657
9652
  const int64_t i03 = i13/r3;
9658
9653
  const int64_t i02 = i12/r2;
9659
9654
 
9660
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9661
- const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
9662
- float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
9655
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9656
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
9657
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9663
9658
 
9664
9659
  if (type != GGML_TYPE_F32) {
9665
9660
  float * const wdata = params->wdata;
@@ -9676,7 +9671,7 @@ static void ggml_compute_forward_mul_mat(
9676
9671
  }
9677
9672
 
9678
9673
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9679
- cne1, ne01, ne10,
9674
+ ne1, ne01, ne10,
9680
9675
  1.0f, y, ne10,
9681
9676
  x, ne00,
9682
9677
  0.0f, d, ne01);
@@ -9717,8 +9712,8 @@ static void ggml_compute_forward_mul_mat(
9717
9712
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9718
9713
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9719
9714
 
9720
- const int64_t nr0 = ne01; // src0 rows
9721
- const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9715
+ const int64_t nr0 = ne01; // src0 rows
9716
+ const int64_t nr1 = ne1*ne12*ne13; // src1 rows
9722
9717
 
9723
9718
  //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9724
9719
 
@@ -9760,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
9760
9755
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9761
9756
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9762
9757
  for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9763
- const int64_t i13 = (ir1/(ne12*cne1));
9764
- const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9765
- const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
9758
+ const int64_t i13 = (ir1/(ne12*ne1));
9759
+ const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
9760
+ const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
9766
9761
 
9767
9762
  // broadcast src0 into src1
9768
9763
  const int64_t i03 = i13/r3;
@@ -9802,28 +9797,191 @@ static void ggml_compute_forward_mul_mat(
9802
9797
 
9803
9798
  static void ggml_compute_forward_mul_mat_id(
9804
9799
  const struct ggml_compute_params * params,
9805
- const struct ggml_tensor * src0,
9800
+ const struct ggml_tensor * ids,
9806
9801
  const struct ggml_tensor * src1,
9807
9802
  struct ggml_tensor * dst) {
9808
9803
 
9809
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9810
- // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
9811
- ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
9812
- return;
9813
- }
9804
+ const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
9805
+
9806
+ GGML_TENSOR_BINARY_OP_LOCALS
9807
+
9808
+ const int ith = params->ith;
9809
+ const int nth = params->nth;
9810
+
9811
+ const enum ggml_type type = src0->type;
9812
+
9813
+ const bool src1_cont = ggml_is_contiguous(src1);
9814
9814
 
9815
- const struct ggml_tensor * ids = src0;
9815
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9816
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9817
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
9818
+
9819
+ GGML_ASSERT(ne0 == ne01);
9820
+ GGML_ASSERT(ne1 == ne11);
9821
+ GGML_ASSERT(ne2 == ne12);
9822
+ GGML_ASSERT(ne3 == ne13);
9823
+
9824
+ // we don't support permuted src0 or src1
9825
+ GGML_ASSERT(nb00 == ggml_type_size(type));
9826
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9827
+
9828
+ // dst cannot be transposed or permuted
9829
+ GGML_ASSERT(nb0 == sizeof(float));
9830
+ GGML_ASSERT(nb0 <= nb1);
9831
+ GGML_ASSERT(nb1 <= nb2);
9832
+ GGML_ASSERT(nb2 <= nb3);
9833
+
9834
+ // broadcast factors
9835
+ const int64_t r2 = ne12/ne02;
9836
+ const int64_t r3 = ne13/ne03;
9837
+
9838
+ // row groups
9816
9839
  const int id = ggml_get_op_params_i32(dst, 0);
9817
9840
  const int n_as = ggml_get_op_params_i32(dst, 1);
9818
9841
 
9819
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9820
- const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9842
+ char * wdata_src1_end = (src1->type == vec_dot_type) ?
9843
+ (char *) params->wdata :
9844
+ (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
9845
+
9846
+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
9847
+ int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
9848
+
9849
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
9850
+
9851
+ if (params->type == GGML_TASK_INIT) {
9852
+ char * wdata = params->wdata;
9853
+ if (src1->type != vec_dot_type) {
9854
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9855
+
9856
+ assert(params->wsize >= ne11*ne12*ne13*row_size);
9857
+ assert(src1->type == GGML_TYPE_F32);
9858
+
9859
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
9860
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
9861
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
9862
+ from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
9863
+ wdata += row_size;
9864
+ }
9865
+ }
9866
+ }
9867
+ }
9868
+
9869
+ // initialize matrix_row_counts
9870
+ GGML_ASSERT(wdata == wdata_src1_end);
9871
+ memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
9872
+
9873
+ // group rows by src0 matrix
9874
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
9875
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
9876
+
9877
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
9878
+ MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
9879
+ matrix_row_counts[row_id] += 1;
9880
+ }
9881
+
9882
+ return;
9883
+ }
9884
+
9885
+ if (params->type == GGML_TASK_FINALIZE) {
9886
+ return;
9887
+ }
9888
+
9889
+ // compute each matrix multiplication in sequence
9890
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
9891
+ const int64_t cne1 = matrix_row_counts[cur_a];
9892
+
9893
+ if (cne1 == 0) {
9894
+ continue;
9895
+ }
9896
+
9897
+ const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
9898
+
9899
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9900
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9901
+
9902
+ const int64_t nr0 = ne01; // src0 rows
9903
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
9904
+
9905
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
9906
+
9907
+ // distribute the thread work across the inner or outer loop based on which one is larger
9908
+
9909
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
9910
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
9911
+
9912
+ const int64_t ith0 = ith % nth0;
9913
+ const int64_t ith1 = ith / nth0;
9914
+
9915
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
9916
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
9917
+
9918
+ const int64_t ir010 = dr0*ith0;
9919
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
9920
+
9921
+ const int64_t ir110 = dr1*ith1;
9922
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
9923
+
9924
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
9925
+
9926
+ // threads with no work simply yield (not sure if it helps)
9927
+ if (ir010 >= ir011 || ir110 >= ir111) {
9928
+ sched_yield();
9929
+ continue;
9930
+ }
9931
+
9932
+ assert(ne12 % ne02 == 0);
9933
+ assert(ne13 % ne03 == 0);
9934
+
9935
+ // block-tiling attempt
9936
+ const int64_t blck_0 = 16;
9937
+ const int64_t blck_1 = 16;
9938
+
9939
+ // attempt to reduce false-sharing (does not seem to make a difference)
9940
+ float tmp[16];
9941
+
9942
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
9943
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
9944
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
9945
+ const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
9946
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
9947
+ const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
9948
+ const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
9949
+
9950
+ // broadcast src0 into src1
9951
+ const int64_t i03 = i13/r3;
9952
+ const int64_t i02 = i12/r2;
9953
+
9954
+ const int64_t i1 = i11;
9955
+ const int64_t i2 = i12;
9956
+ const int64_t i3 = i13;
9821
9957
 
9822
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
9958
+ const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
9823
9959
 
9824
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
9825
- ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
9960
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
9961
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
9962
+ // the original src1 data pointer, so we should index using the indices directly
9963
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
9964
+ const char * src1_col = (const char *) wdata +
9965
+ (src1_cont || src1->type != vec_dot_type
9966
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
9967
+ : (i11*nb11 + i12*nb12 + i13*nb13));
9968
+
9969
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
9970
+
9971
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9972
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
9973
+ //}
9974
+
9975
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
9976
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
9977
+ }
9978
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
9979
+ }
9980
+ }
9981
+ }
9826
9982
  }
9983
+
9984
+ #undef MMID_MATRIX_ROW
9827
9985
  }
9828
9986
 
9829
9987
  // ggml_compute_forward_out_prod
@@ -10167,19 +10325,17 @@ static void ggml_compute_forward_out_prod(
10167
10325
  static void ggml_compute_forward_scale_f32(
10168
10326
  const struct ggml_compute_params * params,
10169
10327
  const struct ggml_tensor * src0,
10170
- const struct ggml_tensor * src1,
10171
10328
  struct ggml_tensor * dst) {
10172
10329
  GGML_ASSERT(ggml_is_contiguous(src0));
10173
10330
  GGML_ASSERT(ggml_is_contiguous(dst));
10174
10331
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10175
- GGML_ASSERT(ggml_is_scalar(src1));
10176
10332
 
10177
10333
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10178
10334
  return;
10179
10335
  }
10180
10336
 
10181
10337
  // scale factor
10182
- const float v = *(float *) src1->data;
10338
+ const float v = *(float *) dst->op_params;
10183
10339
 
10184
10340
  const int ith = params->ith;
10185
10341
  const int nth = params->nth;
@@ -10210,12 +10366,11 @@ static void ggml_compute_forward_scale_f32(
10210
10366
  static void ggml_compute_forward_scale(
10211
10367
  const struct ggml_compute_params * params,
10212
10368
  const struct ggml_tensor * src0,
10213
- const struct ggml_tensor * src1,
10214
10369
  struct ggml_tensor * dst) {
10215
10370
  switch (src0->type) {
10216
10371
  case GGML_TYPE_F32:
10217
10372
  {
10218
- ggml_compute_forward_scale_f32(params, src0, src1, dst);
10373
+ ggml_compute_forward_scale_f32(params, src0, dst);
10219
10374
  } break;
10220
10375
  default:
10221
10376
  {
@@ -11404,10 +11559,13 @@ static void ggml_compute_forward_rope_f32(
11404
11559
  }
11405
11560
  } else {
11406
11561
  // TODO: this might be wrong for ne0 != n_dims - need double check
11407
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
11562
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11563
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11408
11564
  theta_base *= freq_scale;
11409
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11410
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11565
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11566
+ if (ic < n_dims) {
11567
+ const int64_t ib = 0;
11568
+
11411
11569
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11412
11570
  float cur_rot = inv_ndims * ic - ib;
11413
11571
 
@@ -11430,6 +11588,14 @@ static void ggml_compute_forward_rope_f32(
11430
11588
 
11431
11589
  dst_data[0] = x0*cos_theta - x1*sin_theta;
11432
11590
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
11591
+ } else {
11592
+ const int64_t i0 = ic;
11593
+
11594
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11595
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11596
+
11597
+ dst_data[0] = src[0];
11598
+ dst_data[1] = src[1];
11433
11599
  }
11434
11600
  }
11435
11601
  }
@@ -11557,10 +11723,13 @@ static void ggml_compute_forward_rope_f16(
11557
11723
  }
11558
11724
  } else {
11559
11725
  // TODO: this might be wrong for ne0 != n_dims - need double check
11560
- // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
11726
+ // it seems we have to rope just the first n_dims elements and do nothing with the rest
11727
+ // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
11561
11728
  theta_base *= freq_scale;
11562
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11563
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11729
+ for (int64_t ic = 0; ic < ne0; ic += 2) {
11730
+ if (ic < n_dims) {
11731
+ const int64_t ib = 0;
11732
+
11564
11733
  // simplified from `(ib * n_dims + ic) * inv_ndims`
11565
11734
  float cur_rot = inv_ndims * ic - ib;
11566
11735
 
@@ -11583,6 +11752,14 @@ static void ggml_compute_forward_rope_f16(
11583
11752
 
11584
11753
  dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
11585
11754
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
11755
+ } else {
11756
+ const int64_t i0 = ic;
11757
+
11758
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11759
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11760
+
11761
+ dst_data[0] = src[0];
11762
+ dst_data[1] = src[1];
11586
11763
  }
11587
11764
  }
11588
11765
  }
@@ -14191,7 +14368,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14191
14368
  } break;
14192
14369
  case GGML_OP_MUL_MAT:
14193
14370
  {
14194
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
14371
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
14195
14372
  } break;
14196
14373
  case GGML_OP_MUL_MAT_ID:
14197
14374
  {
@@ -14203,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14203
14380
  } break;
14204
14381
  case GGML_OP_SCALE:
14205
14382
  {
14206
- ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
14383
+ ggml_compute_forward_scale(params, tensor->src[0], tensor);
14207
14384
  } break;
14208
14385
  case GGML_OP_SET:
14209
14386
  {
@@ -14659,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
14659
14836
 
14660
14837
  static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14661
14838
  if (ggml_hash_contains(zero_table, a)) {
14662
- struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14839
+ struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
14663
14840
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14664
14841
  } else {
14665
14842
  return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
@@ -14795,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14795
14972
  src0->grad,
14796
14973
  ggml_scale(ctx,
14797
14974
  ggml_mul(ctx, src0, tensor->grad),
14798
- ggml_new_f32(ctx, 2.0f)),
14975
+ 2.0f),
14799
14976
  zero_table);
14800
14977
  }
14801
14978
  } break;
@@ -14809,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14809
14986
  ggml_div(ctx,
14810
14987
  tensor->grad,
14811
14988
  tensor),
14812
- ggml_new_f32(ctx, 0.5f)),
14989
+ 0.5f),
14813
14990
  zero_table);
14814
14991
  }
14815
14992
  } break;
@@ -14975,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14975
15152
  {
14976
15153
  // necessary for llama
14977
15154
  if (src0->grad) {
15155
+ const float s = ((float *) tensor->op_params)[0];
15156
+
14978
15157
  src0->grad =
14979
15158
  ggml_add_or_set(ctx,
14980
15159
  src0->grad,
14981
- ggml_scale_impl(ctx, tensor->grad, src1, false),
14982
- zero_table);
14983
- }
14984
- if (src1->grad) {
14985
- src1->grad =
14986
- ggml_add_or_set(ctx,
14987
- src1->grad,
14988
- ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
15160
+ ggml_scale_impl(ctx, tensor->grad, s, false),
14989
15161
  zero_table);
14990
15162
  }
14991
15163
  } break;
@@ -15163,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15163
15335
  const int n_past = ((int32_t *) tensor->op_params)[0];
15164
15336
  src0->grad =
15165
15337
  ggml_add_or_set(ctx, src0->grad,
15338
+ /* ggml_diag_mask_inf_impl() shouldn't be here */
15339
+ /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
15166
15340
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15167
15341
  zero_table);
15168
15342
  }
@@ -15991,7 +16165,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15991
16165
  } break;
15992
16166
  case GGML_OP_MUL_MAT_ID:
15993
16167
  {
15994
- // FIXME: blas
15995
16168
  n_tasks = n_threads;
15996
16169
  } break;
15997
16170
  case GGML_OP_OUT_PROD:
@@ -16325,20 +16498,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16325
16498
  } break;
16326
16499
  case GGML_OP_MUL_MAT_ID:
16327
16500
  {
16328
- const struct ggml_tensor * a = node->src[2];
16329
- const struct ggml_tensor * b = node->src[1];
16330
- const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
16331
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16332
- if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
16333
- if (a->type != GGML_TYPE_F32) {
16334
- // here we need memory just for single 2D matrix from src0
16335
- cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
16336
- }
16337
- } else
16338
- #endif
16339
- if (b->type != vec_dot_type) {
16340
- cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
16501
+ const struct ggml_tensor * src0 = node->src[2];
16502
+ const struct ggml_tensor * src1 = node->src[1];
16503
+ const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16504
+ if (src1->type != vec_dot_type) {
16505
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
16341
16506
  }
16507
+ const int n_as = ggml_get_op_params_i32(node, 1);
16508
+ cur = GGML_PAD(cur, sizeof(int64_t)); // align
16509
+ cur += n_as * sizeof(int64_t); // matrix_row_counts
16510
+ cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16342
16511
  } break;
16343
16512
  case GGML_OP_OUT_PROD:
16344
16513
  {
@@ -19026,6 +19195,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
19026
19195
  return ctx->infos[i].name.data;
19027
19196
  }
19028
19197
 
19198
+ enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
19199
+ return ctx->infos[i].type;
19200
+ }
19201
+
19029
19202
  // returns the index
19030
19203
  static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
19031
19204
  const int idx = gguf_find_key(ctx, key);