llama_cpp 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,11 +31,17 @@
31
31
  #include <unistd.h>
32
32
  #endif
33
33
 
34
+ // static_assert should be a #define, but if it's not,
35
+ // fall back to the _Static_assert C11 keyword.
34
36
  // if C99 - static_assert is noop
35
37
  // ref: https://stackoverflow.com/a/53923785/4039976
36
38
  #ifndef static_assert
39
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
40
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
41
+ #else
37
42
  #define static_assert(cond, msg) struct global_scope_noop_trick
38
43
  #endif
44
+ #endif
39
45
 
40
46
  #if defined(_MSC_VER)
41
47
  // disable "possible loss of data" to avoid hundreds of casts
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
112
118
  #endif
113
119
  #endif
114
120
 
115
- #ifdef __HAIKU__
116
- #define static_assert(cond, msg) _Static_assert(cond, msg)
117
- #endif
118
-
119
121
  /*#define GGML_PERF*/
120
122
  #define GGML_DEBUG 0
121
123
  #define GGML_GELU_FP16
@@ -4410,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
4410
4412
  if (&g_state.contexts[i].context == ctx) {
4411
4413
  g_state.contexts[i].used = false;
4412
4414
 
4413
- GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
4414
- __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
4415
+ GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
4416
+ __func__, i, ggml_used_mem(ctx));
4415
4417
 
4416
4418
  if (ctx->mem_buffer_owned) {
4417
4419
  GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -6955,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
6955
6957
  int n_dims,
6956
6958
  int mode,
6957
6959
  int n_ctx,
6960
+ float freq_base,
6961
+ float freq_scale,
6958
6962
  bool inplace) {
6959
6963
  GGML_ASSERT(n_past >= 0);
6960
6964
  bool is_node = false;
@@ -6967,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl(
6967
6971
 
6968
6972
  ggml_scratch_save(ctx);
6969
6973
 
6970
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6974
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
6971
6975
 
6972
6976
  ((int32_t *) b->data)[0] = n_past;
6973
6977
  ((int32_t *) b->data)[1] = n_dims;
6974
6978
  ((int32_t *) b->data)[2] = mode;
6975
6979
  ((int32_t *) b->data)[3] = n_ctx;
6980
+ memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
6981
+ memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
6976
6982
 
6977
6983
  ggml_scratch_load(ctx);
6978
6984
 
@@ -6991,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
6991
6997
  int n_dims,
6992
6998
  int mode,
6993
6999
  int n_ctx) {
6994
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
7000
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
6995
7001
  }
6996
7002
 
6997
7003
  struct ggml_tensor * ggml_rope_inplace(
@@ -7001,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
7001
7007
  int n_dims,
7002
7008
  int mode,
7003
7009
  int n_ctx) {
7004
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
7010
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
7011
+ }
7012
+
7013
+ struct ggml_tensor * ggml_rope_custom_inplace(
7014
+ struct ggml_context * ctx,
7015
+ struct ggml_tensor * a,
7016
+ int n_past,
7017
+ int n_dims,
7018
+ int mode,
7019
+ int n_ctx,
7020
+ float freq_base,
7021
+ float freq_scale) {
7022
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
7005
7023
  }
7006
7024
 
7007
7025
  // ggml_rope_back
@@ -7011,7 +7029,8 @@ struct ggml_tensor * ggml_rope_back(
7011
7029
  struct ggml_tensor * a,
7012
7030
  int n_past,
7013
7031
  int n_dims,
7014
- int mode) {
7032
+ int mode,
7033
+ int n_ctx) {
7015
7034
  GGML_ASSERT(n_past >= 0);
7016
7035
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
7017
7036
 
@@ -7025,12 +7044,13 @@ struct ggml_tensor * ggml_rope_back(
7025
7044
 
7026
7045
  ggml_scratch_save(ctx);
7027
7046
 
7028
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7047
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7029
7048
  ggml_set_name(b, "n_past, n_dims, mode");
7030
7049
 
7031
7050
  ((int32_t *) b->data)[0] = n_past;
7032
7051
  ((int32_t *) b->data)[1] = n_dims;
7033
7052
  ((int32_t *) b->data)[2] = mode;
7053
+ ((int32_t *) b->data)[3] = n_ctx;
7034
7054
 
7035
7055
  ggml_scratch_load(ctx);
7036
7056
 
@@ -10684,6 +10704,8 @@ static void ggml_compute_forward_mul_mat(
10684
10704
 
10685
10705
  const enum ggml_type type = src0->type;
10686
10706
 
10707
+ const bool src1_cont = ggml_is_contiguous(src1);
10708
+
10687
10709
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10688
10710
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10689
10711
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
@@ -10747,7 +10769,7 @@ static void ggml_compute_forward_mul_mat(
10747
10769
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10748
10770
 
10749
10771
  if (type != GGML_TYPE_F32) {
10750
- float * const wdata = params->wdata;
10772
+ float * const wdata = params->wdata;
10751
10773
  ggml_to_float_t const to_float = type_traits[type].to_float;
10752
10774
 
10753
10775
  size_t id = 0;
@@ -10805,7 +10827,7 @@ static void ggml_compute_forward_mul_mat(
10805
10827
  // src1 rows
10806
10828
  const int64_t nr1 = ne11*ne12*ne13;
10807
10829
 
10808
- void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10830
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10809
10831
  const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10810
10832
 
10811
10833
  for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
@@ -10828,7 +10850,15 @@ static void ggml_compute_forward_mul_mat(
10828
10850
  const int64_t i3 = i13;
10829
10851
 
10830
10852
  const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10831
- const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
10853
+
10854
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10855
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10856
+ // the original src1 data pointer, so we should index using the indices directly
10857
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10858
+ const char * src1_col = (const char *) wdata +
10859
+ (src1_cont || src1->type != vec_dot_type
10860
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10861
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10832
10862
 
10833
10863
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10834
10864
 
@@ -12062,16 +12092,21 @@ static void ggml_compute_forward_rope_f32(
12062
12092
  const struct ggml_tensor * src1,
12063
12093
  struct ggml_tensor * dst) {
12064
12094
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12065
- GGML_ASSERT(ggml_nelements(src1) == 4);
12095
+ GGML_ASSERT(ggml_nelements(src1) == 6);
12066
12096
 
12067
12097
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12068
12098
  return;
12069
12099
  }
12070
12100
 
12101
+ float freq_base;
12102
+ float freq_scale;
12103
+
12071
12104
  const int n_past = ((int32_t *) src1->data)[0];
12072
12105
  const int n_dims = ((int32_t *) src1->data)[1];
12073
12106
  const int mode = ((int32_t *) src1->data)[2];
12074
12107
  const int n_ctx = ((int32_t *) src1->data)[3];
12108
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12109
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
12075
12110
 
12076
12111
  assert(n_past >= 0);
12077
12112
 
@@ -12100,7 +12135,7 @@ static void ggml_compute_forward_rope_f32(
12100
12135
  // row index used to determine which thread to use
12101
12136
  int ir = 0;
12102
12137
 
12103
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12138
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12104
12139
 
12105
12140
  const bool is_neox = mode & 2;
12106
12141
  const bool is_glm = mode & 4;
@@ -12112,7 +12147,7 @@ static void ggml_compute_forward_rope_f32(
12112
12147
  if (ir++ < ir0) continue;
12113
12148
  if (ir > ir1) break;
12114
12149
 
12115
- float theta = (float)p;
12150
+ float theta = freq_scale * (float)p;
12116
12151
 
12117
12152
  if (is_glm) {
12118
12153
  theta = MIN(p, n_ctx - 2);
@@ -12189,16 +12224,21 @@ static void ggml_compute_forward_rope_f16(
12189
12224
  const struct ggml_tensor * src1,
12190
12225
  struct ggml_tensor * dst) {
12191
12226
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12192
- GGML_ASSERT(ggml_nelements(src1) == 4);
12227
+ GGML_ASSERT(ggml_nelements(src1) == 6);
12193
12228
 
12194
12229
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12195
12230
  return;
12196
12231
  }
12197
12232
 
12233
+ float freq_base;
12234
+ float freq_scale;
12235
+
12198
12236
  const int n_past = ((int32_t *) src1->data)[0];
12199
12237
  const int n_dims = ((int32_t *) src1->data)[1];
12200
12238
  const int mode = ((int32_t *) src1->data)[2];
12201
12239
  const int n_ctx = ((int32_t *) src1->data)[3];
12240
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12241
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
12202
12242
 
12203
12243
  assert(n_past >= 0);
12204
12244
 
@@ -12227,7 +12267,7 @@ static void ggml_compute_forward_rope_f16(
12227
12267
  // row index used to determine which thread to use
12228
12268
  int ir = 0;
12229
12269
 
12230
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12270
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12231
12271
 
12232
12272
  const bool is_neox = mode & 2;
12233
12273
  const bool is_glm = mode & 4;
@@ -12239,7 +12279,7 @@ static void ggml_compute_forward_rope_f16(
12239
12279
  if (ir++ < ir0) continue;
12240
12280
  if (ir > ir1) break;
12241
12281
 
12242
- float theta = (float)p;
12282
+ float theta = freq_scale * (float)p;
12243
12283
 
12244
12284
  if (is_glm) {
12245
12285
  theta = MIN(p, n_ctx - 2);
@@ -12300,7 +12340,7 @@ static void ggml_compute_forward_rope_f16(
12300
12340
  const float x0 = GGML_FP16_TO_FP32(src[0]);
12301
12341
  const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12302
12342
 
12303
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12343
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12304
12344
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12305
12345
  }
12306
12346
  }
@@ -12339,7 +12379,7 @@ static void ggml_compute_forward_rope_back_f32(
12339
12379
  const struct ggml_tensor * src1,
12340
12380
  struct ggml_tensor * dst) {
12341
12381
  assert(src1->type == GGML_TYPE_I32);
12342
- assert(ggml_nelements(src1) == 3);
12382
+ assert(ggml_nelements(src1) == 4);
12343
12383
 
12344
12384
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12345
12385
  return;
@@ -12982,12 +13022,13 @@ static void ggml_compute_forward_conv_1d(
12982
13022
  };
12983
13023
  }
12984
13024
 
12985
- // ggml_compute_forward_conv_2d_sk_p0
13025
+ // ggml_compute_forward_conv_2d
12986
13026
 
12987
- static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13027
+ static void ggml_compute_forward_conv_2d_f16_f32(
12988
13028
  const struct ggml_compute_params * params,
12989
13029
  const struct ggml_tensor * src0,
12990
13030
  const struct ggml_tensor * src1,
13031
+ const struct ggml_tensor * opt0,
12991
13032
  struct ggml_tensor * dst) {
12992
13033
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12993
13034
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13007,28 +13048,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13007
13048
  // size of the convolution row - the kernel size unrolled across all channels
13008
13049
  const int ew0 = nk0*nk1*ne02;
13009
13050
 
13051
+ const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13052
+ const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13053
+ const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13054
+ const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13055
+ const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13056
+ const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13057
+
13010
13058
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13011
13059
  GGML_ASSERT(nb10 == sizeof(float));
13012
13060
 
13013
13061
  if (params->type == GGML_TASK_INIT) {
13014
- // TODO: fix this memset (wsize is overestimated)
13015
13062
  memset(params->wdata, 0, params->wsize);
13016
13063
 
13017
13064
  // prepare source data (src1)
13018
13065
  {
13019
13066
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13020
13067
 
13021
- for (int i13 = 0; i13 < ne13; i13++) {
13022
- for (int i12 = 0; i12 < ne12; i12++) {
13023
- const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
13024
- ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
13068
+ for (int i12 = 0; i12 < ne12; i12++) {
13069
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13070
+ ggml_fp16_t * dst_data = wdata;
13071
+
13072
+ for (int i1 = 0; i1 < ne1; i1++) {
13073
+ for (int i0 = 0; i0 < ne0; i0++) {
13074
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13075
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13076
+ const int idx0 = i0*s0 + ik0*d0 - p0;
13077
+ const int idx1 = i1*s1 + ik1*d1 - p1;
13025
13078
 
13026
- for (int i1 = 0; i1 < ne1; i1++) {
13027
- for (int i0 = 0; i0 < ne0; i0++) {
13028
- for (int ik1 = 0; ik1 < nk1; ik1++) {
13029
- for (int ik0 = 0; ik0 < nk0; ik0++) {
13079
+ if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
13030
13080
  dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13031
- GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13081
+ GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
13032
13082
  }
13033
13083
  }
13034
13084
  }
@@ -13071,19 +13121,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13071
13121
  }
13072
13122
  }
13073
13123
 
13074
- static void ggml_compute_forward_conv_2d_sk_p0(
13124
+ static void ggml_compute_forward_conv_2d(
13075
13125
  const struct ggml_compute_params * params,
13076
13126
  const struct ggml_tensor * src0,
13077
13127
  const struct ggml_tensor * src1,
13078
- struct ggml_tensor * dst) {
13128
+ const struct ggml_tensor * opt0,
13129
+ struct ggml_tensor * dst
13130
+ ) {
13079
13131
  switch (src0->type) {
13080
13132
  case GGML_TYPE_F16:
13081
13133
  {
13082
- ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
13134
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
13083
13135
  } break;
13084
13136
  case GGML_TYPE_F32:
13085
13137
  {
13086
- //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13138
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
13087
13139
  GGML_ASSERT(false);
13088
13140
  } break;
13089
13141
  default:
@@ -13093,32 +13145,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
13093
13145
  }
13094
13146
  }
13095
13147
 
13096
- // ggml_compute_forward_conv_2d
13097
-
13098
- static void ggml_compute_forward_conv_2d(
13099
- const struct ggml_compute_params* params,
13100
- const struct ggml_tensor* src0,
13101
- const struct ggml_tensor* src1,
13102
- const struct ggml_tensor* opt0,
13103
- struct ggml_tensor* dst) {
13104
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13105
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13106
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13107
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13108
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13109
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13110
- GGML_ASSERT(d0 == 1); // dilation not supported
13111
- GGML_ASSERT(d1 == 1);
13112
- GGML_ASSERT(p0 == 0); // padding not supported
13113
- GGML_ASSERT(p1 == 0);
13114
-
13115
- if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
13116
- ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
13117
- } else {
13118
- GGML_ASSERT(false); // only stride equal to kernel size is supported
13119
- }
13120
- }
13121
-
13122
13148
  // ggml_compute_forward_pool_1d_sk_p0
13123
13149
 
13124
13150
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -15712,17 +15738,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15712
15738
  // necessary for llama
15713
15739
  if (src0->grad) {
15714
15740
  assert(src1->type == GGML_TYPE_I32);
15715
- assert(ggml_nelements(src1) == 4);
15741
+ assert(ggml_nelements(src1) == 6);
15716
15742
  const int n_past = ((int32_t *) src1->data)[0];
15717
15743
  const int n_dims = ((int32_t *) src1->data)[1];
15718
15744
  const int mode = ((int32_t *) src1->data)[2];
15745
+ const int n_ctx = ((int32_t *) src1->data)[3];
15719
15746
  src0->grad = ggml_add_impl(ctx,
15720
15747
  src0->grad,
15721
15748
  ggml_rope_back(ctx,
15722
15749
  tensor->grad,
15723
15750
  n_past,
15724
15751
  n_dims,
15725
- mode),
15752
+ mode,
15753
+ n_ctx),
15726
15754
  inplace);
15727
15755
  }
15728
15756
  if (src1->grad) {
@@ -16293,8 +16321,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16293
16321
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16294
16322
  params.nth = n_tasks_arr[node_n];
16295
16323
  ggml_compute_forward(&params, node);
16296
- ggml_graph_compute_perf_stats_node(node, state->shared);
16297
16324
  }
16325
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16298
16326
  }
16299
16327
 
16300
16328
  // distribute new work or execute it direct if 1T
@@ -16324,8 +16352,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16324
16352
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16325
16353
  params.type = GGML_TASK_FINALIZE;
16326
16354
  ggml_compute_forward(&params, node);
16327
- ggml_graph_compute_perf_stats_node(node, state->shared);
16328
16355
  }
16356
+
16357
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16329
16358
  } else {
16330
16359
  break;
16331
16360
  }
@@ -16575,19 +16604,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16575
16604
  const int64_t ne11 = node->src[1]->ne[1]; // H
16576
16605
  const int64_t ne12 = node->src[1]->ne[2]; // C
16577
16606
 
16607
+ const int64_t ne0 = node->ne[0];
16608
+ const int64_t ne1 = node->ne[1];
16609
+ const int64_t ne2 = node->ne[2];
16578
16610
  const int64_t nk = ne00*ne01;
16611
+ const int64_t ew0 = nk * ne02;
16579
16612
 
16580
- UNUSED(ne02);
16581
16613
  UNUSED(ne03);
16582
- UNUSED(nk);
16614
+ UNUSED(ne2);
16583
16615
 
16584
16616
  size_t cur = 0;
16585
16617
 
16586
16618
  if (node->src[0]->type == GGML_TYPE_F16 &&
16587
- node->src[1]->type == GGML_TYPE_F32) {
16588
- cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16619
+ node->src[1]->type == GGML_TYPE_F32) {
16620
+ cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16589
16621
  } else if (node->src[0]->type == GGML_TYPE_F32 &&
16590
- node->src[1]->type == GGML_TYPE_F32) {
16622
+ node->src[1]->type == GGML_TYPE_F32) {
16591
16623
  cur = sizeof(float)* (ne10*ne11*ne12);
16592
16624
  } else {
16593
16625
  GGML_ASSERT(false);
@@ -16864,9 +16896,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16864
16896
  }
16865
16897
 
16866
16898
  void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16867
- //assert(cgraph->work == NULL);
16868
- //assert(cgraph->work_size == 0);
16869
-
16870
16899
  uint64_t size_eval = 0;
16871
16900
 
16872
16901
  // compute size of intermediate results
@@ -17305,9 +17334,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17305
17334
 
17306
17335
  GGML_PRINT("=== GRAPH ===\n");
17307
17336
 
17308
- GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
17309
- GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
17310
-
17311
17337
  GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
17312
17338
  for (int i = 0; i < cgraph->n_nodes; i++) {
17313
17339
  struct ggml_tensor * node = cgraph->nodes[i];
@@ -1121,6 +1121,17 @@ extern "C" {
1121
1121
  int mode,
1122
1122
  int n_ctx);
1123
1123
 
1124
+ // custom RoPE, in-place, returns view(a)
1125
+ GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1126
+ struct ggml_context * ctx,
1127
+ struct ggml_tensor * a,
1128
+ int n_past,
1129
+ int n_dims,
1130
+ int mode,
1131
+ int n_ctx,
1132
+ float freq_base,
1133
+ float freq_scale);
1134
+
1124
1135
  // rotary position embedding backward, i.e compute dx from dy
1125
1136
  // a - dy
1126
1137
  GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1128,7 +1139,8 @@ extern "C" {
1128
1139
  struct ggml_tensor * a,
1129
1140
  int n_past,
1130
1141
  int n_dims,
1131
- int mode);
1142
+ int mode,
1143
+ int n_ctx);
1132
1144
 
1133
1145
  // alibi position embedding
1134
1146
  // in-place, returns view(a)
@@ -15,6 +15,14 @@
15
15
  #define K_SCALE_SIZE 12
16
16
  #endif
17
17
 
18
+ #ifndef static_assert
19
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
20
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
21
+ #else
22
+ #define static_assert(cond, msg) struct global_scope_noop_trick
23
+ #endif
24
+ #endif
25
+
18
26
  //
19
27
  // Super-block quantization structures
20
28
  //