llama_cpp 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,11 +31,17 @@
31
31
  #include <unistd.h>
32
32
  #endif
33
33
 
34
+ // static_assert should be a #define, but if it's not,
35
+ // fall back to the _Static_assert C11 keyword.
34
36
  // if C99 - static_assert is noop
35
37
  // ref: https://stackoverflow.com/a/53923785/4039976
36
38
  #ifndef static_assert
39
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
40
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
41
+ #else
37
42
  #define static_assert(cond, msg) struct global_scope_noop_trick
38
43
  #endif
44
+ #endif
39
45
 
40
46
  #if defined(_MSC_VER)
41
47
  // disable "possible loss of data" to avoid hundreds of casts
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
112
118
  #endif
113
119
  #endif
114
120
 
115
- #ifdef __HAIKU__
116
- #define static_assert(cond, msg) _Static_assert(cond, msg)
117
- #endif
118
-
119
121
  /*#define GGML_PERF*/
120
122
  #define GGML_DEBUG 0
121
123
  #define GGML_GELU_FP16
@@ -4410,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
4410
4412
  if (&g_state.contexts[i].context == ctx) {
4411
4413
  g_state.contexts[i].used = false;
4412
4414
 
4413
- GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
4414
- __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
4415
+ GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
4416
+ __func__, i, ggml_used_mem(ctx));
4415
4417
 
4416
4418
  if (ctx->mem_buffer_owned) {
4417
4419
  GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -6955,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
6955
6957
  int n_dims,
6956
6958
  int mode,
6957
6959
  int n_ctx,
6960
+ float freq_base,
6961
+ float freq_scale,
6958
6962
  bool inplace) {
6959
6963
  GGML_ASSERT(n_past >= 0);
6960
6964
  bool is_node = false;
@@ -6967,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl(
6967
6971
 
6968
6972
  ggml_scratch_save(ctx);
6969
6973
 
6970
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6974
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
6971
6975
 
6972
6976
  ((int32_t *) b->data)[0] = n_past;
6973
6977
  ((int32_t *) b->data)[1] = n_dims;
6974
6978
  ((int32_t *) b->data)[2] = mode;
6975
6979
  ((int32_t *) b->data)[3] = n_ctx;
6980
+ memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
6981
+ memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
6976
6982
 
6977
6983
  ggml_scratch_load(ctx);
6978
6984
 
@@ -6991,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
6991
6997
  int n_dims,
6992
6998
  int mode,
6993
6999
  int n_ctx) {
6994
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
7000
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
6995
7001
  }
6996
7002
 
6997
7003
  struct ggml_tensor * ggml_rope_inplace(
@@ -7001,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
7001
7007
  int n_dims,
7002
7008
  int mode,
7003
7009
  int n_ctx) {
7004
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
7010
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
7011
+ }
7012
+
7013
+ struct ggml_tensor * ggml_rope_custom_inplace(
7014
+ struct ggml_context * ctx,
7015
+ struct ggml_tensor * a,
7016
+ int n_past,
7017
+ int n_dims,
7018
+ int mode,
7019
+ int n_ctx,
7020
+ float freq_base,
7021
+ float freq_scale) {
7022
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
7005
7023
  }
7006
7024
 
7007
7025
  // ggml_rope_back
@@ -7011,7 +7029,8 @@ struct ggml_tensor * ggml_rope_back(
7011
7029
  struct ggml_tensor * a,
7012
7030
  int n_past,
7013
7031
  int n_dims,
7014
- int mode) {
7032
+ int mode,
7033
+ int n_ctx) {
7015
7034
  GGML_ASSERT(n_past >= 0);
7016
7035
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
7017
7036
 
@@ -7025,12 +7044,13 @@ struct ggml_tensor * ggml_rope_back(
7025
7044
 
7026
7045
  ggml_scratch_save(ctx);
7027
7046
 
7028
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7047
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7029
7048
  ggml_set_name(b, "n_past, n_dims, mode");
7030
7049
 
7031
7050
  ((int32_t *) b->data)[0] = n_past;
7032
7051
  ((int32_t *) b->data)[1] = n_dims;
7033
7052
  ((int32_t *) b->data)[2] = mode;
7053
+ ((int32_t *) b->data)[3] = n_ctx;
7034
7054
 
7035
7055
  ggml_scratch_load(ctx);
7036
7056
 
@@ -10684,6 +10704,8 @@ static void ggml_compute_forward_mul_mat(
10684
10704
 
10685
10705
  const enum ggml_type type = src0->type;
10686
10706
 
10707
+ const bool src1_cont = ggml_is_contiguous(src1);
10708
+
10687
10709
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10688
10710
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10689
10711
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
@@ -10747,7 +10769,7 @@ static void ggml_compute_forward_mul_mat(
10747
10769
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10748
10770
 
10749
10771
  if (type != GGML_TYPE_F32) {
10750
- float * const wdata = params->wdata;
10772
+ float * const wdata = params->wdata;
10751
10773
  ggml_to_float_t const to_float = type_traits[type].to_float;
10752
10774
 
10753
10775
  size_t id = 0;
@@ -10805,7 +10827,7 @@ static void ggml_compute_forward_mul_mat(
10805
10827
  // src1 rows
10806
10828
  const int64_t nr1 = ne11*ne12*ne13;
10807
10829
 
10808
- void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10830
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10809
10831
  const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10810
10832
 
10811
10833
  for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
@@ -10828,7 +10850,15 @@ static void ggml_compute_forward_mul_mat(
10828
10850
  const int64_t i3 = i13;
10829
10851
 
10830
10852
  const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10831
- const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
10853
+
10854
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10855
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10856
+ // the original src1 data pointer, so we should index using the indices directly
10857
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10858
+ const char * src1_col = (const char *) wdata +
10859
+ (src1_cont || src1->type != vec_dot_type
10860
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10861
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10832
10862
 
10833
10863
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10834
10864
 
@@ -12062,16 +12092,21 @@ static void ggml_compute_forward_rope_f32(
12062
12092
  const struct ggml_tensor * src1,
12063
12093
  struct ggml_tensor * dst) {
12064
12094
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12065
- GGML_ASSERT(ggml_nelements(src1) == 4);
12095
+ GGML_ASSERT(ggml_nelements(src1) == 6);
12066
12096
 
12067
12097
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12068
12098
  return;
12069
12099
  }
12070
12100
 
12101
+ float freq_base;
12102
+ float freq_scale;
12103
+
12071
12104
  const int n_past = ((int32_t *) src1->data)[0];
12072
12105
  const int n_dims = ((int32_t *) src1->data)[1];
12073
12106
  const int mode = ((int32_t *) src1->data)[2];
12074
12107
  const int n_ctx = ((int32_t *) src1->data)[3];
12108
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12109
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
12075
12110
 
12076
12111
  assert(n_past >= 0);
12077
12112
 
@@ -12100,7 +12135,7 @@ static void ggml_compute_forward_rope_f32(
12100
12135
  // row index used to determine which thread to use
12101
12136
  int ir = 0;
12102
12137
 
12103
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12138
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12104
12139
 
12105
12140
  const bool is_neox = mode & 2;
12106
12141
  const bool is_glm = mode & 4;
@@ -12112,7 +12147,7 @@ static void ggml_compute_forward_rope_f32(
12112
12147
  if (ir++ < ir0) continue;
12113
12148
  if (ir > ir1) break;
12114
12149
 
12115
- float theta = (float)p;
12150
+ float theta = freq_scale * (float)p;
12116
12151
 
12117
12152
  if (is_glm) {
12118
12153
  theta = MIN(p, n_ctx - 2);
@@ -12189,16 +12224,21 @@ static void ggml_compute_forward_rope_f16(
12189
12224
  const struct ggml_tensor * src1,
12190
12225
  struct ggml_tensor * dst) {
12191
12226
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12192
- GGML_ASSERT(ggml_nelements(src1) == 4);
12227
+ GGML_ASSERT(ggml_nelements(src1) == 6);
12193
12228
 
12194
12229
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12195
12230
  return;
12196
12231
  }
12197
12232
 
12233
+ float freq_base;
12234
+ float freq_scale;
12235
+
12198
12236
  const int n_past = ((int32_t *) src1->data)[0];
12199
12237
  const int n_dims = ((int32_t *) src1->data)[1];
12200
12238
  const int mode = ((int32_t *) src1->data)[2];
12201
12239
  const int n_ctx = ((int32_t *) src1->data)[3];
12240
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12241
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
12202
12242
 
12203
12243
  assert(n_past >= 0);
12204
12244
 
@@ -12227,7 +12267,7 @@ static void ggml_compute_forward_rope_f16(
12227
12267
  // row index used to determine which thread to use
12228
12268
  int ir = 0;
12229
12269
 
12230
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12270
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12231
12271
 
12232
12272
  const bool is_neox = mode & 2;
12233
12273
  const bool is_glm = mode & 4;
@@ -12239,7 +12279,7 @@ static void ggml_compute_forward_rope_f16(
12239
12279
  if (ir++ < ir0) continue;
12240
12280
  if (ir > ir1) break;
12241
12281
 
12242
- float theta = (float)p;
12282
+ float theta = freq_scale * (float)p;
12243
12283
 
12244
12284
  if (is_glm) {
12245
12285
  theta = MIN(p, n_ctx - 2);
@@ -12300,7 +12340,7 @@ static void ggml_compute_forward_rope_f16(
12300
12340
  const float x0 = GGML_FP16_TO_FP32(src[0]);
12301
12341
  const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12302
12342
 
12303
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12343
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12304
12344
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12305
12345
  }
12306
12346
  }
@@ -12339,7 +12379,7 @@ static void ggml_compute_forward_rope_back_f32(
12339
12379
  const struct ggml_tensor * src1,
12340
12380
  struct ggml_tensor * dst) {
12341
12381
  assert(src1->type == GGML_TYPE_I32);
12342
- assert(ggml_nelements(src1) == 3);
12382
+ assert(ggml_nelements(src1) == 4);
12343
12383
 
12344
12384
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12345
12385
  return;
@@ -12982,12 +13022,13 @@ static void ggml_compute_forward_conv_1d(
12982
13022
  };
12983
13023
  }
12984
13024
 
12985
- // ggml_compute_forward_conv_2d_sk_p0
13025
+ // ggml_compute_forward_conv_2d
12986
13026
 
12987
- static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13027
+ static void ggml_compute_forward_conv_2d_f16_f32(
12988
13028
  const struct ggml_compute_params * params,
12989
13029
  const struct ggml_tensor * src0,
12990
13030
  const struct ggml_tensor * src1,
13031
+ const struct ggml_tensor * opt0,
12991
13032
  struct ggml_tensor * dst) {
12992
13033
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12993
13034
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13007,28 +13048,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13007
13048
  // size of the convolution row - the kernel size unrolled across all channels
13008
13049
  const int ew0 = nk0*nk1*ne02;
13009
13050
 
13051
+ const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13052
+ const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13053
+ const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13054
+ const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13055
+ const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13056
+ const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13057
+
13010
13058
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13011
13059
  GGML_ASSERT(nb10 == sizeof(float));
13012
13060
 
13013
13061
  if (params->type == GGML_TASK_INIT) {
13014
- // TODO: fix this memset (wsize is overestimated)
13015
13062
  memset(params->wdata, 0, params->wsize);
13016
13063
 
13017
13064
  // prepare source data (src1)
13018
13065
  {
13019
13066
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13020
13067
 
13021
- for (int i13 = 0; i13 < ne13; i13++) {
13022
- for (int i12 = 0; i12 < ne12; i12++) {
13023
- const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
13024
- ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
13068
+ for (int i12 = 0; i12 < ne12; i12++) {
13069
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13070
+ ggml_fp16_t * dst_data = wdata;
13071
+
13072
+ for (int i1 = 0; i1 < ne1; i1++) {
13073
+ for (int i0 = 0; i0 < ne0; i0++) {
13074
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13075
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13076
+ const int idx0 = i0*s0 + ik0*d0 - p0;
13077
+ const int idx1 = i1*s1 + ik1*d1 - p1;
13025
13078
 
13026
- for (int i1 = 0; i1 < ne1; i1++) {
13027
- for (int i0 = 0; i0 < ne0; i0++) {
13028
- for (int ik1 = 0; ik1 < nk1; ik1++) {
13029
- for (int ik0 = 0; ik0 < nk0; ik0++) {
13079
+ if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
13030
13080
  dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13031
- GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13081
+ GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
13032
13082
  }
13033
13083
  }
13034
13084
  }
@@ -13071,19 +13121,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13071
13121
  }
13072
13122
  }
13073
13123
 
13074
- static void ggml_compute_forward_conv_2d_sk_p0(
13124
+ static void ggml_compute_forward_conv_2d(
13075
13125
  const struct ggml_compute_params * params,
13076
13126
  const struct ggml_tensor * src0,
13077
13127
  const struct ggml_tensor * src1,
13078
- struct ggml_tensor * dst) {
13128
+ const struct ggml_tensor * opt0,
13129
+ struct ggml_tensor * dst
13130
+ ) {
13079
13131
  switch (src0->type) {
13080
13132
  case GGML_TYPE_F16:
13081
13133
  {
13082
- ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
13134
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
13083
13135
  } break;
13084
13136
  case GGML_TYPE_F32:
13085
13137
  {
13086
- //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13138
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
13087
13139
  GGML_ASSERT(false);
13088
13140
  } break;
13089
13141
  default:
@@ -13093,32 +13145,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
13093
13145
  }
13094
13146
  }
13095
13147
 
13096
- // ggml_compute_forward_conv_2d
13097
-
13098
- static void ggml_compute_forward_conv_2d(
13099
- const struct ggml_compute_params* params,
13100
- const struct ggml_tensor* src0,
13101
- const struct ggml_tensor* src1,
13102
- const struct ggml_tensor* opt0,
13103
- struct ggml_tensor* dst) {
13104
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13105
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13106
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13107
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13108
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13109
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13110
- GGML_ASSERT(d0 == 1); // dilation not supported
13111
- GGML_ASSERT(d1 == 1);
13112
- GGML_ASSERT(p0 == 0); // padding not supported
13113
- GGML_ASSERT(p1 == 0);
13114
-
13115
- if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
13116
- ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
13117
- } else {
13118
- GGML_ASSERT(false); // only stride equal to kernel size is supported
13119
- }
13120
- }
13121
-
13122
13148
  // ggml_compute_forward_pool_1d_sk_p0
13123
13149
 
13124
13150
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -15712,17 +15738,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15712
15738
  // necessary for llama
15713
15739
  if (src0->grad) {
15714
15740
  assert(src1->type == GGML_TYPE_I32);
15715
- assert(ggml_nelements(src1) == 4);
15741
+ assert(ggml_nelements(src1) == 6);
15716
15742
  const int n_past = ((int32_t *) src1->data)[0];
15717
15743
  const int n_dims = ((int32_t *) src1->data)[1];
15718
15744
  const int mode = ((int32_t *) src1->data)[2];
15745
+ const int n_ctx = ((int32_t *) src1->data)[3];
15719
15746
  src0->grad = ggml_add_impl(ctx,
15720
15747
  src0->grad,
15721
15748
  ggml_rope_back(ctx,
15722
15749
  tensor->grad,
15723
15750
  n_past,
15724
15751
  n_dims,
15725
- mode),
15752
+ mode,
15753
+ n_ctx),
15726
15754
  inplace);
15727
15755
  }
15728
15756
  if (src1->grad) {
@@ -16293,8 +16321,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16293
16321
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16294
16322
  params.nth = n_tasks_arr[node_n];
16295
16323
  ggml_compute_forward(&params, node);
16296
- ggml_graph_compute_perf_stats_node(node, state->shared);
16297
16324
  }
16325
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16298
16326
  }
16299
16327
 
16300
16328
  // distribute new work or execute it direct if 1T
@@ -16324,8 +16352,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16324
16352
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16325
16353
  params.type = GGML_TASK_FINALIZE;
16326
16354
  ggml_compute_forward(&params, node);
16327
- ggml_graph_compute_perf_stats_node(node, state->shared);
16328
16355
  }
16356
+
16357
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16329
16358
  } else {
16330
16359
  break;
16331
16360
  }
@@ -16575,19 +16604,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16575
16604
  const int64_t ne11 = node->src[1]->ne[1]; // H
16576
16605
  const int64_t ne12 = node->src[1]->ne[2]; // C
16577
16606
 
16607
+ const int64_t ne0 = node->ne[0];
16608
+ const int64_t ne1 = node->ne[1];
16609
+ const int64_t ne2 = node->ne[2];
16578
16610
  const int64_t nk = ne00*ne01;
16611
+ const int64_t ew0 = nk * ne02;
16579
16612
 
16580
- UNUSED(ne02);
16581
16613
  UNUSED(ne03);
16582
- UNUSED(nk);
16614
+ UNUSED(ne2);
16583
16615
 
16584
16616
  size_t cur = 0;
16585
16617
 
16586
16618
  if (node->src[0]->type == GGML_TYPE_F16 &&
16587
- node->src[1]->type == GGML_TYPE_F32) {
16588
- cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16619
+ node->src[1]->type == GGML_TYPE_F32) {
16620
+ cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16589
16621
  } else if (node->src[0]->type == GGML_TYPE_F32 &&
16590
- node->src[1]->type == GGML_TYPE_F32) {
16622
+ node->src[1]->type == GGML_TYPE_F32) {
16591
16623
  cur = sizeof(float)* (ne10*ne11*ne12);
16592
16624
  } else {
16593
16625
  GGML_ASSERT(false);
@@ -16864,9 +16896,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16864
16896
  }
16865
16897
 
16866
16898
  void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16867
- //assert(cgraph->work == NULL);
16868
- //assert(cgraph->work_size == 0);
16869
-
16870
16899
  uint64_t size_eval = 0;
16871
16900
 
16872
16901
  // compute size of intermediate results
@@ -17305,9 +17334,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17305
17334
 
17306
17335
  GGML_PRINT("=== GRAPH ===\n");
17307
17336
 
17308
- GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
17309
- GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
17310
-
17311
17337
  GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
17312
17338
  for (int i = 0; i < cgraph->n_nodes; i++) {
17313
17339
  struct ggml_tensor * node = cgraph->nodes[i];
@@ -1121,6 +1121,17 @@ extern "C" {
1121
1121
  int mode,
1122
1122
  int n_ctx);
1123
1123
 
1124
+ // custom RoPE, in-place, returns view(a)
1125
+ GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1126
+ struct ggml_context * ctx,
1127
+ struct ggml_tensor * a,
1128
+ int n_past,
1129
+ int n_dims,
1130
+ int mode,
1131
+ int n_ctx,
1132
+ float freq_base,
1133
+ float freq_scale);
1134
+
1124
1135
  // rotary position embedding backward, i.e compute dx from dy
1125
1136
  // a - dy
1126
1137
  GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1128,7 +1139,8 @@ extern "C" {
1128
1139
  struct ggml_tensor * a,
1129
1140
  int n_past,
1130
1141
  int n_dims,
1131
- int mode);
1142
+ int mode,
1143
+ int n_ctx);
1132
1144
 
1133
1145
  // alibi position embedding
1134
1146
  // in-place, returns view(a)
@@ -15,6 +15,14 @@
15
15
  #define K_SCALE_SIZE 12
16
16
  #endif
17
17
 
18
+ #ifndef static_assert
19
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
20
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
21
+ #else
22
+ #define static_assert(cond, msg) struct global_scope_noop_trick
23
+ #endif
24
+ #endif
25
+
18
26
  //
19
27
  // Super-block quantization structures
20
28
  //