llama_cpp 0.15.3 → 0.15.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6012
6012
  };
6013
6013
 
6014
6014
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6015
+ ggml_vk_instance_init();
6016
+
6015
6017
  #ifdef GGML_VULKAN_DEBUG
6016
6018
  std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
6017
6019
  #endif
@@ -60,6 +60,9 @@
60
60
 
61
61
  typedef volatile LONG atomic_int;
62
62
  typedef atomic_int atomic_bool;
63
+ typedef atomic_int atomic_flag;
64
+
65
+ #define ATOMIC_FLAG_INIT 0
63
66
 
64
67
  static void atomic_store(atomic_int * ptr, LONG val) {
65
68
  InterlockedExchange(ptr, val);
@@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
73
76
  static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
74
77
  return atomic_fetch_add(ptr, -(dec));
75
78
  }
79
+ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
80
+ return InterlockedExchange(ptr, 1);
81
+ }
82
+ static void atomic_flag_clear(atomic_flag * ptr) {
83
+ InterlockedExchange(ptr, 0);
84
+ }
76
85
 
77
86
  typedef HANDLE pthread_t;
78
87
 
@@ -1567,11 +1576,11 @@ do { \
1567
1576
 
1568
1577
  // F16 arithmetic is not supported by AVX, so we use F32 instead
1569
1578
 
1570
- #define GGML_F32Cx8 __m256
1579
+ #define GGML_F32Cx8 __m256
1571
1580
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1572
1581
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1573
1582
 
1574
- static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
1583
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1575
1584
  float tmp[8];
1576
1585
 
1577
1586
  for (int i = 0; i < 8; i++) {
@@ -1580,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
1580
1589
 
1581
1590
  return (__m256)__lasx_xvld(tmp, 0);
1582
1591
  }
1583
- static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1592
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1584
1593
  float arr[8];
1585
1594
 
1586
1595
  __lasx_xvst(y, arr, 0);
1587
1596
 
1588
- for (int i = 0; i < 8; i++)
1597
+ for (int i = 0; i < 8; i++) {
1589
1598
  x[i] = GGML_FP32_TO_FP16(arr[i]);
1599
+ }
1590
1600
  }
1591
1601
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1592
1602
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@@ -1662,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1662
1672
  #define GGML_F16_STEP 32
1663
1673
  #define GGML_F16_EPR 4
1664
1674
 
1665
- static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
1675
+ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1666
1676
  float tmp[4];
1667
1677
 
1668
1678
  tmp[0] = GGML_FP16_TO_FP32(x[0]);
@@ -1673,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
1673
1683
  return __lsx_vld(tmp, 0);
1674
1684
  }
1675
1685
 
1676
- static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
1686
+ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1677
1687
  float arr[4];
1678
1688
 
1679
1689
  __lsx_vst(y, arr, 0);
@@ -2306,32 +2316,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
2306
2316
  const __m512 r = _mm512_set1_ps(0x1.8p23f);
2307
2317
  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
2308
2318
  const __m512 n = _mm512_sub_ps(z, r);
2309
- const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
2310
- _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
2311
- const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
2312
- const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
2313
- const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
2314
- const __m512 u = _mm512_mul_ps(b, b);
2315
- const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
2316
- _mm512_set1_ps(0x1.573e2ep-5f)), u,
2317
- _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
2318
- _mm512_set1_ps(0x1.fffdb6p-2f))),
2319
- u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
2320
- if (_mm512_kortestz(c, c))
2321
- return _mm512_fmadd_ps(j, k, k);
2322
- const __m512i g = _mm512_and_si512(
2323
- _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
2324
- _mm512_set1_epi32(0x82000000u));
2325
- const __m512 s1 =
2326
- _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
2327
- const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
2319
+ const __m512 b =
2320
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
2321
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
2328
2322
  const __mmask16 d =
2329
2323
  _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
2330
- return _mm512_mask_blend_ps(
2331
- d, _mm512_mask_blend_ps(
2332
- c, _mm512_fmadd_ps(k, j, k),
2333
- _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
2334
- _mm512_mul_ps(s1, s1));
2324
+ const __m512 u = _mm512_mul_ps(b, b);
2325
+ const __m512 j = _mm512_fmadd_ps(
2326
+ _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
2327
+ _mm512_set1_ps(0x1.573e2ep-5f)),
2328
+ u,
2329
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
2330
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
2331
+ u,
2332
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
2333
+ const __m512 res = _mm512_scalef_ps(j, n);
2334
+ if (_mm512_kortestz(d, d))
2335
+ return res;
2336
+ const __m512 zero = _mm512_setzero_ps();
2337
+ const __m512 alt = _mm512_mask_blend_ps(
2338
+ _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
2339
+ return _mm512_mask_blend_ps(d, res, alt);
2335
2340
  }
2336
2341
 
2337
2342
  // computes silu x/(1+exp(-x)) in single precision vector
@@ -2883,24 +2888,20 @@ struct ggml_state {
2883
2888
 
2884
2889
  // global state
2885
2890
  static struct ggml_state g_state;
2886
- static atomic_int g_state_barrier = 0;
2891
+ static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
2887
2892
 
2888
2893
  // barrier via spin lock
2889
2894
  inline static void ggml_critical_section_start(void) {
2890
- int processing = atomic_fetch_add(&g_state_barrier, 1);
2891
-
2892
- while (processing > 0) {
2893
- // wait for other threads to finish
2894
- atomic_fetch_sub(&g_state_barrier, 1);
2895
- sched_yield(); // TODO: reconsider this
2896
- processing = atomic_fetch_add(&g_state_barrier, 1);
2895
+ while (atomic_flag_test_and_set(&g_state_critical)) {
2896
+ // spin
2897
+ sched_yield();
2897
2898
  }
2898
2899
  }
2899
2900
 
2900
2901
  // TODO: make this somehow automatically executed
2901
2902
  // some sort of "sentry" mechanism
2902
2903
  inline static void ggml_critical_section_end(void) {
2903
- atomic_fetch_sub(&g_state_barrier, 1);
2904
+ atomic_flag_clear(&g_state_critical);
2904
2905
  }
2905
2906
 
2906
2907
  #if defined(__gnu_linux__)
@@ -3216,7 +3217,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3216
3217
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3217
3218
  }
3218
3219
 
3219
- static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
3220
+ GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3221
+ return ggml_is_contiguous(tensor);
3222
+ }
3223
+
3224
+ GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3220
3225
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3221
3226
 
3222
3227
  return
@@ -3225,6 +3230,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
3225
3230
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3226
3231
  }
3227
3232
 
3233
+ GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3234
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3235
+
3236
+ return
3237
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
3238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3239
+ }
3240
+
3228
3241
  GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
3229
3242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3230
3243
 
@@ -4882,10 +4895,21 @@ struct ggml_tensor * ggml_repeat_back(
4882
4895
  // ggml_concat
4883
4896
 
4884
4897
  struct ggml_tensor * ggml_concat(
4885
- struct ggml_context* ctx,
4886
- struct ggml_tensor* a,
4887
- struct ggml_tensor* b) {
4888
- GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
4898
+ struct ggml_context * ctx,
4899
+ struct ggml_tensor * a,
4900
+ struct ggml_tensor * b,
4901
+ int dim) {
4902
+ GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
4903
+
4904
+ int64_t ne[GGML_MAX_DIMS];
4905
+ for (int d = 0; d < GGML_MAX_DIMS; ++d) {
4906
+ if (d == dim) {
4907
+ ne[d] = a->ne[d] + b->ne[d];
4908
+ continue;
4909
+ }
4910
+ GGML_ASSERT(a->ne[d] == b->ne[d]);
4911
+ ne[d] = a->ne[d];
4912
+ }
4889
4913
 
4890
4914
  bool is_node = false;
4891
4915
 
@@ -4893,7 +4917,9 @@ struct ggml_tensor * ggml_concat(
4893
4917
  is_node = true;
4894
4918
  }
4895
4919
 
4896
- struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
4920
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
4921
+
4922
+ ggml_set_op_params_i32(result, 0, dim);
4897
4923
 
4898
4924
  result->op = GGML_OP_CONCAT;
4899
4925
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5013,6 +5039,7 @@ struct ggml_tensor * ggml_leaky_relu(
5013
5039
  }
5014
5040
 
5015
5041
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5042
+
5016
5043
  ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
5017
5044
 
5018
5045
  result->op = GGML_OP_LEAKY_RELU;
@@ -6378,6 +6405,16 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6378
6405
  );
6379
6406
  }
6380
6407
 
6408
+ struct ggml_tensor * ggml_rope_xpos_inplace(
6409
+ struct ggml_context * ctx,
6410
+ struct ggml_tensor * a,
6411
+ struct ggml_tensor * b,
6412
+ int n_dims,
6413
+ float base,
6414
+ bool down) {
6415
+ return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
6416
+ }
6417
+
6381
6418
  // ggml_rope_back
6382
6419
 
6383
6420
  struct ggml_tensor * ggml_rope_back(
@@ -10967,26 +11004,29 @@ static void ggml_compute_forward_concat_f32(
10967
11004
  GGML_ASSERT(nb00 == sizeof(float));
10968
11005
  GGML_ASSERT(nb10 == sizeof(float));
10969
11006
 
11007
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
11008
+
11009
+ GGML_ASSERT(dim >= 0 && dim < 4);
11010
+
11011
+ int64_t o[4] = {0, 0, 0, 0};
11012
+ o[dim] = src0->ne[dim];
11013
+
11014
+ const float * x;
11015
+
11016
+ // TODO: smarter multi-theading
10970
11017
  for (int i3 = 0; i3 < ne3; i3++) {
10971
11018
  for (int i2 = ith; i2 < ne2; i2 += nth) {
10972
- if (i2 < ne02) { // src0
10973
- for (int i1 = 0; i1 < ne1; i1++) {
10974
- for (int i0 = 0; i0 < ne0; i0++) {
10975
- const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
10976
-
10977
- float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10978
- *y = *x;
10979
- }
10980
- }
10981
- } // src1
10982
- else {
10983
- for (int i1 = 0; i1 < ne1; i1++) {
10984
- for (int i0 = 0; i0 < ne0; i0++) {
10985
- const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
10986
-
10987
- float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10988
- *y = *x;
11019
+ for (int i1 = 0; i1 < ne1; i1++) {
11020
+ for (int i0 = 0; i0 < ne0; i0++) {
11021
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
11022
+ x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
11023
+ } else {
11024
+ x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
10989
11025
  }
11026
+
11027
+ float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
11028
+
11029
+ *y = *x;
10990
11030
  }
10991
11031
  }
10992
11032
  }
@@ -10994,8 +11034,8 @@ static void ggml_compute_forward_concat_f32(
10994
11034
  }
10995
11035
 
10996
11036
  static void ggml_compute_forward_concat(
10997
- const struct ggml_compute_params* params,
10998
- struct ggml_tensor* dst) {
11037
+ const struct ggml_compute_params * params,
11038
+ struct ggml_tensor * dst) {
10999
11039
 
11000
11040
  const struct ggml_tensor * src0 = dst->src[0];
11001
11041
 
@@ -11388,8 +11428,8 @@ static void ggml_compute_forward_gelu_f32(
11388
11428
 
11389
11429
  const struct ggml_tensor * src0 = dst->src[0];
11390
11430
 
11391
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11392
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11431
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11432
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11393
11433
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11394
11434
 
11395
11435
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11451,8 +11491,8 @@ static void ggml_compute_forward_gelu_quick_f32(
11451
11491
 
11452
11492
  const struct ggml_tensor * src0 = dst->src[0];
11453
11493
 
11454
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11455
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11494
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11495
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11456
11496
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11457
11497
 
11458
11498
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11514,8 +11554,8 @@ static void ggml_compute_forward_silu_f32(
11514
11554
 
11515
11555
  const struct ggml_tensor * src0 = dst->src[0];
11516
11556
 
11517
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11518
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11557
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11558
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11519
11559
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11520
11560
 
11521
11561
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11626,9 +11666,9 @@ static void ggml_compute_forward_silu_back_f32(
11626
11666
  const struct ggml_tensor * src0 = dst->src[0];
11627
11667
  const struct ggml_tensor * grad = dst->src[1];
11628
11668
 
11629
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
11630
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11631
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11669
+ GGML_ASSERT(ggml_is_contiguous_1(grad));
11670
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11671
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11632
11672
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11633
11673
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
11634
11674
 
@@ -14326,7 +14366,7 @@ static void ggml_compute_forward_rope_f32(
14326
14366
  int ir = 0;
14327
14367
 
14328
14368
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14329
- const float inv_ndims = -1.f/n_dims;
14369
+
14330
14370
  float corr_dims[2];
14331
14371
  ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14332
14372
 
@@ -14375,7 +14415,7 @@ static void ggml_compute_forward_rope_f32(
14375
14415
  const float cos_block_theta = cosf(block_theta);
14376
14416
  const float sin_block_theta = sinf(block_theta) * sin_sign;
14377
14417
 
14378
- theta_base *= theta_scale;
14418
+ theta_base *= theta_scale;
14379
14419
  block_theta *= theta_scale;
14380
14420
 
14381
14421
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -14410,29 +14450,22 @@ static void ggml_compute_forward_rope_f32(
14410
14450
  dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
14411
14451
  }
14412
14452
  } else {
14413
- // TODO: this might be wrong for ne0 != n_dims - need double check
14414
- // it seems we have to rope just the first n_dims elements and do nothing with the rest
14415
- // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
14416
- theta_base *= freq_scale;
14453
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14417
14454
  for (int64_t ic = 0; ic < ne0; ic += 2) {
14418
14455
  if (ic < n_dims) {
14419
- const int64_t ib = 0;
14456
+ const int64_t i0 = ic/2;
14420
14457
 
14421
- // simplified from `(ib * n_dims + ic) * inv_ndims`
14422
- float cur_rot = inv_ndims * ic - ib;
14423
- float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
14458
+ const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14424
14459
 
14425
14460
  float cos_theta, sin_theta;
14426
14461
  rope_yarn(
14427
- theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
14462
+ theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14428
14463
  &cos_theta, &sin_theta
14429
14464
  );
14430
- sin_theta *= sin_sign;
14431
14465
 
14466
+ sin_theta *= sin_sign;
14432
14467
  theta_base *= theta_scale;
14433
14468
 
14434
- const int64_t i0 = ib*n_dims + ic/2;
14435
-
14436
14469
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14437
14470
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14438
14471
 
@@ -14511,7 +14544,7 @@ static void ggml_compute_forward_rope_f16(
14511
14544
  int ir = 0;
14512
14545
 
14513
14546
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14514
- const float inv_ndims = -1.f/n_dims;
14547
+
14515
14548
  float corr_dims[2];
14516
14549
  ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14517
14550
 
@@ -14560,7 +14593,7 @@ static void ggml_compute_forward_rope_f16(
14560
14593
  const float cos_block_theta = cosf(block_theta);
14561
14594
  const float sin_block_theta = sinf(block_theta) * sin_sign;
14562
14595
 
14563
- theta_base *= theta_scale;
14596
+ theta_base *= theta_scale;
14564
14597
  block_theta *= theta_scale;
14565
14598
 
14566
14599
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -14591,29 +14624,22 @@ static void ggml_compute_forward_rope_f16(
14591
14624
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14592
14625
  }
14593
14626
  } else {
14594
- // TODO: this might be wrong for ne0 != n_dims - need double check
14595
- // it seems we have to rope just the first n_dims elements and do nothing with the rest
14596
- // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
14597
- theta_base *= freq_scale;
14627
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14598
14628
  for (int64_t ic = 0; ic < ne0; ic += 2) {
14599
14629
  if (ic < n_dims) {
14600
- const int64_t ib = 0;
14630
+ const int64_t i0 = ic/2;
14601
14631
 
14602
- // simplified from `(ib * n_dims + ic) * inv_ndims`
14603
- float cur_rot = inv_ndims * ic - ib;
14604
- float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
14632
+ const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14605
14633
 
14606
14634
  float cos_theta, sin_theta;
14607
14635
  rope_yarn(
14608
- theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
14636
+ theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14609
14637
  &cos_theta, &sin_theta
14610
14638
  );
14611
- sin_theta *= sin_sign;
14612
14639
 
14640
+ sin_theta *= sin_sign;
14613
14641
  theta_base *= theta_scale;
14614
14642
 
14615
- const int64_t i0 = ib*n_dims + ic/2;
14616
-
14617
14643
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14618
14644
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14619
14645
 
@@ -22742,6 +22768,16 @@ int ggml_cpu_has_neon(void) {
22742
22768
  #endif
22743
22769
  }
22744
22770
 
22771
+ int ggml_cpu_has_sve(void) {
22772
+ #if defined(__ARM_FEATURE_SVE)
22773
+ // TODO: Currently, SVE 256 bit is only supported.
22774
+ GGML_ASSERT(svcntb() == QK8_0);
22775
+ return 1;
22776
+ #else
22777
+ return 0;
22778
+ #endif
22779
+ }
22780
+
22745
22781
  int ggml_cpu_has_arm_fma(void) {
22746
22782
  #if defined(__ARM_FEATURE_FMA)
22747
22783
  return 1;
@@ -22830,6 +22866,14 @@ int ggml_cpu_has_sycl(void) {
22830
22866
  #endif
22831
22867
  }
22832
22868
 
22869
+ int ggml_cpu_has_rpc(void) {
22870
+ #if defined(GGML_USE_RPC)
22871
+ return 1;
22872
+ #else
22873
+ return 0;
22874
+ #endif
22875
+ }
22876
+
22833
22877
  int ggml_cpu_has_gpublas(void) {
22834
22878
  return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
22835
22879
  ggml_cpu_has_sycl();
@@ -756,7 +756,6 @@ extern "C" {
756
756
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
757
757
 
758
758
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
759
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
760
759
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
761
760
  GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
762
761
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
@@ -765,6 +764,11 @@ extern "C" {
765
764
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
766
765
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
767
766
 
767
+ GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
768
+ GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
769
+ GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
770
+ GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
771
+
768
772
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
769
773
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
774
 
@@ -1007,12 +1011,13 @@ extern "C" {
1007
1011
  struct ggml_tensor * a,
1008
1012
  struct ggml_tensor * b);
1009
1013
 
1010
- // concat a and b on dim 2
1014
+ // concat a and b along dim
1011
1015
  // used in stable-diffusion
1012
1016
  GGML_API struct ggml_tensor * ggml_concat(
1013
1017
  struct ggml_context * ctx,
1014
1018
  struct ggml_tensor * a,
1015
- struct ggml_tensor * b);
1019
+ struct ggml_tensor * b,
1020
+ int dim);
1016
1021
 
1017
1022
  GGML_API struct ggml_tensor * ggml_abs(
1018
1023
  struct ggml_context * ctx,
@@ -1547,6 +1552,14 @@ extern "C" {
1547
1552
  float beta_slow),
1548
1553
  "use ggml_rope_ext_inplace instead");
1549
1554
 
1555
+ struct ggml_tensor * ggml_rope_xpos_inplace(
1556
+ struct ggml_context * ctx,
1557
+ struct ggml_tensor * a,
1558
+ struct ggml_tensor * b,
1559
+ int n_dims,
1560
+ float base,
1561
+ bool down);
1562
+
1550
1563
  // compute correction dims for YaRN RoPE scaling
1551
1564
  GGML_CALL void ggml_rope_yarn_corr_dims(
1552
1565
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
@@ -2404,6 +2417,7 @@ extern "C" {
2404
2417
  GGML_API int ggml_cpu_has_avx512_bf16(void);
2405
2418
  GGML_API int ggml_cpu_has_fma (void);
2406
2419
  GGML_API int ggml_cpu_has_neon (void);
2420
+ GGML_API int ggml_cpu_has_sve (void);
2407
2421
  GGML_API int ggml_cpu_has_arm_fma (void);
2408
2422
  GGML_API int ggml_cpu_has_metal (void);
2409
2423
  GGML_API int ggml_cpu_has_f16c (void);
@@ -2418,6 +2432,7 @@ extern "C" {
2418
2432
  GGML_API int ggml_cpu_has_sse3 (void);
2419
2433
  GGML_API int ggml_cpu_has_ssse3 (void);
2420
2434
  GGML_API int ggml_cpu_has_sycl (void);
2435
+ GGML_API int ggml_cpu_has_rpc (void);
2421
2436
  GGML_API int ggml_cpu_has_vsx (void);
2422
2437
  GGML_API int ggml_cpu_has_matmul_int8(void);
2423
2438