llama_cpp 0.15.3 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6012
6012
  };
6013
6013
 
6014
6014
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6015
+ ggml_vk_instance_init();
6016
+
6015
6017
  #ifdef GGML_VULKAN_DEBUG
6016
6018
  std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
6017
6019
  #endif
@@ -60,6 +60,9 @@
60
60
 
61
61
  typedef volatile LONG atomic_int;
62
62
  typedef atomic_int atomic_bool;
63
+ typedef atomic_int atomic_flag;
64
+
65
+ #define ATOMIC_FLAG_INIT 0
63
66
 
64
67
  static void atomic_store(atomic_int * ptr, LONG val) {
65
68
  InterlockedExchange(ptr, val);
@@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
73
76
  static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
74
77
  return atomic_fetch_add(ptr, -(dec));
75
78
  }
79
+ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
80
+ return InterlockedExchange(ptr, 1);
81
+ }
82
+ static void atomic_flag_clear(atomic_flag * ptr) {
83
+ InterlockedExchange(ptr, 0);
84
+ }
76
85
 
77
86
  typedef HANDLE pthread_t;
78
87
 
@@ -1567,11 +1576,11 @@ do { \
1567
1576
 
1568
1577
  // F16 arithmetic is not supported by AVX, so we use F32 instead
1569
1578
 
1570
- #define GGML_F32Cx8 __m256
1579
+ #define GGML_F32Cx8 __m256
1571
1580
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1572
1581
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1573
1582
 
1574
- static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
1583
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1575
1584
  float tmp[8];
1576
1585
 
1577
1586
  for (int i = 0; i < 8; i++) {
@@ -1580,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
1580
1589
 
1581
1590
  return (__m256)__lasx_xvld(tmp, 0);
1582
1591
  }
1583
- static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1592
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1584
1593
  float arr[8];
1585
1594
 
1586
1595
  __lasx_xvst(y, arr, 0);
1587
1596
 
1588
- for (int i = 0; i < 8; i++)
1597
+ for (int i = 0; i < 8; i++) {
1589
1598
  x[i] = GGML_FP32_TO_FP16(arr[i]);
1599
+ }
1590
1600
  }
1591
1601
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1592
1602
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@@ -1662,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1662
1672
  #define GGML_F16_STEP 32
1663
1673
  #define GGML_F16_EPR 4
1664
1674
 
1665
- static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
1675
+ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1666
1676
  float tmp[4];
1667
1677
 
1668
1678
  tmp[0] = GGML_FP16_TO_FP32(x[0]);
@@ -1673,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
1673
1683
  return __lsx_vld(tmp, 0);
1674
1684
  }
1675
1685
 
1676
- static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
1686
+ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1677
1687
  float arr[4];
1678
1688
 
1679
1689
  __lsx_vst(y, arr, 0);
@@ -2306,32 +2316,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
2306
2316
  const __m512 r = _mm512_set1_ps(0x1.8p23f);
2307
2317
  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
2308
2318
  const __m512 n = _mm512_sub_ps(z, r);
2309
- const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
2310
- _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
2311
- const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
2312
- const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
2313
- const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
2314
- const __m512 u = _mm512_mul_ps(b, b);
2315
- const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
2316
- _mm512_set1_ps(0x1.573e2ep-5f)), u,
2317
- _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
2318
- _mm512_set1_ps(0x1.fffdb6p-2f))),
2319
- u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
2320
- if (_mm512_kortestz(c, c))
2321
- return _mm512_fmadd_ps(j, k, k);
2322
- const __m512i g = _mm512_and_si512(
2323
- _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
2324
- _mm512_set1_epi32(0x82000000u));
2325
- const __m512 s1 =
2326
- _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
2327
- const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
2319
+ const __m512 b =
2320
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
2321
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
2328
2322
  const __mmask16 d =
2329
2323
  _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
2330
- return _mm512_mask_blend_ps(
2331
- d, _mm512_mask_blend_ps(
2332
- c, _mm512_fmadd_ps(k, j, k),
2333
- _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
2334
- _mm512_mul_ps(s1, s1));
2324
+ const __m512 u = _mm512_mul_ps(b, b);
2325
+ const __m512 j = _mm512_fmadd_ps(
2326
+ _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
2327
+ _mm512_set1_ps(0x1.573e2ep-5f)),
2328
+ u,
2329
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
2330
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
2331
+ u,
2332
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
2333
+ const __m512 res = _mm512_scalef_ps(j, n);
2334
+ if (_mm512_kortestz(d, d))
2335
+ return res;
2336
+ const __m512 zero = _mm512_setzero_ps();
2337
+ const __m512 alt = _mm512_mask_blend_ps(
2338
+ _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
2339
+ return _mm512_mask_blend_ps(d, res, alt);
2335
2340
  }
2336
2341
 
2337
2342
  // computes silu x/(1+exp(-x)) in single precision vector
@@ -2883,24 +2888,20 @@ struct ggml_state {
2883
2888
 
2884
2889
  // global state
2885
2890
  static struct ggml_state g_state;
2886
- static atomic_int g_state_barrier = 0;
2891
+ static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
2887
2892
 
2888
2893
  // barrier via spin lock
2889
2894
  inline static void ggml_critical_section_start(void) {
2890
- int processing = atomic_fetch_add(&g_state_barrier, 1);
2891
-
2892
- while (processing > 0) {
2893
- // wait for other threads to finish
2894
- atomic_fetch_sub(&g_state_barrier, 1);
2895
- sched_yield(); // TODO: reconsider this
2896
- processing = atomic_fetch_add(&g_state_barrier, 1);
2895
+ while (atomic_flag_test_and_set(&g_state_critical)) {
2896
+ // spin
2897
+ sched_yield();
2897
2898
  }
2898
2899
  }
2899
2900
 
2900
2901
  // TODO: make this somehow automatically executed
2901
2902
  // some sort of "sentry" mechanism
2902
2903
  inline static void ggml_critical_section_end(void) {
2903
- atomic_fetch_sub(&g_state_barrier, 1);
2904
+ atomic_flag_clear(&g_state_critical);
2904
2905
  }
2905
2906
 
2906
2907
  #if defined(__gnu_linux__)
@@ -3216,7 +3217,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3216
3217
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3217
3218
  }
3218
3219
 
3219
- static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
3220
+ GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3221
+ return ggml_is_contiguous(tensor);
3222
+ }
3223
+
3224
+ GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3220
3225
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3221
3226
 
3222
3227
  return
@@ -3225,6 +3230,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
3225
3230
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3226
3231
  }
3227
3232
 
3233
+ GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3234
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3235
+
3236
+ return
3237
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
3238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3239
+ }
3240
+
3228
3241
  GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
3229
3242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3230
3243
 
@@ -4882,10 +4895,21 @@ struct ggml_tensor * ggml_repeat_back(
4882
4895
  // ggml_concat
4883
4896
 
4884
4897
  struct ggml_tensor * ggml_concat(
4885
- struct ggml_context* ctx,
4886
- struct ggml_tensor* a,
4887
- struct ggml_tensor* b) {
4888
- GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
4898
+ struct ggml_context * ctx,
4899
+ struct ggml_tensor * a,
4900
+ struct ggml_tensor * b,
4901
+ int dim) {
4902
+ GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
4903
+
4904
+ int64_t ne[GGML_MAX_DIMS];
4905
+ for (int d = 0; d < GGML_MAX_DIMS; ++d) {
4906
+ if (d == dim) {
4907
+ ne[d] = a->ne[d] + b->ne[d];
4908
+ continue;
4909
+ }
4910
+ GGML_ASSERT(a->ne[d] == b->ne[d]);
4911
+ ne[d] = a->ne[d];
4912
+ }
4889
4913
 
4890
4914
  bool is_node = false;
4891
4915
 
@@ -4893,7 +4917,9 @@ struct ggml_tensor * ggml_concat(
4893
4917
  is_node = true;
4894
4918
  }
4895
4919
 
4896
- struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
4920
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
4921
+
4922
+ ggml_set_op_params_i32(result, 0, dim);
4897
4923
 
4898
4924
  result->op = GGML_OP_CONCAT;
4899
4925
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5013,6 +5039,7 @@ struct ggml_tensor * ggml_leaky_relu(
5013
5039
  }
5014
5040
 
5015
5041
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5042
+
5016
5043
  ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
5017
5044
 
5018
5045
  result->op = GGML_OP_LEAKY_RELU;
@@ -6378,6 +6405,16 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6378
6405
  );
6379
6406
  }
6380
6407
 
6408
+ struct ggml_tensor * ggml_rope_xpos_inplace(
6409
+ struct ggml_context * ctx,
6410
+ struct ggml_tensor * a,
6411
+ struct ggml_tensor * b,
6412
+ int n_dims,
6413
+ float base,
6414
+ bool down) {
6415
+ return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
6416
+ }
6417
+
6381
6418
  // ggml_rope_back
6382
6419
 
6383
6420
  struct ggml_tensor * ggml_rope_back(
@@ -10967,26 +11004,29 @@ static void ggml_compute_forward_concat_f32(
10967
11004
  GGML_ASSERT(nb00 == sizeof(float));
10968
11005
  GGML_ASSERT(nb10 == sizeof(float));
10969
11006
 
11007
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
11008
+
11009
+ GGML_ASSERT(dim >= 0 && dim < 4);
11010
+
11011
+ int64_t o[4] = {0, 0, 0, 0};
11012
+ o[dim] = src0->ne[dim];
11013
+
11014
+ const float * x;
11015
+
11016
+ // TODO: smarter multi-theading
10970
11017
  for (int i3 = 0; i3 < ne3; i3++) {
10971
11018
  for (int i2 = ith; i2 < ne2; i2 += nth) {
10972
- if (i2 < ne02) { // src0
10973
- for (int i1 = 0; i1 < ne1; i1++) {
10974
- for (int i0 = 0; i0 < ne0; i0++) {
10975
- const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
10976
-
10977
- float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10978
- *y = *x;
10979
- }
10980
- }
10981
- } // src1
10982
- else {
10983
- for (int i1 = 0; i1 < ne1; i1++) {
10984
- for (int i0 = 0; i0 < ne0; i0++) {
10985
- const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
10986
-
10987
- float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10988
- *y = *x;
11019
+ for (int i1 = 0; i1 < ne1; i1++) {
11020
+ for (int i0 = 0; i0 < ne0; i0++) {
11021
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
11022
+ x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
11023
+ } else {
11024
+ x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
10989
11025
  }
11026
+
11027
+ float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
11028
+
11029
+ *y = *x;
10990
11030
  }
10991
11031
  }
10992
11032
  }
@@ -10994,8 +11034,8 @@ static void ggml_compute_forward_concat_f32(
10994
11034
  }
10995
11035
 
10996
11036
  static void ggml_compute_forward_concat(
10997
- const struct ggml_compute_params* params,
10998
- struct ggml_tensor* dst) {
11037
+ const struct ggml_compute_params * params,
11038
+ struct ggml_tensor * dst) {
10999
11039
 
11000
11040
  const struct ggml_tensor * src0 = dst->src[0];
11001
11041
 
@@ -11388,8 +11428,8 @@ static void ggml_compute_forward_gelu_f32(
11388
11428
 
11389
11429
  const struct ggml_tensor * src0 = dst->src[0];
11390
11430
 
11391
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11392
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11431
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11432
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11393
11433
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11394
11434
 
11395
11435
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11451,8 +11491,8 @@ static void ggml_compute_forward_gelu_quick_f32(
11451
11491
 
11452
11492
  const struct ggml_tensor * src0 = dst->src[0];
11453
11493
 
11454
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11455
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11494
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11495
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11456
11496
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11457
11497
 
11458
11498
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11514,8 +11554,8 @@ static void ggml_compute_forward_silu_f32(
11514
11554
 
11515
11555
  const struct ggml_tensor * src0 = dst->src[0];
11516
11556
 
11517
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11518
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11557
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11558
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11519
11559
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11520
11560
 
11521
11561
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11626,9 +11666,9 @@ static void ggml_compute_forward_silu_back_f32(
11626
11666
  const struct ggml_tensor * src0 = dst->src[0];
11627
11667
  const struct ggml_tensor * grad = dst->src[1];
11628
11668
 
11629
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
11630
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
11631
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
11669
+ GGML_ASSERT(ggml_is_contiguous_1(grad));
11670
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
11671
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
11632
11672
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11633
11673
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
11634
11674
 
@@ -14326,7 +14366,7 @@ static void ggml_compute_forward_rope_f32(
14326
14366
  int ir = 0;
14327
14367
 
14328
14368
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14329
- const float inv_ndims = -1.f/n_dims;
14369
+
14330
14370
  float corr_dims[2];
14331
14371
  ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14332
14372
 
@@ -14375,7 +14415,7 @@ static void ggml_compute_forward_rope_f32(
14375
14415
  const float cos_block_theta = cosf(block_theta);
14376
14416
  const float sin_block_theta = sinf(block_theta) * sin_sign;
14377
14417
 
14378
- theta_base *= theta_scale;
14418
+ theta_base *= theta_scale;
14379
14419
  block_theta *= theta_scale;
14380
14420
 
14381
14421
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -14410,29 +14450,22 @@ static void ggml_compute_forward_rope_f32(
14410
14450
  dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
14411
14451
  }
14412
14452
  } else {
14413
- // TODO: this might be wrong for ne0 != n_dims - need double check
14414
- // it seems we have to rope just the first n_dims elements and do nothing with the rest
14415
- // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
14416
- theta_base *= freq_scale;
14453
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14417
14454
  for (int64_t ic = 0; ic < ne0; ic += 2) {
14418
14455
  if (ic < n_dims) {
14419
- const int64_t ib = 0;
14456
+ const int64_t i0 = ic/2;
14420
14457
 
14421
- // simplified from `(ib * n_dims + ic) * inv_ndims`
14422
- float cur_rot = inv_ndims * ic - ib;
14423
- float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
14458
+ const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14424
14459
 
14425
14460
  float cos_theta, sin_theta;
14426
14461
  rope_yarn(
14427
- theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
14462
+ theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14428
14463
  &cos_theta, &sin_theta
14429
14464
  );
14430
- sin_theta *= sin_sign;
14431
14465
 
14466
+ sin_theta *= sin_sign;
14432
14467
  theta_base *= theta_scale;
14433
14468
 
14434
- const int64_t i0 = ib*n_dims + ic/2;
14435
-
14436
14469
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14437
14470
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14438
14471
 
@@ -14511,7 +14544,7 @@ static void ggml_compute_forward_rope_f16(
14511
14544
  int ir = 0;
14512
14545
 
14513
14546
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14514
- const float inv_ndims = -1.f/n_dims;
14547
+
14515
14548
  float corr_dims[2];
14516
14549
  ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14517
14550
 
@@ -14560,7 +14593,7 @@ static void ggml_compute_forward_rope_f16(
14560
14593
  const float cos_block_theta = cosf(block_theta);
14561
14594
  const float sin_block_theta = sinf(block_theta) * sin_sign;
14562
14595
 
14563
- theta_base *= theta_scale;
14596
+ theta_base *= theta_scale;
14564
14597
  block_theta *= theta_scale;
14565
14598
 
14566
14599
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -14591,29 +14624,22 @@ static void ggml_compute_forward_rope_f16(
14591
14624
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14592
14625
  }
14593
14626
  } else {
14594
- // TODO: this might be wrong for ne0 != n_dims - need double check
14595
- // it seems we have to rope just the first n_dims elements and do nothing with the rest
14596
- // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
14597
- theta_base *= freq_scale;
14627
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14598
14628
  for (int64_t ic = 0; ic < ne0; ic += 2) {
14599
14629
  if (ic < n_dims) {
14600
- const int64_t ib = 0;
14630
+ const int64_t i0 = ic/2;
14601
14631
 
14602
- // simplified from `(ib * n_dims + ic) * inv_ndims`
14603
- float cur_rot = inv_ndims * ic - ib;
14604
- float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
14632
+ const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14605
14633
 
14606
14634
  float cos_theta, sin_theta;
14607
14635
  rope_yarn(
14608
- theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
14636
+ theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14609
14637
  &cos_theta, &sin_theta
14610
14638
  );
14611
- sin_theta *= sin_sign;
14612
14639
 
14640
+ sin_theta *= sin_sign;
14613
14641
  theta_base *= theta_scale;
14614
14642
 
14615
- const int64_t i0 = ib*n_dims + ic/2;
14616
-
14617
14643
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14618
14644
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14619
14645
 
@@ -22742,6 +22768,16 @@ int ggml_cpu_has_neon(void) {
22742
22768
  #endif
22743
22769
  }
22744
22770
 
22771
+ int ggml_cpu_has_sve(void) {
22772
+ #if defined(__ARM_FEATURE_SVE)
22773
+ // TODO: Currently, SVE 256 bit is only supported.
22774
+ GGML_ASSERT(svcntb() == QK8_0);
22775
+ return 1;
22776
+ #else
22777
+ return 0;
22778
+ #endif
22779
+ }
22780
+
22745
22781
  int ggml_cpu_has_arm_fma(void) {
22746
22782
  #if defined(__ARM_FEATURE_FMA)
22747
22783
  return 1;
@@ -22830,6 +22866,14 @@ int ggml_cpu_has_sycl(void) {
22830
22866
  #endif
22831
22867
  }
22832
22868
 
22869
+ int ggml_cpu_has_rpc(void) {
22870
+ #if defined(GGML_USE_RPC)
22871
+ return 1;
22872
+ #else
22873
+ return 0;
22874
+ #endif
22875
+ }
22876
+
22833
22877
  int ggml_cpu_has_gpublas(void) {
22834
22878
  return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
22835
22879
  ggml_cpu_has_sycl();
@@ -756,7 +756,6 @@ extern "C" {
756
756
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
757
757
 
758
758
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
759
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
760
759
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
761
760
  GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
762
761
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
@@ -765,6 +764,11 @@ extern "C" {
765
764
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
766
765
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
767
766
 
767
+ GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
768
+ GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
769
+ GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
770
+ GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
771
+
768
772
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
769
773
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
774
 
@@ -1007,12 +1011,13 @@ extern "C" {
1007
1011
  struct ggml_tensor * a,
1008
1012
  struct ggml_tensor * b);
1009
1013
 
1010
- // concat a and b on dim 2
1014
+ // concat a and b along dim
1011
1015
  // used in stable-diffusion
1012
1016
  GGML_API struct ggml_tensor * ggml_concat(
1013
1017
  struct ggml_context * ctx,
1014
1018
  struct ggml_tensor * a,
1015
- struct ggml_tensor * b);
1019
+ struct ggml_tensor * b,
1020
+ int dim);
1016
1021
 
1017
1022
  GGML_API struct ggml_tensor * ggml_abs(
1018
1023
  struct ggml_context * ctx,
@@ -1547,6 +1552,14 @@ extern "C" {
1547
1552
  float beta_slow),
1548
1553
  "use ggml_rope_ext_inplace instead");
1549
1554
 
1555
+ struct ggml_tensor * ggml_rope_xpos_inplace(
1556
+ struct ggml_context * ctx,
1557
+ struct ggml_tensor * a,
1558
+ struct ggml_tensor * b,
1559
+ int n_dims,
1560
+ float base,
1561
+ bool down);
1562
+
1550
1563
  // compute correction dims for YaRN RoPE scaling
1551
1564
  GGML_CALL void ggml_rope_yarn_corr_dims(
1552
1565
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
@@ -2404,6 +2417,7 @@ extern "C" {
2404
2417
  GGML_API int ggml_cpu_has_avx512_bf16(void);
2405
2418
  GGML_API int ggml_cpu_has_fma (void);
2406
2419
  GGML_API int ggml_cpu_has_neon (void);
2420
+ GGML_API int ggml_cpu_has_sve (void);
2407
2421
  GGML_API int ggml_cpu_has_arm_fma (void);
2408
2422
  GGML_API int ggml_cpu_has_metal (void);
2409
2423
  GGML_API int ggml_cpu_has_f16c (void);
@@ -2418,6 +2432,7 @@ extern "C" {
2418
2432
  GGML_API int ggml_cpu_has_sse3 (void);
2419
2433
  GGML_API int ggml_cpu_has_ssse3 (void);
2420
2434
  GGML_API int ggml_cpu_has_sycl (void);
2435
+ GGML_API int ggml_cpu_has_rpc (void);
2421
2436
  GGML_API int ggml_cpu_has_vsx (void);
2422
2437
  GGML_API int ggml_cpu_has_matmul_int8(void);
2423
2438