@fugood/llama.node 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/CMakeLists.txt +8 -3
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +5 -5
  4. package/src/LlamaCompletionWorker.cpp +33 -33
  5. package/src/LlamaContext.cpp +17 -16
  6. package/src/llama.cpp/CMakeLists.txt +4 -0
  7. package/src/llama.cpp/common/CMakeLists.txt +6 -37
  8. package/src/llama.cpp/common/common.cpp +1 -5
  9. package/src/llama.cpp/common/download.cpp +47 -29
  10. package/src/llama.cpp/common/log.cpp +6 -0
  11. package/src/llama.cpp/common/log.h +2 -0
  12. package/src/llama.cpp/ggml/include/ggml.h +71 -0
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
  22. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
  26. package/src/llama.cpp/src/CMakeLists.txt +6 -0
  27. package/src/llama.cpp/src/llama-arch.cpp +32 -0
  28. package/src/llama.cpp/src/llama-arch.h +2 -0
  29. package/src/llama.cpp/src/llama-graph.cpp +2 -1
  30. package/src/llama.cpp/src/llama-model.cpp +102 -0
  31. package/src/llama.cpp/src/llama-model.h +2 -0
  32. package/src/llama.cpp/src/llama-sampling.cpp +10 -5
  33. package/src/llama.cpp/src/llama-vocab.cpp +16 -1
  34. package/src/llama.cpp/src/llama-vocab.h +1 -0
  35. package/src/llama.cpp/src/models/afmoe.cpp +187 -0
  36. package/src/llama.cpp/src/models/models.h +4 -0
  37. package/src/llama.cpp/src/unicode.cpp +77 -0
@@ -7,8 +7,10 @@
7
7
  #include "unary-ops.h"
8
8
  #include "vec.h"
9
9
 
10
- #include <float.h>
10
+ #include <cfloat>
11
11
  #include <algorithm>
12
+ #include <cmath>
13
+ #include <functional>
12
14
 
13
15
  // ggml_compute_forward_dup
14
16
 
@@ -1394,6 +1396,56 @@ void ggml_compute_forward_sum(
1394
1396
  }
1395
1397
  }
1396
1398
 
1399
+ // ggml_compute_forward_cumsum
1400
+
1401
+ static void ggml_compute_forward_cumsum_f32(
1402
+ const ggml_compute_params * params,
1403
+ ggml_tensor * dst) {
1404
+
1405
+ const ggml_tensor * src0 = dst->src[0];
1406
+
1407
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
1408
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
1409
+
1410
+ GGML_TENSOR_UNARY_OP_LOCALS
1411
+
1412
+ GGML_ASSERT(ne0 == ne00);
1413
+ GGML_ASSERT(ne1 == ne01);
1414
+ GGML_ASSERT(ne2 == ne02);
1415
+ GGML_ASSERT(ne3 == ne03);
1416
+
1417
+ const auto [ir0, ir1] = get_thread_range(params, src0);
1418
+
1419
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
1420
+ const int64_t i03 = ir/(ne02*ne01);
1421
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
1422
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
1423
+
1424
+ float * src_row = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
1425
+ float * dst_row = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
1426
+
1427
+ ggml_vec_cumsum_f32(ne00, dst_row, src_row);
1428
+ }
1429
+ }
1430
+
1431
+ void ggml_compute_forward_cumsum(
1432
+ const ggml_compute_params * params,
1433
+ ggml_tensor * dst) {
1434
+
1435
+ const ggml_tensor * src0 = dst->src[0];
1436
+
1437
+ switch (src0->type) {
1438
+ case GGML_TYPE_F32:
1439
+ {
1440
+ ggml_compute_forward_cumsum_f32(params, dst);
1441
+ } break;
1442
+ default:
1443
+ {
1444
+ GGML_ABORT("fatal error");
1445
+ }
1446
+ }
1447
+ }
1448
+
1397
1449
  // ggml_compute_forward_sum_rows
1398
1450
 
1399
1451
  static void ggml_compute_forward_sum_rows_f32(
@@ -2140,6 +2192,83 @@ static void ggml_compute_forward_gelu(
2140
2192
  }
2141
2193
  }
2142
2194
 
2195
+ // ggml_compute_fill
2196
+
2197
+ static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, ggml_tensor * dst) {
2198
+ const float c = ggml_get_op_params_f32(dst, 0);
2199
+
2200
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
2201
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
2202
+
2203
+ const auto [ir0, ir1] = get_thread_range(params, dst);
2204
+
2205
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
2206
+ const int64_t i03 = ir/(ne2*ne1);
2207
+ const int64_t i02 = (ir - i03*ne2*ne1)/ne1;
2208
+ const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1);
2209
+
2210
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
2211
+
2212
+ ggml_vec_set_f32(ne0, dst_ptr, c);
2213
+ }
2214
+ }
2215
+
2216
+ void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) {
2217
+ ggml_compute_forward_fill_f32(params, dst);
2218
+ }
2219
+
2220
+ // ggml_compute_tri
2221
+
2222
+ static void ggml_compute_forward_tri_f32(const ggml_compute_params * params, ggml_tensor * dst) {
2223
+ const ggml_tensor * src0 = dst->src[0];
2224
+
2225
+ const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
2226
+
2227
+ GGML_ASSERT(ggml_is_contiguous(src0));
2228
+
2229
+ GGML_TENSOR_UNARY_OP_LOCALS
2230
+
2231
+ const auto [ir0, ir1] = get_thread_range(params, src0);
2232
+
2233
+ bool (*bipred)(int, int);
2234
+
2235
+ switch (ttype) {
2236
+ case GGML_TRI_TYPE_LOWER: bipred = [](int i, int r) { return i < r; }; break;
2237
+ case GGML_TRI_TYPE_LOWER_DIAG: bipred = [](int i, int r) { return i <= r; }; break;
2238
+ case GGML_TRI_TYPE_UPPER: bipred = [](int i, int r) { return i > r; }; break;
2239
+ case GGML_TRI_TYPE_UPPER_DIAG: bipred = [](int i, int r) { return i >= r; }; break;
2240
+ default: GGML_ABORT("invalid tri type");
2241
+ }
2242
+
2243
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
2244
+ const int64_t i03 = ir/(ne02*ne01);
2245
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
2246
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
2247
+
2248
+ const float * src_ptr = (const float *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
2249
+ float * dst_ptr = ( float *) (( char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
2250
+
2251
+ for (int i0 = 0; i0 < ne0; ++i0) {
2252
+ dst_ptr[i0] = bipred(i0, i01) ? src_ptr[i0] : 0.0f;
2253
+ }
2254
+ }
2255
+ }
2256
+
2257
+ void ggml_compute_forward_tri(const ggml_compute_params * params, ggml_tensor * dst) {
2258
+ const ggml_tensor * src0 = dst->src[0];
2259
+
2260
+ switch (src0->type) {
2261
+ case GGML_TYPE_F32:
2262
+ {
2263
+ ggml_compute_forward_tri_f32(params, dst);
2264
+ } break;
2265
+ default:
2266
+ {
2267
+ GGML_ABORT("fatal error");
2268
+ }
2269
+ }
2270
+ }
2271
+
2143
2272
  // ggml_compute_forward_gelu_erf
2144
2273
 
2145
2274
  static void ggml_compute_forward_gelu_erf_f32(
@@ -5503,194 +5632,28 @@ static void ggml_mrope_cache_init(
5503
5632
  }
5504
5633
  }
5505
5634
 
5506
- static void ggml_compute_forward_rope_f32(
5507
- const ggml_compute_params * params,
5508
- ggml_tensor * dst,
5509
- const bool forward) {
5510
-
5511
- const ggml_tensor * src0 = dst->src[0];
5512
- const ggml_tensor * src1 = dst->src[1];
5513
- const ggml_tensor * src2 = dst->src[2];
5514
-
5515
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
5516
- int sections[4];
5517
-
5518
- //const int n_past = ((int32_t *) dst->op_params)[0];
5519
- const int n_dims = ((int32_t *) dst->op_params)[1];
5520
- const int mode = ((int32_t *) dst->op_params)[2];
5521
- //const int n_ctx = ((int32_t *) dst->op_params)[3];
5522
- const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
5523
-
5524
- memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
5525
- memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
5526
- memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
5527
- memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
5528
- memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
5529
- memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
5530
- memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
5531
-
5532
- GGML_TENSOR_UNARY_OP_LOCALS
5533
-
5534
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
5535
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
5536
-
5537
- GGML_ASSERT(nb00 == sizeof(float));
5538
-
5539
- const int ith = params->ith;
5540
- const int nth = params->nth;
5541
-
5542
- const int nr = ggml_nrows(dst);
5543
-
5544
- GGML_ASSERT(n_dims <= ne0);
5545
- GGML_ASSERT(n_dims % 2 == 0);
5546
-
5547
- // rows per thread
5548
- const int dr = (nr + nth - 1)/nth;
5549
-
5550
- // row range for this thread
5551
- const int ir0 = dr*ith;
5552
- const int ir1 = MIN(ir0 + dr, nr);
5553
-
5554
- // row index used to determine which thread to use
5555
- int ir = 0;
5556
-
5557
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
5558
-
5559
- float corr_dims[2];
5560
- ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
5561
-
5562
- const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
5563
- const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
5564
- const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
5565
- const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
5566
-
5567
- if (is_mrope) {
5568
- GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
5569
- }
5570
-
5571
- if (is_vision) {
5572
- GGML_ASSERT(n_dims == ne0/2);
5573
- }
5574
-
5575
- const float * freq_factors = NULL;
5576
- if (src2 != NULL) {
5577
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
5578
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
5579
- freq_factors = (const float *) src2->data;
5580
- }
5581
-
5582
- // backward process uses inverse rotation by cos and sin.
5583
- // cos and sin build a rotation matrix, where the inverse is the transpose.
5584
- // this essentially just switches the sign of sin.
5585
- const float sin_sign = forward ? 1.0f : -1.0f;
5586
-
5587
- const int32_t * pos = (const int32_t *) src1->data;
5588
-
5589
- for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
5590
- for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
5591
-
5592
- float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
5593
- if (!is_mrope) {
5594
- const int64_t p = pos[i2];
5595
- ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5596
- }
5597
- else {
5598
- const int64_t p_t = pos[i2];
5599
- const int64_t p_h = pos[i2 + ne2];
5600
- const int64_t p_w = pos[i2 + ne2 * 2];
5601
- const int64_t p_e = pos[i2 + ne2 * 3];
5602
- ggml_mrope_cache_init(
5603
- p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
5604
- freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5605
- }
5606
-
5607
- for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
5608
- if (ir++ < ir0) continue;
5609
- if (ir > ir1) break;
5610
-
5611
- if (is_neox || is_mrope) {
5612
- if (is_vision){
5613
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5614
- const int64_t ic = i0/2;
5615
-
5616
- const float cos_theta = cache[i0 + 0];
5617
- const float sin_theta = cache[i0 + 1];
5618
-
5619
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5620
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5621
-
5622
- const float x0 = src[0];
5623
- const float x1 = src[n_dims];
5624
-
5625
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5626
- dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
5627
- }
5628
- } else {
5629
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5630
- const int64_t ic = i0/2;
5631
-
5632
- const float cos_theta = cache[i0 + 0];
5633
- const float sin_theta = cache[i0 + 1];
5634
-
5635
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5636
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5637
-
5638
- const float x0 = src[0];
5639
- const float x1 = src[n_dims/2];
5640
-
5641
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5642
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
5643
- }
5644
- }
5645
- } else {
5646
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5647
- const float cos_theta = cache[i0 + 0];
5648
- const float sin_theta = cache[i0 + 1];
5649
-
5650
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5651
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5652
-
5653
- const float x0 = src[0];
5654
- const float x1 = src[1];
5655
5635
 
5656
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5657
- dst_data[1] = x0*sin_theta + x1*cos_theta;
5658
- }
5659
- }
5660
-
5661
- if (is_vision) {
5662
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5663
- const int64_t ic = i0/2;
5664
-
5665
- const float cos_theta = cache[i0 + 0];
5666
- const float sin_theta = cache[i0 + 1];
5636
+ template<typename T>
5637
+ static void rotate_pairs(const int64_t n, const int64_t n_offset, const float * cache, const T * src_data, T * dst_data, const int scale = 2) {
5638
+ for (int64_t i0 = 0; i0 < n; i0 += 2) {
5639
+ const int64_t ic = i0/scale; // hack for GGML_ROPE_TYPE_NORMAL, where we need ic = i0; for all other cases, ic = i0/2
5667
5640
 
5668
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5669
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5641
+ const float cos_theta = cache[i0 + 0];
5642
+ const float sin_theta = cache[i0 + 1];
5670
5643
 
5671
- const float x0 = src[0];
5672
- const float x1 = src[n_dims];
5644
+ const T * const src = src_data + ic;
5645
+ T * dst = dst_data + ic;
5673
5646
 
5674
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5675
- dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
5676
- }
5677
- } else {
5678
- // fill the remain channels with data from src tensor
5679
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5680
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5681
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5647
+ const float x0 = type_conversion_table<T>::to_f32(src[0]);
5648
+ const float x1 = type_conversion_table<T>::to_f32(src[n_offset]);
5682
5649
 
5683
- dst_data[0] = src[0];
5684
- dst_data[1] = src[1];
5685
- }
5686
- }
5687
- }
5688
- }
5689
- }
5650
+ dst[0] = type_conversion_table<T>::from_f32(x0*cos_theta - x1*sin_theta);
5651
+ dst[n_offset] = type_conversion_table<T>::from_f32(x0*sin_theta + x1*cos_theta);
5652
+ }
5690
5653
  }
5691
5654
 
5692
- // TODO: deduplicate f16/f32 code
5693
- static void ggml_compute_forward_rope_f16(
5655
+ template<typename T> //float or ggml_fp16_t
5656
+ static void ggml_compute_forward_rope_flt(
5694
5657
  const ggml_compute_params * params,
5695
5658
  ggml_tensor * dst,
5696
5659
  const bool forward) {
@@ -5699,6 +5662,9 @@ static void ggml_compute_forward_rope_f16(
5699
5662
  const ggml_tensor * src1 = dst->src[1];
5700
5663
  const ggml_tensor * src2 = dst->src[2];
5701
5664
 
5665
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
5666
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
5667
+
5702
5668
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
5703
5669
  int sections[4];
5704
5670
 
@@ -5707,6 +5673,7 @@ static void ggml_compute_forward_rope_f16(
5707
5673
  const int mode = ((int32_t *) dst->op_params)[2];
5708
5674
  //const int n_ctx = ((int32_t *) dst->op_params)[3];
5709
5675
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
5676
+
5710
5677
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
5711
5678
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
5712
5679
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -5715,13 +5682,13 @@ static void ggml_compute_forward_rope_f16(
5715
5682
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
5716
5683
  memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
5717
5684
 
5718
-
5719
5685
  GGML_TENSOR_UNARY_OP_LOCALS
5720
5686
 
5721
5687
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
5722
5688
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
5723
5689
 
5724
- GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
5690
+ GGML_ASSERT(nb0 == nb00);
5691
+ GGML_ASSERT(nb0 == sizeof(T));
5725
5692
 
5726
5693
  const int ith = params->ith;
5727
5694
  const int nth = params->nth;
@@ -5746,12 +5713,11 @@ static void ggml_compute_forward_rope_f16(
5746
5713
  float corr_dims[2];
5747
5714
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
5748
5715
 
5749
- const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
5750
- const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
5751
- const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
5716
+ const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
5717
+ const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
5752
5718
  const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
5753
5719
 
5754
- if (is_mrope) {
5720
+ if (mrope_used) {
5755
5721
  GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
5756
5722
  }
5757
5723
 
@@ -5773,11 +5739,11 @@ static void ggml_compute_forward_rope_f16(
5773
5739
 
5774
5740
  const int32_t * pos = (const int32_t *) src1->data;
5775
5741
 
5776
- for (int64_t i3 = 0; i3 < ne3; i3++) {
5777
- for (int64_t i2 = 0; i2 < ne2; i2++) {
5742
+ for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
5743
+ for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
5778
5744
 
5779
5745
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
5780
- if (!is_mrope) {
5746
+ if (!mrope_used) {
5781
5747
  const int64_t p = pos[i2];
5782
5748
  ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5783
5749
  }
@@ -5791,86 +5757,40 @@ static void ggml_compute_forward_rope_f16(
5791
5757
  freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5792
5758
  }
5793
5759
 
5794
- for (int64_t i1 = 0; i1 < ne1; i1++) {
5760
+ for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
5795
5761
  if (ir++ < ir0) continue;
5796
5762
  if (ir > ir1) break;
5797
5763
 
5798
- if (is_neox || is_mrope) {
5799
- if (is_vision) {
5800
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5801
- const int64_t ic = i0/2;
5802
-
5803
- const float cos_theta = cache[i0 + 0];
5804
- const float sin_theta = cache[i0 + 1];
5805
-
5806
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5807
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5808
-
5809
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5810
- const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5811
-
5812
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5813
- dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5814
- }
5815
- } else {
5816
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5817
- const int64_t ic = i0/2;
5818
-
5819
- const float cos_theta = cache[i0 + 0];
5820
- const float sin_theta = cache[i0 + 1];
5821
-
5822
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5823
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5824
-
5825
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5826
- const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
5827
-
5828
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5829
- dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5830
- }
5831
- }
5832
- } else {
5833
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5834
- const float cos_theta = cache[i0 + 0];
5835
- const float sin_theta = cache[i0 + 1];
5836
-
5837
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5838
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5839
-
5840
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5841
- const float x1 = GGML_CPU_FP16_TO_FP32(src[1]);
5842
-
5843
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5844
- dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5845
- }
5764
+ T * src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
5765
+ T * dst_data = (T *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
5766
+
5767
+ switch (mode) {
5768
+ case GGML_ROPE_TYPE_NORMAL:
5769
+ rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
5770
+ break;
5771
+ case GGML_ROPE_TYPE_NEOX:
5772
+ case GGML_ROPE_TYPE_MROPE:
5773
+ case GGML_ROPE_TYPE_IMROPE:
5774
+ rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
5775
+ break;
5776
+ case GGML_ROPE_TYPE_VISION:
5777
+ rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
5778
+ break;
5779
+ default:
5780
+ GGML_ABORT("rope type not supported");
5846
5781
  }
5847
5782
 
5848
- if (is_vision) {
5849
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5850
- const int64_t ic = i0/2;
5851
-
5852
- const float cos_theta = cache[i0 + 0];
5853
- const float sin_theta = cache[i0 + 1];
5854
-
5855
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5856
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5857
-
5858
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5859
- const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5860
-
5861
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5862
- dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5863
- }
5864
- } else {
5783
+ if (!is_vision) {
5784
+ // fill the remain channels with data from src tensor
5865
5785
  for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5866
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5867
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5786
+ const T * const src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5787
+ T * dst_data = (T *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5868
5788
 
5869
5789
  dst_data[0] = src[0];
5870
5790
  dst_data[1] = src[1];
5871
5791
  }
5872
5792
  }
5873
- }
5793
+ } //attn-heads
5874
5794
  }
5875
5795
  }
5876
5796
  }
@@ -5884,11 +5804,11 @@ void ggml_compute_forward_rope(
5884
5804
  switch (src0->type) {
5885
5805
  case GGML_TYPE_F16:
5886
5806
  {
5887
- ggml_compute_forward_rope_f16(params, dst, true);
5807
+ ggml_compute_forward_rope_flt<ggml_fp16_t>(params, dst, true);
5888
5808
  } break;
5889
5809
  case GGML_TYPE_F32:
5890
5810
  {
5891
- ggml_compute_forward_rope_f32(params, dst, true);
5811
+ ggml_compute_forward_rope_flt<float>(params, dst, true);
5892
5812
  } break;
5893
5813
  default:
5894
5814
  {
@@ -5908,11 +5828,11 @@ void ggml_compute_forward_rope_back(
5908
5828
  switch (src0->type) {
5909
5829
  case GGML_TYPE_F16:
5910
5830
  {
5911
- ggml_compute_forward_rope_f16(params, dst, false);
5831
+ ggml_compute_forward_rope_flt<ggml_fp16_t>(params, dst, false);
5912
5832
  } break;
5913
5833
  case GGML_TYPE_F32:
5914
5834
  {
5915
- ggml_compute_forward_rope_f32(params, dst, false);
5835
+ ggml_compute_forward_rope_flt<float>(params, dst, false);
5916
5836
  } break;
5917
5837
  default:
5918
5838
  {
@@ -7873,6 +7793,18 @@ void ggml_compute_forward_timestep_embedding(
7873
7793
 
7874
7794
  // ggml_compute_forward_argsort
7875
7795
 
7796
+ template<enum ggml_sort_order order>
7797
+ struct argsort_cmp {
7798
+ const float * data;
7799
+ bool operator()(int32_t a, int32_t b) const {
7800
+ if constexpr (order == GGML_SORT_ORDER_ASC) {
7801
+ return data[a] < data[b];
7802
+ } else {
7803
+ return data[a] > data[b];
7804
+ }
7805
+ }
7806
+ };
7807
+
7876
7808
  static void ggml_compute_forward_argsort_f32(
7877
7809
  const ggml_compute_params * params,
7878
7810
  ggml_tensor * dst) {
@@ -7891,23 +7823,25 @@ static void ggml_compute_forward_argsort_f32(
7891
7823
  ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
7892
7824
 
7893
7825
  for (int64_t i = ith; i < nr; i += nth) {
7894
- int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
7895
7826
  const float * src_data = (float *)((char *) src0->data + i*nb01);
7896
7827
 
7828
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
7829
+
7897
7830
  for (int64_t j = 0; j < ne0; j++) {
7898
7831
  dst_data[j] = j;
7899
7832
  }
7900
7833
 
7901
- // C doesn't have a functional sort, so we do a bubble sort instead
7902
- for (int64_t j = 0; j < ne0; j++) {
7903
- for (int64_t k = j + 1; k < ne0; k++) {
7904
- if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
7905
- (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
7906
- int32_t tmp = dst_data[j];
7907
- dst_data[j] = dst_data[k];
7908
- dst_data[k] = tmp;
7909
- }
7910
- }
7834
+ switch (order) {
7835
+ case GGML_SORT_ORDER_ASC:
7836
+ std::sort(dst_data, dst_data + ne0, argsort_cmp<GGML_SORT_ORDER_ASC>{src_data});
7837
+ break;
7838
+
7839
+ case GGML_SORT_ORDER_DESC:
7840
+ std::sort(dst_data, dst_data + ne0, argsort_cmp<GGML_SORT_ORDER_DESC>{src_data});
7841
+ break;
7842
+
7843
+ default:
7844
+ GGML_ABORT("invalid sort order");
7911
7845
  }
7912
7846
  }
7913
7847
  }
@@ -8730,7 +8664,7 @@ static void ggml_compute_forward_ssm_scan_f32(
8730
8664
  // n_head
8731
8665
  for (int h = ih0; h < ih1; ++h) {
8732
8666
  // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
8733
- const float dt_soft_plus = ggml_softplus(dt[h]);
8667
+ const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
8734
8668
  const float dA = expf(dt_soft_plus * A[h]);
8735
8669
  const int g = h / (nh / ng); // repeat_interleave
8736
8670
 
@@ -8827,7 +8761,7 @@ static void ggml_compute_forward_ssm_scan_f32(
8827
8761
  // n_head
8828
8762
  for (int h = ih0; h < ih1; ++h) {
8829
8763
  // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
8830
- const float dt_soft_plus = ggml_softplus(dt[h]);
8764
+ const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
8831
8765
  const int g = h / (nh / ng); // repeat_interleave
8832
8766
 
8833
8767
  // dim
@@ -9110,6 +9044,14 @@ void ggml_compute_forward_unary(
9110
9044
  {
9111
9045
  ggml_compute_forward_xielu(params, dst);
9112
9046
  } break;
9047
+ case GGML_UNARY_OP_EXPM1:
9048
+ {
9049
+ ggml_compute_forward_expm1(params, dst);
9050
+ } break;
9051
+ case GGML_UNARY_OP_SOFTPLUS:
9052
+ {
9053
+ ggml_compute_forward_softplus(params, dst);
9054
+ } break;
9113
9055
  default:
9114
9056
  {
9115
9057
  GGML_ABORT("fatal error");
@@ -9706,6 +9648,76 @@ void ggml_compute_forward_gla(
9706
9648
  }
9707
9649
  }
9708
9650
 
9651
+ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst) {
9652
+ const struct ggml_tensor * src0 = dst->src[0]; // A (lower triangular)
9653
+ const struct ggml_tensor * src1 = dst->src[1]; // B (RHS)
9654
+
9655
+ GGML_TENSOR_BINARY_OP_LOCALS;
9656
+
9657
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
9658
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
9659
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
9660
+
9661
+ GGML_ASSERT(ne00 == ne01); // A must be square
9662
+ GGML_ASSERT(ne0 == ne10); // solution cols == B cols
9663
+ GGML_ASSERT(ne1 == ne11); // solution rows == B rows
9664
+
9665
+ GGML_ASSERT(ne02 == ne12 && ne12 == ne2);
9666
+ GGML_ASSERT(ne03 == ne13 && ne13 == ne3);
9667
+
9668
+ const int ith = params->ith;
9669
+ const int nth = params->nth;
9670
+
9671
+ const int64_t k = ne10; // number of RHS columns
9672
+ const int64_t n = ne11; // A is n×n
9673
+ const int64_t nr = ne02 * ne03 * k; // we're parallelizing on columns here, so seq x token x column will be the unit
9674
+
9675
+ // chunks per thread
9676
+ const int64_t dr = (nr + nth - 1)/nth;
9677
+
9678
+ // chunk range for this thread
9679
+ const int64_t ir0 = dr*ith;
9680
+ const int64_t ir1 = MIN(ir0 + dr, nr);
9681
+
9682
+ const float * A = (const float *) src0->data; // [n, n, B1, B2]
9683
+ const float * B = (const float *) src1->data; // [n, k, B1, B2]
9684
+ float * X = ( float *) dst->data; // [n, k, B1, B2]
9685
+
9686
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
9687
+ const int64_t i03 = ir/(ne02*k);
9688
+ const int64_t i02 = (ir - i03*ne02*k)/k;
9689
+ const int64_t i01 = (ir - i03*ne02*k - i02*k);
9690
+
9691
+ const float * A_batch = A + i02 * nb02 / sizeof(float) + i03 * nb03 / sizeof(float);
9692
+ const float * B_batch = B + i02 * nb12 / sizeof(float) + i03 * nb13 / sizeof(float);
9693
+
9694
+ float * X_batch = X + i02 * nb2 / sizeof(float) + i03 * nb3 / sizeof(float);
9695
+
9696
+ for (int64_t i00 = 0; i00 < n; ++i00) {
9697
+ float sum = 0.0f;
9698
+ for (int64_t t = 0; t < i00; ++t) {
9699
+ sum += A_batch[i00 * n + t] * X_batch[i01 * n + t];
9700
+ }
9701
+
9702
+ const float diag = A_batch[i00 * n + i00];
9703
+ GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
9704
+
9705
+ X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag;
9706
+ }
9707
+ }
9708
+ }
9709
+
9710
+ void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst) {
9711
+ const ggml_tensor * src0 = dst->src[0];
9712
+ const ggml_tensor * src1 = dst->src[1];
9713
+
9714
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
9715
+ ggml_compute_forward_solve_tri_f32(params, dst);
9716
+ } else {
9717
+ GGML_ABORT("fatal error");
9718
+ }
9719
+ }
9720
+
9709
9721
  // ggml_compute_forward_rwkv_wkv7
9710
9722
 
9711
9723
  static void ggml_compute_forward_rwkv_wkv7_f32(