@fugood/llama.node 1.3.1 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CMakeLists.txt +4 -3
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +6 -6
  4. package/src/llama.cpp/CMakeLists.txt +4 -0
  5. package/src/llama.cpp/common/CMakeLists.txt +6 -37
  6. package/src/llama.cpp/common/arg.cpp +7 -0
  7. package/src/llama.cpp/common/common.cpp +1 -5
  8. package/src/llama.cpp/common/common.h +2 -1
  9. package/src/llama.cpp/common/download.cpp +47 -29
  10. package/src/llama.cpp/common/log.cpp +6 -0
  11. package/src/llama.cpp/common/log.h +2 -0
  12. package/src/llama.cpp/ggml/include/ggml.h +71 -0
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +34 -11
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +50 -16
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -317
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -4
  22. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
  23. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
  27. package/src/llama.cpp/src/CMakeLists.txt +6 -0
  28. package/src/llama.cpp/src/llama-arch.cpp +32 -0
  29. package/src/llama.cpp/src/llama-arch.h +2 -0
  30. package/src/llama.cpp/src/llama-graph.cpp +2 -1
  31. package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
  32. package/src/llama.cpp/src/llama-model.cpp +102 -0
  33. package/src/llama.cpp/src/llama-model.h +2 -0
  34. package/src/llama.cpp/src/llama-sampling.cpp +10 -5
  35. package/src/llama.cpp/src/llama-vocab.cpp +16 -1
  36. package/src/llama.cpp/src/llama-vocab.h +1 -0
  37. package/src/llama.cpp/src/models/afmoe.cpp +187 -0
  38. package/src/llama.cpp/src/models/ernie4-5.cpp +4 -5
  39. package/src/llama.cpp/src/models/models.h +4 -0
  40. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +2 -1
  41. package/src/llama.cpp/src/unicode.cpp +77 -0
@@ -7,8 +7,10 @@
7
7
  #include "unary-ops.h"
8
8
  #include "vec.h"
9
9
 
10
- #include <float.h>
10
+ #include <cfloat>
11
11
  #include <algorithm>
12
+ #include <cmath>
13
+ #include <functional>
12
14
 
13
15
  // ggml_compute_forward_dup
14
16
 
@@ -1394,6 +1396,56 @@ void ggml_compute_forward_sum(
1394
1396
  }
1395
1397
  }
1396
1398
 
1399
+ // ggml_compute_forward_cumsum
1400
+
1401
+ static void ggml_compute_forward_cumsum_f32(
1402
+ const ggml_compute_params * params,
1403
+ ggml_tensor * dst) {
1404
+
1405
+ const ggml_tensor * src0 = dst->src[0];
1406
+
1407
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
1408
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
1409
+
1410
+ GGML_TENSOR_UNARY_OP_LOCALS
1411
+
1412
+ GGML_ASSERT(ne0 == ne00);
1413
+ GGML_ASSERT(ne1 == ne01);
1414
+ GGML_ASSERT(ne2 == ne02);
1415
+ GGML_ASSERT(ne3 == ne03);
1416
+
1417
+ const auto [ir0, ir1] = get_thread_range(params, src0);
1418
+
1419
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
1420
+ const int64_t i03 = ir/(ne02*ne01);
1421
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
1422
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
1423
+
1424
+ float * src_row = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
1425
+ float * dst_row = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
1426
+
1427
+ ggml_vec_cumsum_f32(ne00, dst_row, src_row);
1428
+ }
1429
+ }
1430
+
1431
+ void ggml_compute_forward_cumsum(
1432
+ const ggml_compute_params * params,
1433
+ ggml_tensor * dst) {
1434
+
1435
+ const ggml_tensor * src0 = dst->src[0];
1436
+
1437
+ switch (src0->type) {
1438
+ case GGML_TYPE_F32:
1439
+ {
1440
+ ggml_compute_forward_cumsum_f32(params, dst);
1441
+ } break;
1442
+ default:
1443
+ {
1444
+ GGML_ABORT("fatal error");
1445
+ }
1446
+ }
1447
+ }
1448
+
1397
1449
  // ggml_compute_forward_sum_rows
1398
1450
 
1399
1451
  static void ggml_compute_forward_sum_rows_f32(
@@ -2140,6 +2192,83 @@ static void ggml_compute_forward_gelu(
2140
2192
  }
2141
2193
  }
2142
2194
 
2195
+ // ggml_compute_fill
2196
+
2197
+ static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, ggml_tensor * dst) {
2198
+ const float c = ggml_get_op_params_f32(dst, 0);
2199
+
2200
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
2201
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
2202
+
2203
+ const auto [ir0, ir1] = get_thread_range(params, dst);
2204
+
2205
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
2206
+ const int64_t i03 = ir/(ne2*ne1);
2207
+ const int64_t i02 = (ir - i03*ne2*ne1)/ne1;
2208
+ const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1);
2209
+
2210
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
2211
+
2212
+ ggml_vec_set_f32(ne0, dst_ptr, c);
2213
+ }
2214
+ }
2215
+
2216
+ void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) {
2217
+ ggml_compute_forward_fill_f32(params, dst);
2218
+ }
2219
+
2220
+ // ggml_compute_tri
2221
+
2222
+ static void ggml_compute_forward_tri_f32(const ggml_compute_params * params, ggml_tensor * dst) {
2223
+ const ggml_tensor * src0 = dst->src[0];
2224
+
2225
+ const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
2226
+
2227
+ GGML_ASSERT(ggml_is_contiguous(src0));
2228
+
2229
+ GGML_TENSOR_UNARY_OP_LOCALS
2230
+
2231
+ const auto [ir0, ir1] = get_thread_range(params, src0);
2232
+
2233
+ bool (*bipred)(int, int);
2234
+
2235
+ switch (ttype) {
2236
+ case GGML_TRI_TYPE_LOWER: bipred = [](int i, int r) { return i < r; }; break;
2237
+ case GGML_TRI_TYPE_LOWER_DIAG: bipred = [](int i, int r) { return i <= r; }; break;
2238
+ case GGML_TRI_TYPE_UPPER: bipred = [](int i, int r) { return i > r; }; break;
2239
+ case GGML_TRI_TYPE_UPPER_DIAG: bipred = [](int i, int r) { return i >= r; }; break;
2240
+ default: GGML_ABORT("invalid tri type");
2241
+ }
2242
+
2243
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
2244
+ const int64_t i03 = ir/(ne02*ne01);
2245
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
2246
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
2247
+
2248
+ const float * src_ptr = (const float *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
2249
+ float * dst_ptr = ( float *) (( char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
2250
+
2251
+ for (int i0 = 0; i0 < ne0; ++i0) {
2252
+ dst_ptr[i0] = bipred(i0, i01) ? src_ptr[i0] : 0.0f;
2253
+ }
2254
+ }
2255
+ }
2256
+
2257
+ void ggml_compute_forward_tri(const ggml_compute_params * params, ggml_tensor * dst) {
2258
+ const ggml_tensor * src0 = dst->src[0];
2259
+
2260
+ switch (src0->type) {
2261
+ case GGML_TYPE_F32:
2262
+ {
2263
+ ggml_compute_forward_tri_f32(params, dst);
2264
+ } break;
2265
+ default:
2266
+ {
2267
+ GGML_ABORT("fatal error");
2268
+ }
2269
+ }
2270
+ }
2271
+
2143
2272
  // ggml_compute_forward_gelu_erf
2144
2273
 
2145
2274
  static void ggml_compute_forward_gelu_erf_f32(
@@ -4455,46 +4584,6 @@ void ggml_compute_forward_cont(
4455
4584
  ggml_compute_forward_dup(params, dst);
4456
4585
  }
4457
4586
 
4458
- // ggml_compute_forward_reshape
4459
-
4460
- void ggml_compute_forward_reshape(
4461
- const ggml_compute_params * params,
4462
- ggml_tensor * dst) {
4463
- // NOP
4464
- GGML_UNUSED(params);
4465
- GGML_UNUSED(dst);
4466
- }
4467
-
4468
- // ggml_compute_forward_view
4469
-
4470
- void ggml_compute_forward_view(
4471
- const ggml_compute_params * params,
4472
- ggml_tensor * dst) {
4473
- // NOP
4474
- GGML_UNUSED(params);
4475
- GGML_UNUSED(dst);
4476
- }
4477
-
4478
- // ggml_compute_forward_permute
4479
-
4480
- void ggml_compute_forward_permute(
4481
- const ggml_compute_params * params,
4482
- ggml_tensor * dst) {
4483
- // NOP
4484
- GGML_UNUSED(params);
4485
- GGML_UNUSED(dst);
4486
- }
4487
-
4488
- // ggml_compute_forward_transpose
4489
-
4490
- void ggml_compute_forward_transpose(
4491
- const ggml_compute_params * params,
4492
- ggml_tensor * dst) {
4493
- // NOP
4494
- GGML_UNUSED(params);
4495
- GGML_UNUSED(dst);
4496
- }
4497
-
4498
4587
  // ggml_compute_forward_get_rows
4499
4588
 
4500
4589
  static void ggml_compute_forward_get_rows_q(
@@ -5543,194 +5632,28 @@ static void ggml_mrope_cache_init(
5543
5632
  }
5544
5633
  }
5545
5634
 
5546
- static void ggml_compute_forward_rope_f32(
5547
- const ggml_compute_params * params,
5548
- ggml_tensor * dst,
5549
- const bool forward) {
5550
-
5551
- const ggml_tensor * src0 = dst->src[0];
5552
- const ggml_tensor * src1 = dst->src[1];
5553
- const ggml_tensor * src2 = dst->src[2];
5554
-
5555
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
5556
- int sections[4];
5557
-
5558
- //const int n_past = ((int32_t *) dst->op_params)[0];
5559
- const int n_dims = ((int32_t *) dst->op_params)[1];
5560
- const int mode = ((int32_t *) dst->op_params)[2];
5561
- //const int n_ctx = ((int32_t *) dst->op_params)[3];
5562
- const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
5563
-
5564
- memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
5565
- memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
5566
- memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
5567
- memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
5568
- memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
5569
- memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
5570
- memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
5571
-
5572
- GGML_TENSOR_UNARY_OP_LOCALS
5573
-
5574
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
5575
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
5576
-
5577
- GGML_ASSERT(nb00 == sizeof(float));
5578
-
5579
- const int ith = params->ith;
5580
- const int nth = params->nth;
5581
-
5582
- const int nr = ggml_nrows(dst);
5583
-
5584
- GGML_ASSERT(n_dims <= ne0);
5585
- GGML_ASSERT(n_dims % 2 == 0);
5586
-
5587
- // rows per thread
5588
- const int dr = (nr + nth - 1)/nth;
5589
-
5590
- // row range for this thread
5591
- const int ir0 = dr*ith;
5592
- const int ir1 = MIN(ir0 + dr, nr);
5593
-
5594
- // row index used to determine which thread to use
5595
- int ir = 0;
5596
-
5597
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
5598
-
5599
- float corr_dims[2];
5600
- ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
5601
-
5602
- const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
5603
- const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
5604
- const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
5605
- const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
5606
-
5607
- if (is_mrope) {
5608
- GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
5609
- }
5610
-
5611
- if (is_vision) {
5612
- GGML_ASSERT(n_dims == ne0/2);
5613
- }
5614
-
5615
- const float * freq_factors = NULL;
5616
- if (src2 != NULL) {
5617
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
5618
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
5619
- freq_factors = (const float *) src2->data;
5620
- }
5621
-
5622
- // backward process uses inverse rotation by cos and sin.
5623
- // cos and sin build a rotation matrix, where the inverse is the transpose.
5624
- // this essentially just switches the sign of sin.
5625
- const float sin_sign = forward ? 1.0f : -1.0f;
5626
-
5627
- const int32_t * pos = (const int32_t *) src1->data;
5628
-
5629
- for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
5630
- for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
5631
-
5632
- float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
5633
- if (!is_mrope) {
5634
- const int64_t p = pos[i2];
5635
- ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5636
- }
5637
- else {
5638
- const int64_t p_t = pos[i2];
5639
- const int64_t p_h = pos[i2 + ne2];
5640
- const int64_t p_w = pos[i2 + ne2 * 2];
5641
- const int64_t p_e = pos[i2 + ne2 * 3];
5642
- ggml_mrope_cache_init(
5643
- p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
5644
- freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5645
- }
5646
-
5647
- for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
5648
- if (ir++ < ir0) continue;
5649
- if (ir > ir1) break;
5650
-
5651
- if (is_neox || is_mrope) {
5652
- if (is_vision){
5653
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5654
- const int64_t ic = i0/2;
5655
-
5656
- const float cos_theta = cache[i0 + 0];
5657
- const float sin_theta = cache[i0 + 1];
5658
-
5659
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5660
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5661
-
5662
- const float x0 = src[0];
5663
- const float x1 = src[n_dims];
5664
-
5665
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5666
- dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
5667
- }
5668
- } else {
5669
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5670
- const int64_t ic = i0/2;
5671
-
5672
- const float cos_theta = cache[i0 + 0];
5673
- const float sin_theta = cache[i0 + 1];
5674
-
5675
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5676
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5677
-
5678
- const float x0 = src[0];
5679
- const float x1 = src[n_dims/2];
5680
-
5681
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5682
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
5683
- }
5684
- }
5685
- } else {
5686
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5687
- const float cos_theta = cache[i0 + 0];
5688
- const float sin_theta = cache[i0 + 1];
5689
-
5690
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5691
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5692
-
5693
- const float x0 = src[0];
5694
- const float x1 = src[1];
5695
-
5696
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5697
- dst_data[1] = x0*sin_theta + x1*cos_theta;
5698
- }
5699
- }
5700
5635
 
5701
- if (is_vision) {
5702
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5703
- const int64_t ic = i0/2;
5636
+ template<typename T>
5637
+ static void rotate_pairs(const int64_t n, const int64_t n_offset, const float * cache, const T * src_data, T * dst_data, const int scale = 2) {
5638
+ for (int64_t i0 = 0; i0 < n; i0 += 2) {
5639
+ const int64_t ic = i0/scale; // hack for GGML_ROPE_TYPE_NORMAL, where we need ic = i0; for all other cases, ic = i0/2
5704
5640
 
5705
- const float cos_theta = cache[i0 + 0];
5706
- const float sin_theta = cache[i0 + 1];
5641
+ const float cos_theta = cache[i0 + 0];
5642
+ const float sin_theta = cache[i0 + 1];
5707
5643
 
5708
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5709
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5644
+ const T * const src = src_data + ic;
5645
+ T * dst = dst_data + ic;
5710
5646
 
5711
- const float x0 = src[0];
5712
- const float x1 = src[n_dims];
5647
+ const float x0 = type_conversion_table<T>::to_f32(src[0]);
5648
+ const float x1 = type_conversion_table<T>::to_f32(src[n_offset]);
5713
5649
 
5714
- dst_data[0] = x0*cos_theta - x1*sin_theta;
5715
- dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
5716
- }
5717
- } else {
5718
- // fill the remain channels with data from src tensor
5719
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5720
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5721
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5722
-
5723
- dst_data[0] = src[0];
5724
- dst_data[1] = src[1];
5725
- }
5726
- }
5727
- }
5728
- }
5729
- }
5650
+ dst[0] = type_conversion_table<T>::from_f32(x0*cos_theta - x1*sin_theta);
5651
+ dst[n_offset] = type_conversion_table<T>::from_f32(x0*sin_theta + x1*cos_theta);
5652
+ }
5730
5653
  }
5731
5654
 
5732
- // TODO: deduplicate f16/f32 code
5733
- static void ggml_compute_forward_rope_f16(
5655
+ template<typename T> //float or ggml_fp16_t
5656
+ static void ggml_compute_forward_rope_flt(
5734
5657
  const ggml_compute_params * params,
5735
5658
  ggml_tensor * dst,
5736
5659
  const bool forward) {
@@ -5739,6 +5662,9 @@ static void ggml_compute_forward_rope_f16(
5739
5662
  const ggml_tensor * src1 = dst->src[1];
5740
5663
  const ggml_tensor * src2 = dst->src[2];
5741
5664
 
5665
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
5666
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
5667
+
5742
5668
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
5743
5669
  int sections[4];
5744
5670
 
@@ -5747,6 +5673,7 @@ static void ggml_compute_forward_rope_f16(
5747
5673
  const int mode = ((int32_t *) dst->op_params)[2];
5748
5674
  //const int n_ctx = ((int32_t *) dst->op_params)[3];
5749
5675
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
5676
+
5750
5677
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
5751
5678
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
5752
5679
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -5755,13 +5682,13 @@ static void ggml_compute_forward_rope_f16(
5755
5682
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
5756
5683
  memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
5757
5684
 
5758
-
5759
5685
  GGML_TENSOR_UNARY_OP_LOCALS
5760
5686
 
5761
5687
  //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
5762
5688
  //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
5763
5689
 
5764
- GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
5690
+ GGML_ASSERT(nb0 == nb00);
5691
+ GGML_ASSERT(nb0 == sizeof(T));
5765
5692
 
5766
5693
  const int ith = params->ith;
5767
5694
  const int nth = params->nth;
@@ -5786,12 +5713,11 @@ static void ggml_compute_forward_rope_f16(
5786
5713
  float corr_dims[2];
5787
5714
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
5788
5715
 
5789
- const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
5790
- const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
5791
- const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
5716
+ const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
5717
+ const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
5792
5718
  const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
5793
5719
 
5794
- if (is_mrope) {
5720
+ if (mrope_used) {
5795
5721
  GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
5796
5722
  }
5797
5723
 
@@ -5813,11 +5739,11 @@ static void ggml_compute_forward_rope_f16(
5813
5739
 
5814
5740
  const int32_t * pos = (const int32_t *) src1->data;
5815
5741
 
5816
- for (int64_t i3 = 0; i3 < ne3; i3++) {
5817
- for (int64_t i2 = 0; i2 < ne2; i2++) {
5742
+ for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
5743
+ for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
5818
5744
 
5819
5745
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
5820
- if (!is_mrope) {
5746
+ if (!mrope_used) {
5821
5747
  const int64_t p = pos[i2];
5822
5748
  ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5823
5749
  }
@@ -5831,86 +5757,40 @@ static void ggml_compute_forward_rope_f16(
5831
5757
  freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5832
5758
  }
5833
5759
 
5834
- for (int64_t i1 = 0; i1 < ne1; i1++) {
5760
+ for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
5835
5761
  if (ir++ < ir0) continue;
5836
5762
  if (ir > ir1) break;
5837
5763
 
5838
- if (is_neox || is_mrope) {
5839
- if (is_vision) {
5840
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5841
- const int64_t ic = i0/2;
5842
-
5843
- const float cos_theta = cache[i0 + 0];
5844
- const float sin_theta = cache[i0 + 1];
5845
-
5846
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5847
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5848
-
5849
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5850
- const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5851
-
5852
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5853
- dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5854
- }
5855
- } else {
5856
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5857
- const int64_t ic = i0/2;
5858
-
5859
- const float cos_theta = cache[i0 + 0];
5860
- const float sin_theta = cache[i0 + 1];
5861
-
5862
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5863
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5864
-
5865
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5866
- const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
5867
-
5868
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5869
- dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5870
- }
5871
- }
5872
- } else {
5873
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
5874
- const float cos_theta = cache[i0 + 0];
5875
- const float sin_theta = cache[i0 + 1];
5876
-
5877
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5878
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5879
-
5880
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5881
- const float x1 = GGML_CPU_FP16_TO_FP32(src[1]);
5882
-
5883
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5884
- dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5885
- }
5764
+ T * src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
5765
+ T * dst_data = (T *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
5766
+
5767
+ switch (mode) {
5768
+ case GGML_ROPE_TYPE_NORMAL:
5769
+ rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
5770
+ break;
5771
+ case GGML_ROPE_TYPE_NEOX:
5772
+ case GGML_ROPE_TYPE_MROPE:
5773
+ case GGML_ROPE_TYPE_IMROPE:
5774
+ rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
5775
+ break;
5776
+ case GGML_ROPE_TYPE_VISION:
5777
+ rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
5778
+ break;
5779
+ default:
5780
+ GGML_ABORT("rope type not supported");
5886
5781
  }
5887
5782
 
5888
- if (is_vision) {
5889
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5890
- const int64_t ic = i0/2;
5891
-
5892
- const float cos_theta = cache[i0 + 0];
5893
- const float sin_theta = cache[i0 + 1];
5894
-
5895
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5896
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5897
-
5898
- const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5899
- const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5900
-
5901
- dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5902
- dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5903
- }
5904
- } else {
5783
+ if (!is_vision) {
5784
+ // fill the remain channels with data from src tensor
5905
5785
  for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
5906
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5907
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5786
+ const T * const src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5787
+ T * dst_data = (T *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5908
5788
 
5909
5789
  dst_data[0] = src[0];
5910
5790
  dst_data[1] = src[1];
5911
5791
  }
5912
5792
  }
5913
- }
5793
+ } //attn-heads
5914
5794
  }
5915
5795
  }
5916
5796
  }
@@ -5924,11 +5804,11 @@ void ggml_compute_forward_rope(
5924
5804
  switch (src0->type) {
5925
5805
  case GGML_TYPE_F16:
5926
5806
  {
5927
- ggml_compute_forward_rope_f16(params, dst, true);
5807
+ ggml_compute_forward_rope_flt<ggml_fp16_t>(params, dst, true);
5928
5808
  } break;
5929
5809
  case GGML_TYPE_F32:
5930
5810
  {
5931
- ggml_compute_forward_rope_f32(params, dst, true);
5811
+ ggml_compute_forward_rope_flt<float>(params, dst, true);
5932
5812
  } break;
5933
5813
  default:
5934
5814
  {
@@ -5948,11 +5828,11 @@ void ggml_compute_forward_rope_back(
5948
5828
  switch (src0->type) {
5949
5829
  case GGML_TYPE_F16:
5950
5830
  {
5951
- ggml_compute_forward_rope_f16(params, dst, false);
5831
+ ggml_compute_forward_rope_flt<ggml_fp16_t>(params, dst, false);
5952
5832
  } break;
5953
5833
  case GGML_TYPE_F32:
5954
5834
  {
5955
- ggml_compute_forward_rope_f32(params, dst, false);
5835
+ ggml_compute_forward_rope_flt<float>(params, dst, false);
5956
5836
  } break;
5957
5837
  default:
5958
5838
  {
@@ -7913,6 +7793,18 @@ void ggml_compute_forward_timestep_embedding(
7913
7793
 
7914
7794
  // ggml_compute_forward_argsort
7915
7795
 
7796
+ template<enum ggml_sort_order order>
7797
+ struct argsort_cmp {
7798
+ const float * data;
7799
+ bool operator()(int32_t a, int32_t b) const {
7800
+ if constexpr (order == GGML_SORT_ORDER_ASC) {
7801
+ return data[a] < data[b];
7802
+ } else {
7803
+ return data[a] > data[b];
7804
+ }
7805
+ }
7806
+ };
7807
+
7916
7808
  static void ggml_compute_forward_argsort_f32(
7917
7809
  const ggml_compute_params * params,
7918
7810
  ggml_tensor * dst) {
@@ -7931,23 +7823,25 @@ static void ggml_compute_forward_argsort_f32(
7931
7823
  ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
7932
7824
 
7933
7825
  for (int64_t i = ith; i < nr; i += nth) {
7934
- int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
7935
7826
  const float * src_data = (float *)((char *) src0->data + i*nb01);
7936
7827
 
7828
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
7829
+
7937
7830
  for (int64_t j = 0; j < ne0; j++) {
7938
7831
  dst_data[j] = j;
7939
7832
  }
7940
7833
 
7941
- // C doesn't have a functional sort, so we do a bubble sort instead
7942
- for (int64_t j = 0; j < ne0; j++) {
7943
- for (int64_t k = j + 1; k < ne0; k++) {
7944
- if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
7945
- (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
7946
- int32_t tmp = dst_data[j];
7947
- dst_data[j] = dst_data[k];
7948
- dst_data[k] = tmp;
7949
- }
7950
- }
7834
+ switch (order) {
7835
+ case GGML_SORT_ORDER_ASC:
7836
+ std::sort(dst_data, dst_data + ne0, argsort_cmp<GGML_SORT_ORDER_ASC>{src_data});
7837
+ break;
7838
+
7839
+ case GGML_SORT_ORDER_DESC:
7840
+ std::sort(dst_data, dst_data + ne0, argsort_cmp<GGML_SORT_ORDER_DESC>{src_data});
7841
+ break;
7842
+
7843
+ default:
7844
+ GGML_ABORT("invalid sort order");
7951
7845
  }
7952
7846
  }
7953
7847
  }
@@ -8770,7 +8664,7 @@ static void ggml_compute_forward_ssm_scan_f32(
8770
8664
  // n_head
8771
8665
  for (int h = ih0; h < ih1; ++h) {
8772
8666
  // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
8773
- const float dt_soft_plus = ggml_softplus(dt[h]);
8667
+ const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
8774
8668
  const float dA = expf(dt_soft_plus * A[h]);
8775
8669
  const int g = h / (nh / ng); // repeat_interleave
8776
8670
 
@@ -8867,7 +8761,7 @@ static void ggml_compute_forward_ssm_scan_f32(
8867
8761
  // n_head
8868
8762
  for (int h = ih0; h < ih1; ++h) {
8869
8763
  // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
8870
- const float dt_soft_plus = ggml_softplus(dt[h]);
8764
+ const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
8871
8765
  const int g = h / (nh / ng); // repeat_interleave
8872
8766
 
8873
8767
  // dim
@@ -9150,6 +9044,14 @@ void ggml_compute_forward_unary(
9150
9044
  {
9151
9045
  ggml_compute_forward_xielu(params, dst);
9152
9046
  } break;
9047
+ case GGML_UNARY_OP_EXPM1:
9048
+ {
9049
+ ggml_compute_forward_expm1(params, dst);
9050
+ } break;
9051
+ case GGML_UNARY_OP_SOFTPLUS:
9052
+ {
9053
+ ggml_compute_forward_softplus(params, dst);
9054
+ } break;
9153
9055
  default:
9154
9056
  {
9155
9057
  GGML_ABORT("fatal error");
@@ -9746,6 +9648,76 @@ void ggml_compute_forward_gla(
9746
9648
  }
9747
9649
  }
9748
9650
 
9651
+ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst) {
9652
+ const struct ggml_tensor * src0 = dst->src[0]; // A (lower triangular)
9653
+ const struct ggml_tensor * src1 = dst->src[1]; // B (RHS)
9654
+
9655
+ GGML_TENSOR_BINARY_OP_LOCALS;
9656
+
9657
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
9658
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
9659
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
9660
+
9661
+ GGML_ASSERT(ne00 == ne01); // A must be square
9662
+ GGML_ASSERT(ne0 == ne10); // solution cols == B cols
9663
+ GGML_ASSERT(ne1 == ne11); // solution rows == B rows
9664
+
9665
+ GGML_ASSERT(ne02 == ne12 && ne12 == ne2);
9666
+ GGML_ASSERT(ne03 == ne13 && ne13 == ne3);
9667
+
9668
+ const int ith = params->ith;
9669
+ const int nth = params->nth;
9670
+
9671
+ const int64_t k = ne10; // number of RHS columns
9672
+ const int64_t n = ne11; // A is n×n
9673
+ const int64_t nr = ne02 * ne03 * k; // we're parallelizing on columns here, so seq x token x column will be the unit
9674
+
9675
+ // chunks per thread
9676
+ const int64_t dr = (nr + nth - 1)/nth;
9677
+
9678
+ // chunk range for this thread
9679
+ const int64_t ir0 = dr*ith;
9680
+ const int64_t ir1 = MIN(ir0 + dr, nr);
9681
+
9682
+ const float * A = (const float *) src0->data; // [n, n, B1, B2]
9683
+ const float * B = (const float *) src1->data; // [n, k, B1, B2]
9684
+ float * X = ( float *) dst->data; // [n, k, B1, B2]
9685
+
9686
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
9687
+ const int64_t i03 = ir/(ne02*k);
9688
+ const int64_t i02 = (ir - i03*ne02*k)/k;
9689
+ const int64_t i01 = (ir - i03*ne02*k - i02*k);
9690
+
9691
+ const float * A_batch = A + i02 * nb02 / sizeof(float) + i03 * nb03 / sizeof(float);
9692
+ const float * B_batch = B + i02 * nb12 / sizeof(float) + i03 * nb13 / sizeof(float);
9693
+
9694
+ float * X_batch = X + i02 * nb2 / sizeof(float) + i03 * nb3 / sizeof(float);
9695
+
9696
+ for (int64_t i00 = 0; i00 < n; ++i00) {
9697
+ float sum = 0.0f;
9698
+ for (int64_t t = 0; t < i00; ++t) {
9699
+ sum += A_batch[i00 * n + t] * X_batch[i01 * n + t];
9700
+ }
9701
+
9702
+ const float diag = A_batch[i00 * n + i00];
9703
+ GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
9704
+
9705
+ X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag;
9706
+ }
9707
+ }
9708
+ }
9709
+
9710
+ void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst) {
9711
+ const ggml_tensor * src0 = dst->src[0];
9712
+ const ggml_tensor * src1 = dst->src[1];
9713
+
9714
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
9715
+ ggml_compute_forward_solve_tri_f32(params, dst);
9716
+ } else {
9717
+ GGML_ABORT("fatal error");
9718
+ }
9719
+ }
9720
+
9749
9721
  // ggml_compute_forward_rwkv_wkv7
9750
9722
 
9751
9723
  static void ggml_compute_forward_rwkv_wkv7_f32(