@fugood/llama.node 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CMakeLists.txt +4 -3
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +5 -5
  4. package/src/llama.cpp/CMakeLists.txt +4 -0
  5. package/src/llama.cpp/common/CMakeLists.txt +6 -37
  6. package/src/llama.cpp/common/common.cpp +1 -5
  7. package/src/llama.cpp/common/download.cpp +47 -29
  8. package/src/llama.cpp/common/log.cpp +6 -0
  9. package/src/llama.cpp/common/log.h +2 -0
  10. package/src/llama.cpp/ggml/include/ggml.h +71 -0
  11. package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
  13. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
  20. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
  24. package/src/llama.cpp/src/CMakeLists.txt +6 -0
  25. package/src/llama.cpp/src/llama-arch.cpp +32 -0
  26. package/src/llama.cpp/src/llama-arch.h +2 -0
  27. package/src/llama.cpp/src/llama-graph.cpp +2 -1
  28. package/src/llama.cpp/src/llama-model.cpp +102 -0
  29. package/src/llama.cpp/src/llama-model.h +2 -0
  30. package/src/llama.cpp/src/llama-sampling.cpp +10 -5
  31. package/src/llama.cpp/src/llama-vocab.cpp +16 -1
  32. package/src/llama.cpp/src/llama-vocab.h +1 -0
  33. package/src/llama.cpp/src/models/afmoe.cpp +187 -0
  34. package/src/llama.cpp/src/models/models.h +4 -0
  35. package/src/llama.cpp/src/unicode.cpp +77 -0
@@ -34,6 +34,7 @@ void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct
34
34
  void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
35
35
  void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
36
36
  void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
37
+ void ggml_compute_forward_cumsum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
37
38
  void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
38
39
  void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
39
40
  void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -81,6 +82,8 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru
81
82
  void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
82
83
  void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
83
84
  void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
85
+ void ggml_compute_forward_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
86
+ void ggml_compute_forward_fill(const struct ggml_compute_params * params, struct ggml_tensor * dst);
84
87
  void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
85
88
  void ggml_compute_forward_flash_attn_back(
86
89
  const struct ggml_compute_params * params,
@@ -96,6 +99,7 @@ void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params,
96
99
  void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
97
100
  void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
98
101
  void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
102
+ void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
99
103
  void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
100
104
  void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
101
105
  void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -1600,29 +1600,52 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1600
1600
  return false;
1601
1601
  }
1602
1602
 
1603
- void forward_mul_mat_one_chunk(ggml_compute_params * params, ggml_tensor * op, int64_t src0_start, int64_t src0_end) {
1603
+ void forward_mul_mat_one_chunk(ggml_compute_params * params,
1604
+ ggml_tensor * op,
1605
+ int64_t src0_start,
1606
+ int64_t src0_end,
1607
+ int64_t src1_start,
1608
+ int64_t src1_end) {
1604
1609
  const ggml_tensor * src0 = op->src[0];
1605
1610
  const ggml_tensor * src1 = op->src[1];
1606
1611
  ggml_tensor * dst = op;
1607
1612
 
1608
1613
  GGML_TENSOR_BINARY_OP_LOCALS
1609
1614
 
1610
- const void * src1_wdata = params->wdata;
1611
1615
  const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
1612
1616
 
1617
+ GGML_ASSERT(ne03 == 1 && ne13 == 1);
1618
+ GGML_ASSERT(ne12 % ne02 == 0);
1619
+ const int64_t r2 = ne12 / ne02;
1620
+
1621
+ const int64_t i12 = src1_start / ne1;
1622
+ const int64_t i11 = src1_start - i12 * ne1;
1623
+
1624
+ // Determine batch index
1625
+ const int64_t i02 = i12 / r2;
1626
+
1627
+ const int64_t i1 = i11;
1628
+ const int64_t i2 = i12;
1629
+
1630
+ const char * src0_ptr = (const char *) src0->data + i02 * nb02;
1631
+ const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
1632
+ char * dst_ptr = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
1633
+
1634
+ const int64_t nrows = src1_end - src1_start;
1635
+ const int64_t ncols = src0_end - src0_start;
1636
+
1637
+ GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
1638
+
1613
1639
  // If there are more than three rows in src1, use gemm; otherwise, use gemv.
1614
- if (ne11 > 3) {
1615
- gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1616
- (float *) ((char *) dst->data) + src0_start, ne01,
1617
- (const char *) src0->data + src0_start * nb01,
1618
- (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
1640
+ if (nrows > 3) {
1641
+ gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
1642
+ src0_ptr + src0_start * nb01, src1_ptr,
1643
+ nrows - (nrows % 4), ncols);
1619
1644
  }
1620
- for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
1621
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1622
- (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
1623
- (const char *) src0->data + src0_start * nb01,
1624
- (const char *) src1_wdata + (src1_col_stride * iter), 1,
1625
- src0_end - src0_start);
1645
+ for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
1646
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
1647
+ ne01, src0_ptr + src0_start * nb01,
1648
+ src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols);
1626
1649
  }
1627
1650
  }
1628
1651
 
@@ -1647,6 +1670,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1647
1670
  GGML_ASSERT(nb1 <= nb2);
1648
1671
  GGML_ASSERT(nb2 <= nb3);
1649
1672
 
1673
+ // TODO: General batched mul mat for 4D tensors
1674
+ // Currently only supports 3D tensors
1675
+ GGML_ASSERT(ne03 == 1);
1676
+ GGML_ASSERT(ne13 == 1);
1677
+ GGML_ASSERT(ne3 == 1);
1678
+
1650
1679
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
1651
1680
 
1652
1681
  GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
@@ -1654,47 +1683,64 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1654
1683
 
1655
1684
  char * wdata = static_cast<char *>(params->wdata);
1656
1685
  const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
1686
+ const size_t nbw2 = nbw1 * ne11;
1657
1687
 
1658
- assert(params->wsize >= nbw1 * ne11);
1688
+ assert(params->wsize >= nbw2 * ne12);
1659
1689
 
1660
1690
  const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
1661
1691
 
1662
- int64_t i11_processed = 0;
1663
- for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
1664
- ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
1665
- }
1692
+ // INFO: Quantization is done in planes to avoid extra complexity in chunking.
1693
+ // Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how
1694
+ // the planes are broadcast.
1695
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1696
+ char * data_ptr = (char *) src1->data + i12 * nb12;
1697
+ char * wdata_ptr = wdata + i12 * nbw2;
1666
1698
 
1667
- i11_processed = ne11 - ne11 % 4;
1668
- for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
1669
- from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
1699
+ for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
1700
+ ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
1701
+ (void *) (wdata_ptr + i11 * nbw1), 4, ne10);
1702
+ }
1703
+
1704
+ const int64_t i11_processed = ne11 - ne11 % 4;
1705
+ for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
1706
+ from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
1707
+ }
1670
1708
  }
1671
1709
 
1672
1710
  // disable for NUMA
1673
1711
  const bool disable_chunking = ggml_is_numa();
1674
1712
 
1675
1713
  // 4x chunks per thread
1676
- int64_t nr = ggml_nrows(op->src[0]);
1677
- int nth_scaled = nth * 4;
1678
- int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
1679
- int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
1714
+ const int64_t nr0 = ggml_nrows(op->src[0]);
1715
+
1716
+ int nth_scaled = nth * 4;
1717
+ int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
1718
+ int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0;
1719
+
1720
+ // src1 is chunked only by full planes.
1721
+ // When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE
1722
+ // to route them thorugh GEMV.
1723
+ // nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors
1724
+ // to avoid affecting their performance
1725
+ int64_t nchunk1 = ne12;
1680
1726
 
1681
1727
  // Ensure minimum chunk size to avoid alignment issues with high thread counts
1682
1728
  // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
1683
1729
  const int64_t min_chunk_size = NB_COLS;
1684
- if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
1685
- nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
1730
+ if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
1731
+ nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
1686
1732
  }
1687
1733
 
1688
- if (nth == 1 || nchunk < nth || disable_chunking) {
1689
- nchunk = nth;
1734
+ if (nth == 1 || nchunk0 < nth || disable_chunking) {
1735
+ nchunk0 = nth;
1690
1736
  }
1691
1737
 
1738
+ const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
1739
+
1692
1740
  // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
1693
1741
  // This prevents creating too many tiny chunks that could overlap after alignment
1694
- const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
1695
- if (nchunk > max_nchunk) {
1696
- nchunk = max_nchunk;
1697
- }
1742
+ const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
1743
+ nchunk0 = MIN(nchunk0, max_nchunk);
1698
1744
 
1699
1745
  if (ith == 0) {
1700
1746
  // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
@@ -1706,23 +1752,30 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1706
1752
  // The first chunk comes from our thread_id, the rest will get auto-assigned.
1707
1753
  int current_chunk = ith;
1708
1754
 
1709
- while (current_chunk < nchunk) {
1710
- int64_t src0_start = (current_chunk * ne01) / nchunk;
1711
- int64_t src0_end = ((current_chunk + 1) * ne01) / nchunk;
1755
+ while (current_chunk < nchunk0 * nchunk1) {
1756
+ const int64_t ith0 = current_chunk % nchunk0;
1757
+ const int64_t ith1 = current_chunk / nchunk0;
1758
+
1759
+ int64_t src0_start = dr0 * ith0;
1760
+ int64_t src0_end = MIN(src0_start + dr0, nr0);
1761
+
1762
+ // full-plane range for src1
1763
+ int64_t src1_start = ith1 * ne11;
1764
+ int64_t src1_end = (ith1 + 1) * ne11;
1712
1765
 
1713
1766
  // Align boundaries to NB_COLS - round up to ensure all data is included
1714
1767
  // The chunk size limiting above ensures chunks are large enough to prevent overlaps
1715
1768
  src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
1716
- src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
1717
- if (src0_end > ne01) {
1718
- src0_end = ne01;
1719
- }
1769
+ src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
1770
+ src0_end = MIN(src0_end, ne01);
1720
1771
 
1772
+ // Make sure current plane is the last one before exiting
1721
1773
  if (src0_start >= src0_end) {
1722
- break;
1774
+ current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
1775
+ continue;
1723
1776
  }
1724
1777
 
1725
- forward_mul_mat_one_chunk(params, dst, src0_start, src0_end);
1778
+ forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
1726
1779
 
1727
1780
  current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
1728
1781
  }
@@ -73,6 +73,14 @@ static inline float op_log(float x) {
73
73
  return logf(x);
74
74
  }
75
75
 
76
+ static inline float op_expm1(float x) {
77
+ return expf(x) - 1.0f;
78
+ }
79
+
80
+ static inline float op_softplus(float x) {
81
+ return (x > 20.0f) ? x : logf(1.0f + expf(x));
82
+ }
83
+
76
84
  static inline float op_floor(float x) {
77
85
  return floorf(x);
78
86
  }
@@ -290,6 +298,14 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
290
298
  unary_op<op_log>(params, dst);
291
299
  }
292
300
 
301
+ void ggml_compute_forward_expm1(const ggml_compute_params * params, ggml_tensor * dst) {
302
+ unary_op<op_expm1>(params, dst);
303
+ }
304
+
305
+ void ggml_compute_forward_softplus(const ggml_compute_params * params, ggml_tensor * dst) {
306
+ unary_op<op_softplus>(params, dst);
307
+ }
308
+
293
309
  void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
294
310
  unary_op<op_floor>(params, dst);
295
311
  }
@@ -22,6 +22,8 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
22
22
  void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
23
23
  void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
24
24
  void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
25
+ void ggml_compute_forward_expm1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
26
+ void ggml_compute_forward_softplus(const struct ggml_compute_params * params, struct ggml_tensor * dst);
25
27
  void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
26
28
  void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
27
29
  void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -360,6 +360,13 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
360
360
  for (; i + 3 < n; i += 4) {
361
361
  vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
362
362
  }
363
+ #elif defined(__riscv_v_intrinsic)
364
+ for (int vl; i < n; i += vl) {
365
+ vl = __riscv_vsetvl_e32m2(n - i);
366
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
367
+ vfloat32m2_t vy = ggml_v_silu_m2(vx, vl);
368
+ __riscv_vse32_v_f32m2(&y[i], vy, vl);
369
+ }
363
370
  #endif
364
371
  for (; i < n; ++i) {
365
372
  y[i] = ggml_silu_f32(x[i]);
@@ -460,6 +467,16 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa
460
467
  val = vec_mul(val, val);
461
468
  sum += (ggml_float)vec_hsum_f32x4(val);
462
469
  }
470
+ #elif defined(__riscv_v_intrinsic)
471
+ vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
472
+ for (int vl; i < n; i += vl) {
473
+ vl = __riscv_vsetvl_e32m2(n - i);
474
+ vfloat32m2_t val = __riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], vl), mean, vl);
475
+ __riscv_vse32_v_f32m2(&y[i], val, vl);
476
+ val = __riscv_vfmul_vv_f32m2(val, val, vl);
477
+ vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, vl);
478
+ }
479
+ sum = (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
463
480
  #endif
464
481
  for (; i < n; ++i) {
465
482
  float val = x[i] - mean;
@@ -1416,6 +1416,16 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
1416
1416
  #endif
1417
1417
  }
1418
1418
 
1419
+ inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
1420
+ for (int i = 0; i < n; ++i) {
1421
+ if (i == 0) {
1422
+ y[i] = x[i];
1423
+ } else {
1424
+ y[i] = y[i - 1] + x[i];
1425
+ }
1426
+ }
1427
+ }
1428
+
1419
1429
  inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
1420
1430
  ggml_float sum = 0.0;
1421
1431
  for (int i = 0; i < n; ++i) {
@@ -35,6 +35,7 @@ add_library(llama
35
35
  unicode-data.cpp
36
36
  unicode.cpp
37
37
  unicode.h
38
+ models/afmoe.cpp
38
39
  models/apertus.cpp
39
40
  models/arcee.cpp
40
41
  models/arctic.cpp
@@ -132,6 +133,11 @@ add_library(llama
132
133
  models/graph-context-mamba.cpp
133
134
  )
134
135
 
136
+ set_target_properties(llama PROPERTIES
137
+ VERSION ${LLAMA_INSTALL_VERSION}
138
+ SOVERSION 0
139
+ )
140
+
135
141
  target_include_directories(llama PRIVATE .)
136
142
  target_include_directories(llama PUBLIC ../include)
137
143
  target_compile_features (llama PRIVATE cxx_std_17) # don't bump
@@ -90,6 +90,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
90
90
  { LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
91
91
  { LLM_ARCH_DOTS1, "dots1" },
92
92
  { LLM_ARCH_ARCEE, "arcee" },
93
+ { LLM_ARCH_AFMOE, "afmoe" },
93
94
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
94
95
  { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
95
96
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
@@ -333,6 +334,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
333
334
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
334
335
  },
335
336
  },
337
+ {
338
+ LLM_ARCH_AFMOE,
339
+ {
340
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
341
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
342
+ { LLM_TENSOR_OUTPUT, "output" },
343
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
344
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
345
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
346
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
347
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
348
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
349
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
350
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
351
+ { LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
352
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
353
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
354
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
355
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
356
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
357
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
358
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
359
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
360
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
361
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
362
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
363
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
364
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
365
+ },
366
+ },
336
367
  {
337
368
  LLM_ARCH_LLAMA4,
338
369
  {
@@ -2444,6 +2475,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2444
2475
  {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2445
2476
  {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2446
2477
  {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2478
+ {LLM_TENSOR_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2447
2479
  {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2448
2480
  {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2449
2481
  {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -94,6 +94,7 @@ enum llm_arch {
94
94
  LLM_ARCH_BAILINGMOE2,
95
95
  LLM_ARCH_DOTS1,
96
96
  LLM_ARCH_ARCEE,
97
+ LLM_ARCH_AFMOE,
97
98
  LLM_ARCH_ERNIE4_5,
98
99
  LLM_ARCH_ERNIE4_5_MOE,
99
100
  LLM_ARCH_HUNYUAN_MOE,
@@ -312,6 +313,7 @@ enum llm_tensor {
312
313
  LLM_TENSOR_ATTN_POST_NORM,
313
314
  LLM_TENSOR_ATTN_ROT_EMBD,
314
315
  LLM_TENSOR_ATTN_SINKS,
316
+ LLM_TENSOR_ATTN_GATE,
315
317
  LLM_TENSOR_FFN_GATE_INP,
316
318
  LLM_TENSOR_FFN_GATE_INP_SHEXP,
317
319
  LLM_TENSOR_FFN_NORM,
@@ -1592,9 +1592,10 @@ ggml_tensor * llm_graph_context::build_attn(
1592
1592
  int il) const {
1593
1593
  // these nodes are added to the graph together so that they are not reordered
1594
1594
  // by doing so, the number of splits in the graph is reduced
1595
+ // expand k later to enable rope fusion which directly writes into k-v cache
1595
1596
  ggml_build_forward_expand(gf, q_cur);
1596
- ggml_build_forward_expand(gf, k_cur);
1597
1597
  ggml_build_forward_expand(gf, v_cur);
1598
+ ggml_build_forward_expand(gf, k_cur);
1598
1599
 
1599
1600
  const auto * mctx_cur = inp->mctx;
1600
1601
 
@@ -84,6 +84,7 @@ const char * llm_type_name(llm_type type) {
84
84
  case LLM_TYPE_15B: return "15B";
85
85
  case LLM_TYPE_16B: return "16B";
86
86
  case LLM_TYPE_20B: return "20B";
87
+ case LLM_TYPE_26B: return "26B";
87
88
  case LLM_TYPE_27B: return "27B";
88
89
  case LLM_TYPE_30B: return "30B";
89
90
  case LLM_TYPE_32B: return "32B";
@@ -695,6 +696,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
695
696
  default: type = LLM_TYPE_UNKNOWN;
696
697
  }
697
698
  } break;
699
+ case LLM_ARCH_AFMOE:
700
+ {
701
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
702
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
703
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
704
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
705
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
706
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
707
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
708
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
709
+
710
+ // Set up interleaved sliding window attention (ISWA)
711
+ // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
712
+ if (hparams.n_swa > 0) {
713
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
714
+ hparams.set_swa_pattern(4);
715
+ } else {
716
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
717
+ }
718
+
719
+ // Default to sigmoid if not set
720
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
721
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
722
+ }
723
+
724
+ switch (hparams.n_layer) {
725
+ case 56: type = LLM_TYPE_6B; break;
726
+ case 32: type = LLM_TYPE_26B; break;
727
+ default: type = LLM_TYPE_UNKNOWN;
728
+ }
729
+ } break;
698
730
  case LLM_ARCH_DECI:
699
731
  {
700
732
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5749,6 +5781,71 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5749
5781
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5750
5782
  }
5751
5783
  } break;
5784
+ case LLM_ARCH_AFMOE:
5785
+ {
5786
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5787
+
5788
+ // output
5789
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5790
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5791
+
5792
+ // if output is NULL, init from the input tok embed
5793
+ if (output == NULL) {
5794
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5795
+ }
5796
+
5797
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5798
+ const int64_t n_expert_shared = hparams.n_expert_shared;
5799
+
5800
+ for (int i = 0; i < n_layer; ++i) {
5801
+ auto & layer = layers[i];
5802
+
5803
+ // dual attention normalization
5804
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5805
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5806
+
5807
+ // attention projections
5808
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5809
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5810
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5811
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5812
+
5813
+ // Q/K normalization
5814
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5815
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5816
+
5817
+ // attention gating
5818
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5819
+
5820
+ // dual ffn normalization
5821
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5822
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5823
+
5824
+ if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
5825
+ // MoE layers
5826
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5827
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
5828
+
5829
+ // grouped expert weights
5830
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5831
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5832
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5833
+
5834
+ // shared expert
5835
+ if (n_expert_shared > 0) {
5836
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5837
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5838
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5839
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5840
+ }
5841
+ } else {
5842
+ // Dense layers
5843
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5844
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5845
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5846
+ }
5847
+ }
5848
+ } break;
5752
5849
  case LLM_ARCH_ERNIE4_5:
5753
5850
  case LLM_ARCH_ERNIE4_5_MOE:
5754
5851
  {
@@ -7243,6 +7340,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7243
7340
  {
7244
7341
  llm = std::make_unique<llm_build_arcee>(*this, params);
7245
7342
  } break;
7343
+ case LLM_ARCH_AFMOE:
7344
+ {
7345
+ llm = std::make_unique<llm_build_afmoe>(*this, params);
7346
+ } break;
7246
7347
  case LLM_ARCH_ERNIE4_5:
7247
7348
  {
7248
7349
  llm = std::make_unique<llm_build_ernie4_5>(*this, params);
@@ -7528,6 +7629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7528
7629
  case LLM_ARCH_MINIMAX_M2:
7529
7630
  case LLM_ARCH_COGVLM:
7530
7631
  case LLM_ARCH_PANGU_EMBED:
7632
+ case LLM_ARCH_AFMOE:
7531
7633
  return LLAMA_ROPE_TYPE_NEOX;
7532
7634
 
7533
7635
  case LLM_ARCH_QWEN2VL:
@@ -76,6 +76,7 @@ enum llm_type {
76
76
  LLM_TYPE_15B,
77
77
  LLM_TYPE_16B,
78
78
  LLM_TYPE_20B,
79
+ LLM_TYPE_26B,
79
80
  LLM_TYPE_27B,
80
81
  LLM_TYPE_30B,
81
82
  LLM_TYPE_32B,
@@ -234,6 +235,7 @@ struct llama_layer {
234
235
  struct ggml_tensor * wk_enc = nullptr;
235
236
  struct ggml_tensor * wv_enc = nullptr;
236
237
  struct ggml_tensor * wo_enc = nullptr;
238
+ struct ggml_tensor * wqkv_gate = nullptr;
237
239
 
238
240
  // attention bias
239
241
  struct ggml_tensor * bq = nullptr;
@@ -4,6 +4,7 @@
4
4
  #include "llama-vocab.h"
5
5
  #include "llama-grammar.h"
6
6
 
7
+ #include <array>
7
8
  #include <algorithm>
8
9
  #include <cassert>
9
10
  #include <cfloat>
@@ -1625,10 +1626,12 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1625
1626
  auto * ctx = new llama_sampler_grammar;
1626
1627
 
1627
1628
  if (grammar_str != nullptr && grammar_str[0] != '\0') {
1629
+ std::string trigger_pattern;
1630
+ llama_grammar * grammar = nullptr;
1628
1631
  // TODO: remove trigger_words support.
1629
1632
  if (trigger_words != nullptr && num_trigger_words > 0) {
1630
1633
  GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
1631
- std::string trigger_pattern("[\\s\\S]*?(");
1634
+ trigger_pattern = "[\\s\\S]*?(";
1632
1635
  for (size_t i = 0; i < num_trigger_words; ++i) {
1633
1636
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
1634
1637
  if (i > 0) {
@@ -1637,15 +1640,17 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1637
1640
  trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
1638
1641
  }
1639
1642
  trigger_pattern += ")[\\s\\S]*";
1640
- const auto * trigger_pattern_c = trigger_pattern.c_str();
1641
- trigger_patterns = &trigger_pattern_c;
1642
- num_trigger_patterns = 1;
1643
+
1644
+ std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
1645
+ grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
1646
+ } else {
1647
+ grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
1643
1648
  }
1644
1649
  *ctx = {
1645
1650
  /* .vocab = */ vocab,
1646
1651
  /* .grammar_str = */ grammar_str,
1647
1652
  /* .grammar_root = */ grammar_root,
1648
- /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
1653
+ /* .grammar = */ grammar,
1649
1654
  };
1650
1655
  if (!ctx->grammar) {
1651
1656
  delete ctx;