@fugood/llama.node 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CMakeLists.txt +1 -1
  2. package/lib/binding.js +3 -0
  3. package/lib/binding.ts +10 -0
  4. package/lib/index.js +9 -0
  5. package/lib/index.ts +10 -0
  6. package/package.json +15 -15
  7. package/scripts/llama.cpp.patch +25 -11
  8. package/src/LlamaContext.cpp +24 -0
  9. package/src/LlamaContext.h +3 -0
  10. package/src/llama.cpp/CMakeLists.txt +21 -6
  11. package/src/llama.cpp/common/CMakeLists.txt +6 -0
  12. package/src/llama.cpp/common/arg.cpp +83 -22
  13. package/src/llama.cpp/common/chat-parser.cpp +40 -0
  14. package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
  15. package/src/llama.cpp/common/chat-peg-parser.h +105 -0
  16. package/src/llama.cpp/common/chat.cpp +40 -29
  17. package/src/llama.cpp/common/chat.h +10 -1
  18. package/src/llama.cpp/common/common.cpp +70 -7
  19. package/src/llama.cpp/common/common.h +23 -5
  20. package/src/llama.cpp/common/download.cpp +18 -8
  21. package/src/llama.cpp/common/download.h +3 -1
  22. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  23. package/src/llama.cpp/common/log.cpp +18 -27
  24. package/src/llama.cpp/common/log.h +19 -12
  25. package/src/llama.cpp/common/peg-parser.cpp +1712 -0
  26. package/src/llama.cpp/common/peg-parser.h +459 -0
  27. package/src/llama.cpp/common/unicode.cpp +64 -0
  28. package/src/llama.cpp/common/unicode.h +22 -0
  29. package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
  30. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
  31. package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +29 -2
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
  37. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
  39. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-arch.cpp +30 -1
  43. package/src/llama.cpp/src/llama-arch.h +3 -0
  44. package/src/llama.cpp/src/llama-graph.cpp +3 -6
  45. package/src/llama.cpp/src/llama-hparams.h +2 -2
  46. package/src/llama.cpp/src/llama-impl.h +1 -1
  47. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  48. package/src/llama.cpp/src/llama-model.cpp +54 -6
  49. package/src/llama.cpp/src/llama-quant.cpp +0 -29
  50. package/src/llama.cpp/src/llama-vocab.cpp +1 -2
  51. package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
  52. package/src/llama.cpp/src/models/mistral3.cpp +160 -0
  53. package/src/llama.cpp/src/models/models.h +4 -0
  54. package/src/llama.cpp/src/unicode.cpp +2 -2
@@ -117,8 +117,7 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
117
117
  #endif
118
118
 
119
119
  #if defined(__MMA__)
120
- typedef vector unsigned char vec_t;
121
- typedef __vector_quad acc_t;
120
+ #include "sgemm-ppc.h"
122
121
  #endif
123
122
  ////////////////////////////////////////////////////////////////////////////////////////////////////
124
123
  // VECTORIZED FUSED MULTIPLY ADD
@@ -1573,95 +1572,35 @@ class tinyBLAS_BF16_PPC {
1573
1572
  const int nth;
1574
1573
  };
1575
1574
 
1576
- template <typename TA>
1577
- class tinyBLAS_Q0_PPC {
1578
- public:
1579
- tinyBLAS_Q0_PPC(int64_t k,
1580
- const TA *A, int64_t lda,
1581
- const block_q8_0 *B, int64_t ldb,
1582
- float *C, int64_t ldc,
1583
- int ith, int nth)
1575
+ template <typename TA>
1576
+ tinyBLAS_Q0_PPC<TA>::tinyBLAS_Q0_PPC(int64_t k,
1577
+ const TA *A, int64_t lda,
1578
+ const block_q8_0 *B, int64_t ldb,
1579
+ float *C, int64_t ldc,
1580
+ int ith, int nth)
1584
1581
  : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
1582
+ kc = 64;
1585
1583
  }
1586
1584
 
1587
- void matmul(int64_t m, int64_t n) {
1588
- mnpack(0, m, 0, n);
1589
- }
1590
-
1591
- private:
1592
-
1593
- inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
1594
- for (int I = 0; I < RM; I++) {
1595
- for (int J = 0; J < RN; J++) {
1596
- *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
1597
- }
1598
- }
1599
- }
1600
-
1601
- template<int size>
1602
- inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
1603
- vector signed int vec_C[4];
1604
- vector float CA[4] = {0};
1605
- vector float res[4] = {0};
1606
- __builtin_mma_disassemble_acc(vec_C, ACC);
1607
- for (int i = 0; i < 4; i++) {
1608
- CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
1609
- res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1610
- fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
1611
- }
1612
- }
1613
- /* This function processes quantized data from block_q4_0 elements.
1614
- * First the we try to extract the two int4 values stored in single int8_t into two signed int8.
1615
- * And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8.
1616
- * Also compute the rowsum which is required to compensate the above conversion. */
1617
- inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
1618
- const vector signed char lowMask = vec_splats((signed char)0xF);
1619
- const vector unsigned char v4 = vec_splats((unsigned char)0x4);
1620
- const vector signed char v8 = vec_splats((signed char)0x8);
1621
- vector signed int vsum = {0};
1622
- vector signed int vsum2 = {0};
1623
- c[0] = vec_and(c[1], lowMask);
1624
- c[1] = vec_sr(c[1], v4);
1625
- c[0] = vec_sub(c[0], v8);
1626
- c[1] = vec_sub(c[1], v8);
1627
- vsum = vec_sum4s(c[0], vsum);
1628
- vsum2 = vec_sum4s(c[1], vsum2);
1629
- vsum = vec_add(vsum, vsum2);
1630
- *(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
1631
- }
1632
-
1633
- template <typename V1, typename V2>
1634
- inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
1635
- vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
1636
- vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
1637
- vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
1638
- vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
1639
- V2 t1, t2, t3, t4, t5, t6, t7, t8;
1640
- vector unsigned char xor_vector;
1641
- uint8_t flip_vec = 0x80;
1642
- xor_vector = vec_splats(flip_vec);
1643
- t1 = vec_perm(s1, s2, swiz1);
1644
- t2 = vec_perm(s1, s2, swiz2);
1645
- t3 = vec_perm(s3, s4, swiz1);
1646
- t4 = vec_perm(s3, s4, swiz2);
1647
- t5 = vec_perm(t1, t3, swiz3);
1648
- t6 = vec_perm(t1, t3, swiz4);
1649
- t7 = vec_perm(t2, t4, swiz3);
1650
- t8 = vec_perm(t2, t4, swiz4);
1651
- if (flip == true) {
1652
- t5 = vec_xor(t5, xor_vector);
1653
- t6 = vec_xor(t6, xor_vector);
1654
- t7 = vec_xor(t7, xor_vector);
1655
- t8 = vec_xor(t8, xor_vector);
1585
+ template<typename TA>
1586
+ void tinyBLAS_Q0_PPC<TA>::matmul(int64_t m, int64_t n) {
1587
+ int mc = 64; int nc = 64;
1588
+ if (n % 8 == 0 && n < nc) {
1589
+ nc = n;
1590
+ mc = 32 ;
1591
+ kc = 32;
1592
+ }
1593
+ const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
1594
+ if (is_aligned) {
1595
+ this->matmul_tiled_q0(m, n, mc, nc, kc);
1596
+ } else {
1597
+ mnpack(0, m, 0, n);
1656
1598
  }
1657
- vec_xst(t5, 0, vecOffset);
1658
- vec_xst(t6, 0, vecOffset+16);
1659
- vec_xst(t7, 0, vecOffset+32);
1660
- vec_xst(t8, 0, vecOffset+48);
1661
1599
  }
1662
1600
 
1663
- template<int size>
1664
- void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
1601
+ template<typename TA>
1602
+ template<int size>
1603
+ void tinyBLAS_Q0_PPC<TA>::packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
1665
1604
  int64_t i, j;
1666
1605
  TA *aoffset = NULL;
1667
1606
  int8_t *vecOffset = NULL;
@@ -1781,8 +1720,10 @@ class tinyBLAS_Q0_PPC {
1781
1720
  }
1782
1721
  }
1783
1722
  }
1723
+
1724
+ template<typename TA>
1784
1725
  template<typename VA, typename VB>
1785
- void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
1726
+ void tinyBLAS_Q0_PPC<TA>::packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
1786
1727
  int64_t i, j;
1787
1728
  block_q8_0 *aoffset = NULL;
1788
1729
  VA *vecOffset = NULL;
@@ -1822,7 +1763,6 @@ class tinyBLAS_Q0_PPC {
1822
1763
  j--;
1823
1764
  } while(j > 0);
1824
1765
  }
1825
-
1826
1766
  if (rows & 4) {
1827
1767
  aoffsets[0] = aoffset;
1828
1768
  for (int it = 1; it < 4; it++ )
@@ -1878,7 +1818,8 @@ class tinyBLAS_Q0_PPC {
1878
1818
  }
1879
1819
  }
1880
1820
 
1881
- void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1821
+ template<typename TA>
1822
+ void tinyBLAS_Q0_PPC<TA>::mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1882
1823
  int m_rem = MIN(m - m0, 16);
1883
1824
  int n_rem = MIN(n - n0, 16);
1884
1825
 
@@ -1915,7 +1856,8 @@ class tinyBLAS_Q0_PPC {
1915
1856
  }
1916
1857
 
1917
1858
 
1918
- void KERNEL_4x8(int64_t ii, int64_t jj) {
1859
+ template<typename TA>
1860
+ void tinyBLAS_Q0_PPC<TA>::KERNEL_4x8(int64_t ii, int64_t jj) {
1919
1861
  vec_t vec_A[8], vec_B[16] = {0};
1920
1862
  acc_t acc_0, acc_1;
1921
1863
  std::array<int, 4> comparray {};
@@ -1953,14 +1895,15 @@ class tinyBLAS_Q0_PPC {
1953
1895
  aoffset += lda;
1954
1896
  }
1955
1897
  }
1956
- compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
1957
- compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
1898
+ compute(&acc_0, 0, 0, comparray, vs, fin_res);
1899
+ compute(&acc_1, 0, 4, comparray, vs, fin_res);
1958
1900
  }
1959
1901
  save_res(ii, jj, 0, fin_res);
1960
1902
  save_res(ii, jj+4, 4, fin_res);
1961
1903
  }
1962
1904
 
1963
- void KERNEL_8x4(int64_t ii, int64_t jj) {
1905
+ template<typename TA>
1906
+ void tinyBLAS_Q0_PPC<TA>::KERNEL_8x4(int64_t ii, int64_t jj) {
1964
1907
  vec_t vec_A[16], vec_B[8] = {0};
1965
1908
  acc_t acc_0, acc_1;
1966
1909
  std::array<int, 8> comparray {};
@@ -1997,16 +1940,18 @@ class tinyBLAS_Q0_PPC {
1997
1940
  aoffset += lda;
1998
1941
  }
1999
1942
  }
2000
- compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
2001
- compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
1943
+ compute(&acc_0, 0, 0, comparray, vs, fin_res);
1944
+ compute(&acc_1, 4, 4, comparray, vs, fin_res);
2002
1945
  }
2003
1946
  save_res(ii, jj, 0, fin_res);
2004
1947
  save_res(ii+4, jj, 4, fin_res);
2005
1948
  }
2006
1949
 
2007
- void KERNEL_8x8(int64_t ii, int64_t jj) {
1950
+ template<typename TA>
1951
+ void tinyBLAS_Q0_PPC<TA>::KERNEL_8x8(int64_t ii, int64_t jj) {
2008
1952
  vec_t vec_A[16], vec_B[16] = {0};
2009
1953
  acc_t acc_0, acc_1, acc_2, acc_3;
1954
+ acc_t acc_4, acc_5, acc_6, acc_7;
2010
1955
  std::array<int, 8> comparray {};
2011
1956
  vector float fin_res[16] = {0};
2012
1957
  vector float vs[16] = {0};
@@ -2046,10 +1991,10 @@ class tinyBLAS_Q0_PPC {
2046
1991
  aoffset += lda;
2047
1992
  }
2048
1993
  }
2049
- compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
2050
- compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
2051
- compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
2052
- compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
1994
+ compute(&acc_0, 0, 0, comparray, vs, fin_res);
1995
+ compute(&acc_1, 4, 4, comparray, vs, fin_res);
1996
+ compute(&acc_2, 0, 8, comparray, vs, fin_res);
1997
+ compute(&acc_3, 4, 12, comparray, vs, fin_res);
2053
1998
  }
2054
1999
  save_res(ii, jj, 0, fin_res);
2055
2000
  save_res(ii+4, jj, 4, fin_res);
@@ -2057,7 +2002,8 @@ class tinyBLAS_Q0_PPC {
2057
2002
  save_res(ii+4, jj+4, 12, fin_res);
2058
2003
  }
2059
2004
 
2060
- void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
2005
+ template<typename TA>
2006
+ void tinyBLAS_Q0_PPC<TA>::gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
2061
2007
  int64_t ytiles = (m - m0) / RM;
2062
2008
  int64_t xtiles = (n - n0) / RN;
2063
2009
  int64_t tiles = xtiles * ytiles;
@@ -2125,21 +2071,9 @@ class tinyBLAS_Q0_PPC {
2125
2071
  }
2126
2072
  }
2127
2073
 
2128
- template<int RM, int RN>
2129
- inline void kernel(int64_t ii, int64_t jj) {
2130
- if constexpr(RM == 4 && RN == 8) {
2131
- KERNEL_4x8(ii,jj);
2132
- } else if constexpr(RM == 8 && RN == 4) {
2133
- KERNEL_8x4(ii,jj);
2134
- } else if constexpr(RM == 8 && RN == 8) {
2135
- KERNEL_8x8(ii,jj);
2136
- } else {
2137
- assert(false && "RN/RM values not supported");
2138
- }
2139
- }
2140
-
2074
+ template<typename TA>
2141
2075
  template <int RM, int RN>
2142
- NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
2076
+ NOINLINE void tinyBLAS_Q0_PPC<TA>::gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
2143
2077
  int64_t ytiles = (m - m0) / RM;
2144
2078
  int64_t xtiles = (n - n0) / RN;
2145
2079
  int64_t tiles = xtiles * ytiles;
@@ -2151,20 +2085,12 @@ class tinyBLAS_Q0_PPC {
2151
2085
  for (int64_t job = start; job < end; ++job) {
2152
2086
  int64_t ii = m0 + job / xtiles * RM;
2153
2087
  int64_t jj = n0 + job % xtiles * RN;
2154
- kernel<RM, RN>(ii, jj);
2088
+ this->kernel<RM, RN>(ii, jj);
2155
2089
  }
2156
2090
  }
2157
2091
 
2158
- const TA *const A;
2159
- const block_q8_0 *const B;
2160
- float *C;
2161
- const int64_t k;
2162
- const int64_t lda;
2163
- const int64_t ldb;
2164
- const int64_t ldc;
2165
- const int ith;
2166
- const int nth;
2167
- };
2092
+ template class tinyBLAS_Q0_PPC<block_q4_0>;
2093
+ template class tinyBLAS_Q0_PPC<block_q8_0>;
2168
2094
 
2169
2095
  class tinyBLAS_PPC {
2170
2096
  public:
@@ -6,6 +6,12 @@
6
6
  #include <vecintrin.h>
7
7
  #endif
8
8
 
9
+ #ifdef _MSC_VER
10
+ #define NOINLINE __declspec(noinline)
11
+ #else
12
+ #define NOINLINE __attribute__((__noinline__))
13
+ #endif
14
+
9
15
  #ifdef __cplusplus
10
16
  extern "C" {
11
17
  #endif
@@ -6383,7 +6383,7 @@ static void ggml_compute_forward_im2col_3d_f16(
6383
6383
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
6384
6384
  const int64_t iid = iod*s2 + ikd*d2 - p2;
6385
6385
 
6386
- if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
6386
+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
6387
6387
  dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
6388
6388
  } else {
6389
6389
  const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
@@ -6554,8 +6554,13 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
6554
6554
  ggml_compute_forward_mul_mat(params, &dst);
6555
6555
  }
6556
6556
 
6557
+ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
6558
+ return (coord + size) % size; // adding size avoids negative number weirdness
6559
+ }
6560
+
6557
6561
  // ggml_compute_forward_conv_2d
6558
6562
 
6563
+
6559
6564
  static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
6560
6565
  const ggml_tensor * kernel, // [KW, KH, IC, OC]
6561
6566
  const ggml_tensor * src, // [W, H, C, N]
@@ -7420,6 +7425,65 @@ static void ggml_compute_forward_upscale_f32(
7420
7425
  }
7421
7426
  }
7422
7427
  }
7428
+ } else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
7429
+ // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
7430
+ // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
7431
+ auto triangle_filter = [](float x) -> float {
7432
+ return std::max(1.0f - fabsf(x), 0.0f);
7433
+ };
7434
+
7435
+ // support and invscale, minimum 1 pixel for bilinear
7436
+ const float support1 = std::max(1.0f, 1.0f / sf1);
7437
+ const float invscale1 = 1.0f / support1;
7438
+ const float support0 = std::max(1.0f, 1.0f / sf0);
7439
+ const float invscale0 = 1.0f / support0;
7440
+
7441
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
7442
+ const int64_t i03 = i3 / sf3;
7443
+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
7444
+ const int64_t i02 = i2 / sf2;
7445
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
7446
+ const float y = ((float) i1 + pixel_offset) / sf1;
7447
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
7448
+ const float x = ((float) i0 + pixel_offset) / sf0;
7449
+
7450
+ // the range of source pixels that contribute
7451
+ const int64_t x_min = std::max<int64_t>(x - support0 + pixel_offset, 0);
7452
+ const int64_t x_max = std::min<int64_t>(x + support0 + pixel_offset, ne00);
7453
+ const int64_t y_min = std::max<int64_t>(y - support1 + pixel_offset, 0);
7454
+ const int64_t y_max = std::min<int64_t>(y + support1 + pixel_offset, ne01);
7455
+
7456
+ // bilinear filter with antialiasing
7457
+ float val = 0.0f;
7458
+ float total_weight = 0.0f;
7459
+
7460
+ for (int64_t sy = y_min; sy < y_max; sy++) {
7461
+ const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
7462
+
7463
+ for (int64_t sx = x_min; sx < x_max; sx++) {
7464
+ const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
7465
+ const float weight = weight_x * weight_y;
7466
+
7467
+ if (weight <= 0.0f) {
7468
+ continue;
7469
+ }
7470
+
7471
+ const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03);
7472
+ val += pixel * weight;
7473
+ total_weight += weight;
7474
+ }
7475
+ }
7476
+
7477
+ if (total_weight > 0.0f) {
7478
+ val /= total_weight;
7479
+ }
7480
+
7481
+ float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
7482
+ *dst_ptr = val;
7483
+ }
7484
+ }
7485
+ }
7486
+ }
7423
7487
  } else if (mode == GGML_SCALE_MODE_BILINEAR) {
7424
7488
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7425
7489
  const int64_t i03 = i3 / sf3;
@@ -7532,6 +7596,7 @@ void ggml_compute_forward_upscale(
7532
7596
 
7533
7597
  // ggml_compute_forward_pad
7534
7598
 
7599
+ template<bool circular_t>
7535
7600
  static void ggml_compute_forward_pad_f32(
7536
7601
  const ggml_compute_params * params,
7537
7602
  ggml_tensor * dst) {
@@ -7556,23 +7621,40 @@ static void ggml_compute_forward_pad_f32(
7556
7621
  const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
7557
7622
  const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
7558
7623
 
7559
-
7560
7624
  // TODO: optimize
7561
7625
 
7562
7626
  for (int64_t i2 = 0; i2 < ne2; ++i2) {
7563
7627
  for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
7564
7628
  for (int64_t i0 = 0; i0 < ne0; ++i0) {
7565
7629
  for (int64_t i3 = 0; i3 < ne3; ++i3) {
7566
- const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
7567
- if ((i0 >= lp0 && i0 < ne0 - rp0) \
7568
- && (i1 >= lp1 && i1 < ne1 - rp1) \
7569
- && (i2 >= lp2 && i2 < ne2 - rp2) \
7570
- && (i3 >= lp3 && i3 < ne3 - rp3)) {
7571
- const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
7630
+ // circular means wrap around on a torus, so x and y loop around
7631
+ if constexpr (circular_t) {
7632
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
7633
+ const int64_t src_i0 = ggml_wrap_around(i0 - lp0, ne00);
7634
+ const int64_t src_i1 = ggml_wrap_around(i1 - lp1, ne01);
7635
+ const int64_t src_i2 = ggml_wrap_around(i2 - lp2, ne02);
7636
+ const int64_t src_i3 = ggml_wrap_around(i3 - lp3, ne03);
7637
+
7638
+ const int64_t src_idx =
7639
+ src_i3*nb03 +
7640
+ src_i2*nb02 +
7641
+ src_i1*nb01 +
7642
+ src_i0*nb00;
7643
+
7572
7644
  const float * src_ptr = (const float *)((char *) src0->data + src_idx);
7573
7645
  dst_ptr[dst_idx] = *src_ptr;
7574
7646
  } else {
7575
- dst_ptr[dst_idx] = 0;
7647
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
7648
+ if ((i0 >= lp0 && i0 < ne0 - rp0) \
7649
+ && (i1 >= lp1 && i1 < ne1 - rp1) \
7650
+ && (i2 >= lp2 && i2 < ne2 - rp2) \
7651
+ && (i3 >= lp3 && i3 < ne3 - rp3)) {
7652
+ const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
7653
+ const float * src_ptr = (const float *)((char *) src0->data + src_idx);
7654
+ dst_ptr[dst_idx] = *src_ptr;
7655
+ } else {
7656
+ dst_ptr[dst_idx] = 0;
7657
+ }
7576
7658
  }
7577
7659
  }
7578
7660
  }
@@ -7580,16 +7662,20 @@ static void ggml_compute_forward_pad_f32(
7580
7662
  }
7581
7663
  }
7582
7664
 
7665
+
7583
7666
  void ggml_compute_forward_pad(
7584
7667
  const ggml_compute_params * params,
7585
7668
  ggml_tensor * dst) {
7586
-
7587
7669
  const ggml_tensor * src0 = dst->src[0];
7588
-
7670
+ const bool circular = (bool) ggml_get_op_params_i32(dst, 8);
7589
7671
  switch (src0->type) {
7590
7672
  case GGML_TYPE_F32:
7591
7673
  {
7592
- ggml_compute_forward_pad_f32(params, dst);
7674
+ if (circular) {
7675
+ ggml_compute_forward_pad_f32<true>(params, dst);
7676
+ } else {
7677
+ ggml_compute_forward_pad_f32<false>(params, dst);
7678
+ }
7593
7679
  } break;
7594
7680
  default:
7595
7681
  {
@@ -132,6 +132,7 @@ add_library(llama
132
132
  models/t5-enc.cpp
133
133
  models/wavtokenizer-dec.cpp
134
134
  models/xverse.cpp
135
+ models/mistral3.cpp
135
136
  models/graph-context-mamba.cpp
136
137
  )
137
138
 
@@ -111,6 +111,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
111
111
  { LLM_ARCH_COGVLM, "cogvlm" },
112
112
  { LLM_ARCH_RND1, "rnd1" },
113
113
  { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
114
+ { LLM_ARCH_MISTRAL3, "mistral3" },
114
115
  { LLM_ARCH_UNKNOWN, "(unknown)" },
115
116
  };
116
117
 
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
204
205
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
205
206
  { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
206
207
  { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
208
+ { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
207
209
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
208
210
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
209
211
 
@@ -853,7 +855,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
853
855
  { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
854
856
  { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
855
857
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
856
- { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
858
+ { LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
857
859
  { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
858
860
  { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
859
861
  { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
@@ -2512,6 +2514,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2512
2514
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2513
2515
  },
2514
2516
  },
2517
+ {
2518
+ LLM_ARCH_MISTRAL3,
2519
+ {
2520
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2521
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2522
+ { LLM_TENSOR_OUTPUT, "output" },
2523
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
2524
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2525
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2526
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2527
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2528
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2529
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
2530
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2531
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2532
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2533
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2534
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2535
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
2536
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
2537
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
2538
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2539
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2540
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2541
+ },
2542
+ },
2515
2543
  {
2516
2544
  LLM_ARCH_UNKNOWN,
2517
2545
  {
@@ -2611,6 +2639,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2611
2639
  {LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
2612
2640
  {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2613
2641
  {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
2642
+ {LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
2614
2643
  {LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2615
2644
  {LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2616
2645
  {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -115,6 +115,7 @@ enum llm_arch {
115
115
  LLM_ARCH_COGVLM,
116
116
  LLM_ARCH_RND1,
117
117
  LLM_ARCH_PANGU_EMBED,
118
+ LLM_ARCH_MISTRAL3,
118
119
  LLM_ARCH_UNKNOWN,
119
120
  };
120
121
 
@@ -208,6 +209,7 @@ enum llm_kv {
208
209
  LLM_KV_ATTENTION_SCALE,
209
210
  LLM_KV_ATTENTION_OUTPUT_SCALE,
210
211
  LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
212
+ LLM_KV_ATTENTION_TEMPERATURE_SCALE,
211
213
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
212
214
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
213
215
 
@@ -377,6 +379,7 @@ enum llm_tensor {
377
379
  LLM_TENSOR_SSM_DT,
378
380
  LLM_TENSOR_SSM_DT_NORM,
379
381
  LLM_TENSOR_SSM_A,
382
+ LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
380
383
  LLM_TENSOR_SSM_B_NORM,
381
384
  LLM_TENSOR_SSM_C_NORM,
382
385
  LLM_TENSOR_SSM_D,
@@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
71
71
  if (ubatch->pos && attn_scale) {
72
72
  const int64_t n_tokens = ubatch->n_tokens;
73
73
 
74
+ GGML_ASSERT(f_attn_temp_scale != 0.0f);
75
+ GGML_ASSERT(n_attn_temp_floor_scale != 0);
76
+
74
77
  std::vector<float> attn_scale_data(n_tokens, 0.0f);
75
78
  for (int i = 0; i < n_tokens; ++i) {
76
79
  const float pos = ubatch->pos[i];
@@ -810,9 +813,6 @@ ggml_tensor * llm_graph_context::build_ffn(
810
813
  GGML_ABORT("fatal error");
811
814
  }
812
815
 
813
- //expand here so that we can fuse ffn gate
814
- ggml_build_forward_expand(gf, cur);
815
-
816
816
  if (gate && type_gate == LLM_FFN_PAR) {
817
817
  cur = ggml_mul(ctx0, cur, tmp);
818
818
  cb(cur, "ffn_gate_par", il);
@@ -1093,9 +1093,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1093
1093
  GGML_ABORT("fatal error");
1094
1094
  }
1095
1095
 
1096
- //expand here so that we can fuse ffn gate
1097
- ggml_build_forward_expand(gf, cur);
1098
-
1099
1096
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
1100
1097
  cb(experts, "ffn_moe_down", il);
1101
1098
 
@@ -162,8 +162,8 @@ struct llama_hparams {
162
162
  // llama4 smallthinker
163
163
  uint32_t n_moe_layer_step = 0;
164
164
  uint32_t n_no_rope_layer_step = 4;
165
- uint32_t n_attn_temp_floor_scale = 8192;
166
- float f_attn_temp_scale = 0.1;
165
+ uint32_t n_attn_temp_floor_scale = 0;
166
+ float f_attn_temp_scale = 0.0f;
167
167
 
168
168
  // gemma3n altup
169
169
  uint32_t n_altup = 4; // altup_num_inputs
@@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
37
37
  template <typename T>
38
38
  struct no_init {
39
39
  T value;
40
- no_init() { /* do nothing */ }
40
+ no_init() = default;
41
41
  };
42
42
 
43
43
  struct time_meas {
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
485
485
  if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
486
486
  suggest = false;
487
487
  }
488
- if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
488
+ if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
489
489
  suggest = false;
490
490
  }
491
491
  #endif