@fugood/llama.node 1.4.2 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -1
- package/lib/binding.js +3 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +9 -0
- package/lib/index.ts +10 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -11
- package/src/LlamaContext.cpp +24 -0
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/CMakeLists.txt +21 -6
- package/src/llama.cpp/common/CMakeLists.txt +6 -0
- package/src/llama.cpp/common/arg.cpp +83 -22
- package/src/llama.cpp/common/chat-parser.cpp +40 -0
- package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
- package/src/llama.cpp/common/chat-peg-parser.h +105 -0
- package/src/llama.cpp/common/chat.cpp +40 -29
- package/src/llama.cpp/common/chat.h +10 -1
- package/src/llama.cpp/common/common.cpp +70 -7
- package/src/llama.cpp/common/common.h +23 -5
- package/src/llama.cpp/common/download.cpp +18 -8
- package/src/llama.cpp/common/download.h +3 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +18 -27
- package/src/llama.cpp/common/log.h +19 -12
- package/src/llama.cpp/common/peg-parser.cpp +1712 -0
- package/src/llama.cpp/common/peg-parser.h +459 -0
- package/src/llama.cpp/common/unicode.cpp +64 -0
- package/src/llama.cpp/common/unicode.h +22 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
- package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +30 -1
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -6
- package/src/llama.cpp/src/llama-hparams.h +2 -2
- package/src/llama.cpp/src/llama-impl.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +54 -6
- package/src/llama.cpp/src/llama-quant.cpp +0 -29
- package/src/llama.cpp/src/llama-vocab.cpp +1 -2
- package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
- package/src/llama.cpp/src/models/mistral3.cpp +160 -0
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/unicode.cpp +2 -2
|
@@ -117,8 +117,7 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
|
|
|
117
117
|
#endif
|
|
118
118
|
|
|
119
119
|
#if defined(__MMA__)
|
|
120
|
-
|
|
121
|
-
typedef __vector_quad acc_t;
|
|
120
|
+
#include "sgemm-ppc.h"
|
|
122
121
|
#endif
|
|
123
122
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
124
123
|
// VECTORIZED FUSED MULTIPLY ADD
|
|
@@ -1573,95 +1572,35 @@ class tinyBLAS_BF16_PPC {
|
|
|
1573
1572
|
const int nth;
|
|
1574
1573
|
};
|
|
1575
1574
|
|
|
1576
|
-
template <typename TA>
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
float *C, int64_t ldc,
|
|
1583
|
-
int ith, int nth)
|
|
1575
|
+
template <typename TA>
|
|
1576
|
+
tinyBLAS_Q0_PPC<TA>::tinyBLAS_Q0_PPC(int64_t k,
|
|
1577
|
+
const TA *A, int64_t lda,
|
|
1578
|
+
const block_q8_0 *B, int64_t ldb,
|
|
1579
|
+
float *C, int64_t ldc,
|
|
1580
|
+
int ith, int nth)
|
|
1584
1581
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
1582
|
+
kc = 64;
|
|
1585
1583
|
}
|
|
1586
1584
|
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
template<int size>
|
|
1602
|
-
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
|
|
1603
|
-
vector signed int vec_C[4];
|
|
1604
|
-
vector float CA[4] = {0};
|
|
1605
|
-
vector float res[4] = {0};
|
|
1606
|
-
__builtin_mma_disassemble_acc(vec_C, ACC);
|
|
1607
|
-
for (int i = 0; i < 4; i++) {
|
|
1608
|
-
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
|
1609
|
-
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
|
1610
|
-
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
|
1611
|
-
}
|
|
1612
|
-
}
|
|
1613
|
-
/* This function processes quantized data from block_q4_0 elements.
|
|
1614
|
-
* First the we try to extract the two int4 values stored in single int8_t into two signed int8.
|
|
1615
|
-
* And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8.
|
|
1616
|
-
* Also compute the rowsum which is required to compensate the above conversion. */
|
|
1617
|
-
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
|
|
1618
|
-
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
1619
|
-
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
1620
|
-
const vector signed char v8 = vec_splats((signed char)0x8);
|
|
1621
|
-
vector signed int vsum = {0};
|
|
1622
|
-
vector signed int vsum2 = {0};
|
|
1623
|
-
c[0] = vec_and(c[1], lowMask);
|
|
1624
|
-
c[1] = vec_sr(c[1], v4);
|
|
1625
|
-
c[0] = vec_sub(c[0], v8);
|
|
1626
|
-
c[1] = vec_sub(c[1], v8);
|
|
1627
|
-
vsum = vec_sum4s(c[0], vsum);
|
|
1628
|
-
vsum2 = vec_sum4s(c[1], vsum2);
|
|
1629
|
-
vsum = vec_add(vsum, vsum2);
|
|
1630
|
-
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
|
1631
|
-
}
|
|
1632
|
-
|
|
1633
|
-
template <typename V1, typename V2>
|
|
1634
|
-
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
|
|
1635
|
-
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
|
1636
|
-
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
1637
|
-
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
|
1638
|
-
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
|
1639
|
-
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
|
1640
|
-
vector unsigned char xor_vector;
|
|
1641
|
-
uint8_t flip_vec = 0x80;
|
|
1642
|
-
xor_vector = vec_splats(flip_vec);
|
|
1643
|
-
t1 = vec_perm(s1, s2, swiz1);
|
|
1644
|
-
t2 = vec_perm(s1, s2, swiz2);
|
|
1645
|
-
t3 = vec_perm(s3, s4, swiz1);
|
|
1646
|
-
t4 = vec_perm(s3, s4, swiz2);
|
|
1647
|
-
t5 = vec_perm(t1, t3, swiz3);
|
|
1648
|
-
t6 = vec_perm(t1, t3, swiz4);
|
|
1649
|
-
t7 = vec_perm(t2, t4, swiz3);
|
|
1650
|
-
t8 = vec_perm(t2, t4, swiz4);
|
|
1651
|
-
if (flip == true) {
|
|
1652
|
-
t5 = vec_xor(t5, xor_vector);
|
|
1653
|
-
t6 = vec_xor(t6, xor_vector);
|
|
1654
|
-
t7 = vec_xor(t7, xor_vector);
|
|
1655
|
-
t8 = vec_xor(t8, xor_vector);
|
|
1585
|
+
template<typename TA>
|
|
1586
|
+
void tinyBLAS_Q0_PPC<TA>::matmul(int64_t m, int64_t n) {
|
|
1587
|
+
int mc = 64; int nc = 64;
|
|
1588
|
+
if (n % 8 == 0 && n < nc) {
|
|
1589
|
+
nc = n;
|
|
1590
|
+
mc = 32 ;
|
|
1591
|
+
kc = 32;
|
|
1592
|
+
}
|
|
1593
|
+
const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
|
|
1594
|
+
if (is_aligned) {
|
|
1595
|
+
this->matmul_tiled_q0(m, n, mc, nc, kc);
|
|
1596
|
+
} else {
|
|
1597
|
+
mnpack(0, m, 0, n);
|
|
1656
1598
|
}
|
|
1657
|
-
vec_xst(t5, 0, vecOffset);
|
|
1658
|
-
vec_xst(t6, 0, vecOffset+16);
|
|
1659
|
-
vec_xst(t7, 0, vecOffset+32);
|
|
1660
|
-
vec_xst(t8, 0, vecOffset+48);
|
|
1661
1599
|
}
|
|
1662
1600
|
|
|
1663
|
-
|
|
1664
|
-
|
|
1601
|
+
template<typename TA>
|
|
1602
|
+
template<int size>
|
|
1603
|
+
void tinyBLAS_Q0_PPC<TA>::packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
|
|
1665
1604
|
int64_t i, j;
|
|
1666
1605
|
TA *aoffset = NULL;
|
|
1667
1606
|
int8_t *vecOffset = NULL;
|
|
@@ -1781,8 +1720,10 @@ class tinyBLAS_Q0_PPC {
|
|
|
1781
1720
|
}
|
|
1782
1721
|
}
|
|
1783
1722
|
}
|
|
1723
|
+
|
|
1724
|
+
template<typename TA>
|
|
1784
1725
|
template<typename VA, typename VB>
|
|
1785
|
-
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
|
1726
|
+
void tinyBLAS_Q0_PPC<TA>::packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
|
1786
1727
|
int64_t i, j;
|
|
1787
1728
|
block_q8_0 *aoffset = NULL;
|
|
1788
1729
|
VA *vecOffset = NULL;
|
|
@@ -1822,7 +1763,6 @@ class tinyBLAS_Q0_PPC {
|
|
|
1822
1763
|
j--;
|
|
1823
1764
|
} while(j > 0);
|
|
1824
1765
|
}
|
|
1825
|
-
|
|
1826
1766
|
if (rows & 4) {
|
|
1827
1767
|
aoffsets[0] = aoffset;
|
|
1828
1768
|
for (int it = 1; it < 4; it++ )
|
|
@@ -1878,7 +1818,8 @@ class tinyBLAS_Q0_PPC {
|
|
|
1878
1818
|
}
|
|
1879
1819
|
}
|
|
1880
1820
|
|
|
1881
|
-
|
|
1821
|
+
template<typename TA>
|
|
1822
|
+
void tinyBLAS_Q0_PPC<TA>::mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
|
1882
1823
|
int m_rem = MIN(m - m0, 16);
|
|
1883
1824
|
int n_rem = MIN(n - n0, 16);
|
|
1884
1825
|
|
|
@@ -1915,7 +1856,8 @@ class tinyBLAS_Q0_PPC {
|
|
|
1915
1856
|
}
|
|
1916
1857
|
|
|
1917
1858
|
|
|
1918
|
-
|
|
1859
|
+
template<typename TA>
|
|
1860
|
+
void tinyBLAS_Q0_PPC<TA>::KERNEL_4x8(int64_t ii, int64_t jj) {
|
|
1919
1861
|
vec_t vec_A[8], vec_B[16] = {0};
|
|
1920
1862
|
acc_t acc_0, acc_1;
|
|
1921
1863
|
std::array<int, 4> comparray {};
|
|
@@ -1953,14 +1895,15 @@ class tinyBLAS_Q0_PPC {
|
|
|
1953
1895
|
aoffset += lda;
|
|
1954
1896
|
}
|
|
1955
1897
|
}
|
|
1956
|
-
compute
|
|
1957
|
-
compute
|
|
1898
|
+
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
|
1899
|
+
compute(&acc_1, 0, 4, comparray, vs, fin_res);
|
|
1958
1900
|
}
|
|
1959
1901
|
save_res(ii, jj, 0, fin_res);
|
|
1960
1902
|
save_res(ii, jj+4, 4, fin_res);
|
|
1961
1903
|
}
|
|
1962
1904
|
|
|
1963
|
-
|
|
1905
|
+
template<typename TA>
|
|
1906
|
+
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x4(int64_t ii, int64_t jj) {
|
|
1964
1907
|
vec_t vec_A[16], vec_B[8] = {0};
|
|
1965
1908
|
acc_t acc_0, acc_1;
|
|
1966
1909
|
std::array<int, 8> comparray {};
|
|
@@ -1997,16 +1940,18 @@ class tinyBLAS_Q0_PPC {
|
|
|
1997
1940
|
aoffset += lda;
|
|
1998
1941
|
}
|
|
1999
1942
|
}
|
|
2000
|
-
compute
|
|
2001
|
-
compute
|
|
1943
|
+
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
|
1944
|
+
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
|
2002
1945
|
}
|
|
2003
1946
|
save_res(ii, jj, 0, fin_res);
|
|
2004
1947
|
save_res(ii+4, jj, 4, fin_res);
|
|
2005
1948
|
}
|
|
2006
1949
|
|
|
2007
|
-
|
|
1950
|
+
template<typename TA>
|
|
1951
|
+
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x8(int64_t ii, int64_t jj) {
|
|
2008
1952
|
vec_t vec_A[16], vec_B[16] = {0};
|
|
2009
1953
|
acc_t acc_0, acc_1, acc_2, acc_3;
|
|
1954
|
+
acc_t acc_4, acc_5, acc_6, acc_7;
|
|
2010
1955
|
std::array<int, 8> comparray {};
|
|
2011
1956
|
vector float fin_res[16] = {0};
|
|
2012
1957
|
vector float vs[16] = {0};
|
|
@@ -2046,10 +1991,10 @@ class tinyBLAS_Q0_PPC {
|
|
|
2046
1991
|
aoffset += lda;
|
|
2047
1992
|
}
|
|
2048
1993
|
}
|
|
2049
|
-
compute
|
|
2050
|
-
compute
|
|
2051
|
-
compute
|
|
2052
|
-
compute
|
|
1994
|
+
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
|
1995
|
+
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
|
1996
|
+
compute(&acc_2, 0, 8, comparray, vs, fin_res);
|
|
1997
|
+
compute(&acc_3, 4, 12, comparray, vs, fin_res);
|
|
2053
1998
|
}
|
|
2054
1999
|
save_res(ii, jj, 0, fin_res);
|
|
2055
2000
|
save_res(ii+4, jj, 4, fin_res);
|
|
@@ -2057,7 +2002,8 @@ class tinyBLAS_Q0_PPC {
|
|
|
2057
2002
|
save_res(ii+4, jj+4, 12, fin_res);
|
|
2058
2003
|
}
|
|
2059
2004
|
|
|
2060
|
-
|
|
2005
|
+
template<typename TA>
|
|
2006
|
+
void tinyBLAS_Q0_PPC<TA>::gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
|
|
2061
2007
|
int64_t ytiles = (m - m0) / RM;
|
|
2062
2008
|
int64_t xtiles = (n - n0) / RN;
|
|
2063
2009
|
int64_t tiles = xtiles * ytiles;
|
|
@@ -2125,21 +2071,9 @@ class tinyBLAS_Q0_PPC {
|
|
|
2125
2071
|
}
|
|
2126
2072
|
}
|
|
2127
2073
|
|
|
2128
|
-
template<
|
|
2129
|
-
inline void kernel(int64_t ii, int64_t jj) {
|
|
2130
|
-
if constexpr(RM == 4 && RN == 8) {
|
|
2131
|
-
KERNEL_4x8(ii,jj);
|
|
2132
|
-
} else if constexpr(RM == 8 && RN == 4) {
|
|
2133
|
-
KERNEL_8x4(ii,jj);
|
|
2134
|
-
} else if constexpr(RM == 8 && RN == 8) {
|
|
2135
|
-
KERNEL_8x8(ii,jj);
|
|
2136
|
-
} else {
|
|
2137
|
-
assert(false && "RN/RM values not supported");
|
|
2138
|
-
}
|
|
2139
|
-
}
|
|
2140
|
-
|
|
2074
|
+
template<typename TA>
|
|
2141
2075
|
template <int RM, int RN>
|
|
2142
|
-
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
|
2076
|
+
NOINLINE void tinyBLAS_Q0_PPC<TA>::gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
|
2143
2077
|
int64_t ytiles = (m - m0) / RM;
|
|
2144
2078
|
int64_t xtiles = (n - n0) / RN;
|
|
2145
2079
|
int64_t tiles = xtiles * ytiles;
|
|
@@ -2151,20 +2085,12 @@ class tinyBLAS_Q0_PPC {
|
|
|
2151
2085
|
for (int64_t job = start; job < end; ++job) {
|
|
2152
2086
|
int64_t ii = m0 + job / xtiles * RM;
|
|
2153
2087
|
int64_t jj = n0 + job % xtiles * RN;
|
|
2154
|
-
kernel<RM, RN>(ii, jj);
|
|
2088
|
+
this->kernel<RM, RN>(ii, jj);
|
|
2155
2089
|
}
|
|
2156
2090
|
}
|
|
2157
2091
|
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
float *C;
|
|
2161
|
-
const int64_t k;
|
|
2162
|
-
const int64_t lda;
|
|
2163
|
-
const int64_t ldb;
|
|
2164
|
-
const int64_t ldc;
|
|
2165
|
-
const int ith;
|
|
2166
|
-
const int nth;
|
|
2167
|
-
};
|
|
2092
|
+
template class tinyBLAS_Q0_PPC<block_q4_0>;
|
|
2093
|
+
template class tinyBLAS_Q0_PPC<block_q8_0>;
|
|
2168
2094
|
|
|
2169
2095
|
class tinyBLAS_PPC {
|
|
2170
2096
|
public:
|
|
@@ -6383,7 +6383,7 @@ static void ggml_compute_forward_im2col_3d_f16(
|
|
|
6383
6383
|
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
6384
6384
|
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
6385
6385
|
|
|
6386
|
-
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW
|
|
6386
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
|
6387
6387
|
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
6388
6388
|
} else {
|
|
6389
6389
|
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
@@ -6554,8 +6554,13 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
|
|
|
6554
6554
|
ggml_compute_forward_mul_mat(params, &dst);
|
|
6555
6555
|
}
|
|
6556
6556
|
|
|
6557
|
+
static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
|
|
6558
|
+
return (coord + size) % size; // adding size avoids negative number weirdness
|
|
6559
|
+
}
|
|
6560
|
+
|
|
6557
6561
|
// ggml_compute_forward_conv_2d
|
|
6558
6562
|
|
|
6563
|
+
|
|
6559
6564
|
static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
|
|
6560
6565
|
const ggml_tensor * kernel, // [KW, KH, IC, OC]
|
|
6561
6566
|
const ggml_tensor * src, // [W, H, C, N]
|
|
@@ -7420,6 +7425,65 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
7420
7425
|
}
|
|
7421
7426
|
}
|
|
7422
7427
|
}
|
|
7428
|
+
} else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
|
|
7429
|
+
// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
|
|
7430
|
+
// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
|
|
7431
|
+
auto triangle_filter = [](float x) -> float {
|
|
7432
|
+
return std::max(1.0f - fabsf(x), 0.0f);
|
|
7433
|
+
};
|
|
7434
|
+
|
|
7435
|
+
// support and invscale, minimum 1 pixel for bilinear
|
|
7436
|
+
const float support1 = std::max(1.0f, 1.0f / sf1);
|
|
7437
|
+
const float invscale1 = 1.0f / support1;
|
|
7438
|
+
const float support0 = std::max(1.0f, 1.0f / sf0);
|
|
7439
|
+
const float invscale0 = 1.0f / support0;
|
|
7440
|
+
|
|
7441
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
7442
|
+
const int64_t i03 = i3 / sf3;
|
|
7443
|
+
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
|
7444
|
+
const int64_t i02 = i2 / sf2;
|
|
7445
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
7446
|
+
const float y = ((float) i1 + pixel_offset) / sf1;
|
|
7447
|
+
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
|
7448
|
+
const float x = ((float) i0 + pixel_offset) / sf0;
|
|
7449
|
+
|
|
7450
|
+
// the range of source pixels that contribute
|
|
7451
|
+
const int64_t x_min = std::max<int64_t>(x - support0 + pixel_offset, 0);
|
|
7452
|
+
const int64_t x_max = std::min<int64_t>(x + support0 + pixel_offset, ne00);
|
|
7453
|
+
const int64_t y_min = std::max<int64_t>(y - support1 + pixel_offset, 0);
|
|
7454
|
+
const int64_t y_max = std::min<int64_t>(y + support1 + pixel_offset, ne01);
|
|
7455
|
+
|
|
7456
|
+
// bilinear filter with antialiasing
|
|
7457
|
+
float val = 0.0f;
|
|
7458
|
+
float total_weight = 0.0f;
|
|
7459
|
+
|
|
7460
|
+
for (int64_t sy = y_min; sy < y_max; sy++) {
|
|
7461
|
+
const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
|
|
7462
|
+
|
|
7463
|
+
for (int64_t sx = x_min; sx < x_max; sx++) {
|
|
7464
|
+
const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
|
|
7465
|
+
const float weight = weight_x * weight_y;
|
|
7466
|
+
|
|
7467
|
+
if (weight <= 0.0f) {
|
|
7468
|
+
continue;
|
|
7469
|
+
}
|
|
7470
|
+
|
|
7471
|
+
const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03);
|
|
7472
|
+
val += pixel * weight;
|
|
7473
|
+
total_weight += weight;
|
|
7474
|
+
}
|
|
7475
|
+
}
|
|
7476
|
+
|
|
7477
|
+
if (total_weight > 0.0f) {
|
|
7478
|
+
val /= total_weight;
|
|
7479
|
+
}
|
|
7480
|
+
|
|
7481
|
+
float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
|
7482
|
+
*dst_ptr = val;
|
|
7483
|
+
}
|
|
7484
|
+
}
|
|
7485
|
+
}
|
|
7486
|
+
}
|
|
7423
7487
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
7424
7488
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
7425
7489
|
const int64_t i03 = i3 / sf3;
|
|
@@ -7532,6 +7596,7 @@ void ggml_compute_forward_upscale(
|
|
|
7532
7596
|
|
|
7533
7597
|
// ggml_compute_forward_pad
|
|
7534
7598
|
|
|
7599
|
+
template<bool circular_t>
|
|
7535
7600
|
static void ggml_compute_forward_pad_f32(
|
|
7536
7601
|
const ggml_compute_params * params,
|
|
7537
7602
|
ggml_tensor * dst) {
|
|
@@ -7556,23 +7621,40 @@ static void ggml_compute_forward_pad_f32(
|
|
|
7556
7621
|
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
|
7557
7622
|
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
|
7558
7623
|
|
|
7559
|
-
|
|
7560
7624
|
// TODO: optimize
|
|
7561
7625
|
|
|
7562
7626
|
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
|
7563
7627
|
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
|
7564
7628
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
|
7565
7629
|
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
|
7566
|
-
|
|
7567
|
-
if (
|
|
7568
|
-
|
|
7569
|
-
|
|
7570
|
-
|
|
7571
|
-
const int64_t
|
|
7630
|
+
// circular means wrap around on a torus, so x and y loop around
|
|
7631
|
+
if constexpr (circular_t) {
|
|
7632
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|
7633
|
+
const int64_t src_i0 = ggml_wrap_around(i0 - lp0, ne00);
|
|
7634
|
+
const int64_t src_i1 = ggml_wrap_around(i1 - lp1, ne01);
|
|
7635
|
+
const int64_t src_i2 = ggml_wrap_around(i2 - lp2, ne02);
|
|
7636
|
+
const int64_t src_i3 = ggml_wrap_around(i3 - lp3, ne03);
|
|
7637
|
+
|
|
7638
|
+
const int64_t src_idx =
|
|
7639
|
+
src_i3*nb03 +
|
|
7640
|
+
src_i2*nb02 +
|
|
7641
|
+
src_i1*nb01 +
|
|
7642
|
+
src_i0*nb00;
|
|
7643
|
+
|
|
7572
7644
|
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
|
7573
7645
|
dst_ptr[dst_idx] = *src_ptr;
|
|
7574
7646
|
} else {
|
|
7575
|
-
|
|
7647
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|
7648
|
+
if ((i0 >= lp0 && i0 < ne0 - rp0) \
|
|
7649
|
+
&& (i1 >= lp1 && i1 < ne1 - rp1) \
|
|
7650
|
+
&& (i2 >= lp2 && i2 < ne2 - rp2) \
|
|
7651
|
+
&& (i3 >= lp3 && i3 < ne3 - rp3)) {
|
|
7652
|
+
const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
|
|
7653
|
+
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
|
7654
|
+
dst_ptr[dst_idx] = *src_ptr;
|
|
7655
|
+
} else {
|
|
7656
|
+
dst_ptr[dst_idx] = 0;
|
|
7657
|
+
}
|
|
7576
7658
|
}
|
|
7577
7659
|
}
|
|
7578
7660
|
}
|
|
@@ -7580,16 +7662,20 @@ static void ggml_compute_forward_pad_f32(
|
|
|
7580
7662
|
}
|
|
7581
7663
|
}
|
|
7582
7664
|
|
|
7665
|
+
|
|
7583
7666
|
void ggml_compute_forward_pad(
|
|
7584
7667
|
const ggml_compute_params * params,
|
|
7585
7668
|
ggml_tensor * dst) {
|
|
7586
|
-
|
|
7587
7669
|
const ggml_tensor * src0 = dst->src[0];
|
|
7588
|
-
|
|
7670
|
+
const bool circular = (bool) ggml_get_op_params_i32(dst, 8);
|
|
7589
7671
|
switch (src0->type) {
|
|
7590
7672
|
case GGML_TYPE_F32:
|
|
7591
7673
|
{
|
|
7592
|
-
|
|
7674
|
+
if (circular) {
|
|
7675
|
+
ggml_compute_forward_pad_f32<true>(params, dst);
|
|
7676
|
+
} else {
|
|
7677
|
+
ggml_compute_forward_pad_f32<false>(params, dst);
|
|
7678
|
+
}
|
|
7593
7679
|
} break;
|
|
7594
7680
|
default:
|
|
7595
7681
|
{
|
|
@@ -111,6 +111,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
111
111
|
{ LLM_ARCH_COGVLM, "cogvlm" },
|
|
112
112
|
{ LLM_ARCH_RND1, "rnd1" },
|
|
113
113
|
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
114
|
+
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
|
114
115
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
115
116
|
};
|
|
116
117
|
|
|
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
204
205
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
205
206
|
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
|
206
207
|
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
|
208
|
+
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
|
207
209
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
208
210
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
209
211
|
|
|
@@ -853,7 +855,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
853
855
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
854
856
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
855
857
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
856
|
-
{
|
|
858
|
+
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
|
857
859
|
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
858
860
|
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
859
861
|
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
|
|
@@ -2512,6 +2514,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2512
2514
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2513
2515
|
},
|
|
2514
2516
|
},
|
|
2517
|
+
{
|
|
2518
|
+
LLM_ARCH_MISTRAL3,
|
|
2519
|
+
{
|
|
2520
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2521
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2522
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2523
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
2524
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2525
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2526
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2527
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2528
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2529
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
2530
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2531
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2532
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2533
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2534
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2535
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
|
2536
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
|
2537
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
2538
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2539
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2540
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2541
|
+
},
|
|
2542
|
+
},
|
|
2515
2543
|
{
|
|
2516
2544
|
LLM_ARCH_UNKNOWN,
|
|
2517
2545
|
{
|
|
@@ -2611,6 +2639,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2611
2639
|
{LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
|
|
2612
2640
|
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2613
2641
|
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
|
|
2642
|
+
{LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
|
|
2614
2643
|
{LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
2615
2644
|
{LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
2616
2645
|
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
@@ -115,6 +115,7 @@ enum llm_arch {
|
|
|
115
115
|
LLM_ARCH_COGVLM,
|
|
116
116
|
LLM_ARCH_RND1,
|
|
117
117
|
LLM_ARCH_PANGU_EMBED,
|
|
118
|
+
LLM_ARCH_MISTRAL3,
|
|
118
119
|
LLM_ARCH_UNKNOWN,
|
|
119
120
|
};
|
|
120
121
|
|
|
@@ -208,6 +209,7 @@ enum llm_kv {
|
|
|
208
209
|
LLM_KV_ATTENTION_SCALE,
|
|
209
210
|
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
|
210
211
|
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
|
212
|
+
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
|
211
213
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
212
214
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
213
215
|
|
|
@@ -377,6 +379,7 @@ enum llm_tensor {
|
|
|
377
379
|
LLM_TENSOR_SSM_DT,
|
|
378
380
|
LLM_TENSOR_SSM_DT_NORM,
|
|
379
381
|
LLM_TENSOR_SSM_A,
|
|
382
|
+
LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
|
|
380
383
|
LLM_TENSOR_SSM_B_NORM,
|
|
381
384
|
LLM_TENSOR_SSM_C_NORM,
|
|
382
385
|
LLM_TENSOR_SSM_D,
|
|
@@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|
|
71
71
|
if (ubatch->pos && attn_scale) {
|
|
72
72
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
73
73
|
|
|
74
|
+
GGML_ASSERT(f_attn_temp_scale != 0.0f);
|
|
75
|
+
GGML_ASSERT(n_attn_temp_floor_scale != 0);
|
|
76
|
+
|
|
74
77
|
std::vector<float> attn_scale_data(n_tokens, 0.0f);
|
|
75
78
|
for (int i = 0; i < n_tokens; ++i) {
|
|
76
79
|
const float pos = ubatch->pos[i];
|
|
@@ -810,9 +813,6 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
810
813
|
GGML_ABORT("fatal error");
|
|
811
814
|
}
|
|
812
815
|
|
|
813
|
-
//expand here so that we can fuse ffn gate
|
|
814
|
-
ggml_build_forward_expand(gf, cur);
|
|
815
|
-
|
|
816
816
|
if (gate && type_gate == LLM_FFN_PAR) {
|
|
817
817
|
cur = ggml_mul(ctx0, cur, tmp);
|
|
818
818
|
cb(cur, "ffn_gate_par", il);
|
|
@@ -1093,9 +1093,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1093
1093
|
GGML_ABORT("fatal error");
|
|
1094
1094
|
}
|
|
1095
1095
|
|
|
1096
|
-
//expand here so that we can fuse ffn gate
|
|
1097
|
-
ggml_build_forward_expand(gf, cur);
|
|
1098
|
-
|
|
1099
1096
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
1100
1097
|
cb(experts, "ffn_moe_down", il);
|
|
1101
1098
|
|
|
@@ -162,8 +162,8 @@ struct llama_hparams {
|
|
|
162
162
|
// llama4 smallthinker
|
|
163
163
|
uint32_t n_moe_layer_step = 0;
|
|
164
164
|
uint32_t n_no_rope_layer_step = 4;
|
|
165
|
-
uint32_t n_attn_temp_floor_scale =
|
|
166
|
-
float f_attn_temp_scale = 0.
|
|
165
|
+
uint32_t n_attn_temp_floor_scale = 0;
|
|
166
|
+
float f_attn_temp_scale = 0.0f;
|
|
167
167
|
|
|
168
168
|
// gemma3n altup
|
|
169
169
|
uint32_t n_altup = 4; // altup_num_inputs
|
|
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
|
|
|
485
485
|
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
|
486
486
|
suggest = false;
|
|
487
487
|
}
|
|
488
|
-
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
|
488
|
+
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
|
|
489
489
|
suggest = false;
|
|
490
490
|
}
|
|
491
491
|
#endif
|