@fugood/llama.node 1.3.2 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +8 -3
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +5 -5
- package/src/LlamaCompletionWorker.cpp +33 -33
- package/src/LlamaContext.cpp +17 -16
- package/src/llama.cpp/CMakeLists.txt +4 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -37
- package/src/llama.cpp/common/common.cpp +1 -5
- package/src/llama.cpp/common/download.cpp +47 -29
- package/src/llama.cpp/common/log.cpp +6 -0
- package/src/llama.cpp/common/log.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +71 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
- package/src/llama.cpp/src/CMakeLists.txt +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +32 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +2 -1
- package/src/llama.cpp/src/llama-model.cpp +102 -0
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-sampling.cpp +10 -5
- package/src/llama.cpp/src/llama-vocab.cpp +16 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/afmoe.cpp +187 -0
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/unicode.cpp +77 -0
|
@@ -34,6 +34,7 @@ void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct
|
|
|
34
34
|
void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
35
35
|
void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
36
36
|
void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
37
|
+
void ggml_compute_forward_cumsum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
37
38
|
void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
38
39
|
void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
39
40
|
void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -81,6 +82,8 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru
|
|
|
81
82
|
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
82
83
|
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
83
84
|
void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
85
|
+
void ggml_compute_forward_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
86
|
+
void ggml_compute_forward_fill(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
84
87
|
void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
85
88
|
void ggml_compute_forward_flash_attn_back(
|
|
86
89
|
const struct ggml_compute_params * params,
|
|
@@ -96,6 +99,7 @@ void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params,
|
|
|
96
99
|
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
97
100
|
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
98
101
|
void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
102
|
+
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
99
103
|
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
100
104
|
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
101
105
|
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -1600,29 +1600,52 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1600
1600
|
return false;
|
|
1601
1601
|
}
|
|
1602
1602
|
|
|
1603
|
-
void forward_mul_mat_one_chunk(ggml_compute_params * params,
|
|
1603
|
+
void forward_mul_mat_one_chunk(ggml_compute_params * params,
|
|
1604
|
+
ggml_tensor * op,
|
|
1605
|
+
int64_t src0_start,
|
|
1606
|
+
int64_t src0_end,
|
|
1607
|
+
int64_t src1_start,
|
|
1608
|
+
int64_t src1_end) {
|
|
1604
1609
|
const ggml_tensor * src0 = op->src[0];
|
|
1605
1610
|
const ggml_tensor * src1 = op->src[1];
|
|
1606
1611
|
ggml_tensor * dst = op;
|
|
1607
1612
|
|
|
1608
1613
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
1609
1614
|
|
|
1610
|
-
const void * src1_wdata = params->wdata;
|
|
1611
1615
|
const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
|
|
1612
1616
|
|
|
1617
|
+
GGML_ASSERT(ne03 == 1 && ne13 == 1);
|
|
1618
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
|
1619
|
+
const int64_t r2 = ne12 / ne02;
|
|
1620
|
+
|
|
1621
|
+
const int64_t i12 = src1_start / ne1;
|
|
1622
|
+
const int64_t i11 = src1_start - i12 * ne1;
|
|
1623
|
+
|
|
1624
|
+
// Determine batch index
|
|
1625
|
+
const int64_t i02 = i12 / r2;
|
|
1626
|
+
|
|
1627
|
+
const int64_t i1 = i11;
|
|
1628
|
+
const int64_t i2 = i12;
|
|
1629
|
+
|
|
1630
|
+
const char * src0_ptr = (const char *) src0->data + i02 * nb02;
|
|
1631
|
+
const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
|
|
1632
|
+
char * dst_ptr = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
|
|
1633
|
+
|
|
1634
|
+
const int64_t nrows = src1_end - src1_start;
|
|
1635
|
+
const int64_t ncols = src0_end - src0_start;
|
|
1636
|
+
|
|
1637
|
+
GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
|
|
1638
|
+
|
|
1613
1639
|
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
|
1614
|
-
if (
|
|
1615
|
-
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
|
|
1640
|
+
if (nrows > 3) {
|
|
1641
|
+
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
|
|
1642
|
+
src0_ptr + src0_start * nb01, src1_ptr,
|
|
1643
|
+
nrows - (nrows % 4), ncols);
|
|
1619
1644
|
}
|
|
1620
|
-
for (int iter =
|
|
1621
|
-
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
(const char *) src1_wdata + (src1_col_stride * iter), 1,
|
|
1625
|
-
src0_end - src0_start);
|
|
1645
|
+
for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
|
|
1646
|
+
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
|
|
1647
|
+
ne01, src0_ptr + src0_start * nb01,
|
|
1648
|
+
src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols);
|
|
1626
1649
|
}
|
|
1627
1650
|
}
|
|
1628
1651
|
|
|
@@ -1647,6 +1670,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1647
1670
|
GGML_ASSERT(nb1 <= nb2);
|
|
1648
1671
|
GGML_ASSERT(nb2 <= nb3);
|
|
1649
1672
|
|
|
1673
|
+
// TODO: General batched mul mat for 4D tensors
|
|
1674
|
+
// Currently only supports 3D tensors
|
|
1675
|
+
GGML_ASSERT(ne03 == 1);
|
|
1676
|
+
GGML_ASSERT(ne13 == 1);
|
|
1677
|
+
GGML_ASSERT(ne3 == 1);
|
|
1678
|
+
|
|
1650
1679
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
1651
1680
|
|
|
1652
1681
|
GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
|
|
@@ -1654,47 +1683,64 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1654
1683
|
|
|
1655
1684
|
char * wdata = static_cast<char *>(params->wdata);
|
|
1656
1685
|
const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
|
|
1686
|
+
const size_t nbw2 = nbw1 * ne11;
|
|
1657
1687
|
|
|
1658
|
-
assert(params->wsize >=
|
|
1688
|
+
assert(params->wsize >= nbw2 * ne12);
|
|
1659
1689
|
|
|
1660
1690
|
const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
|
|
1661
1691
|
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1692
|
+
// INFO: Quantization is done in planes to avoid extra complexity in chunking.
|
|
1693
|
+
// Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how
|
|
1694
|
+
// the planes are broadcast.
|
|
1695
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
1696
|
+
char * data_ptr = (char *) src1->data + i12 * nb12;
|
|
1697
|
+
char * wdata_ptr = wdata + i12 * nbw2;
|
|
1666
1698
|
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1699
|
+
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
|
1700
|
+
ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
|
|
1701
|
+
(void *) (wdata_ptr + i11 * nbw1), 4, ne10);
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
const int64_t i11_processed = ne11 - ne11 % 4;
|
|
1705
|
+
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
|
1706
|
+
from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
|
|
1707
|
+
}
|
|
1670
1708
|
}
|
|
1671
1709
|
|
|
1672
1710
|
// disable for NUMA
|
|
1673
1711
|
const bool disable_chunking = ggml_is_numa();
|
|
1674
1712
|
|
|
1675
1713
|
// 4x chunks per thread
|
|
1676
|
-
int64_t
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
int64_t
|
|
1714
|
+
const int64_t nr0 = ggml_nrows(op->src[0]);
|
|
1715
|
+
|
|
1716
|
+
int nth_scaled = nth * 4;
|
|
1717
|
+
int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
|
|
1718
|
+
int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0;
|
|
1719
|
+
|
|
1720
|
+
// src1 is chunked only by full planes.
|
|
1721
|
+
// When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE
|
|
1722
|
+
// to route them thorugh GEMV.
|
|
1723
|
+
// nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors
|
|
1724
|
+
// to avoid affecting their performance
|
|
1725
|
+
int64_t nchunk1 = ne12;
|
|
1680
1726
|
|
|
1681
1727
|
// Ensure minimum chunk size to avoid alignment issues with high thread counts
|
|
1682
1728
|
// Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
|
|
1683
1729
|
const int64_t min_chunk_size = NB_COLS;
|
|
1684
|
-
if (
|
|
1685
|
-
|
|
1730
|
+
if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
|
|
1731
|
+
nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
|
1686
1732
|
}
|
|
1687
1733
|
|
|
1688
|
-
if (nth == 1 ||
|
|
1689
|
-
|
|
1734
|
+
if (nth == 1 || nchunk0 < nth || disable_chunking) {
|
|
1735
|
+
nchunk0 = nth;
|
|
1690
1736
|
}
|
|
1691
1737
|
|
|
1738
|
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
|
1739
|
+
|
|
1692
1740
|
// Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
|
|
1693
1741
|
// This prevents creating too many tiny chunks that could overlap after alignment
|
|
1694
|
-
const int64_t max_nchunk = (
|
|
1695
|
-
|
|
1696
|
-
nchunk = max_nchunk;
|
|
1697
|
-
}
|
|
1742
|
+
const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
|
1743
|
+
nchunk0 = MIN(nchunk0, max_nchunk);
|
|
1698
1744
|
|
|
1699
1745
|
if (ith == 0) {
|
|
1700
1746
|
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
|
@@ -1706,23 +1752,30 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1706
1752
|
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
|
1707
1753
|
int current_chunk = ith;
|
|
1708
1754
|
|
|
1709
|
-
while (current_chunk <
|
|
1710
|
-
int64_t
|
|
1711
|
-
int64_t
|
|
1755
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
|
1756
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
|
1757
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
|
1758
|
+
|
|
1759
|
+
int64_t src0_start = dr0 * ith0;
|
|
1760
|
+
int64_t src0_end = MIN(src0_start + dr0, nr0);
|
|
1761
|
+
|
|
1762
|
+
// full-plane range for src1
|
|
1763
|
+
int64_t src1_start = ith1 * ne11;
|
|
1764
|
+
int64_t src1_end = (ith1 + 1) * ne11;
|
|
1712
1765
|
|
|
1713
1766
|
// Align boundaries to NB_COLS - round up to ensure all data is included
|
|
1714
1767
|
// The chunk size limiting above ensures chunks are large enough to prevent overlaps
|
|
1715
1768
|
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
|
|
1716
|
-
src0_end = (src0_end
|
|
1717
|
-
|
|
1718
|
-
src0_end = ne01;
|
|
1719
|
-
}
|
|
1769
|
+
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
|
|
1770
|
+
src0_end = MIN(src0_end, ne01);
|
|
1720
1771
|
|
|
1772
|
+
// Make sure current plane is the last one before exiting
|
|
1721
1773
|
if (src0_start >= src0_end) {
|
|
1722
|
-
|
|
1774
|
+
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
|
1775
|
+
continue;
|
|
1723
1776
|
}
|
|
1724
1777
|
|
|
1725
|
-
forward_mul_mat_one_chunk(params, dst, src0_start, src0_end);
|
|
1778
|
+
forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
|
|
1726
1779
|
|
|
1727
1780
|
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
|
1728
1781
|
}
|
|
@@ -73,6 +73,14 @@ static inline float op_log(float x) {
|
|
|
73
73
|
return logf(x);
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
+
static inline float op_expm1(float x) {
|
|
77
|
+
return expf(x) - 1.0f;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
static inline float op_softplus(float x) {
|
|
81
|
+
return (x > 20.0f) ? x : logf(1.0f + expf(x));
|
|
82
|
+
}
|
|
83
|
+
|
|
76
84
|
static inline float op_floor(float x) {
|
|
77
85
|
return floorf(x);
|
|
78
86
|
}
|
|
@@ -290,6 +298,14 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
|
|
|
290
298
|
unary_op<op_log>(params, dst);
|
|
291
299
|
}
|
|
292
300
|
|
|
301
|
+
void ggml_compute_forward_expm1(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
302
|
+
unary_op<op_expm1>(params, dst);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
void ggml_compute_forward_softplus(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
306
|
+
unary_op<op_softplus>(params, dst);
|
|
307
|
+
}
|
|
308
|
+
|
|
293
309
|
void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
294
310
|
unary_op<op_floor>(params, dst);
|
|
295
311
|
}
|
|
@@ -22,6 +22,8 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
|
|
|
22
22
|
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
23
23
|
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
24
24
|
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
25
|
+
void ggml_compute_forward_expm1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
26
|
+
void ggml_compute_forward_softplus(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
25
27
|
void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
26
28
|
void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
27
29
|
void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -360,6 +360,13 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
360
360
|
for (; i + 3 < n; i += 4) {
|
|
361
361
|
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
|
362
362
|
}
|
|
363
|
+
#elif defined(__riscv_v_intrinsic)
|
|
364
|
+
for (int vl; i < n; i += vl) {
|
|
365
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
366
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
367
|
+
vfloat32m2_t vy = ggml_v_silu_m2(vx, vl);
|
|
368
|
+
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
|
369
|
+
}
|
|
363
370
|
#endif
|
|
364
371
|
for (; i < n; ++i) {
|
|
365
372
|
y[i] = ggml_silu_f32(x[i]);
|
|
@@ -460,6 +467,16 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa
|
|
|
460
467
|
val = vec_mul(val, val);
|
|
461
468
|
sum += (ggml_float)vec_hsum_f32x4(val);
|
|
462
469
|
}
|
|
470
|
+
#elif defined(__riscv_v_intrinsic)
|
|
471
|
+
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
|
|
472
|
+
for (int vl; i < n; i += vl) {
|
|
473
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
474
|
+
vfloat32m2_t val = __riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], vl), mean, vl);
|
|
475
|
+
__riscv_vse32_v_f32m2(&y[i], val, vl);
|
|
476
|
+
val = __riscv_vfmul_vv_f32m2(val, val, vl);
|
|
477
|
+
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, vl);
|
|
478
|
+
}
|
|
479
|
+
sum = (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
|
|
463
480
|
#endif
|
|
464
481
|
for (; i < n; ++i) {
|
|
465
482
|
float val = x[i] - mean;
|
|
@@ -1416,6 +1416,16 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
|
|
1416
1416
|
#endif
|
|
1417
1417
|
}
|
|
1418
1418
|
|
|
1419
|
+
inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
|
|
1420
|
+
for (int i = 0; i < n; ++i) {
|
|
1421
|
+
if (i == 0) {
|
|
1422
|
+
y[i] = x[i];
|
|
1423
|
+
} else {
|
|
1424
|
+
y[i] = y[i - 1] + x[i];
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1419
1429
|
inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
|
|
1420
1430
|
ggml_float sum = 0.0;
|
|
1421
1431
|
for (int i = 0; i < n; ++i) {
|
|
@@ -35,6 +35,7 @@ add_library(llama
|
|
|
35
35
|
unicode-data.cpp
|
|
36
36
|
unicode.cpp
|
|
37
37
|
unicode.h
|
|
38
|
+
models/afmoe.cpp
|
|
38
39
|
models/apertus.cpp
|
|
39
40
|
models/arcee.cpp
|
|
40
41
|
models/arctic.cpp
|
|
@@ -132,6 +133,11 @@ add_library(llama
|
|
|
132
133
|
models/graph-context-mamba.cpp
|
|
133
134
|
)
|
|
134
135
|
|
|
136
|
+
set_target_properties(llama PROPERTIES
|
|
137
|
+
VERSION ${LLAMA_INSTALL_VERSION}
|
|
138
|
+
SOVERSION 0
|
|
139
|
+
)
|
|
140
|
+
|
|
135
141
|
target_include_directories(llama PRIVATE .)
|
|
136
142
|
target_include_directories(llama PUBLIC ../include)
|
|
137
143
|
target_compile_features (llama PRIVATE cxx_std_17) # don't bump
|
|
@@ -90,6 +90,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
90
90
|
{ LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
|
|
91
91
|
{ LLM_ARCH_DOTS1, "dots1" },
|
|
92
92
|
{ LLM_ARCH_ARCEE, "arcee" },
|
|
93
|
+
{ LLM_ARCH_AFMOE, "afmoe" },
|
|
93
94
|
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
|
94
95
|
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
|
95
96
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
@@ -333,6 +334,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
333
334
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
334
335
|
},
|
|
335
336
|
},
|
|
337
|
+
{
|
|
338
|
+
LLM_ARCH_AFMOE,
|
|
339
|
+
{
|
|
340
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
341
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
342
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
343
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
344
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
345
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
346
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
347
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
348
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
349
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
350
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
351
|
+
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
|
|
352
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
353
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
354
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
355
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
356
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
357
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
358
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
359
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
360
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
361
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
362
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
363
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
364
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
365
|
+
},
|
|
366
|
+
},
|
|
336
367
|
{
|
|
337
368
|
LLM_ARCH_LLAMA4,
|
|
338
369
|
{
|
|
@@ -2444,6 +2475,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2444
2475
|
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2445
2476
|
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2446
2477
|
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2478
|
+
{LLM_TENSOR_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2447
2479
|
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2448
2480
|
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2449
2481
|
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -94,6 +94,7 @@ enum llm_arch {
|
|
|
94
94
|
LLM_ARCH_BAILINGMOE2,
|
|
95
95
|
LLM_ARCH_DOTS1,
|
|
96
96
|
LLM_ARCH_ARCEE,
|
|
97
|
+
LLM_ARCH_AFMOE,
|
|
97
98
|
LLM_ARCH_ERNIE4_5,
|
|
98
99
|
LLM_ARCH_ERNIE4_5_MOE,
|
|
99
100
|
LLM_ARCH_HUNYUAN_MOE,
|
|
@@ -312,6 +313,7 @@ enum llm_tensor {
|
|
|
312
313
|
LLM_TENSOR_ATTN_POST_NORM,
|
|
313
314
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
314
315
|
LLM_TENSOR_ATTN_SINKS,
|
|
316
|
+
LLM_TENSOR_ATTN_GATE,
|
|
315
317
|
LLM_TENSOR_FFN_GATE_INP,
|
|
316
318
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
|
317
319
|
LLM_TENSOR_FFN_NORM,
|
|
@@ -1592,9 +1592,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1592
1592
|
int il) const {
|
|
1593
1593
|
// these nodes are added to the graph together so that they are not reordered
|
|
1594
1594
|
// by doing so, the number of splits in the graph is reduced
|
|
1595
|
+
// expand k later to enable rope fusion which directly writes into k-v cache
|
|
1595
1596
|
ggml_build_forward_expand(gf, q_cur);
|
|
1596
|
-
ggml_build_forward_expand(gf, k_cur);
|
|
1597
1597
|
ggml_build_forward_expand(gf, v_cur);
|
|
1598
|
+
ggml_build_forward_expand(gf, k_cur);
|
|
1598
1599
|
|
|
1599
1600
|
const auto * mctx_cur = inp->mctx;
|
|
1600
1601
|
|
|
@@ -84,6 +84,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
84
84
|
case LLM_TYPE_15B: return "15B";
|
|
85
85
|
case LLM_TYPE_16B: return "16B";
|
|
86
86
|
case LLM_TYPE_20B: return "20B";
|
|
87
|
+
case LLM_TYPE_26B: return "26B";
|
|
87
88
|
case LLM_TYPE_27B: return "27B";
|
|
88
89
|
case LLM_TYPE_30B: return "30B";
|
|
89
90
|
case LLM_TYPE_32B: return "32B";
|
|
@@ -695,6 +696,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
695
696
|
default: type = LLM_TYPE_UNKNOWN;
|
|
696
697
|
}
|
|
697
698
|
} break;
|
|
699
|
+
case LLM_ARCH_AFMOE:
|
|
700
|
+
{
|
|
701
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
702
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
703
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
704
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
705
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
706
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
707
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
708
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
709
|
+
|
|
710
|
+
// Set up interleaved sliding window attention (ISWA)
|
|
711
|
+
// Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
|
|
712
|
+
if (hparams.n_swa > 0) {
|
|
713
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
714
|
+
hparams.set_swa_pattern(4);
|
|
715
|
+
} else {
|
|
716
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
// Default to sigmoid if not set
|
|
720
|
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
721
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
switch (hparams.n_layer) {
|
|
725
|
+
case 56: type = LLM_TYPE_6B; break;
|
|
726
|
+
case 32: type = LLM_TYPE_26B; break;
|
|
727
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
728
|
+
}
|
|
729
|
+
} break;
|
|
698
730
|
case LLM_ARCH_DECI:
|
|
699
731
|
{
|
|
700
732
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -5749,6 +5781,71 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5749
5781
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5750
5782
|
}
|
|
5751
5783
|
} break;
|
|
5784
|
+
case LLM_ARCH_AFMOE:
|
|
5785
|
+
{
|
|
5786
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5787
|
+
|
|
5788
|
+
// output
|
|
5789
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5790
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5791
|
+
|
|
5792
|
+
// if output is NULL, init from the input tok embed
|
|
5793
|
+
if (output == NULL) {
|
|
5794
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5795
|
+
}
|
|
5796
|
+
|
|
5797
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5798
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
5799
|
+
|
|
5800
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5801
|
+
auto & layer = layers[i];
|
|
5802
|
+
|
|
5803
|
+
// dual attention normalization
|
|
5804
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5805
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5806
|
+
|
|
5807
|
+
// attention projections
|
|
5808
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5809
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
5810
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
5811
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
5812
|
+
|
|
5813
|
+
// Q/K normalization
|
|
5814
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5815
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5816
|
+
|
|
5817
|
+
// attention gating
|
|
5818
|
+
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5819
|
+
|
|
5820
|
+
// dual ffn normalization
|
|
5821
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5822
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5823
|
+
|
|
5824
|
+
if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
|
|
5825
|
+
// MoE layers
|
|
5826
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
5827
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
|
|
5828
|
+
|
|
5829
|
+
// grouped expert weights
|
|
5830
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
5831
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5832
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
5833
|
+
|
|
5834
|
+
// shared expert
|
|
5835
|
+
if (n_expert_shared > 0) {
|
|
5836
|
+
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
|
|
5837
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
|
5838
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
|
5839
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
|
5840
|
+
}
|
|
5841
|
+
} else {
|
|
5842
|
+
// Dense layers
|
|
5843
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5844
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
5845
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5846
|
+
}
|
|
5847
|
+
}
|
|
5848
|
+
} break;
|
|
5752
5849
|
case LLM_ARCH_ERNIE4_5:
|
|
5753
5850
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
5754
5851
|
{
|
|
@@ -7243,6 +7340,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7243
7340
|
{
|
|
7244
7341
|
llm = std::make_unique<llm_build_arcee>(*this, params);
|
|
7245
7342
|
} break;
|
|
7343
|
+
case LLM_ARCH_AFMOE:
|
|
7344
|
+
{
|
|
7345
|
+
llm = std::make_unique<llm_build_afmoe>(*this, params);
|
|
7346
|
+
} break;
|
|
7246
7347
|
case LLM_ARCH_ERNIE4_5:
|
|
7247
7348
|
{
|
|
7248
7349
|
llm = std::make_unique<llm_build_ernie4_5>(*this, params);
|
|
@@ -7528,6 +7629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7528
7629
|
case LLM_ARCH_MINIMAX_M2:
|
|
7529
7630
|
case LLM_ARCH_COGVLM:
|
|
7530
7631
|
case LLM_ARCH_PANGU_EMBED:
|
|
7632
|
+
case LLM_ARCH_AFMOE:
|
|
7531
7633
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
7532
7634
|
|
|
7533
7635
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -76,6 +76,7 @@ enum llm_type {
|
|
|
76
76
|
LLM_TYPE_15B,
|
|
77
77
|
LLM_TYPE_16B,
|
|
78
78
|
LLM_TYPE_20B,
|
|
79
|
+
LLM_TYPE_26B,
|
|
79
80
|
LLM_TYPE_27B,
|
|
80
81
|
LLM_TYPE_30B,
|
|
81
82
|
LLM_TYPE_32B,
|
|
@@ -234,6 +235,7 @@ struct llama_layer {
|
|
|
234
235
|
struct ggml_tensor * wk_enc = nullptr;
|
|
235
236
|
struct ggml_tensor * wv_enc = nullptr;
|
|
236
237
|
struct ggml_tensor * wo_enc = nullptr;
|
|
238
|
+
struct ggml_tensor * wqkv_gate = nullptr;
|
|
237
239
|
|
|
238
240
|
// attention bias
|
|
239
241
|
struct ggml_tensor * bq = nullptr;
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "llama-vocab.h"
|
|
5
5
|
#include "llama-grammar.h"
|
|
6
6
|
|
|
7
|
+
#include <array>
|
|
7
8
|
#include <algorithm>
|
|
8
9
|
#include <cassert>
|
|
9
10
|
#include <cfloat>
|
|
@@ -1625,10 +1626,12 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1625
1626
|
auto * ctx = new llama_sampler_grammar;
|
|
1626
1627
|
|
|
1627
1628
|
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
|
1629
|
+
std::string trigger_pattern;
|
|
1630
|
+
llama_grammar * grammar = nullptr;
|
|
1628
1631
|
// TODO: remove trigger_words support.
|
|
1629
1632
|
if (trigger_words != nullptr && num_trigger_words > 0) {
|
|
1630
1633
|
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
|
|
1631
|
-
|
|
1634
|
+
trigger_pattern = "[\\s\\S]*?(";
|
|
1632
1635
|
for (size_t i = 0; i < num_trigger_words; ++i) {
|
|
1633
1636
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
1634
1637
|
if (i > 0) {
|
|
@@ -1637,15 +1640,17 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1637
1640
|
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
|
1638
1641
|
}
|
|
1639
1642
|
trigger_pattern += ")[\\s\\S]*";
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
+
|
|
1644
|
+
std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
|
|
1645
|
+
grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
|
|
1646
|
+
} else {
|
|
1647
|
+
grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
|
|
1643
1648
|
}
|
|
1644
1649
|
*ctx = {
|
|
1645
1650
|
/* .vocab = */ vocab,
|
|
1646
1651
|
/* .grammar_str = */ grammar_str,
|
|
1647
1652
|
/* .grammar_root = */ grammar_root,
|
|
1648
|
-
/* .grammar = */
|
|
1653
|
+
/* .grammar = */ grammar,
|
|
1649
1654
|
};
|
|
1650
1655
|
if (!ctx->grammar) {
|
|
1651
1656
|
delete ctx;
|