@fugood/llama.node 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -12
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/arg.cpp +17 -0
- package/src/llama.cpp/common/chat.cpp +37 -20
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.h +4 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +181 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -2
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-batch.cpp +27 -1
- package/src/llama.cpp/src/llama-batch.h +8 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +95 -81
- package/src/llama.cpp/src/llama-graph.h +43 -16
- package/src/llama.cpp/src/llama-hparams.cpp +2 -1
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +1374 -210
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +8 -1
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
|
@@ -20,6 +20,9 @@
|
|
|
20
20
|
|
|
21
21
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
22
22
|
|
|
23
|
+
// Work buffer size for im2col operations in CONV2D
|
|
24
|
+
#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
|
|
25
|
+
|
|
23
26
|
#ifdef __cplusplus
|
|
24
27
|
extern "C" {
|
|
25
28
|
#endif
|
|
@@ -53,6 +56,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
|
|
|
53
56
|
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
54
57
|
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
55
58
|
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
59
|
+
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
56
60
|
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
57
61
|
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
58
62
|
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -64,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
|
|
|
64
68
|
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
65
69
|
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
66
70
|
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
71
|
+
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
67
72
|
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
68
73
|
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
69
74
|
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -93,6 +98,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st
|
|
|
93
98
|
void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
94
99
|
void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
95
100
|
void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
101
|
+
void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
96
102
|
void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
97
103
|
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
98
104
|
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -105,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
|
|
|
105
111
|
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
106
112
|
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
107
113
|
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
114
|
+
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
108
115
|
|
|
109
116
|
#ifdef __cplusplus
|
|
110
117
|
}
|
|
@@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
189
189
|
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
190
190
|
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
|
191
191
|
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
192
|
-
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg,
|
|
192
|
+
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
|
|
193
193
|
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
194
194
|
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
|
195
195
|
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
37
37
|
for (int i = 0; i < np; i += ggml_f32_step) {
|
|
38
38
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
39
39
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
40
|
-
sum1 = GGML_F32_VEC_FMA(ax1, ay1
|
|
40
|
+
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
|
41
41
|
|
|
42
42
|
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
|
43
43
|
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
44
|
-
sum2 = GGML_F32_VEC_FMA(ax2, ay2
|
|
44
|
+
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
|
|
45
45
|
|
|
46
46
|
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
|
47
47
|
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
|
48
|
-
sum3 = GGML_F32_VEC_FMA(ax3, ay3
|
|
48
|
+
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
|
|
49
49
|
|
|
50
50
|
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
|
51
51
|
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
|
52
|
-
sum4 = GGML_F32_VEC_FMA(ax4, ay4
|
|
52
|
+
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
|
|
53
53
|
|
|
54
54
|
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
|
55
55
|
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
|
56
|
-
sum5 = GGML_F32_VEC_FMA(ax5, ay5
|
|
56
|
+
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
|
|
57
57
|
|
|
58
58
|
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
|
59
59
|
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
|
60
|
-
sum6 = GGML_F32_VEC_FMA(ax6, ay6
|
|
60
|
+
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
|
|
61
61
|
|
|
62
62
|
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
|
63
63
|
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
|
64
|
-
sum7 = GGML_F32_VEC_FMA(ax7, ay7
|
|
64
|
+
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
|
|
65
65
|
|
|
66
66
|
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
|
67
67
|
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
|
68
|
-
sum8 = GGML_F32_VEC_FMA(ax8, ay8
|
|
68
|
+
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
|
|
69
69
|
}
|
|
70
70
|
// leftovers
|
|
71
71
|
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
|
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
73
73
|
for (int i = np; i < np2; i += ggml_f32_epr) {
|
|
74
74
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
75
75
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
76
|
-
sum1 = GGML_F32_VEC_FMA(ax1, ay1
|
|
76
|
+
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
|
77
77
|
}
|
|
78
78
|
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
|
79
79
|
if (np2 < n) {
|
|
@@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
254
254
|
}
|
|
255
255
|
}
|
|
256
256
|
|
|
257
|
+
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
258
|
+
int i = 0;
|
|
259
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
|
260
|
+
for (; i + 15 < n; i += 16) {
|
|
261
|
+
_mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
|
|
262
|
+
}
|
|
263
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
|
264
|
+
for (; i + 7 < n; i += 8) {
|
|
265
|
+
_mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
|
|
266
|
+
}
|
|
267
|
+
#elif defined(__SSE2__)
|
|
268
|
+
for (; i + 3 < n; i += 4) {
|
|
269
|
+
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
|
270
|
+
}
|
|
271
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
272
|
+
for (; i + 3 < n; i += 4) {
|
|
273
|
+
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
|
274
|
+
}
|
|
275
|
+
#endif
|
|
276
|
+
for (; i < n; ++i) {
|
|
277
|
+
y[i] = ggml_silu_f32(x[i]) * g[i];
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
257
281
|
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
|
258
282
|
int i = 0;
|
|
259
283
|
ggml_float sum = 0;
|
|
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
163
163
|
|
|
164
164
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
165
165
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
166
|
-
ay1 = GGML_F32_VEC_FMA(ax1, vx
|
|
166
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
|
167
167
|
|
|
168
168
|
GGML_F32_VEC_STORE(y + i, ay1);
|
|
169
169
|
|
|
170
170
|
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
|
171
171
|
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
172
|
-
ay2 = GGML_F32_VEC_FMA(ax2, vx
|
|
172
|
+
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
|
|
173
173
|
|
|
174
174
|
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
|
175
175
|
|
|
176
176
|
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
|
177
177
|
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
|
178
|
-
ay3 = GGML_F32_VEC_FMA(ax3, vx
|
|
178
|
+
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
|
|
179
179
|
|
|
180
180
|
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
|
181
181
|
|
|
182
182
|
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
|
183
183
|
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
|
184
|
-
ay4 = GGML_F32_VEC_FMA(ax4, vx
|
|
184
|
+
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
|
|
185
185
|
|
|
186
186
|
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
|
187
187
|
|
|
188
188
|
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
|
189
189
|
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
|
190
|
-
ay5 = GGML_F32_VEC_FMA(ax5, vx
|
|
190
|
+
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
|
|
191
191
|
|
|
192
192
|
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
|
193
193
|
|
|
194
194
|
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
|
195
195
|
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
|
196
|
-
ay6 = GGML_F32_VEC_FMA(ax6, vx
|
|
196
|
+
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
|
|
197
197
|
|
|
198
198
|
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
|
199
199
|
|
|
200
200
|
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
|
201
201
|
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
|
202
|
-
ay7 = GGML_F32_VEC_FMA(ax7, vx
|
|
202
|
+
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
|
|
203
203
|
|
|
204
204
|
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
|
205
205
|
|
|
206
206
|
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
|
207
207
|
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
|
208
|
-
ay8 = GGML_F32_VEC_FMA(ax8, vx
|
|
208
|
+
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
|
|
209
209
|
|
|
210
210
|
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
|
211
211
|
}
|
|
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
215
215
|
for (int i = np; i < np2; i += ggml_f32_epr) {
|
|
216
216
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
217
217
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
218
|
-
ay1 = GGML_F32_VEC_FMA(ax1, vx
|
|
218
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
|
219
219
|
|
|
220
220
|
GGML_F32_VEC_STORE(y + i, ay1);
|
|
221
221
|
}
|
|
@@ -905,6 +905,100 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
|
|
|
905
905
|
}
|
|
906
906
|
}
|
|
907
907
|
|
|
908
|
+
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
|
|
909
|
+
for (int i = 0; i < n; ++i) {
|
|
910
|
+
y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
915
|
+
for (int i = 0; i < n; ++i) {
|
|
916
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
917
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
#ifdef GGML_GELU_FP16
|
|
922
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
923
|
+
uint16_t t;
|
|
924
|
+
for (int i = 0; i < n; ++i) {
|
|
925
|
+
if (x[i] <= -10.0f) {
|
|
926
|
+
y[i] = 0.0f;
|
|
927
|
+
} else if (x[i] >= 10.0f) {
|
|
928
|
+
y[i] = x[i] * g[i];
|
|
929
|
+
} else {
|
|
930
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
931
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
932
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
#else
|
|
937
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
938
|
+
for (int i = 0; i < n; ++i) {
|
|
939
|
+
y[i] = ggml_gelu_f32(x[i]) * g[i];
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
#endif
|
|
943
|
+
|
|
944
|
+
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
945
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
|
946
|
+
for (int i = 0; i < n; ++i) {
|
|
947
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
948
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
|
|
953
|
+
|
|
954
|
+
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
955
|
+
for (int i = 0; i < n; ++i) {
|
|
956
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
957
|
+
float w = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
958
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
|
|
963
|
+
for (int i = 0; i < n; ++i) {
|
|
964
|
+
float xi = x[i];
|
|
965
|
+
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
970
|
+
for (int i = 0; i < n; ++i) {
|
|
971
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
972
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
973
|
+
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
#ifdef GGML_GELU_QUICK_FP16
|
|
978
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
|
979
|
+
uint16_t t;
|
|
980
|
+
for (int i = 0; i < n; ++i) {
|
|
981
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
982
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
983
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
#else
|
|
987
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
|
988
|
+
for (int i = 0; i < n; ++i) {
|
|
989
|
+
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
#endif
|
|
993
|
+
|
|
994
|
+
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
995
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
|
996
|
+
for (int i = 0; i < n; ++i) {
|
|
997
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
998
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
|
|
908
1002
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
|
909
1003
|
#ifndef GGML_USE_ACCELERATE
|
|
910
1004
|
ggml_float sum = 0.0;
|
|
@@ -45,6 +45,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
45
45
|
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
|
46
46
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
47
47
|
{ LLM_ARCH_MAMBA, "mamba" },
|
|
48
|
+
{ LLM_ARCH_MAMBA2, "mamba2" },
|
|
49
|
+
{ LLM_ARCH_FALCON_H1, "falcon-h1" },
|
|
48
50
|
{ LLM_ARCH_XVERSE, "xverse" },
|
|
49
51
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
|
50
52
|
{ LLM_ARCH_COHERE2, "cohere2" },
|
|
@@ -76,6 +78,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
76
78
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
|
77
79
|
{ LLM_ARCH_DOTS1, "dots1" },
|
|
78
80
|
{ LLM_ARCH_ARCEE, "arcee" },
|
|
81
|
+
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
|
82
|
+
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
83
|
+
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
79
84
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
80
85
|
};
|
|
81
86
|
|
|
@@ -169,6 +174,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
169
174
|
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
|
170
175
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
|
171
176
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
|
177
|
+
{ LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
|
|
172
178
|
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
|
173
179
|
|
|
174
180
|
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
|
@@ -1003,6 +1009,46 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1003
1009
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
1004
1010
|
},
|
|
1005
1011
|
},
|
|
1012
|
+
{
|
|
1013
|
+
LLM_ARCH_MAMBA2,
|
|
1014
|
+
{
|
|
1015
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1016
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1017
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1018
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1019
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
1020
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
1021
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
1022
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
1023
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
|
1024
|
+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
|
1025
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
1026
|
+
},
|
|
1027
|
+
},
|
|
1028
|
+
{
|
|
1029
|
+
LLM_ARCH_FALCON_H1,
|
|
1030
|
+
{
|
|
1031
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1032
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1033
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1034
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1035
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1036
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1037
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1038
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1039
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
1040
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
1041
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
1042
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
1043
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
|
1044
|
+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
|
1045
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
1046
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1047
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1048
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1049
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1050
|
+
},
|
|
1051
|
+
},
|
|
1006
1052
|
{
|
|
1007
1053
|
LLM_ARCH_XVERSE,
|
|
1008
1054
|
{
|
|
@@ -1658,12 +1704,69 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1658
1704
|
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1659
1705
|
}
|
|
1660
1706
|
},
|
|
1707
|
+
{
|
|
1708
|
+
LLM_ARCH_ERNIE4_5,
|
|
1709
|
+
{
|
|
1710
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1711
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1712
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1713
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1714
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1715
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1716
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1717
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1718
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1719
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1720
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1721
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1722
|
+
},
|
|
1723
|
+
},
|
|
1724
|
+
{
|
|
1725
|
+
LLM_ARCH_HUNYUAN_MOE,
|
|
1726
|
+
{
|
|
1727
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1728
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1729
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1730
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1731
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1732
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1733
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1734
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1735
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1736
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1737
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1738
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1739
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1740
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1741
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1742
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1743
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1744
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1745
|
+
},
|
|
1746
|
+
},
|
|
1661
1747
|
{
|
|
1662
1748
|
LLM_ARCH_UNKNOWN,
|
|
1663
1749
|
{
|
|
1664
1750
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1665
1751
|
},
|
|
1666
1752
|
},
|
|
1753
|
+
{
|
|
1754
|
+
LLM_ARCH_SMOLLM3,
|
|
1755
|
+
{
|
|
1756
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1757
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1758
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1759
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1760
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1761
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1762
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1763
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1764
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1765
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1766
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1767
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1768
|
+
},
|
|
1769
|
+
},
|
|
1667
1770
|
};
|
|
1668
1771
|
|
|
1669
1772
|
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
@@ -1743,6 +1846,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1743
1846
|
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
1744
1847
|
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
|
|
1745
1848
|
{LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1849
|
+
{LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1746
1850
|
{LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1747
1851
|
{LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1748
1852
|
{LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
@@ -1876,6 +1980,7 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
|
|
|
1876
1980
|
bool llm_arch_is_recurrent(const llm_arch & arch) {
|
|
1877
1981
|
switch (arch) {
|
|
1878
1982
|
case LLM_ARCH_MAMBA:
|
|
1983
|
+
case LLM_ARCH_MAMBA2:
|
|
1879
1984
|
case LLM_ARCH_RWKV6:
|
|
1880
1985
|
case LLM_ARCH_RWKV6QWEN2:
|
|
1881
1986
|
case LLM_ARCH_RWKV7:
|
|
@@ -1887,9 +1992,10 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
|
|
|
1887
1992
|
}
|
|
1888
1993
|
|
|
1889
1994
|
bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
1890
|
-
//
|
|
1891
|
-
// the place to identify them
|
|
1995
|
+
// List all mamba-attention hybrid models here
|
|
1892
1996
|
switch (arch) {
|
|
1997
|
+
case LLM_ARCH_FALCON_H1:
|
|
1998
|
+
return true;
|
|
1893
1999
|
default:
|
|
1894
2000
|
return false;
|
|
1895
2001
|
}
|
|
@@ -49,6 +49,8 @@ enum llm_arch {
|
|
|
49
49
|
LLM_ARCH_GEMMA3N,
|
|
50
50
|
LLM_ARCH_STARCODER2,
|
|
51
51
|
LLM_ARCH_MAMBA,
|
|
52
|
+
LLM_ARCH_MAMBA2,
|
|
53
|
+
LLM_ARCH_FALCON_H1,
|
|
52
54
|
LLM_ARCH_XVERSE,
|
|
53
55
|
LLM_ARCH_COMMAND_R,
|
|
54
56
|
LLM_ARCH_COHERE2,
|
|
@@ -80,6 +82,9 @@ enum llm_arch {
|
|
|
80
82
|
LLM_ARCH_BAILINGMOE,
|
|
81
83
|
LLM_ARCH_DOTS1,
|
|
82
84
|
LLM_ARCH_ARCEE,
|
|
85
|
+
LLM_ARCH_ERNIE4_5,
|
|
86
|
+
LLM_ARCH_HUNYUAN_MOE,
|
|
87
|
+
LLM_ARCH_SMOLLM3,
|
|
83
88
|
LLM_ARCH_UNKNOWN,
|
|
84
89
|
};
|
|
85
90
|
|
|
@@ -173,6 +178,7 @@ enum llm_kv {
|
|
|
173
178
|
LLM_KV_SSM_CONV_KERNEL,
|
|
174
179
|
LLM_KV_SSM_STATE_SIZE,
|
|
175
180
|
LLM_KV_SSM_TIME_STEP_RANK,
|
|
181
|
+
LLM_KV_SSM_GROUP_COUNT,
|
|
176
182
|
LLM_KV_SSM_DT_B_C_RMS,
|
|
177
183
|
|
|
178
184
|
LLM_KV_WKV_HEAD_SIZE,
|
|
@@ -292,6 +298,7 @@ enum llm_tensor {
|
|
|
292
298
|
LLM_TENSOR_SSM_DT,
|
|
293
299
|
LLM_TENSOR_SSM_A,
|
|
294
300
|
LLM_TENSOR_SSM_D,
|
|
301
|
+
LLM_TENSOR_SSM_NORM,
|
|
295
302
|
LLM_TENSOR_SSM_OUT,
|
|
296
303
|
LLM_TENSOR_TIME_MIX_W0,
|
|
297
304
|
LLM_TENSOR_TIME_MIX_W1,
|
|
@@ -166,6 +166,8 @@ bool llama_batch_allocr::init(
|
|
|
166
166
|
|
|
167
167
|
// note: tracking the other way around is not necessary for now
|
|
168
168
|
//seq_cpl[s0][s1] = true;
|
|
169
|
+
|
|
170
|
+
has_cpl = true;
|
|
169
171
|
}
|
|
170
172
|
}
|
|
171
173
|
}
|
|
@@ -405,6 +407,10 @@ uint32_t llama_batch_allocr::get_n_outputs() const {
|
|
|
405
407
|
return n_outputs;
|
|
406
408
|
}
|
|
407
409
|
|
|
410
|
+
uint32_t llama_batch_allocr::get_n_used() const {
|
|
411
|
+
return n_used;
|
|
412
|
+
}
|
|
413
|
+
|
|
408
414
|
std::vector<int32_t> & llama_batch_allocr::get_out_ids() {
|
|
409
415
|
return out_ids;
|
|
410
416
|
}
|
|
@@ -420,6 +426,8 @@ llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
420
426
|
void llama_batch_allocr::split_reset() {
|
|
421
427
|
out_ids.clear();
|
|
422
428
|
|
|
429
|
+
n_used = 0;
|
|
430
|
+
|
|
423
431
|
used.clear();
|
|
424
432
|
used.resize(get_n_tokens(), false);
|
|
425
433
|
|
|
@@ -444,6 +452,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
|
|
444
452
|
idxs.push_back(cur_idx);
|
|
445
453
|
|
|
446
454
|
used[cur_idx] = true;
|
|
455
|
+
++n_used;
|
|
447
456
|
|
|
448
457
|
++cur_idx;
|
|
449
458
|
|
|
@@ -459,9 +468,17 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
|
|
459
468
|
return ubatch_add(idxs, idxs.size(), false);
|
|
460
469
|
}
|
|
461
470
|
|
|
462
|
-
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
|
|
471
|
+
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
|
472
|
+
if (sequential && has_cpl) {
|
|
473
|
+
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
|
|
474
|
+
|
|
475
|
+
return {};
|
|
476
|
+
}
|
|
477
|
+
|
|
463
478
|
std::vector<seq_set_t> cur_seq_set;
|
|
464
479
|
|
|
480
|
+
llama_seq_id last_seq_id = -1;
|
|
481
|
+
|
|
465
482
|
// determine the non-overlapping sequence sets participating in this ubatch
|
|
466
483
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
|
467
484
|
if (used[i]) {
|
|
@@ -478,9 +495,16 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
|
|
|
478
495
|
}
|
|
479
496
|
}
|
|
480
497
|
|
|
498
|
+
// accept only increasing sequence ids
|
|
499
|
+
if (sequential) {
|
|
500
|
+
add = add && (cur_seq_set.empty() || batch.seq_id[i][0] == last_seq_id + 1);
|
|
501
|
+
}
|
|
502
|
+
|
|
481
503
|
if (add) {
|
|
482
504
|
cur_seq_set.push_back(seq_set[i]);
|
|
483
505
|
|
|
506
|
+
last_seq_id = batch.seq_id[i][0];
|
|
507
|
+
|
|
484
508
|
if (cur_seq_set.size() > n_ubatch) {
|
|
485
509
|
break;
|
|
486
510
|
}
|
|
@@ -529,6 +553,7 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
|
|
|
529
553
|
idxs_per_seq[s].push_back(idx);
|
|
530
554
|
|
|
531
555
|
used[idx] = true;
|
|
556
|
+
++n_used;
|
|
532
557
|
|
|
533
558
|
++cur_idx[s];
|
|
534
559
|
}
|
|
@@ -570,6 +595,7 @@ llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) {
|
|
|
570
595
|
idxs.push_back(cur_idx);
|
|
571
596
|
|
|
572
597
|
used[cur_idx] = true;
|
|
598
|
+
++n_used;
|
|
573
599
|
|
|
574
600
|
if (idxs.size() >= n_ubatch) {
|
|
575
601
|
break;
|
|
@@ -54,6 +54,7 @@ public:
|
|
|
54
54
|
|
|
55
55
|
uint32_t get_n_tokens() const;
|
|
56
56
|
uint32_t get_n_outputs() const;
|
|
57
|
+
uint32_t get_n_used() const;
|
|
57
58
|
|
|
58
59
|
// the array of output indices in the order they were encountered during the ubatch splitting
|
|
59
60
|
std::vector<int32_t> & get_out_ids();
|
|
@@ -69,7 +70,8 @@ public:
|
|
|
69
70
|
llama_ubatch split_simple(uint32_t n_ubatch);
|
|
70
71
|
|
|
71
72
|
// make ubatches of equal-length sequences sets
|
|
72
|
-
|
|
73
|
+
// if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
|
|
74
|
+
llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);
|
|
73
75
|
|
|
74
76
|
// sequence-set-wise split - each ubatch contains a single sequence-set
|
|
75
77
|
llama_ubatch split_seq(uint32_t n_ubatch);
|
|
@@ -112,6 +114,9 @@ private:
|
|
|
112
114
|
using pos_set_t = std::set<llama_pos>;
|
|
113
115
|
using seq_cpl_t = std::vector<bool>;
|
|
114
116
|
|
|
117
|
+
// helper flag to quickly determine if there are any coupled sequences in the batch
|
|
118
|
+
bool has_cpl;
|
|
119
|
+
|
|
115
120
|
std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
|
|
116
121
|
std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
|
|
117
122
|
|
|
@@ -125,6 +130,8 @@ private:
|
|
|
125
130
|
// batch indices of the output
|
|
126
131
|
std::vector<int32_t> out_ids;
|
|
127
132
|
|
|
133
|
+
uint32_t n_used;
|
|
134
|
+
|
|
128
135
|
// used[i] indicates if token i has already been used in a previous ubatch
|
|
129
136
|
std::vector<bool> used;
|
|
130
137
|
|
|
@@ -64,6 +64,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
64
64
|
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
|
65
65
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
|
66
66
|
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
67
|
+
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
|
67
68
|
};
|
|
68
69
|
|
|
69
70
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -185,6 +186,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
185
186
|
return LLM_CHAT_TEMPLATE_LLAMA4;
|
|
186
187
|
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
|
187
188
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
|
189
|
+
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
|
190
|
+
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
|
188
191
|
}
|
|
189
192
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
190
193
|
}
|
|
@@ -665,6 +668,18 @@ int32_t llm_chat_apply_template(
|
|
|
665
668
|
if (add_ass) {
|
|
666
669
|
ss << "<|response|>";
|
|
667
670
|
}
|
|
671
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
|
|
672
|
+
// tencent/Hunyuan-A13B-Instruct
|
|
673
|
+
for (auto message : chat) {
|
|
674
|
+
std::string role(message->role);
|
|
675
|
+
if (role == "system") {
|
|
676
|
+
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
|
677
|
+
} else if (role == "assistant") {
|
|
678
|
+
ss << "<|startoftext|>" << message->content << "<|eos|>";
|
|
679
|
+
} else {
|
|
680
|
+
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
|
681
|
+
}
|
|
682
|
+
}
|
|
668
683
|
} else {
|
|
669
684
|
// template not supported
|
|
670
685
|
return -1;
|