@fugood/llama.node 1.2.3 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +484 -204
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +156 -15
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/common/json-partial.cpp +51 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
- package/src/llama.cpp/include/llama.h +8 -0
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +6 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +5 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +572 -45
- package/src/llama.cpp/src/llama-model.h +18 -0
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstddef>
|
|
4
|
+
|
|
5
|
+
namespace sqnbitgemm_spacemit_ime {
|
|
6
|
+
namespace ime1 {
|
|
7
|
+
size_t gemm_kernel_i8i4(size_t blk_len,
|
|
8
|
+
const std::byte * quant_a_ptr,
|
|
9
|
+
const std::byte * quant_b_data,
|
|
10
|
+
const float * quant_b_scale,
|
|
11
|
+
const std::byte * quant_b_zp,
|
|
12
|
+
float * c_ptr,
|
|
13
|
+
size_t count_m,
|
|
14
|
+
size_t count_n,
|
|
15
|
+
size_t count_k,
|
|
16
|
+
size_t block_count_k,
|
|
17
|
+
size_t ldc,
|
|
18
|
+
const float * bias,
|
|
19
|
+
const size_t scale_stride);
|
|
20
|
+
|
|
21
|
+
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
|
22
|
+
|
|
23
|
+
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
|
24
|
+
|
|
25
|
+
} // namespace ime1
|
|
26
|
+
} // namespace sqnbitgemm_spacemit_ime
|
|
@@ -52,6 +52,15 @@ static inline float op_sqrt(float x) {
|
|
|
52
52
|
return sqrtf(x);
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
|
|
56
|
+
if (x > 0.0f) {
|
|
57
|
+
return alpha_p * x * x + beta * x;
|
|
58
|
+
} else {
|
|
59
|
+
const float min_x_eps = fminf(x, eps);
|
|
60
|
+
return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
55
64
|
static inline float op_sin(float x) {
|
|
56
65
|
return sinf(x);
|
|
57
66
|
}
|
|
@@ -121,6 +130,86 @@ static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
|
121
130
|
}
|
|
122
131
|
}
|
|
123
132
|
|
|
133
|
+
template <float (*op)(float, ggml_tensor *)>
|
|
134
|
+
static void unary_op_params(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
135
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
136
|
+
|
|
137
|
+
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
|
138
|
+
apply_unary_op<op, float, float>(params, dst);
|
|
139
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
|
140
|
+
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
|
141
|
+
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
|
142
|
+
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
|
143
|
+
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
|
|
144
|
+
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
|
|
145
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
|
146
|
+
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
|
|
147
|
+
} else {
|
|
148
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
|
|
149
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type));
|
|
150
|
+
GGML_ABORT("fatal error");
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Extend vec_unary_op to support functors
|
|
155
|
+
template <typename Op, typename src0_t, typename dst_t>
|
|
156
|
+
static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
|
|
157
|
+
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
|
158
|
+
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
|
159
|
+
|
|
160
|
+
for (int i = 0; i < n; i++) {
|
|
161
|
+
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Extend apply_unary_op to support functors
|
|
166
|
+
template <typename Op, typename src0_t, typename dst_t>
|
|
167
|
+
static void apply_unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
|
|
168
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
169
|
+
|
|
170
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
|
|
171
|
+
|
|
172
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
|
173
|
+
|
|
174
|
+
GGML_ASSERT( nb0 == sizeof(dst_t));
|
|
175
|
+
GGML_ASSERT(nb00 == sizeof(src0_t));
|
|
176
|
+
|
|
177
|
+
const auto [ir0, ir1] = get_thread_range(params, src0);
|
|
178
|
+
|
|
179
|
+
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
|
180
|
+
const int64_t i03 = ir/(ne02*ne01);
|
|
181
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
|
182
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
183
|
+
|
|
184
|
+
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
|
185
|
+
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
|
186
|
+
|
|
187
|
+
vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Generic dispatcher for functors
|
|
192
|
+
template <typename Op>
|
|
193
|
+
static void unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
|
|
194
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
195
|
+
|
|
196
|
+
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
|
197
|
+
apply_unary_op_functor<Op, float, float>(params, dst, op);
|
|
198
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
|
199
|
+
apply_unary_op_functor<Op, ggml_fp16_t, ggml_fp16_t>(params, dst, op);
|
|
200
|
+
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
|
201
|
+
apply_unary_op_functor<Op, ggml_bf16_t, ggml_bf16_t>(params, dst, op);
|
|
202
|
+
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
|
|
203
|
+
apply_unary_op_functor<Op, ggml_bf16_t, float>(params, dst, op);
|
|
204
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
|
205
|
+
apply_unary_op_functor<Op, ggml_fp16_t, float>(params, dst, op);
|
|
206
|
+
} else {
|
|
207
|
+
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
|
|
208
|
+
ggml_type_name(dst->type), ggml_type_name(src0->type));
|
|
209
|
+
GGML_ABORT("fatal error");
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
124
213
|
void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
125
214
|
unary_op<op_abs>(params, dst);
|
|
126
215
|
}
|
|
@@ -184,3 +273,17 @@ void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor *
|
|
|
184
273
|
void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
185
274
|
unary_op<op_log>(params, dst);
|
|
186
275
|
}
|
|
276
|
+
|
|
277
|
+
void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
278
|
+
const float alpha_n = ggml_get_op_params_f32(dst, 1);
|
|
279
|
+
const float alpha_p = ggml_get_op_params_f32(dst, 2);
|
|
280
|
+
const float beta = ggml_get_op_params_f32(dst, 3);
|
|
281
|
+
const float eps = ggml_get_op_params_f32(dst, 4);
|
|
282
|
+
|
|
283
|
+
const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
|
|
284
|
+
return op_xielu(f, alpha_n, alpha_p, beta, eps);
|
|
285
|
+
};
|
|
286
|
+
|
|
287
|
+
unary_op_functor(params, dst, xielu_op_params);
|
|
288
|
+
}
|
|
289
|
+
|
|
@@ -22,6 +22,7 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
|
|
|
22
22
|
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
23
23
|
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
24
24
|
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
25
|
+
void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
25
26
|
|
|
26
27
|
#ifdef __cplusplus
|
|
27
28
|
}
|
|
@@ -404,6 +404,72 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
|
|
404
404
|
}
|
|
405
405
|
}
|
|
406
406
|
|
|
407
|
+
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
|
|
408
|
+
int i = 0;
|
|
409
|
+
ggml_float sum = 0;
|
|
410
|
+
// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
|
|
411
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
|
|
412
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
|
413
|
+
for (; i + 15 < n; i += 16) {
|
|
414
|
+
__m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
|
|
415
|
+
_mm512_set1_ps(mean));
|
|
416
|
+
_mm512_storeu_ps(y + i, val);
|
|
417
|
+
sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
|
|
418
|
+
}
|
|
419
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
|
420
|
+
for (; i + 7 < n; i += 8) {
|
|
421
|
+
__m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
|
|
422
|
+
_mm256_set1_ps(mean));
|
|
423
|
+
_mm256_storeu_ps(y + i, val);
|
|
424
|
+
val = _mm256_mul_ps(val,val);
|
|
425
|
+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
|
426
|
+
_mm256_castps256_ps128(val));
|
|
427
|
+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
|
428
|
+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
|
429
|
+
sum += (ggml_float)_mm_cvtss_f32(val2);
|
|
430
|
+
}
|
|
431
|
+
#elif defined(__SSE2__)
|
|
432
|
+
for (; i + 3 < n; i += 4) {
|
|
433
|
+
__m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
|
|
434
|
+
_mm_set1_ps(mean));
|
|
435
|
+
_mm_storeu_ps(y + i, val);
|
|
436
|
+
val = _mm_mul_ps(val, val);
|
|
437
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
|
438
|
+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
|
439
|
+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
|
440
|
+
#else
|
|
441
|
+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
|
442
|
+
val = _mm_add_ps(val, tmp);
|
|
443
|
+
tmp = _mm_movehl_ps(tmp, val);
|
|
444
|
+
val = _mm_add_ss(val, tmp);
|
|
445
|
+
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
|
446
|
+
sum += (ggml_float)_mm_cvtss_f32(val);
|
|
447
|
+
}
|
|
448
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
449
|
+
for (; i + 3 < n; i += 4) {
|
|
450
|
+
float32x4_t val = vsubq_f32(vld1q_f32(x + i),
|
|
451
|
+
vdupq_n_f32(mean));
|
|
452
|
+
vst1q_f32(y + i, val);
|
|
453
|
+
val = vmulq_f32(val, val);
|
|
454
|
+
sum += (ggml_float)vaddvq_f32(val);
|
|
455
|
+
}
|
|
456
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
457
|
+
for (; i + 3 < n; i += 4) {
|
|
458
|
+
float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
|
|
459
|
+
vec_xst(val, 0, y + i);
|
|
460
|
+
val = vec_mul(val, val);
|
|
461
|
+
sum += (ggml_float)vec_hsum_f32x4(val);
|
|
462
|
+
}
|
|
463
|
+
#endif
|
|
464
|
+
for (; i < n; ++i) {
|
|
465
|
+
float val = x[i] - mean;
|
|
466
|
+
val *= val;
|
|
467
|
+
sum += (ggml_float)val;
|
|
468
|
+
y[i] = val;
|
|
469
|
+
}
|
|
470
|
+
return sum/n;
|
|
471
|
+
}
|
|
472
|
+
|
|
407
473
|
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
|
408
474
|
int i = 0;
|
|
409
475
|
ggml_float sum = 0;
|
|
@@ -44,6 +44,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
|
|
|
44
44
|
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
45
45
|
|
|
46
46
|
void ggml_vec_silu_f32(const int n, float * y, const float * x);
|
|
47
|
+
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
|
|
47
48
|
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
|
|
48
49
|
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
|
|
49
50
|
|
|
@@ -143,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
143
144
|
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
144
145
|
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
|
145
146
|
|
|
146
|
-
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8
|
|
147
|
+
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
147
148
|
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
|
148
149
|
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
149
150
|
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
|
150
151
|
|
|
151
152
|
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
|
152
153
|
|
|
153
|
-
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8
|
|
154
|
+
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
|
|
154
155
|
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
|
155
156
|
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
|
156
157
|
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
|
@@ -159,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
159
160
|
|
|
160
161
|
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
|
161
162
|
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
|
162
|
-
|
|
163
|
+
ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
|
163
164
|
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
|
164
165
|
|
|
165
166
|
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
@@ -610,7 +611,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
|
|
|
610
611
|
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
611
612
|
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
612
613
|
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
|
613
|
-
ay[j] = GGML_F32_VEC_FMA(ay[j], vs
|
|
614
|
+
ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
|
|
614
615
|
|
|
615
616
|
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
|
616
617
|
}
|
|
@@ -654,11 +655,11 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
654
655
|
}
|
|
655
656
|
// leftovers
|
|
656
657
|
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
|
657
|
-
|
|
658
|
-
svbool_t pg = svwhilelt_b32(
|
|
659
|
-
ay1 = svld1_f32(pg, y +
|
|
658
|
+
for (int i = np; i < n; i += ggml_f32_epr) {
|
|
659
|
+
svbool_t pg = svwhilelt_b32(i, n);
|
|
660
|
+
ay1 = svld1_f32(pg, y + i);
|
|
660
661
|
ay1 = svmul_f32_m(pg, ay1, vx);
|
|
661
|
-
svst1_f32(pg, y +
|
|
662
|
+
svst1_f32(pg, y + i, ay1);
|
|
662
663
|
}
|
|
663
664
|
#elif defined(__riscv_v_intrinsic)
|
|
664
665
|
for (int i = 0, avl; i < n; i += avl) {
|
|
@@ -819,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
|
|
|
819
820
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
820
821
|
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
821
822
|
for (int i = 0; i < n; ++i) {
|
|
822
|
-
|
|
823
|
+
const float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
824
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
|
|
823
825
|
}
|
|
824
826
|
}
|
|
825
827
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
@@ -296,6 +296,7 @@ extern "C" {
|
|
|
296
296
|
bool use_mlock; // force system to keep model in RAM
|
|
297
297
|
bool check_tensors; // validate model tensor data
|
|
298
298
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
299
|
+
bool no_host; // bypass host buffer allowing extra buffers to be used
|
|
299
300
|
};
|
|
300
301
|
|
|
301
302
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -543,6 +544,9 @@ extern "C" {
|
|
|
543
544
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
|
544
545
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
|
545
546
|
|
|
547
|
+
// Returns true if the model is hybrid (like Jamba, Granite, etc.)
|
|
548
|
+
LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
|
|
549
|
+
|
|
546
550
|
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
|
547
551
|
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
|
548
552
|
|
|
@@ -791,8 +795,12 @@ extern "C" {
|
|
|
791
795
|
size_t n_token_capacity,
|
|
792
796
|
size_t * n_token_count_out);
|
|
793
797
|
|
|
798
|
+
// for backwards-compat
|
|
794
799
|
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
|
795
800
|
|
|
801
|
+
// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
|
|
802
|
+
#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
|
|
803
|
+
|
|
796
804
|
typedef uint32_t llama_state_seq_flags;
|
|
797
805
|
|
|
798
806
|
LLAMA_API size_t llama_state_seq_get_size_ext(
|
|
@@ -93,11 +93,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
93
93
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
94
94
|
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
|
|
95
95
|
{ LLM_ARCH_LFM2, "lfm2" },
|
|
96
|
+
{ LLM_ARCH_LFM2MOE, "lfm2moe" },
|
|
96
97
|
{ LLM_ARCH_DREAM, "dream" },
|
|
97
98
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
98
99
|
{ LLM_ARCH_LLADA, "llada" },
|
|
99
100
|
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
|
|
100
101
|
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
|
102
|
+
{ LLM_ARCH_GROVEMOE, "grovemoe" },
|
|
103
|
+
{ LLM_ARCH_APERTUS, "apertus" },
|
|
101
104
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
102
105
|
};
|
|
103
106
|
|
|
@@ -125,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
125
128
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
|
126
129
|
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
|
127
130
|
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
|
131
|
+
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
|
|
128
132
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
|
129
133
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
|
130
134
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
|
@@ -133,6 +137,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
133
137
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
|
134
138
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
|
135
139
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
|
140
|
+
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
|
|
141
|
+
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
|
136
142
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
137
143
|
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
|
138
144
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
@@ -213,6 +219,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
213
219
|
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
|
214
220
|
|
|
215
221
|
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
|
|
222
|
+
// sentence-transformers dense modules feature dims
|
|
223
|
+
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
|
|
224
|
+
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
|
|
225
|
+
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
|
|
226
|
+
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
|
|
216
227
|
|
|
217
228
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
218
229
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
|
@@ -252,6 +263,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
252
263
|
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
|
|
253
264
|
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
|
|
254
265
|
|
|
266
|
+
{ LLM_KV_XIELU_ALPHA_N, "xielu.alpha_n" },
|
|
267
|
+
{ LLM_KV_XIELU_ALPHA_P, "xielu.alpha_p" },
|
|
268
|
+
{ LLM_KV_XIELU_BETA, "xielu.beta" },
|
|
269
|
+
{ LLM_KV_XIELU_EPS, "xielu.eps" },
|
|
270
|
+
|
|
255
271
|
// deprecated
|
|
256
272
|
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
|
257
273
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
|
@@ -721,6 +737,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
721
737
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
722
738
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
723
739
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
740
|
+
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
|
724
741
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
725
742
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
726
743
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
@@ -1059,6 +1076,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1059
1076
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1060
1077
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1061
1078
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1079
|
+
{ LLM_TENSOR_DENSE_2_OUT, "dense_2" },
|
|
1080
|
+
{ LLM_TENSOR_DENSE_3_OUT, "dense_3" },
|
|
1062
1081
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1063
1082
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1064
1083
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
@@ -2093,6 +2112,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2093
2112
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2094
2113
|
}
|
|
2095
2114
|
},
|
|
2115
|
+
{
|
|
2116
|
+
LLM_ARCH_LFM2MOE,
|
|
2117
|
+
{
|
|
2118
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2119
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2120
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2121
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2122
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2123
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2124
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2125
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2126
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2127
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2128
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2129
|
+
{ LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
|
|
2130
|
+
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
|
2131
|
+
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
|
2132
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2133
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
2134
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2135
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2136
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2137
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2138
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
2139
|
+
}
|
|
2140
|
+
},
|
|
2096
2141
|
{
|
|
2097
2142
|
LLM_ARCH_SMALLTHINKER,
|
|
2098
2143
|
{
|
|
@@ -2114,6 +2159,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2114
2159
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
|
|
2115
2160
|
},
|
|
2116
2161
|
},
|
|
2162
|
+
{
|
|
2163
|
+
LLM_ARCH_APERTUS,
|
|
2164
|
+
{
|
|
2165
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2166
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2167
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2168
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
2169
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2170
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2171
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2172
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2173
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2174
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2175
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2176
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2177
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2178
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2179
|
+
},
|
|
2180
|
+
},
|
|
2117
2181
|
{
|
|
2118
2182
|
LLM_ARCH_DREAM,
|
|
2119
2183
|
{
|
|
@@ -2185,6 +2249,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2185
2249
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2186
2250
|
},
|
|
2187
2251
|
},
|
|
2252
|
+
{
|
|
2253
|
+
LLM_ARCH_GROVEMOE,
|
|
2254
|
+
{
|
|
2255
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2256
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2257
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2258
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2259
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2260
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2261
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2262
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2263
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2264
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2265
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2266
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2267
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2268
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2269
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2270
|
+
{ LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
|
|
2271
|
+
{ LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
|
|
2272
|
+
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
|
|
2273
|
+
},
|
|
2274
|
+
},
|
|
2188
2275
|
{
|
|
2189
2276
|
LLM_ARCH_UNKNOWN,
|
|
2190
2277
|
{
|
|
@@ -2201,6 +2288,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2201
2288
|
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2202
2289
|
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2203
2290
|
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2291
|
+
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
|
2292
|
+
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
|
2204
2293
|
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2205
2294
|
{LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2206
2295
|
{LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
@@ -2317,6 +2406,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2317
2406
|
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
|
2318
2407
|
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
|
2319
2408
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
|
2409
|
+
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
|
2410
|
+
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
|
2411
|
+
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
|
2320
2412
|
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
2321
2413
|
// altup / laurel (gemma 3n)
|
|
2322
2414
|
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
|
@@ -2437,6 +2529,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
|
2437
2529
|
case LLM_ARCH_PLAMO2:
|
|
2438
2530
|
case LLM_ARCH_GRANITE_HYBRID:
|
|
2439
2531
|
case LLM_ARCH_LFM2:
|
|
2532
|
+
case LLM_ARCH_LFM2MOE:
|
|
2440
2533
|
case LLM_ARCH_NEMOTRON_H:
|
|
2441
2534
|
return true;
|
|
2442
2535
|
default:
|
|
@@ -97,11 +97,14 @@ enum llm_arch {
|
|
|
97
97
|
LLM_ARCH_SMOLLM3,
|
|
98
98
|
LLM_ARCH_OPENAI_MOE,
|
|
99
99
|
LLM_ARCH_LFM2,
|
|
100
|
+
LLM_ARCH_LFM2MOE,
|
|
100
101
|
LLM_ARCH_DREAM,
|
|
101
102
|
LLM_ARCH_SMALLTHINKER,
|
|
102
103
|
LLM_ARCH_LLADA,
|
|
103
104
|
LLM_ARCH_LLADA_MOE,
|
|
104
105
|
LLM_ARCH_SEED_OSS,
|
|
106
|
+
LLM_ARCH_GROVEMOE,
|
|
107
|
+
LLM_ARCH_APERTUS,
|
|
105
108
|
LLM_ARCH_UNKNOWN,
|
|
106
109
|
};
|
|
107
110
|
|
|
@@ -129,6 +132,7 @@ enum llm_kv {
|
|
|
129
132
|
LLM_KV_FEED_FORWARD_LENGTH,
|
|
130
133
|
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
131
134
|
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
|
135
|
+
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
|
|
132
136
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
|
133
137
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
|
134
138
|
LLM_KV_EXPERT_COUNT,
|
|
@@ -137,6 +141,8 @@ enum llm_kv {
|
|
|
137
141
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
|
138
142
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
|
139
143
|
LLM_KV_EXPERT_GATING_FUNC,
|
|
144
|
+
LLM_KV_EXPERT_GROUP_SCALE,
|
|
145
|
+
LLM_KV_EXPERTS_PER_GROUP,
|
|
140
146
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
141
147
|
LLM_KV_NEXTN_PREDICT_LAYERS,
|
|
142
148
|
LLM_KV_POOLING_TYPE,
|
|
@@ -256,10 +262,21 @@ enum llm_kv {
|
|
|
256
262
|
|
|
257
263
|
LLM_KV_SHORTCONV_L_CACHE,
|
|
258
264
|
|
|
265
|
+
LLM_KV_XIELU_ALPHA_N,
|
|
266
|
+
LLM_KV_XIELU_ALPHA_P,
|
|
267
|
+
LLM_KV_XIELU_BETA,
|
|
268
|
+
LLM_KV_XIELU_EPS,
|
|
269
|
+
|
|
259
270
|
// deprecated:
|
|
260
271
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
|
261
272
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
|
262
273
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
|
274
|
+
|
|
275
|
+
// sentence-transformers dense layers in and out features
|
|
276
|
+
LLM_KV_DENSE_2_FEAT_IN,
|
|
277
|
+
LLM_KV_DENSE_2_FEAT_OUT,
|
|
278
|
+
LLM_KV_DENSE_3_FEAT_IN,
|
|
279
|
+
LLM_KV_DENSE_3_FEAT_OUT,
|
|
263
280
|
};
|
|
264
281
|
|
|
265
282
|
enum llm_tensor {
|
|
@@ -267,6 +284,8 @@ enum llm_tensor {
|
|
|
267
284
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
|
268
285
|
LLM_TENSOR_TOKEN_TYPES,
|
|
269
286
|
LLM_TENSOR_POS_EMBD,
|
|
287
|
+
LLM_TENSOR_DENSE_2_OUT,
|
|
288
|
+
LLM_TENSOR_DENSE_3_OUT,
|
|
270
289
|
LLM_TENSOR_OUTPUT,
|
|
271
290
|
LLM_TENSOR_OUTPUT_NORM,
|
|
272
291
|
LLM_TENSOR_ROPE_FREQS,
|
|
@@ -301,6 +320,9 @@ enum llm_tensor {
|
|
|
301
320
|
LLM_TENSOR_FFN_DOWN_SHEXP,
|
|
302
321
|
LLM_TENSOR_FFN_GATE_SHEXP,
|
|
303
322
|
LLM_TENSOR_FFN_UP_SHEXP,
|
|
323
|
+
LLM_TENSOR_FFN_DOWN_CHEXPS,
|
|
324
|
+
LLM_TENSOR_FFN_GATE_CHEXPS,
|
|
325
|
+
LLM_TENSOR_FFN_UP_CHEXPS,
|
|
304
326
|
LLM_TENSOR_FFN_EXP_PROBS_B,
|
|
305
327
|
LLM_TENSOR_ATTN_Q_NORM,
|
|
306
328
|
LLM_TENSOR_ATTN_K_NORM,
|
|
@@ -590,7 +590,7 @@ int32_t llm_chat_apply_template(
|
|
|
590
590
|
ss << message->content << "<|end_of_text|>\n";
|
|
591
591
|
}
|
|
592
592
|
if (add_ass) {
|
|
593
|
-
ss << "<|start_of_role|>assistant<|end_of_role
|
|
593
|
+
ss << "<|start_of_role|>assistant<|end_of_role|>";
|
|
594
594
|
}
|
|
595
595
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
|
596
596
|
// GigaChat template
|
|
@@ -2346,6 +2346,12 @@ llama_context * llama_init_from_model(
|
|
|
2346
2346
|
return nullptr;
|
|
2347
2347
|
}
|
|
2348
2348
|
|
|
2349
|
+
if (params.pooling_type != model->hparams.pooling_type) {
|
|
2350
|
+
//user-specified pooling-type is different from the model default
|
|
2351
|
+
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
|
|
2352
|
+
model->hparams.pooling_type, params.pooling_type);
|
|
2353
|
+
}
|
|
2354
|
+
|
|
2349
2355
|
try {
|
|
2350
2356
|
auto * ctx = new llama_context(*model, params);
|
|
2351
2357
|
return ctx;
|