@fugood/llama.node 1.4.1 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -1
- package/lib/binding.js +3 -0
- package/lib/binding.ts +2 -0
- package/package.json +16 -16
- package/scripts/llama.cpp.patch +25 -11
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/CMakeLists.txt +21 -6
- package/src/llama.cpp/common/CMakeLists.txt +6 -0
- package/src/llama.cpp/common/arg.cpp +65 -16
- package/src/llama.cpp/common/chat-parser.cpp +40 -0
- package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
- package/src/llama.cpp/common/chat-peg-parser.h +105 -0
- package/src/llama.cpp/common/chat.cpp +40 -29
- package/src/llama.cpp/common/chat.h +10 -1
- package/src/llama.cpp/common/common.cpp +24 -5
- package/src/llama.cpp/common/common.h +16 -5
- package/src/llama.cpp/common/download.cpp +18 -8
- package/src/llama.cpp/common/download.h +3 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +15 -1
- package/src/llama.cpp/common/log.h +19 -12
- package/src/llama.cpp/common/peg-parser.cpp +1712 -0
- package/src/llama.cpp/common/peg-parser.h +459 -0
- package/src/llama.cpp/common/unicode.cpp +64 -0
- package/src/llama.cpp/common/unicode.h +22 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +48 -48
- package/src/llama.cpp/ggml/include/ggml.h +7 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +60 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +30 -1
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -6
- package/src/llama.cpp/src/llama-hparams.h +2 -2
- package/src/llama.cpp/src/llama-impl.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +50 -6
- package/src/llama.cpp/src/llama-vocab.cpp +1 -2
- package/src/llama.cpp/src/models/mistral3.cpp +160 -0
- package/src/llama.cpp/src/models/models.h +4 -0
|
@@ -204,6 +204,10 @@
|
|
|
204
204
|
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
205
205
|
#endif
|
|
206
206
|
|
|
207
|
+
#if defined(_WIN32) && !defined(_WIN32_WINNT)
|
|
208
|
+
# define _WIN32_WINNT 0x0A00
|
|
209
|
+
#endif
|
|
210
|
+
|
|
207
211
|
#include <stdbool.h>
|
|
208
212
|
#include <stddef.h>
|
|
209
213
|
#include <stdint.h>
|
|
@@ -2148,7 +2152,8 @@ extern "C" {
|
|
|
2148
2152
|
};
|
|
2149
2153
|
|
|
2150
2154
|
enum ggml_scale_flag {
|
|
2151
|
-
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
|
|
2155
|
+
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
|
|
2156
|
+
GGML_SCALE_FLAG_ANTIALIAS = (1 << 9),
|
|
2152
2157
|
};
|
|
2153
2158
|
|
|
2154
2159
|
// interpolate
|
|
@@ -2278,7 +2283,7 @@ extern "C" {
|
|
|
2278
2283
|
float stop,
|
|
2279
2284
|
float step);
|
|
2280
2285
|
|
|
2281
|
-
#define GGML_KQ_MASK_PAD
|
|
2286
|
+
#define GGML_KQ_MASK_PAD 1
|
|
2282
2287
|
|
|
2283
2288
|
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
|
2284
2289
|
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
|
@@ -683,22 +683,14 @@ bool ggml_is_numa(void) {
|
|
|
683
683
|
}
|
|
684
684
|
|
|
685
685
|
#if defined(__ARM_ARCH)
|
|
686
|
-
|
|
687
|
-
#if defined(__linux__) && defined(__aarch64__)
|
|
688
|
-
#include <sys/auxv.h>
|
|
689
|
-
#endif
|
|
690
|
-
|
|
691
|
-
static void ggml_init_arm_arch_features(void) {
|
|
692
686
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
|
693
|
-
#
|
|
694
|
-
|
|
687
|
+
#include <arm_sve.h>
|
|
688
|
+
static void ggml_init_arm_arch_features(void) {
|
|
689
|
+
ggml_arm_arch_features.sve_cnt = svcntb();
|
|
690
|
+
}
|
|
695
691
|
#else
|
|
696
|
-
|
|
697
|
-
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
|
|
692
|
+
static void ggml_init_arm_arch_features(void) {}
|
|
698
693
|
#endif
|
|
699
|
-
#endif
|
|
700
|
-
}
|
|
701
|
-
|
|
702
694
|
#endif // __ARM_ARCH
|
|
703
695
|
|
|
704
696
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
|
@@ -2706,6 +2698,11 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2706
2698
|
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
|
2707
2699
|
}
|
|
2708
2700
|
|
|
2701
|
+
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
|
2702
|
+
// Emscripten without pthreads support can only use a single thread
|
|
2703
|
+
n_threads = 1;
|
|
2704
|
+
#endif
|
|
2705
|
+
|
|
2709
2706
|
size_t work_size = 0;
|
|
2710
2707
|
|
|
2711
2708
|
struct ggml_cplan cplan;
|
|
@@ -6383,7 +6383,7 @@ static void ggml_compute_forward_im2col_3d_f16(
|
|
|
6383
6383
|
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
6384
6384
|
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
6385
6385
|
|
|
6386
|
-
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW
|
|
6386
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
|
6387
6387
|
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
6388
6388
|
} else {
|
|
6389
6389
|
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
@@ -7420,6 +7420,65 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
7420
7420
|
}
|
|
7421
7421
|
}
|
|
7422
7422
|
}
|
|
7423
|
+
} else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
|
|
7424
|
+
// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
|
|
7425
|
+
// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
|
|
7426
|
+
auto triangle_filter = [](float x) -> float {
|
|
7427
|
+
return std::max(1.0f - fabsf(x), 0.0f);
|
|
7428
|
+
};
|
|
7429
|
+
|
|
7430
|
+
// support and invscale, minimum 1 pixel for bilinear
|
|
7431
|
+
const float support1 = std::max(1.0f, 1.0f / sf1);
|
|
7432
|
+
const float invscale1 = 1.0f / support1;
|
|
7433
|
+
const float support0 = std::max(1.0f, 1.0f / sf0);
|
|
7434
|
+
const float invscale0 = 1.0f / support0;
|
|
7435
|
+
|
|
7436
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
7437
|
+
const int64_t i03 = i3 / sf3;
|
|
7438
|
+
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
|
7439
|
+
const int64_t i02 = i2 / sf2;
|
|
7440
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
7441
|
+
const float y = ((float) i1 + pixel_offset) / sf1;
|
|
7442
|
+
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
|
7443
|
+
const float x = ((float) i0 + pixel_offset) / sf0;
|
|
7444
|
+
|
|
7445
|
+
// the range of source pixels that contribute
|
|
7446
|
+
const int64_t x_min = std::max<int64_t>(x - support0 + pixel_offset, 0);
|
|
7447
|
+
const int64_t x_max = std::min<int64_t>(x + support0 + pixel_offset, ne00);
|
|
7448
|
+
const int64_t y_min = std::max<int64_t>(y - support1 + pixel_offset, 0);
|
|
7449
|
+
const int64_t y_max = std::min<int64_t>(y + support1 + pixel_offset, ne01);
|
|
7450
|
+
|
|
7451
|
+
// bilinear filter with antialiasing
|
|
7452
|
+
float val = 0.0f;
|
|
7453
|
+
float total_weight = 0.0f;
|
|
7454
|
+
|
|
7455
|
+
for (int64_t sy = y_min; sy < y_max; sy++) {
|
|
7456
|
+
const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
|
|
7457
|
+
|
|
7458
|
+
for (int64_t sx = x_min; sx < x_max; sx++) {
|
|
7459
|
+
const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
|
|
7460
|
+
const float weight = weight_x * weight_y;
|
|
7461
|
+
|
|
7462
|
+
if (weight <= 0.0f) {
|
|
7463
|
+
continue;
|
|
7464
|
+
}
|
|
7465
|
+
|
|
7466
|
+
const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03);
|
|
7467
|
+
val += pixel * weight;
|
|
7468
|
+
total_weight += weight;
|
|
7469
|
+
}
|
|
7470
|
+
}
|
|
7471
|
+
|
|
7472
|
+
if (total_weight > 0.0f) {
|
|
7473
|
+
val /= total_weight;
|
|
7474
|
+
}
|
|
7475
|
+
|
|
7476
|
+
float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
|
7477
|
+
*dst_ptr = val;
|
|
7478
|
+
}
|
|
7479
|
+
}
|
|
7480
|
+
}
|
|
7481
|
+
}
|
|
7423
7482
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
7424
7483
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
7425
7484
|
const int64_t i03 = i3 / sf3;
|
|
@@ -111,6 +111,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
111
111
|
{ LLM_ARCH_COGVLM, "cogvlm" },
|
|
112
112
|
{ LLM_ARCH_RND1, "rnd1" },
|
|
113
113
|
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
114
|
+
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
|
114
115
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
115
116
|
};
|
|
116
117
|
|
|
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
204
205
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
205
206
|
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
|
206
207
|
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
|
208
|
+
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
|
207
209
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
208
210
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
209
211
|
|
|
@@ -853,7 +855,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
853
855
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
854
856
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
855
857
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
856
|
-
{
|
|
858
|
+
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
|
857
859
|
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
858
860
|
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
859
861
|
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
|
|
@@ -2512,6 +2514,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2512
2514
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2513
2515
|
},
|
|
2514
2516
|
},
|
|
2517
|
+
{
|
|
2518
|
+
LLM_ARCH_MISTRAL3,
|
|
2519
|
+
{
|
|
2520
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2521
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2522
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2523
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
2524
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2525
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2526
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2527
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2528
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2529
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
2530
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2531
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2532
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2533
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2534
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2535
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
|
2536
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
|
2537
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
2538
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2539
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2540
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2541
|
+
},
|
|
2542
|
+
},
|
|
2515
2543
|
{
|
|
2516
2544
|
LLM_ARCH_UNKNOWN,
|
|
2517
2545
|
{
|
|
@@ -2611,6 +2639,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2611
2639
|
{LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
|
|
2612
2640
|
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2613
2641
|
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
|
|
2642
|
+
{LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
|
|
2614
2643
|
{LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
2615
2644
|
{LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
2616
2645
|
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
@@ -115,6 +115,7 @@ enum llm_arch {
|
|
|
115
115
|
LLM_ARCH_COGVLM,
|
|
116
116
|
LLM_ARCH_RND1,
|
|
117
117
|
LLM_ARCH_PANGU_EMBED,
|
|
118
|
+
LLM_ARCH_MISTRAL3,
|
|
118
119
|
LLM_ARCH_UNKNOWN,
|
|
119
120
|
};
|
|
120
121
|
|
|
@@ -208,6 +209,7 @@ enum llm_kv {
|
|
|
208
209
|
LLM_KV_ATTENTION_SCALE,
|
|
209
210
|
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
|
210
211
|
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
|
212
|
+
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
|
211
213
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
212
214
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
213
215
|
|
|
@@ -377,6 +379,7 @@ enum llm_tensor {
|
|
|
377
379
|
LLM_TENSOR_SSM_DT,
|
|
378
380
|
LLM_TENSOR_SSM_DT_NORM,
|
|
379
381
|
LLM_TENSOR_SSM_A,
|
|
382
|
+
LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
|
|
380
383
|
LLM_TENSOR_SSM_B_NORM,
|
|
381
384
|
LLM_TENSOR_SSM_C_NORM,
|
|
382
385
|
LLM_TENSOR_SSM_D,
|
|
@@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|
|
71
71
|
if (ubatch->pos && attn_scale) {
|
|
72
72
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
73
73
|
|
|
74
|
+
GGML_ASSERT(f_attn_temp_scale != 0.0f);
|
|
75
|
+
GGML_ASSERT(n_attn_temp_floor_scale != 0);
|
|
76
|
+
|
|
74
77
|
std::vector<float> attn_scale_data(n_tokens, 0.0f);
|
|
75
78
|
for (int i = 0; i < n_tokens; ++i) {
|
|
76
79
|
const float pos = ubatch->pos[i];
|
|
@@ -810,9 +813,6 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
810
813
|
GGML_ABORT("fatal error");
|
|
811
814
|
}
|
|
812
815
|
|
|
813
|
-
//expand here so that we can fuse ffn gate
|
|
814
|
-
ggml_build_forward_expand(gf, cur);
|
|
815
|
-
|
|
816
816
|
if (gate && type_gate == LLM_FFN_PAR) {
|
|
817
817
|
cur = ggml_mul(ctx0, cur, tmp);
|
|
818
818
|
cb(cur, "ffn_gate_par", il);
|
|
@@ -1093,9 +1093,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1093
1093
|
GGML_ABORT("fatal error");
|
|
1094
1094
|
}
|
|
1095
1095
|
|
|
1096
|
-
//expand here so that we can fuse ffn gate
|
|
1097
|
-
ggml_build_forward_expand(gf, cur);
|
|
1098
|
-
|
|
1099
1096
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
1100
1097
|
cb(experts, "ffn_moe_down", il);
|
|
1101
1098
|
|
|
@@ -162,8 +162,8 @@ struct llama_hparams {
|
|
|
162
162
|
// llama4 smallthinker
|
|
163
163
|
uint32_t n_moe_layer_step = 0;
|
|
164
164
|
uint32_t n_no_rope_layer_step = 4;
|
|
165
|
-
uint32_t n_attn_temp_floor_scale =
|
|
166
|
-
float f_attn_temp_scale = 0.
|
|
165
|
+
uint32_t n_attn_temp_floor_scale = 0;
|
|
166
|
+
float f_attn_temp_scale = 0.0f;
|
|
167
167
|
|
|
168
168
|
// gemma3n altup
|
|
169
169
|
uint32_t n_altup = 4; // altup_num_inputs
|
|
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
|
|
|
485
485
|
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
|
486
486
|
suggest = false;
|
|
487
487
|
}
|
|
488
|
-
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
|
488
|
+
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
|
|
489
489
|
suggest = false;
|
|
490
490
|
}
|
|
491
491
|
#endif
|
|
@@ -423,8 +423,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
|
|
423
423
|
}
|
|
424
424
|
|
|
425
425
|
struct llama_model::impl {
|
|
426
|
-
impl()
|
|
427
|
-
~impl()
|
|
426
|
+
impl() = default;
|
|
427
|
+
~impl() = default;
|
|
428
428
|
|
|
429
429
|
uint64_t n_elements = 0;
|
|
430
430
|
|
|
@@ -461,7 +461,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
|
|
461
461
|
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
462
462
|
}
|
|
463
463
|
|
|
464
|
-
llama_model::~llama_model()
|
|
464
|
+
llama_model::~llama_model() = default;
|
|
465
465
|
|
|
466
466
|
void llama_model::load_stats(llama_model_loader & ml) {
|
|
467
467
|
pimpl->n_elements = ml.n_elements;
|
|
@@ -663,8 +663,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
663
663
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
664
664
|
hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
|
|
665
665
|
} else {
|
|
666
|
-
hparams.swa_type
|
|
667
|
-
hparams.n_swa
|
|
666
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
|
667
|
+
hparams.n_swa = 8192;
|
|
668
|
+
hparams.n_attn_temp_floor_scale = 8192;
|
|
669
|
+
hparams.f_attn_temp_scale = 0.1f;
|
|
668
670
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
669
671
|
}
|
|
670
672
|
|
|
@@ -2247,6 +2249,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2247
2249
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2248
2250
|
}
|
|
2249
2251
|
} break;
|
|
2252
|
+
case LLM_ARCH_MISTRAL3:
|
|
2253
|
+
{
|
|
2254
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2255
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
2256
|
+
|
|
2257
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
2258
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
2259
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
|
2260
|
+
|
|
2261
|
+
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
|
2262
|
+
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
2263
|
+
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
|
|
2264
|
+
if (hparams.n_attn_temp_floor_scale == 0) {
|
|
2265
|
+
throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
|
|
2266
|
+
}
|
|
2267
|
+
}
|
|
2268
|
+
|
|
2269
|
+
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
|
2270
|
+
// but may need further verification with other values
|
|
2271
|
+
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
2272
|
+
float factor = 1.0f / hparams.rope_freq_scale_train;
|
|
2273
|
+
float mscale = 1.0f;
|
|
2274
|
+
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
2275
|
+
static auto get_mscale = [](float scale, float mscale) {
|
|
2276
|
+
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
2277
|
+
};
|
|
2278
|
+
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
2279
|
+
}
|
|
2280
|
+
|
|
2281
|
+
switch (hparams.n_layer) {
|
|
2282
|
+
case 26: type = LLM_TYPE_3B; break;
|
|
2283
|
+
case 34: type = LLM_TYPE_8B; break;
|
|
2284
|
+
case 40: type = LLM_TYPE_14B; break;
|
|
2285
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2286
|
+
}
|
|
2287
|
+
} break;
|
|
2250
2288
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2251
2289
|
}
|
|
2252
2290
|
|
|
@@ -2560,6 +2598,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2560
2598
|
case LLM_ARCH_MINICPM:
|
|
2561
2599
|
case LLM_ARCH_GRANITE:
|
|
2562
2600
|
case LLM_ARCH_GRANITE_MOE:
|
|
2601
|
+
case LLM_ARCH_MISTRAL3:
|
|
2563
2602
|
{
|
|
2564
2603
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2565
2604
|
|
|
@@ -6487,7 +6526,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6487
6526
|
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
|
|
6488
6527
|
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
6489
6528
|
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
6490
|
-
layer.ssm_a = create_tensor(tn(
|
|
6529
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
|
6491
6530
|
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
|
|
6492
6531
|
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
|
6493
6532
|
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
|
@@ -7522,6 +7561,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7522
7561
|
{
|
|
7523
7562
|
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
|
7524
7563
|
} break;
|
|
7564
|
+
case LLM_ARCH_MISTRAL3:
|
|
7565
|
+
{
|
|
7566
|
+
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
|
7567
|
+
} break;
|
|
7525
7568
|
default:
|
|
7526
7569
|
GGML_ABORT("fatal error");
|
|
7527
7570
|
}
|
|
@@ -7690,6 +7733,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7690
7733
|
case LLM_ARCH_ARCEE:
|
|
7691
7734
|
case LLM_ARCH_ERNIE4_5:
|
|
7692
7735
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
7736
|
+
case LLM_ARCH_MISTRAL3:
|
|
7693
7737
|
return LLAMA_ROPE_TYPE_NORM;
|
|
7694
7738
|
|
|
7695
7739
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -3253,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
|
|
|
3253
3253
|
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
|
3254
3254
|
}
|
|
3255
3255
|
|
|
3256
|
-
llama_vocab::~llama_vocab()
|
|
3257
|
-
}
|
|
3256
|
+
llama_vocab::~llama_vocab() = default;
|
|
3258
3257
|
|
|
3259
3258
|
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
3260
3259
|
pimpl->load(ml, kv);
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8
|
+
|
|
9
|
+
ggml_tensor * cur;
|
|
10
|
+
ggml_tensor * inpL;
|
|
11
|
+
|
|
12
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13
|
+
|
|
14
|
+
// inp_pos - contains the positions
|
|
15
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
16
|
+
|
|
17
|
+
// (optional) temperature tuning
|
|
18
|
+
ggml_tensor * inp_attn_scale = nullptr;
|
|
19
|
+
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
20
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
24
|
+
|
|
25
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
26
|
+
|
|
27
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
28
|
+
|
|
29
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
30
|
+
ggml_tensor * inpSA = inpL;
|
|
31
|
+
|
|
32
|
+
// norm
|
|
33
|
+
cur = build_norm(inpL,
|
|
34
|
+
model.layers[il].attn_norm, NULL,
|
|
35
|
+
LLM_NORM_RMS, il);
|
|
36
|
+
cb(cur, "attn_norm", il);
|
|
37
|
+
|
|
38
|
+
// self-attention
|
|
39
|
+
{
|
|
40
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
41
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
42
|
+
|
|
43
|
+
// compute Q and K and RoPE them
|
|
44
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
45
|
+
cb(Qcur, "Qcur", il);
|
|
46
|
+
if (model.layers[il].bq) {
|
|
47
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
48
|
+
cb(Qcur, "Qcur", il);
|
|
49
|
+
}
|
|
50
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
51
|
+
cb(Kcur, "Kcur", il);
|
|
52
|
+
if (model.layers[il].bk) {
|
|
53
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
54
|
+
cb(Kcur, "Kcur", il);
|
|
55
|
+
}
|
|
56
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
57
|
+
cb(Vcur, "Vcur", il);
|
|
58
|
+
if (model.layers[il].bv) {
|
|
59
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
60
|
+
cb(Vcur, "Vcur", il);
|
|
61
|
+
}
|
|
62
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
63
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
64
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
65
|
+
|
|
66
|
+
Qcur = ggml_rope_ext(
|
|
67
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
68
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
69
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
Kcur = ggml_rope_ext(
|
|
73
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
74
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
75
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
cb(Qcur, "Qcur", il);
|
|
79
|
+
cb(Kcur, "Kcur", il);
|
|
80
|
+
cb(Vcur, "Vcur", il);
|
|
81
|
+
|
|
82
|
+
if (inp_attn_scale) {
|
|
83
|
+
// apply llama 4 temperature scaling
|
|
84
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
85
|
+
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
cur = build_attn(inp_attn,
|
|
89
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
90
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
91
|
+
cb(cur, "attn_out", il);
|
|
92
|
+
}
|
|
93
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
94
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
95
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
96
|
+
}
|
|
97
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
98
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
99
|
+
|
|
100
|
+
// feed-forward network (non-MoE)
|
|
101
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
102
|
+
|
|
103
|
+
cur = build_norm(ffn_inp,
|
|
104
|
+
model.layers[il].ffn_norm, NULL,
|
|
105
|
+
LLM_NORM_RMS, il);
|
|
106
|
+
cb(cur, "ffn_norm", il);
|
|
107
|
+
|
|
108
|
+
cur = build_ffn(cur,
|
|
109
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
110
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
111
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
112
|
+
NULL,
|
|
113
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
114
|
+
cb(cur, "ffn_out", il);
|
|
115
|
+
} else {
|
|
116
|
+
// MoE branch
|
|
117
|
+
cur = build_norm(ffn_inp,
|
|
118
|
+
model.layers[il].ffn_norm, NULL,
|
|
119
|
+
LLM_NORM_RMS, il);
|
|
120
|
+
cb(cur, "ffn_norm", il);
|
|
121
|
+
|
|
122
|
+
cur = build_moe_ffn(cur,
|
|
123
|
+
model.layers[il].ffn_gate_inp,
|
|
124
|
+
model.layers[il].ffn_up_exps,
|
|
125
|
+
model.layers[il].ffn_gate_exps,
|
|
126
|
+
model.layers[il].ffn_down_exps,
|
|
127
|
+
nullptr,
|
|
128
|
+
n_expert, n_expert_used,
|
|
129
|
+
LLM_FFN_SILU, true,
|
|
130
|
+
false, 0.0,
|
|
131
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
132
|
+
il);
|
|
133
|
+
cb(cur, "ffn_moe_out", il);
|
|
134
|
+
}
|
|
135
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
136
|
+
cb(cur, "ffn_out", il);
|
|
137
|
+
|
|
138
|
+
cur = build_cvec(cur, il);
|
|
139
|
+
cb(cur, "l_out", il);
|
|
140
|
+
|
|
141
|
+
// input for next layer
|
|
142
|
+
inpL = cur;
|
|
143
|
+
}
|
|
144
|
+
cur = inpL;
|
|
145
|
+
|
|
146
|
+
cur = build_norm(cur,
|
|
147
|
+
model.output_norm, NULL,
|
|
148
|
+
LLM_NORM_RMS, -1);
|
|
149
|
+
|
|
150
|
+
cb(cur, "result_norm", -1);
|
|
151
|
+
res->t_embd = cur;
|
|
152
|
+
|
|
153
|
+
// lm_head
|
|
154
|
+
cur = build_lora_mm(model.output, cur);
|
|
155
|
+
|
|
156
|
+
cb(cur, "result_output", -1);
|
|
157
|
+
res->t_logits = cur;
|
|
158
|
+
|
|
159
|
+
ggml_build_forward_expand(gf, cur);
|
|
160
|
+
}
|
|
@@ -322,6 +322,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
|
|
|
322
322
|
llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
|
|
323
323
|
};
|
|
324
324
|
|
|
325
|
+
struct llm_build_mistral3 : public llm_graph_context {
|
|
326
|
+
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
|
|
327
|
+
};
|
|
328
|
+
|
|
325
329
|
struct llm_build_mpt : public llm_graph_context {
|
|
326
330
|
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
|
|
327
331
|
};
|