@fugood/llama.node 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +37 -0
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -103,6 +103,8 @@ const char * llm_type_name(llm_type type) {
|
|
|
103
103
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
104
104
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
105
105
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
106
|
+
case LLM_TYPE_E2B: return "E2B";
|
|
107
|
+
case LLM_TYPE_E4B: return "E4B";
|
|
106
108
|
default: return "?B";
|
|
107
109
|
}
|
|
108
110
|
}
|
|
@@ -1017,6 +1019,24 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1017
1019
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
1018
1020
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
1019
1021
|
} break;
|
|
1022
|
+
case LLM_ARCH_GEMMA3N:
|
|
1023
|
+
{
|
|
1024
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1025
|
+
hparams.set_swa_pattern(5);
|
|
1026
|
+
|
|
1027
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1028
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1029
|
+
hparams.f_attention_scale = 1.0f;
|
|
1030
|
+
|
|
1031
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1032
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1033
|
+
|
|
1034
|
+
switch (hparams.n_layer) {
|
|
1035
|
+
case 30: type = LLM_TYPE_E2B; break;
|
|
1036
|
+
case 35: type = LLM_TYPE_E4B; break;
|
|
1037
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1038
|
+
}
|
|
1039
|
+
} break;
|
|
1020
1040
|
case LLM_ARCH_STARCODER2:
|
|
1021
1041
|
{
|
|
1022
1042
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -2950,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2950
2970
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2951
2971
|
}
|
|
2952
2972
|
} break;
|
|
2973
|
+
case LLM_ARCH_GEMMA3N:
|
|
2974
|
+
{
|
|
2975
|
+
const int64_t n_altup = hparams.n_altup;
|
|
2976
|
+
const int64_t laurel_rank = hparams.laurel_rank;
|
|
2977
|
+
const int64_t n_embd_altup = hparams.n_embd_altup;
|
|
2978
|
+
|
|
2979
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2980
|
+
// if output is NULL, init from the input tok embed
|
|
2981
|
+
if (output == NULL) {
|
|
2982
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2983
|
+
}
|
|
2984
|
+
|
|
2985
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2986
|
+
tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
|
|
2987
|
+
|
|
2988
|
+
altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
|
|
2989
|
+
altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
|
|
2990
|
+
per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
|
|
2991
|
+
per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
|
|
2992
|
+
|
|
2993
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2994
|
+
|
|
2995
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2996
|
+
auto & layer = layers[i];
|
|
2997
|
+
|
|
2998
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2999
|
+
|
|
3000
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3001
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3002
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3003
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
3004
|
+
|
|
3005
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
3006
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
3007
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3008
|
+
|
|
3009
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3010
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3011
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3012
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3013
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3014
|
+
|
|
3015
|
+
// altup & laurel
|
|
3016
|
+
layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
|
|
3017
|
+
layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
|
|
3018
|
+
layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3019
|
+
layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
|
|
3020
|
+
layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
|
|
3021
|
+
layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
|
|
3022
|
+
layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
|
|
3023
|
+
layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
|
|
3024
|
+
layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
|
|
3025
|
+
layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
|
|
3026
|
+
layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3027
|
+
}
|
|
3028
|
+
} break;
|
|
2953
3029
|
case LLM_ARCH_STARCODER2:
|
|
2954
3030
|
{
|
|
2955
3031
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -8980,6 +9056,442 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8980
9056
|
}
|
|
8981
9057
|
};
|
|
8982
9058
|
|
|
9059
|
+
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
9060
|
+
const llama_model & model;
|
|
9061
|
+
ggml_cgraph * gf;
|
|
9062
|
+
|
|
9063
|
+
const int64_t n_embd_head;
|
|
9064
|
+
const int64_t n_embd_altup;
|
|
9065
|
+
const int64_t n_altup;
|
|
9066
|
+
const int i_altup_act;
|
|
9067
|
+
const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
|
|
9068
|
+
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
9069
|
+
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
9070
|
+
|
|
9071
|
+
ggml_tensor * one; // containing single element 1.0f
|
|
9072
|
+
|
|
9073
|
+
llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
|
|
9074
|
+
: llm_graph_context(params),
|
|
9075
|
+
model(model),
|
|
9076
|
+
gf(gf),
|
|
9077
|
+
n_embd_head(model.hparams.n_embd_head_k),
|
|
9078
|
+
n_embd_altup(model.hparams.n_embd_altup),
|
|
9079
|
+
n_altup(model.hparams.n_altup),
|
|
9080
|
+
i_altup_act(model.hparams.i_altup_act) {
|
|
9081
|
+
ggml_tensor * cur;
|
|
9082
|
+
ggml_tensor * inpL;
|
|
9083
|
+
|
|
9084
|
+
// TODO: remove this when ggml_scale_add is implemented
|
|
9085
|
+
one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
9086
|
+
{
|
|
9087
|
+
auto inp = std::make_unique<llm_graph_input_one>();
|
|
9088
|
+
inp->one = one;
|
|
9089
|
+
res->add_input(std::move(inp));
|
|
9090
|
+
}
|
|
9091
|
+
|
|
9092
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
9093
|
+
|
|
9094
|
+
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
9095
|
+
if (ubatch.token) {
|
|
9096
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
9097
|
+
cb(inpL, "inp_scaled", -1);
|
|
9098
|
+
}
|
|
9099
|
+
|
|
9100
|
+
// inp_pos - contains the positions
|
|
9101
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
9102
|
+
|
|
9103
|
+
// TODO: is causal == true correct? might need some changes
|
|
9104
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9105
|
+
|
|
9106
|
+
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
9107
|
+
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
9108
|
+
|
|
9109
|
+
// inpL now has only 1 altup, project it to the rest of the altups
|
|
9110
|
+
// these "added" altups will be concat to the last dim of inpL
|
|
9111
|
+
{
|
|
9112
|
+
ggml_tensor * target_magnitude = calc_magnitude(inpL);
|
|
9113
|
+
ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
|
|
9114
|
+
ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
|
|
9115
|
+
ggml_tensor * new_magnitude = calc_magnitude(altup_added);
|
|
9116
|
+
altup_added = ggml_div(ctx0,
|
|
9117
|
+
ggml_mul(ctx0, altup_added, target_magnitude),
|
|
9118
|
+
new_magnitude);
|
|
9119
|
+
inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
|
|
9120
|
+
cb(inpL, "inp_stacked", -1);
|
|
9121
|
+
}
|
|
9122
|
+
|
|
9123
|
+
// inpL now has shape: [n_embd, n_tokens, n_altup]
|
|
9124
|
+
// inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
|
|
9125
|
+
|
|
9126
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
9127
|
+
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
|
9128
|
+
const bool has_kv = (il < n_layer_kv);
|
|
9129
|
+
|
|
9130
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
9131
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
9132
|
+
|
|
9133
|
+
ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
|
|
9134
|
+
ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
|
|
9135
|
+
|
|
9136
|
+
// predicted value will go through self-attention and laurel
|
|
9137
|
+
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
|
|
9138
|
+
cur = active_prediction;
|
|
9139
|
+
cb(cur, "active_prediction", il);
|
|
9140
|
+
|
|
9141
|
+
// norm
|
|
9142
|
+
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
9143
|
+
cb(cur, "attn_norm", il);
|
|
9144
|
+
|
|
9145
|
+
// laurel
|
|
9146
|
+
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
|
|
9147
|
+
|
|
9148
|
+
// self-attention
|
|
9149
|
+
if (has_kv) {
|
|
9150
|
+
// compute Q and K and RoPE them
|
|
9151
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
9152
|
+
cb(Qcur, "Qcur", il);
|
|
9153
|
+
|
|
9154
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
9155
|
+
cb(Kcur, "Kcur", il);
|
|
9156
|
+
|
|
9157
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
9158
|
+
cb(Vcur, "Vcur", il);
|
|
9159
|
+
|
|
9160
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9161
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9162
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9163
|
+
|
|
9164
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
9165
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
9166
|
+
Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
|
|
9167
|
+
|
|
9168
|
+
cb(Qcur, "Qcur_normed", il);
|
|
9169
|
+
cb(Kcur, "Kcur_normed", il);
|
|
9170
|
+
cb(Vcur, "Vcur_normed", il);
|
|
9171
|
+
|
|
9172
|
+
Qcur = ggml_rope_ext(
|
|
9173
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
9174
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9175
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9176
|
+
|
|
9177
|
+
Kcur = ggml_rope_ext(
|
|
9178
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
9179
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9180
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9181
|
+
|
|
9182
|
+
cb(Qcur, "Qcur_pos", il);
|
|
9183
|
+
cb(Kcur, "Kcur_pos", il);
|
|
9184
|
+
|
|
9185
|
+
cur = build_attn(inp_attn, gf,
|
|
9186
|
+
model.layers[il].wo, NULL,
|
|
9187
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9188
|
+
} else {
|
|
9189
|
+
// no KV layers
|
|
9190
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
9191
|
+
cb(Qcur, "Qcur", il);
|
|
9192
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9193
|
+
|
|
9194
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
9195
|
+
cb(Qcur, "Qcur_normed", il);
|
|
9196
|
+
|
|
9197
|
+
Qcur = ggml_rope_ext(
|
|
9198
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
9199
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
9200
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9201
|
+
cb(Qcur, "Qcur_pos", il);
|
|
9202
|
+
|
|
9203
|
+
cur = build_attn(inp_attn, gf,
|
|
9204
|
+
model.layers[il].wo, NULL,
|
|
9205
|
+
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9206
|
+
}
|
|
9207
|
+
|
|
9208
|
+
cur = build_norm(cur,
|
|
9209
|
+
model.layers[il].attn_post_norm, NULL,
|
|
9210
|
+
LLM_NORM_RMS, il);
|
|
9211
|
+
cb(cur, "attn_post_norm", il);
|
|
9212
|
+
|
|
9213
|
+
cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
|
|
9214
|
+
cb(cur, "attn_gated", il);
|
|
9215
|
+
|
|
9216
|
+
ggml_tensor * attn_laurel = ggml_scale(ctx0,
|
|
9217
|
+
ggml_add(ctx0, cur, laurel_out),
|
|
9218
|
+
1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
|
|
9219
|
+
cb(attn_laurel, "attn_laurel", il);
|
|
9220
|
+
|
|
9221
|
+
cur = build_norm(attn_laurel,
|
|
9222
|
+
model.layers[il].ffn_norm, NULL,
|
|
9223
|
+
LLM_NORM_RMS, il);
|
|
9224
|
+
cb(cur, "ffn_norm", il);
|
|
9225
|
+
|
|
9226
|
+
// feed-forward network
|
|
9227
|
+
{
|
|
9228
|
+
ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
|
|
9229
|
+
ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
|
|
9230
|
+
|
|
9231
|
+
if (il < n_layer_sparsity) {
|
|
9232
|
+
// apply activation sparsity
|
|
9233
|
+
gate_proj = gaussian_topk(gate_proj);
|
|
9234
|
+
}
|
|
9235
|
+
gate_proj = ggml_gelu(ctx0, gate_proj);
|
|
9236
|
+
|
|
9237
|
+
cur = ggml_mul(ctx0, up_proj, gate_proj);
|
|
9238
|
+
cur = build_lora_mm(model.layers[il].ffn_down, cur);
|
|
9239
|
+
cb(cur, "ffn_out", il);
|
|
9240
|
+
}
|
|
9241
|
+
|
|
9242
|
+
cur = build_norm(cur,
|
|
9243
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
9244
|
+
LLM_NORM_RMS, -1);
|
|
9245
|
+
cb(cur, "ffn_post_norm", il);
|
|
9246
|
+
|
|
9247
|
+
ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
|
|
9248
|
+
cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
|
|
9249
|
+
|
|
9250
|
+
ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
|
|
9251
|
+
|
|
9252
|
+
ggml_tensor * first_prediction; // [n_embd, n_tokens]
|
|
9253
|
+
{
|
|
9254
|
+
first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
|
|
9255
|
+
first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
|
|
9256
|
+
first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
|
|
9257
|
+
first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
|
|
9258
|
+
cb(first_prediction, "first_prediction_gated", il);
|
|
9259
|
+
ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
|
|
9260
|
+
first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
|
|
9261
|
+
cb(first_prediction, "first_prediction_scaled", il);
|
|
9262
|
+
|
|
9263
|
+
first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
|
|
9264
|
+
first_prediction = build_norm(first_prediction,
|
|
9265
|
+
model.layers[il].per_layer_post_norm, NULL,
|
|
9266
|
+
LLM_NORM_RMS, il);
|
|
9267
|
+
cb(first_prediction, "first_prediction_out", il);
|
|
9268
|
+
}
|
|
9269
|
+
|
|
9270
|
+
// equivalent to python code: corrected_predictions[1:] += first_prediction
|
|
9271
|
+
{
|
|
9272
|
+
ggml_tensor * slice_first = view_2d_slice(corrected, 0);
|
|
9273
|
+
ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
|
|
9274
|
+
ggml_row_size(corrected->type, n_embd),
|
|
9275
|
+
ggml_row_size(corrected->type, n_embd*n_tokens),
|
|
9276
|
+
n_embd*n_tokens*ggml_element_size(corrected));
|
|
9277
|
+
ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
|
|
9278
|
+
corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
|
|
9279
|
+
}
|
|
9280
|
+
|
|
9281
|
+
cur = corrected; // [n_embd, n_tokens, n_altup]
|
|
9282
|
+
cur = build_cvec(cur, il);
|
|
9283
|
+
cb(cur, "l_out", il);
|
|
9284
|
+
|
|
9285
|
+
// input for next layer
|
|
9286
|
+
inpL = cur;
|
|
9287
|
+
}
|
|
9288
|
+
|
|
9289
|
+
cur = inpL; // [n_embd, n_tokens, n_altup]
|
|
9290
|
+
|
|
9291
|
+
// cur now has multiple altup(s), we want to merge them back to 1 altup
|
|
9292
|
+
{
|
|
9293
|
+
ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
|
|
9294
|
+
// do a view to skip the first slice (active altup)
|
|
9295
|
+
ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
|
|
9296
|
+
ggml_row_size(cur->type, n_embd),
|
|
9297
|
+
ggml_row_size(cur->type, n_embd*n_tokens),
|
|
9298
|
+
n_embd*n_tokens*ggml_element_size(cur));
|
|
9299
|
+
ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
|
|
9300
|
+
ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
|
|
9301
|
+
altup_unembd = ggml_div(ctx0,
|
|
9302
|
+
ggml_mul(ctx0, altup_unembd, target_magnitude),
|
|
9303
|
+
new_magnitude);
|
|
9304
|
+
cb(altup_unembd, "altup_unembd", -1);
|
|
9305
|
+
|
|
9306
|
+
// equivalent to torch.mean(hidden_states, dim=0)
|
|
9307
|
+
cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
|
|
9308
|
+
for (int i = 0; i < n_altup - 1; ++i) {
|
|
9309
|
+
cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
|
|
9310
|
+
}
|
|
9311
|
+
cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
|
|
9312
|
+
cb(cur, "unembd_merged", -1);
|
|
9313
|
+
}
|
|
9314
|
+
|
|
9315
|
+
// cur now has shape: [n_embd, n_tokens]
|
|
9316
|
+
|
|
9317
|
+
// TODO: move this to right after the last KV layer
|
|
9318
|
+
{
|
|
9319
|
+
// skip computing output for unused tokens
|
|
9320
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9321
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9322
|
+
}
|
|
9323
|
+
|
|
9324
|
+
cur = build_norm(cur,
|
|
9325
|
+
model.output_norm, NULL,
|
|
9326
|
+
LLM_NORM_RMS, -1);
|
|
9327
|
+
|
|
9328
|
+
cb(cur, "result_norm", -1);
|
|
9329
|
+
res->t_embd = cur;
|
|
9330
|
+
|
|
9331
|
+
cur = build_lora_mm(model.output, cur);
|
|
9332
|
+
|
|
9333
|
+
{
|
|
9334
|
+
// final logit soft-capping
|
|
9335
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
|
9336
|
+
cur = ggml_tanh(ctx0, cur);
|
|
9337
|
+
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
|
9338
|
+
}
|
|
9339
|
+
|
|
9340
|
+
cb(cur, "result_output", -1);
|
|
9341
|
+
res->t_logits = cur;
|
|
9342
|
+
|
|
9343
|
+
ggml_build_forward_expand(gf, cur);
|
|
9344
|
+
}
|
|
9345
|
+
|
|
9346
|
+
ggml_tensor * calc_magnitude(ggml_tensor * x) {
|
|
9347
|
+
return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
|
|
9348
|
+
}
|
|
9349
|
+
|
|
9350
|
+
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
|
9351
|
+
ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
|
|
9352
|
+
GGML_ASSERT(idx < (int)x->ne[2]);
|
|
9353
|
+
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
|
|
9354
|
+
ggml_row_size(x->type, x->ne[0]),
|
|
9355
|
+
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
|
9356
|
+
}
|
|
9357
|
+
|
|
9358
|
+
// equivalent to get_per_layer_inputs() in python code
|
|
9359
|
+
// output shape: [n_embd_altup, n_layer, n_tokens]
|
|
9360
|
+
ggml_tensor * get_per_layer_inputs() {
|
|
9361
|
+
auto inp = std::make_unique<llm_graph_input_embd>();
|
|
9362
|
+
ggml_tensor * inp_per_layer;
|
|
9363
|
+
if (ubatch.token) {
|
|
9364
|
+
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
|
9365
|
+
ggml_set_input(inp->tokens);
|
|
9366
|
+
res->t_tokens = inp->tokens;
|
|
9367
|
+
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
|
|
9368
|
+
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
|
9369
|
+
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
|
|
9370
|
+
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
|
9371
|
+
} else {
|
|
9372
|
+
GGML_ABORT("TODO: support embd input");
|
|
9373
|
+
}
|
|
9374
|
+
res->add_input(std::move(inp));
|
|
9375
|
+
return inp_per_layer;
|
|
9376
|
+
}
|
|
9377
|
+
|
|
9378
|
+
// equivalent to project_per_layer_inputs() in python code
|
|
9379
|
+
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
|
9380
|
+
// output shape: [n_embd_altup, n_tokens, n_layer]
|
|
9381
|
+
ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
|
|
9382
|
+
const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
|
|
9383
|
+
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
|
9384
|
+
|
|
9385
|
+
ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
|
|
9386
|
+
per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
|
|
9387
|
+
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
|
|
9388
|
+
per_layer_proj = build_norm(per_layer_proj,
|
|
9389
|
+
model.per_layer_proj_norm, NULL,
|
|
9390
|
+
LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
|
|
9391
|
+
cb(per_layer_proj, "per_layer_proj", -1);
|
|
9392
|
+
|
|
9393
|
+
inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
|
|
9394
|
+
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
|
9395
|
+
cb(inp_per_layer, "inp_per_layer", -1);
|
|
9396
|
+
|
|
9397
|
+
// permute to shape: [n_embd_altup, n_tokens, n_layer]
|
|
9398
|
+
inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
|
|
9399
|
+
return inp_per_layer;
|
|
9400
|
+
}
|
|
9401
|
+
|
|
9402
|
+
// input cur shape: [n_altup, n_tokens]
|
|
9403
|
+
// output shape: [n_altup, n_tokens]
|
|
9404
|
+
ggml_tensor * laurel(ggml_tensor * cur, int il) {
|
|
9405
|
+
ggml_tensor * tmp = cur;
|
|
9406
|
+
tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
|
|
9407
|
+
tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
|
|
9408
|
+
tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
|
|
9409
|
+
tmp = ggml_add(ctx0, tmp, cur);
|
|
9410
|
+
cb(tmp, "laurel_out", il);
|
|
9411
|
+
return tmp;
|
|
9412
|
+
}
|
|
9413
|
+
|
|
9414
|
+
// input x shape: [n_embd, n_tokens]
|
|
9415
|
+
// output shape: [n_embd, n_tokens]
|
|
9416
|
+
ggml_tensor * gaussian_topk(ggml_tensor * x) {
|
|
9417
|
+
ggml_tensor * mean = ggml_mean(ctx0, x);
|
|
9418
|
+
ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
|
|
9419
|
+
ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
|
|
9420
|
+
1.0f / (float)(x->ne[0] - 1)
|
|
9421
|
+
));
|
|
9422
|
+
ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
|
|
9423
|
+
return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
|
|
9424
|
+
}
|
|
9425
|
+
|
|
9426
|
+
//
|
|
9427
|
+
// altup functions
|
|
9428
|
+
//
|
|
9429
|
+
|
|
9430
|
+
// equivalent to compute_router_modalities() in python code
|
|
9431
|
+
// input x shape: [n_embd, n_tokens]
|
|
9432
|
+
// output shape: [n_altup, n_tokens]
|
|
9433
|
+
ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
|
|
9434
|
+
ggml_tensor * router_inputs = build_norm(x,
|
|
9435
|
+
model.layers[il].altup_router_norm, NULL,
|
|
9436
|
+
LLM_NORM_RMS, il);
|
|
9437
|
+
|
|
9438
|
+
// router_input_scale
|
|
9439
|
+
router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
|
|
9440
|
+
|
|
9441
|
+
ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
|
|
9442
|
+
return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
|
|
9443
|
+
}
|
|
9444
|
+
|
|
9445
|
+
// input cur shape: [n_embd, n_tokens, n_altup]
|
|
9446
|
+
// output shape: [n_embd, n_tokens, n_altup]
|
|
9447
|
+
ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
|
|
9448
|
+
ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
|
|
9449
|
+
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
|
9450
|
+
cb(modalities, "modalities", il);
|
|
9451
|
+
|
|
9452
|
+
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
|
|
9453
|
+
cb(all_coefs, "all_coefs", il);
|
|
9454
|
+
// first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
|
|
9455
|
+
all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
|
|
9456
|
+
|
|
9457
|
+
// permute to [n_altup, n_embd, n_tokens]
|
|
9458
|
+
ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
|
9459
|
+
ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
|
|
9460
|
+
|
|
9461
|
+
// final shape must be the same as cur: [n_embd, n_tokens, n_altup]
|
|
9462
|
+
predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
|
|
9463
|
+
predictions = ggml_add(ctx0, predictions, cur);
|
|
9464
|
+
cb(predictions, "predictions", il);
|
|
9465
|
+
|
|
9466
|
+
return predictions;
|
|
9467
|
+
}
|
|
9468
|
+
|
|
9469
|
+
// input predictions shape: [n_embd, n_tokens, n_altup]
|
|
9470
|
+
// input activated shape: [n_embd, n_tokens]
|
|
9471
|
+
// output shape: [n_embd, n_tokens, n_altup]
|
|
9472
|
+
ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
|
|
9473
|
+
ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
|
|
9474
|
+
cb(modalities, "modalities", il);
|
|
9475
|
+
|
|
9476
|
+
ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
|
|
9477
|
+
ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
|
|
9478
|
+
cb(innovation, "innovation", il);
|
|
9479
|
+
|
|
9480
|
+
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
9481
|
+
all_coefs = ggml_add(ctx0, all_coefs, one);
|
|
9482
|
+
cb(all_coefs, "all_coefs", il);
|
|
9483
|
+
all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
|
|
9484
|
+
all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
9485
|
+
|
|
9486
|
+
innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
|
|
9487
|
+
ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
|
|
9488
|
+
corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
|
|
9489
|
+
cb(corrected, "corrected", il);
|
|
9490
|
+
|
|
9491
|
+
return corrected;
|
|
9492
|
+
}
|
|
9493
|
+
};
|
|
9494
|
+
|
|
8983
9495
|
// TODO: move up next to build_starcoder
|
|
8984
9496
|
struct llm_build_starcoder2 : public llm_graph_context {
|
|
8985
9497
|
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
@@ -13974,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13974
14486
|
{
|
|
13975
14487
|
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
|
|
13976
14488
|
} break;
|
|
14489
|
+
case LLM_ARCH_GEMMA3N:
|
|
14490
|
+
{
|
|
14491
|
+
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
|
|
14492
|
+
} break;
|
|
13977
14493
|
case LLM_ARCH_STARCODER2:
|
|
13978
14494
|
{
|
|
13979
14495
|
llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
|
|
@@ -14295,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14295
14811
|
case LLM_ARCH_GEMMA:
|
|
14296
14812
|
case LLM_ARCH_GEMMA2:
|
|
14297
14813
|
case LLM_ARCH_GEMMA3:
|
|
14814
|
+
case LLM_ARCH_GEMMA3N:
|
|
14298
14815
|
case LLM_ARCH_STARCODER2:
|
|
14299
14816
|
case LLM_ARCH_OPENELM:
|
|
14300
14817
|
case LLM_ARCH_GPTNEOX:
|
|
@@ -14377,7 +14894,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|
|
14377
14894
|
// do not extend this list unless absolutely necessary
|
|
14378
14895
|
// Mistral-Small-2503 does not have built-in chat template
|
|
14379
14896
|
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
|
14380
|
-
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
14897
|
+
if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
|
14381
14898
|
return "mistral-v7-tekken";
|
|
14382
14899
|
}
|
|
14383
14900
|
|
|
@@ -95,6 +95,8 @@ enum llm_type {
|
|
|
95
95
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
96
96
|
LLM_TYPE_30B_A3B,
|
|
97
97
|
LLM_TYPE_235B_A22B,
|
|
98
|
+
LLM_TYPE_E2B,
|
|
99
|
+
LLM_TYPE_E4B,
|
|
98
100
|
};
|
|
99
101
|
|
|
100
102
|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
@@ -316,6 +318,19 @@ struct llama_layer {
|
|
|
316
318
|
struct ggml_tensor * ffn_up_scale = nullptr;
|
|
317
319
|
struct ggml_tensor * ffn_down_scale = nullptr;
|
|
318
320
|
|
|
321
|
+
// altup & laurel
|
|
322
|
+
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
|
323
|
+
struct ggml_tensor * per_layer_proj = nullptr;
|
|
324
|
+
struct ggml_tensor * per_layer_post_norm = nullptr;
|
|
325
|
+
struct ggml_tensor * altup_correct_coef = nullptr;
|
|
326
|
+
struct ggml_tensor * altup_correct_scale = nullptr;
|
|
327
|
+
struct ggml_tensor * altup_predict_coef = nullptr;
|
|
328
|
+
struct ggml_tensor * altup_router = nullptr;
|
|
329
|
+
struct ggml_tensor * altup_router_norm = nullptr;
|
|
330
|
+
struct ggml_tensor * laurel_l = nullptr;
|
|
331
|
+
struct ggml_tensor * laurel_r = nullptr;
|
|
332
|
+
struct ggml_tensor * laurel_post_norm = nullptr;
|
|
333
|
+
|
|
319
334
|
struct llama_layer_posnet posnet;
|
|
320
335
|
|
|
321
336
|
struct llama_layer_convnext convnext;
|
|
@@ -354,6 +369,13 @@ struct llama_model {
|
|
|
354
369
|
struct ggml_tensor * conv1d = nullptr;
|
|
355
370
|
struct ggml_tensor * conv1d_b = nullptr;
|
|
356
371
|
|
|
372
|
+
// gemma3n altup
|
|
373
|
+
struct ggml_tensor * tok_embd_per_layer = nullptr;
|
|
374
|
+
struct ggml_tensor * altup_proj = nullptr;
|
|
375
|
+
struct ggml_tensor * altup_unembd_proj = nullptr;
|
|
376
|
+
struct ggml_tensor * per_layer_model_proj = nullptr;
|
|
377
|
+
struct ggml_tensor * per_layer_proj_norm = nullptr;
|
|
378
|
+
|
|
357
379
|
std::vector<llama_layer> layers;
|
|
358
380
|
|
|
359
381
|
llama_model_params params;
|