@fugood/llama.node 1.0.0-beta.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +12 -0
  3. package/lib/index.js +10 -0
  4. package/lib/index.ts +17 -1
  5. package/package.json +14 -14
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +7 -3
  8. package/src/LlamaCompletionWorker.h +2 -0
  9. package/src/LlamaContext.cpp +49 -6
  10. package/src/LlamaContext.h +1 -0
  11. package/src/RerankWorker.h +26 -0
  12. package/src/common.hpp +1 -1
  13. package/src/llama.cpp/CMakeLists.txt +1 -1
  14. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  15. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  16. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  28. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  29. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  35. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  40. package/src/llama.cpp/include/llama.h +6 -3
  41. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  42. package/src/llama.cpp/src/llama-arch.h +17 -0
  43. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  44. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  45. package/src/llama.cpp/src/llama-context.cpp +0 -1
  46. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  47. package/src/llama.cpp/src/llama-graph.h +14 -2
  48. package/src/llama.cpp/src/llama-hparams.h +6 -0
  49. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  50. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  51. package/src/llama.cpp/src/llama-model.cpp +518 -1
  52. package/src/llama.cpp/src/llama-model.h +22 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -103,6 +103,8 @@ const char * llm_type_name(llm_type type) {
103
103
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
104
104
  case LLM_TYPE_30B_A3B: return "30B.A3B";
105
105
  case LLM_TYPE_235B_A22B: return "235B.A22B";
106
+ case LLM_TYPE_E2B: return "E2B";
107
+ case LLM_TYPE_E4B: return "E4B";
106
108
  default: return "?B";
107
109
  }
108
110
  }
@@ -1017,6 +1019,24 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1017
1019
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1018
1020
  : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1019
1021
  } break;
1022
+ case LLM_ARCH_GEMMA3N:
1023
+ {
1024
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1025
+ hparams.set_swa_pattern(5);
1026
+
1027
+ hparams.rope_freq_base_train_swa = 10000.0f;
1028
+ hparams.rope_freq_scale_train_swa = 1.0f;
1029
+ hparams.f_attention_scale = 1.0f;
1030
+
1031
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1032
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1033
+
1034
+ switch (hparams.n_layer) {
1035
+ case 30: type = LLM_TYPE_E2B; break;
1036
+ case 35: type = LLM_TYPE_E4B; break;
1037
+ default: type = LLM_TYPE_UNKNOWN;
1038
+ }
1039
+ } break;
1020
1040
  case LLM_ARCH_STARCODER2:
1021
1041
  {
1022
1042
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2950,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2950
2970
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2951
2971
  }
2952
2972
  } break;
2973
+ case LLM_ARCH_GEMMA3N:
2974
+ {
2975
+ const int64_t n_altup = hparams.n_altup;
2976
+ const int64_t laurel_rank = hparams.laurel_rank;
2977
+ const int64_t n_embd_altup = hparams.n_embd_altup;
2978
+
2979
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2980
+ // if output is NULL, init from the input tok embed
2981
+ if (output == NULL) {
2982
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2983
+ }
2984
+
2985
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2986
+ tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
2987
+
2988
+ altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
2989
+ altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
2990
+ per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
2991
+ per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
2992
+
2993
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2994
+
2995
+ for (int i = 0; i < n_layer; ++i) {
2996
+ auto & layer = layers[i];
2997
+
2998
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2999
+
3000
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3001
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3002
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3003
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3004
+
3005
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3006
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3007
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3008
+
3009
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3010
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3011
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3012
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3013
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3014
+
3015
+ // altup & laurel
3016
+ layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
3017
+ layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
3018
+ layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
3019
+ layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
3020
+ layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
3021
+ layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
3022
+ layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
3023
+ layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
3024
+ layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
3025
+ layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
3026
+ layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
3027
+ }
3028
+ } break;
2953
3029
  case LLM_ARCH_STARCODER2:
2954
3030
  {
2955
3031
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8980,6 +9056,442 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8980
9056
  }
8981
9057
  };
8982
9058
 
9059
+ struct llm_build_gemma3n_iswa : public llm_graph_context {
9060
+ const llama_model & model;
9061
+ ggml_cgraph * gf;
9062
+
9063
+ const int64_t n_embd_head;
9064
+ const int64_t n_embd_altup;
9065
+ const int64_t n_altup;
9066
+ const int i_altup_act;
9067
+ const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
9068
+ const int n_layer_sparsity = 10; // number of layers using activation sparsity
9069
+ const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
9070
+
9071
+ ggml_tensor * one; // containing single element 1.0f
9072
+
9073
+ llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
9074
+ : llm_graph_context(params),
9075
+ model(model),
9076
+ gf(gf),
9077
+ n_embd_head(model.hparams.n_embd_head_k),
9078
+ n_embd_altup(model.hparams.n_embd_altup),
9079
+ n_altup(model.hparams.n_altup),
9080
+ i_altup_act(model.hparams.i_altup_act) {
9081
+ ggml_tensor * cur;
9082
+ ggml_tensor * inpL;
9083
+
9084
+ // TODO: remove this when ggml_scale_add is implemented
9085
+ one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
9086
+ {
9087
+ auto inp = std::make_unique<llm_graph_input_one>();
9088
+ inp->one = one;
9089
+ res->add_input(std::move(inp));
9090
+ }
9091
+
9092
+ inpL = build_inp_embd(model.tok_embd);
9093
+
9094
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
9095
+ if (ubatch.token) {
9096
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
9097
+ cb(inpL, "inp_scaled", -1);
9098
+ }
9099
+
9100
+ // inp_pos - contains the positions
9101
+ ggml_tensor * inp_pos = build_inp_pos();
9102
+
9103
+ // TODO: is causal == true correct? might need some changes
9104
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
9105
+
9106
+ // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
9107
+ ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
9108
+
9109
+ // inpL now has only 1 altup, project it to the rest of the altups
9110
+ // these "added" altups will be concat to the last dim of inpL
9111
+ {
9112
+ ggml_tensor * target_magnitude = calc_magnitude(inpL);
9113
+ ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
9114
+ ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
9115
+ ggml_tensor * new_magnitude = calc_magnitude(altup_added);
9116
+ altup_added = ggml_div(ctx0,
9117
+ ggml_mul(ctx0, altup_added, target_magnitude),
9118
+ new_magnitude);
9119
+ inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
9120
+ cb(inpL, "inp_stacked", -1);
9121
+ }
9122
+
9123
+ // inpL now has shape: [n_embd, n_tokens, n_altup]
9124
+ // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
9125
+
9126
+ for (int il = 0; il < n_layer; ++il) {
9127
+ // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
9128
+ const bool has_kv = (il < n_layer_kv);
9129
+
9130
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
9131
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
9132
+
9133
+ ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
9134
+ ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
9135
+
9136
+ // predicted value will go through self-attention and laurel
9137
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
9138
+ cur = active_prediction;
9139
+ cb(cur, "active_prediction", il);
9140
+
9141
+ // norm
9142
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
9143
+ cb(cur, "attn_norm", il);
9144
+
9145
+ // laurel
9146
+ ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
9147
+
9148
+ // self-attention
9149
+ if (has_kv) {
9150
+ // compute Q and K and RoPE them
9151
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
9152
+ cb(Qcur, "Qcur", il);
9153
+
9154
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
9155
+ cb(Kcur, "Kcur", il);
9156
+
9157
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
9158
+ cb(Vcur, "Vcur", il);
9159
+
9160
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9161
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9162
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9163
+
9164
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
9165
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
9166
+ Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
9167
+
9168
+ cb(Qcur, "Qcur_normed", il);
9169
+ cb(Kcur, "Kcur_normed", il);
9170
+ cb(Vcur, "Vcur_normed", il);
9171
+
9172
+ Qcur = ggml_rope_ext(
9173
+ ctx0, Qcur, inp_pos, nullptr,
9174
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9175
+ ext_factor, attn_factor, beta_fast, beta_slow);
9176
+
9177
+ Kcur = ggml_rope_ext(
9178
+ ctx0, Kcur, inp_pos, nullptr,
9179
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9180
+ ext_factor, attn_factor, beta_fast, beta_slow);
9181
+
9182
+ cb(Qcur, "Qcur_pos", il);
9183
+ cb(Kcur, "Kcur_pos", il);
9184
+
9185
+ cur = build_attn(inp_attn, gf,
9186
+ model.layers[il].wo, NULL,
9187
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
9188
+ } else {
9189
+ // no KV layers
9190
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
9191
+ cb(Qcur, "Qcur", il);
9192
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9193
+
9194
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
9195
+ cb(Qcur, "Qcur_normed", il);
9196
+
9197
+ Qcur = ggml_rope_ext(
9198
+ ctx0, Qcur, inp_pos, nullptr,
9199
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9200
+ ext_factor, attn_factor, beta_fast, beta_slow);
9201
+ cb(Qcur, "Qcur_pos", il);
9202
+
9203
+ cur = build_attn(inp_attn, gf,
9204
+ model.layers[il].wo, NULL,
9205
+ Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
9206
+ }
9207
+
9208
+ cur = build_norm(cur,
9209
+ model.layers[il].attn_post_norm, NULL,
9210
+ LLM_NORM_RMS, il);
9211
+ cb(cur, "attn_post_norm", il);
9212
+
9213
+ cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
9214
+ cb(cur, "attn_gated", il);
9215
+
9216
+ ggml_tensor * attn_laurel = ggml_scale(ctx0,
9217
+ ggml_add(ctx0, cur, laurel_out),
9218
+ 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
9219
+ cb(attn_laurel, "attn_laurel", il);
9220
+
9221
+ cur = build_norm(attn_laurel,
9222
+ model.layers[il].ffn_norm, NULL,
9223
+ LLM_NORM_RMS, il);
9224
+ cb(cur, "ffn_norm", il);
9225
+
9226
+ // feed-forward network
9227
+ {
9228
+ ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
9229
+ ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
9230
+
9231
+ if (il < n_layer_sparsity) {
9232
+ // apply activation sparsity
9233
+ gate_proj = gaussian_topk(gate_proj);
9234
+ }
9235
+ gate_proj = ggml_gelu(ctx0, gate_proj);
9236
+
9237
+ cur = ggml_mul(ctx0, up_proj, gate_proj);
9238
+ cur = build_lora_mm(model.layers[il].ffn_down, cur);
9239
+ cb(cur, "ffn_out", il);
9240
+ }
9241
+
9242
+ cur = build_norm(cur,
9243
+ model.layers[il].ffn_post_norm, NULL,
9244
+ LLM_NORM_RMS, -1);
9245
+ cb(cur, "ffn_post_norm", il);
9246
+
9247
+ ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
9248
+ cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
9249
+
9250
+ ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
9251
+
9252
+ ggml_tensor * first_prediction; // [n_embd, n_tokens]
9253
+ {
9254
+ first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
9255
+ first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
9256
+ first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
9257
+ first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
9258
+ cb(first_prediction, "first_prediction_gated", il);
9259
+ ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
9260
+ first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
9261
+ cb(first_prediction, "first_prediction_scaled", il);
9262
+
9263
+ first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
9264
+ first_prediction = build_norm(first_prediction,
9265
+ model.layers[il].per_layer_post_norm, NULL,
9266
+ LLM_NORM_RMS, il);
9267
+ cb(first_prediction, "first_prediction_out", il);
9268
+ }
9269
+
9270
+ // equivalent to python code: corrected_predictions[1:] += first_prediction
9271
+ {
9272
+ ggml_tensor * slice_first = view_2d_slice(corrected, 0);
9273
+ ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
9274
+ ggml_row_size(corrected->type, n_embd),
9275
+ ggml_row_size(corrected->type, n_embd*n_tokens),
9276
+ n_embd*n_tokens*ggml_element_size(corrected));
9277
+ ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
9278
+ corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
9279
+ }
9280
+
9281
+ cur = corrected; // [n_embd, n_tokens, n_altup]
9282
+ cur = build_cvec(cur, il);
9283
+ cb(cur, "l_out", il);
9284
+
9285
+ // input for next layer
9286
+ inpL = cur;
9287
+ }
9288
+
9289
+ cur = inpL; // [n_embd, n_tokens, n_altup]
9290
+
9291
+ // cur now has multiple altup(s), we want to merge them back to 1 altup
9292
+ {
9293
+ ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
9294
+ // do a view to skip the first slice (active altup)
9295
+ ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
9296
+ ggml_row_size(cur->type, n_embd),
9297
+ ggml_row_size(cur->type, n_embd*n_tokens),
9298
+ n_embd*n_tokens*ggml_element_size(cur));
9299
+ ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
9300
+ ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
9301
+ altup_unembd = ggml_div(ctx0,
9302
+ ggml_mul(ctx0, altup_unembd, target_magnitude),
9303
+ new_magnitude);
9304
+ cb(altup_unembd, "altup_unembd", -1);
9305
+
9306
+ // equivalent to torch.mean(hidden_states, dim=0)
9307
+ cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
9308
+ for (int i = 0; i < n_altup - 1; ++i) {
9309
+ cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
9310
+ }
9311
+ cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
9312
+ cb(cur, "unembd_merged", -1);
9313
+ }
9314
+
9315
+ // cur now has shape: [n_embd, n_tokens]
9316
+
9317
+ // TODO: move this to right after the last KV layer
9318
+ {
9319
+ // skip computing output for unused tokens
9320
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9321
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9322
+ }
9323
+
9324
+ cur = build_norm(cur,
9325
+ model.output_norm, NULL,
9326
+ LLM_NORM_RMS, -1);
9327
+
9328
+ cb(cur, "result_norm", -1);
9329
+ res->t_embd = cur;
9330
+
9331
+ cur = build_lora_mm(model.output, cur);
9332
+
9333
+ {
9334
+ // final logit soft-capping
9335
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
9336
+ cur = ggml_tanh(ctx0, cur);
9337
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
9338
+ }
9339
+
9340
+ cb(cur, "result_output", -1);
9341
+ res->t_logits = cur;
9342
+
9343
+ ggml_build_forward_expand(gf, cur);
9344
+ }
9345
+
9346
+ ggml_tensor * calc_magnitude(ggml_tensor * x) {
9347
+ return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
9348
+ }
9349
+
9350
+ // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
9351
+ ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
9352
+ GGML_ASSERT(idx < (int)x->ne[2]);
9353
+ return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
9354
+ ggml_row_size(x->type, x->ne[0]),
9355
+ idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
9356
+ }
9357
+
9358
+ // equivalent to get_per_layer_inputs() in python code
9359
+ // output shape: [n_embd_altup, n_layer, n_tokens]
9360
+ ggml_tensor * get_per_layer_inputs() {
9361
+ auto inp = std::make_unique<llm_graph_input_embd>();
9362
+ ggml_tensor * inp_per_layer;
9363
+ if (ubatch.token) {
9364
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
9365
+ ggml_set_input(inp->tokens);
9366
+ res->t_tokens = inp->tokens;
9367
+ inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
9368
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
9369
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
9370
+ cb(inp_per_layer, "inp_per_layer_selected", -1);
9371
+ } else {
9372
+ GGML_ABORT("TODO: support embd input");
9373
+ }
9374
+ res->add_input(std::move(inp));
9375
+ return inp_per_layer;
9376
+ }
9377
+
9378
+ // equivalent to project_per_layer_inputs() in python code
9379
+ // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
9380
+ // output shape: [n_embd_altup, n_tokens, n_layer]
9381
+ ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
9382
+ const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
9383
+ const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
9384
+
9385
+ ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
9386
+ per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
9387
+ per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
9388
+ per_layer_proj = build_norm(per_layer_proj,
9389
+ model.per_layer_proj_norm, NULL,
9390
+ LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
9391
+ cb(per_layer_proj, "per_layer_proj", -1);
9392
+
9393
+ inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
9394
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
9395
+ cb(inp_per_layer, "inp_per_layer", -1);
9396
+
9397
+ // permute to shape: [n_embd_altup, n_tokens, n_layer]
9398
+ inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
9399
+ return inp_per_layer;
9400
+ }
9401
+
9402
+ // input cur shape: [n_altup, n_tokens]
9403
+ // output shape: [n_altup, n_tokens]
9404
+ ggml_tensor * laurel(ggml_tensor * cur, int il) {
9405
+ ggml_tensor * tmp = cur;
9406
+ tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
9407
+ tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
9408
+ tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
9409
+ tmp = ggml_add(ctx0, tmp, cur);
9410
+ cb(tmp, "laurel_out", il);
9411
+ return tmp;
9412
+ }
9413
+
9414
+ // input x shape: [n_embd, n_tokens]
9415
+ // output shape: [n_embd, n_tokens]
9416
+ ggml_tensor * gaussian_topk(ggml_tensor * x) {
9417
+ ggml_tensor * mean = ggml_mean(ctx0, x);
9418
+ ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
9419
+ ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
9420
+ 1.0f / (float)(x->ne[0] - 1)
9421
+ ));
9422
+ ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
9423
+ return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
9424
+ }
9425
+
9426
+ //
9427
+ // altup functions
9428
+ //
9429
+
9430
+ // equivalent to compute_router_modalities() in python code
9431
+ // input x shape: [n_embd, n_tokens]
9432
+ // output shape: [n_altup, n_tokens]
9433
+ ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
9434
+ ggml_tensor * router_inputs = build_norm(x,
9435
+ model.layers[il].altup_router_norm, NULL,
9436
+ LLM_NORM_RMS, il);
9437
+
9438
+ // router_input_scale
9439
+ router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
9440
+
9441
+ ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
9442
+ return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
9443
+ }
9444
+
9445
+ // input cur shape: [n_embd, n_tokens, n_altup]
9446
+ // output shape: [n_embd, n_tokens, n_altup]
9447
+ ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
9448
+ ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
9449
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
9450
+ cb(modalities, "modalities", il);
9451
+
9452
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
9453
+ cb(all_coefs, "all_coefs", il);
9454
+ // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
9455
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
9456
+
9457
+ // permute to [n_altup, n_embd, n_tokens]
9458
+ ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
9459
+ ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
9460
+
9461
+ // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
9462
+ predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
9463
+ predictions = ggml_add(ctx0, predictions, cur);
9464
+ cb(predictions, "predictions", il);
9465
+
9466
+ return predictions;
9467
+ }
9468
+
9469
+ // input predictions shape: [n_embd, n_tokens, n_altup]
9470
+ // input activated shape: [n_embd, n_tokens]
9471
+ // output shape: [n_embd, n_tokens, n_altup]
9472
+ ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
9473
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
9474
+ cb(modalities, "modalities", il);
9475
+
9476
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
9477
+ ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
9478
+ cb(innovation, "innovation", il);
9479
+
9480
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
9481
+ all_coefs = ggml_add(ctx0, all_coefs, one);
9482
+ cb(all_coefs, "all_coefs", il);
9483
+ all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
9484
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
9485
+
9486
+ innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
9487
+ ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
9488
+ corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
9489
+ cb(corrected, "corrected", il);
9490
+
9491
+ return corrected;
9492
+ }
9493
+ };
9494
+
8983
9495
  // TODO: move up next to build_starcoder
8984
9496
  struct llm_build_starcoder2 : public llm_graph_context {
8985
9497
  llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
@@ -13974,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph(
13974
14486
  {
13975
14487
  llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
13976
14488
  } break;
14489
+ case LLM_ARCH_GEMMA3N:
14490
+ {
14491
+ llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
14492
+ } break;
13977
14493
  case LLM_ARCH_STARCODER2:
13978
14494
  {
13979
14495
  llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
@@ -14295,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14295
14811
  case LLM_ARCH_GEMMA:
14296
14812
  case LLM_ARCH_GEMMA2:
14297
14813
  case LLM_ARCH_GEMMA3:
14814
+ case LLM_ARCH_GEMMA3N:
14298
14815
  case LLM_ARCH_STARCODER2:
14299
14816
  case LLM_ARCH_OPENELM:
14300
14817
  case LLM_ARCH_GPTNEOX:
@@ -14377,7 +14894,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
14377
14894
  // do not extend this list unless absolutely necessary
14378
14895
  // Mistral-Small-2503 does not have built-in chat template
14379
14896
  llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
14380
- if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
14897
+ if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
14381
14898
  return "mistral-v7-tekken";
14382
14899
  }
14383
14900
 
@@ -95,6 +95,8 @@ enum llm_type {
95
95
  LLM_TYPE_17B_128E, // llama4 Maverick
96
96
  LLM_TYPE_30B_A3B,
97
97
  LLM_TYPE_235B_A22B,
98
+ LLM_TYPE_E2B,
99
+ LLM_TYPE_E4B,
98
100
  };
99
101
 
100
102
  std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
@@ -316,6 +318,19 @@ struct llama_layer {
316
318
  struct ggml_tensor * ffn_up_scale = nullptr;
317
319
  struct ggml_tensor * ffn_down_scale = nullptr;
318
320
 
321
+ // altup & laurel
322
+ struct ggml_tensor * per_layer_inp_gate = nullptr;
323
+ struct ggml_tensor * per_layer_proj = nullptr;
324
+ struct ggml_tensor * per_layer_post_norm = nullptr;
325
+ struct ggml_tensor * altup_correct_coef = nullptr;
326
+ struct ggml_tensor * altup_correct_scale = nullptr;
327
+ struct ggml_tensor * altup_predict_coef = nullptr;
328
+ struct ggml_tensor * altup_router = nullptr;
329
+ struct ggml_tensor * altup_router_norm = nullptr;
330
+ struct ggml_tensor * laurel_l = nullptr;
331
+ struct ggml_tensor * laurel_r = nullptr;
332
+ struct ggml_tensor * laurel_post_norm = nullptr;
333
+
319
334
  struct llama_layer_posnet posnet;
320
335
 
321
336
  struct llama_layer_convnext convnext;
@@ -354,6 +369,13 @@ struct llama_model {
354
369
  struct ggml_tensor * conv1d = nullptr;
355
370
  struct ggml_tensor * conv1d_b = nullptr;
356
371
 
372
+ // gemma3n altup
373
+ struct ggml_tensor * tok_embd_per_layer = nullptr;
374
+ struct ggml_tensor * altup_proj = nullptr;
375
+ struct ggml_tensor * altup_unembd_proj = nullptr;
376
+ struct ggml_tensor * per_layer_model_proj = nullptr;
377
+ struct ggml_tensor * per_layer_proj_norm = nullptr;
378
+
357
379
  std::vector<llama_layer> layers;
358
380
 
359
381
  llama_model_params params;