@fugood/llama.node 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/lib/binding.ts +7 -1
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +15 -5
  4. package/src/LlamaCompletionWorker.cpp +12 -3
  5. package/src/LlamaCompletionWorker.h +3 -1
  6. package/src/LlamaContext.cpp +20 -2
  7. package/src/llama.cpp/common/arg.cpp +29 -19
  8. package/src/llama.cpp/common/chat.cpp +153 -3
  9. package/src/llama.cpp/common/chat.h +1 -0
  10. package/src/llama.cpp/common/common.cpp +10 -3
  11. package/src/llama.cpp/common/common.h +4 -1
  12. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
  20. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  21. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
  23. package/src/llama.cpp/include/llama.h +27 -1
  24. package/src/llama.cpp/src/llama-adapter.cpp +68 -4
  25. package/src/llama.cpp/src/llama-adapter.h +3 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +46 -2
  27. package/src/llama.cpp/src/llama-arch.h +4 -0
  28. package/src/llama.cpp/src/llama-context.cpp +80 -39
  29. package/src/llama.cpp/src/llama-context.h +0 -4
  30. package/src/llama.cpp/src/llama-graph.cpp +20 -10
  31. package/src/llama.cpp/src/llama-graph.h +2 -1
  32. package/src/llama.cpp/src/llama-hparams.cpp +25 -0
  33. package/src/llama.cpp/src/llama-hparams.h +6 -0
  34. package/src/llama.cpp/src/llama-impl.h +2 -0
  35. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
  36. package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
  37. package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
  38. package/src/llama.cpp/src/llama-kv-cache.h +16 -28
  39. package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
  40. package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
  41. package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
  42. package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
  43. package/src/llama.cpp/src/llama-memory.h +8 -0
  44. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  45. package/src/llama.cpp/src/llama-model.cpp +302 -31
  46. package/src/llama.cpp/src/llama-model.h +1 -0
  47. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  48. package/src/llama.cpp/src/llama.cpp +12 -0
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
47
47
  case LLM_TYPE_410M: return "410M";
48
48
  case LLM_TYPE_450M: return "450M";
49
49
  case LLM_TYPE_475M: return "475M";
50
+ case LLM_TYPE_558M: return "558M";
50
51
  case LLM_TYPE_700M: return "700M";
51
52
  case LLM_TYPE_770M: return "770M";
52
53
  case LLM_TYPE_780M: return "780M";
@@ -772,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
772
773
  default: type = LLM_TYPE_UNKNOWN;
773
774
  }
774
775
  } break;
776
+ case LLM_ARCH_JINA_BERT_V3:
777
+ {
778
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
779
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
780
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
781
+
782
+ switch (hparams.n_layer) {
783
+ case 24:
784
+ type = LLM_TYPE_558M; break;
785
+ default: type = LLM_TYPE_UNKNOWN;
786
+ }
787
+ } break;
775
788
  case LLM_ARCH_NOMIC_BERT:
776
789
  case LLM_ARCH_NOMIC_BERT_MOE:
777
790
  {
@@ -1115,6 +1128,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1115
1128
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1116
1129
  hparams.set_swa_pattern(5);
1117
1130
 
1131
+ hparams.n_layer_kv_from_start = 20;
1118
1132
  hparams.rope_freq_base_train_swa = 10000.0f;
1119
1133
  hparams.rope_freq_scale_train_swa = 1.0f;
1120
1134
  hparams.f_attention_scale = 1.0f;
@@ -1474,12 +1488,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1474
1488
  // Expert gating function (GLM-4.5 uses sigmoid)
1475
1489
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1476
1490
  if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1477
- hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1491
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1478
1492
  }
1479
1493
 
1480
1494
  // NextN/MTP parameters
1481
1495
  ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1482
1496
 
1497
+ // TODO: when MTP is implemented, this should probably be updated if needed
1498
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1499
+
1483
1500
  switch (hparams.n_layer) {
1484
1501
  case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1485
1502
  case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
@@ -1553,6 +1570,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1553
1570
  default: type = LLM_TYPE_UNKNOWN;
1554
1571
  }
1555
1572
  } break;
1573
+ case LLM_ARCH_NEMOTRON_H:
1574
+ {
1575
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1576
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1577
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1578
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1579
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1580
+
1581
+ // A layer is recurrent IFF the n_head_kv value is set to 0 and
1582
+ // the n_ff value is set to 0
1583
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1584
+ hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1585
+ }
1586
+
1587
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1588
+
1589
+ switch (hparams.n_layer) {
1590
+ case 56: type = LLM_TYPE_9B; break;
1591
+ default: type = LLM_TYPE_UNKNOWN;
1592
+ }
1593
+ } break;
1556
1594
  case LLM_ARCH_EXAONE:
1557
1595
  {
1558
1596
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2627,6 +2665,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2627
2665
  case LLM_ARCH_BERT:
2628
2666
  case LLM_ARCH_NOMIC_BERT:
2629
2667
  case LLM_ARCH_NOMIC_BERT_MOE:
2668
+ case LLM_ARCH_JINA_BERT_V3:
2630
2669
  {
2631
2670
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2632
2671
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
@@ -2662,24 +2701,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2662
2701
  }
2663
2702
 
2664
2703
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2704
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2665
2705
 
2666
2706
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2667
2707
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
2668
2708
 
2669
2709
  if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
2670
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2671
2710
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2672
2711
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2673
2712
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2674
2713
  } else {
2675
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2676
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2677
-
2678
- if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2679
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2680
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2681
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2682
- } else {
2714
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2715
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2716
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2717
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2718
+
2719
+ if (arch == LLM_ARCH_NOMIC_BERT) {
2683
2720
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2684
2721
  }
2685
2722
  }
@@ -4672,6 +4709,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4672
4709
  layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4673
4710
  }
4674
4711
  } break;
4712
+ case LLM_ARCH_NEMOTRON_H:
4713
+ {
4714
+ // mamba2 Mixer SSM params
4715
+ // NOTE: int64_t for tensor dimensions
4716
+ const int64_t d_conv = hparams.ssm_d_conv;
4717
+ const int64_t d_inner = hparams.ssm_d_inner;
4718
+ const int64_t d_state = hparams.ssm_d_state;
4719
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
4720
+ const int64_t n_group = hparams.ssm_n_group;
4721
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4722
+
4723
+ // embeddings
4724
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4725
+
4726
+ // output
4727
+ {
4728
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4729
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4730
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
4731
+ if (output == NULL) {
4732
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4733
+ }
4734
+ }
4735
+
4736
+ for (int i = 0; i < n_layer; ++i) {
4737
+ auto & layer = layers[i];
4738
+
4739
+ // all blocks use the attn norm
4740
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4741
+
4742
+ if (hparams.is_recurrent(i)) {
4743
+ // ssm layers
4744
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4745
+
4746
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4747
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4748
+
4749
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4750
+
4751
+ // no "weight" suffix for these
4752
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4753
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4754
+
4755
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4756
+
4757
+ // out_proj
4758
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4759
+ } else if (hparams.n_ff(i) == 0) {
4760
+ // attention layers (with optional bias)
4761
+ const int64_t n_head_i = hparams.n_head(i);
4762
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4763
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4764
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4765
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4766
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4767
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4768
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4769
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4770
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4771
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4772
+ } else {
4773
+ // mlp layers
4774
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
4775
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
4776
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4777
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
4778
+ }
4779
+ }
4780
+ } break;
4675
4781
  case LLM_ARCH_EXAONE:
4676
4782
  {
4677
4783
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5846,7 +5952,8 @@ void llama_model::print_info() const {
5846
5952
  arch == LLM_ARCH_JAMBA ||
5847
5953
  arch == LLM_ARCH_FALCON_H1 ||
5848
5954
  arch == LLM_ARCH_PLAMO2 ||
5849
- arch == LLM_ARCH_GRANITE_HYBRID) {
5955
+ arch == LLM_ARCH_GRANITE_HYBRID ||
5956
+ arch == LLM_ARCH_NEMOTRON_H) {
5850
5957
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
5851
5958
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5852
5959
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -7457,7 +7564,7 @@ struct llm_build_bert : public llm_graph_context {
7457
7564
  }
7458
7565
 
7459
7566
  // RoPE
7460
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7567
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7461
7568
  Qcur = ggml_rope_ext(
7462
7569
  ctx0, Qcur, inp_pos, nullptr,
7463
7570
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -7516,7 +7623,7 @@ struct llm_build_bert : public llm_graph_context {
7516
7623
  0.0f,
7517
7624
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
7518
7625
  cb(cur, "ffn_moe_out", il);
7519
- } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7626
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7520
7627
  cur = build_ffn(cur,
7521
7628
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7522
7629
  NULL, NULL, NULL,
@@ -10524,7 +10631,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10524
10631
  const int64_t n_embd_altup;
10525
10632
  const int64_t n_altup;
10526
10633
  const int i_altup_act;
10527
- const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
10528
10634
  const int n_layer_sparsity = 10; // number of layers using activation sparsity
10529
10635
  const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
10530
10636
 
@@ -10574,8 +10680,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10574
10680
 
10575
10681
  for (int il = 0; il < n_layer; ++il) {
10576
10682
  // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
10577
- const bool has_kv = (il < n_layer_kv);
10578
-
10579
10683
  const float freq_base_l = model.get_rope_freq_base (cparams, il);
10580
10684
  const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
10581
10685
 
@@ -10595,7 +10699,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10595
10699
  ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
10596
10700
 
10597
10701
  // self-attention
10598
- if (has_kv) {
10702
+ if (hparams.has_kv(il)) {
10599
10703
  // compute Q and K and RoPE them
10600
10704
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10601
10705
  cb(Qcur, "Qcur", il);
@@ -10635,7 +10739,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10635
10739
  model.layers[il].wo, NULL,
10636
10740
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10637
10741
  } else {
10638
- // no KV layers
10742
+ // reuse KV cache of earlier layers
10639
10743
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10640
10744
  cb(Qcur, "Qcur", il);
10641
10745
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -14116,6 +14220,138 @@ struct llm_build_nemotron : public llm_graph_context {
14116
14220
  }
14117
14221
  };
14118
14222
 
14223
+ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14224
+ llm_build_nemotron_h(
14225
+ const llama_model & model,
14226
+ const llm_graph_params & params) :
14227
+ llm_graph_context_mamba(params) {
14228
+
14229
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14230
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14231
+
14232
+ ggml_tensor * cur;
14233
+ ggml_tensor * inpL;
14234
+
14235
+ inpL = build_inp_embd(model.tok_embd);
14236
+
14237
+ auto * inp = build_inp_mem_hybrid();
14238
+
14239
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14240
+
14241
+ for (int il = 0; il < n_layer; ++il) {
14242
+ struct ggml_tensor * inpSA = inpL;
14243
+
14244
+ // norm
14245
+ cur = build_norm(inpL,
14246
+ model.layers[il].attn_norm, NULL,
14247
+ LLM_NORM_RMS, il);
14248
+ cb(cur, "attn_norm", il);
14249
+
14250
+ if (hparams.is_recurrent(il)) {
14251
+ // ssm layer //
14252
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
14253
+ } else if (hparams.n_ff(il) == 0) {
14254
+ // attention layer //
14255
+ cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
14256
+ } else {
14257
+ cur = build_ffn_layer(cur, model, il);
14258
+ }
14259
+
14260
+ if (il == n_layer - 1 && inp_out_ids) {
14261
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14262
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14263
+ }
14264
+
14265
+ // add residual
14266
+ cur = ggml_add(ctx0, cur, inpSA);
14267
+ cb(cur, "block_out", il);
14268
+
14269
+ // input for next layer
14270
+ inpL = cur;
14271
+ }
14272
+
14273
+ cur = inpL;
14274
+
14275
+ cur = build_norm(cur,
14276
+ model.output_norm, NULL,
14277
+ LLM_NORM_RMS, -1);
14278
+
14279
+ cb(cur, "result_norm", -1);
14280
+ res->t_embd = cur;
14281
+
14282
+ // lm_head
14283
+ cur = build_lora_mm(model.output, cur);
14284
+ cb(cur, "result_output", -1);
14285
+ res->t_logits = cur;
14286
+
14287
+ ggml_build_forward_expand(gf, cur);
14288
+ }
14289
+
14290
+ ggml_tensor * build_attention_layer(
14291
+ ggml_tensor * cur,
14292
+ llm_graph_input_attn_kv * inp_attn,
14293
+ const llama_model & model,
14294
+ const int64_t n_embd_head,
14295
+ const int il) {
14296
+
14297
+ // compute Q and K and (optionally) RoPE them
14298
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14299
+ cb(Qcur, "Qcur", il);
14300
+ if (model.layers[il].bq) {
14301
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14302
+ cb(Qcur, "Qcur", il);
14303
+ }
14304
+
14305
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14306
+ cb(Kcur, "Kcur", il);
14307
+ if (model.layers[il].bk) {
14308
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14309
+ cb(Kcur, "Kcur", il);
14310
+ }
14311
+
14312
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14313
+ cb(Vcur, "Vcur", il);
14314
+ if (model.layers[il].bv) {
14315
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14316
+ cb(Vcur, "Vcur", il);
14317
+ }
14318
+
14319
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14320
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14321
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14322
+
14323
+ cb(Qcur, "Qcur", il);
14324
+ cb(Kcur, "Kcur", il);
14325
+ cb(Vcur, "Vcur", il);
14326
+
14327
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14328
+ cur = build_attn(inp_attn,
14329
+ model.layers[il].wo, model.layers[il].bo,
14330
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
14331
+ cb(cur, "attn_out", il);
14332
+ return cur;
14333
+ }
14334
+
14335
+ ggml_tensor * build_ffn_layer(
14336
+ ggml_tensor * cur,
14337
+ const llama_model & model,
14338
+ const int il) {
14339
+
14340
+ cur = build_ffn(cur,
14341
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14342
+ NULL, NULL, NULL,
14343
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14344
+ NULL,
14345
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
14346
+ cb(cur, "ffn_out", il);
14347
+
14348
+ cur = build_cvec(cur, il);
14349
+ cb(cur, "l_out", il);
14350
+
14351
+ return cur;
14352
+ }
14353
+ };
14354
+
14119
14355
  struct llm_build_exaone : public llm_graph_context {
14120
14356
  llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14121
14357
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -18240,6 +18476,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18240
18476
  // switch statement
18241
18477
  case LLM_ARCH_BERT:
18242
18478
  case LLM_ARCH_JINA_BERT_V2:
18479
+ case LLM_ARCH_JINA_BERT_V3:
18243
18480
  case LLM_ARCH_NOMIC_BERT:
18244
18481
  case LLM_ARCH_NOMIC_BERT_MOE:
18245
18482
  case LLM_ARCH_NEO_BERT:
@@ -18256,13 +18493,30 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18256
18493
  if (llm_arch_is_recurrent(arch)) {
18257
18494
  res = new llama_memory_recurrent(
18258
18495
  *this,
18259
- nullptr,
18260
18496
  GGML_TYPE_F32,
18261
18497
  GGML_TYPE_F32,
18262
18498
  cparams.offload_kqv,
18263
18499
  std::max((uint32_t) 1, cparams.n_seq_max),
18264
- cparams.n_seq_max);
18500
+ cparams.n_seq_max,
18501
+ nullptr);
18265
18502
  } else if (llm_arch_is_hybrid(arch)) {
18503
+
18504
+ // The main difference between hybrid architectures is the
18505
+ // layer filters, so pick the right one here
18506
+ llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
18507
+ llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
18508
+ if (arch == LLM_ARCH_FALCON_H1) {
18509
+ filter_attn = [&](int32_t) { return true; };
18510
+ filter_recr = [&](int32_t) { return true; };
18511
+ } else if (arch == LLM_ARCH_NEMOTRON_H) {
18512
+ filter_attn = [&](int32_t il) {
18513
+ return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18514
+ };
18515
+ filter_recr = [&](int32_t il) {
18516
+ return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18517
+ };
18518
+ }
18519
+
18266
18520
  const auto padding = llama_kv_cache::get_padding(cparams);
18267
18521
 
18268
18522
  cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
@@ -18282,8 +18536,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18282
18536
  /* n_seq_max */ cparams.n_seq_max,
18283
18537
  /* offload */ cparams.offload_kqv,
18284
18538
  /* unified */ cparams.kv_unified,
18285
- /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18286
- /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18539
+ /* filter_attn */ std::move(filter_attn),
18540
+ /* filter_recr */ std::move(filter_recr));
18287
18541
  } else {
18288
18542
  const auto padding = llama_kv_cache::get_padding(cparams);
18289
18543
 
@@ -18302,6 +18556,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18302
18556
 
18303
18557
  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
18304
18558
 
18559
+ llama_memory_i::layer_reuse_cb reuse = nullptr;
18560
+
18561
+ if (arch == LLM_ARCH_GEMMA3N) {
18562
+ reuse = [&](int32_t il) {
18563
+ if (il >= (int32_t) hparams.n_layer_kv_from_start) {
18564
+ return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
18565
+ }
18566
+
18567
+ return -1;
18568
+ };
18569
+ }
18570
+
18305
18571
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
18306
18572
  GGML_ASSERT(hparams.is_swa_any());
18307
18573
 
@@ -18316,13 +18582,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18316
18582
  n_ctx_per_stream,
18317
18583
  cparams.n_seq_max,
18318
18584
  cparams.n_ubatch,
18319
- padding);
18585
+ padding,
18586
+ nullptr,
18587
+ reuse);
18320
18588
  } else {
18321
18589
  GGML_ASSERT(!hparams.is_swa_any());
18322
18590
 
18323
18591
  res = new llama_kv_cache(
18324
18592
  *this,
18325
- nullptr,
18326
18593
  params.type_k,
18327
18594
  params.type_v,
18328
18595
  !cparams.flash_attn,
@@ -18332,7 +18599,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18332
18599
  cparams.n_seq_max,
18333
18600
  padding,
18334
18601
  hparams.n_swa,
18335
- hparams.swa_type);
18602
+ hparams.swa_type,
18603
+ nullptr,
18604
+ nullptr);
18336
18605
  }
18337
18606
  }
18338
18607
  }
@@ -18379,6 +18648,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18379
18648
  } break;
18380
18649
  case LLM_ARCH_BERT:
18381
18650
  case LLM_ARCH_JINA_BERT_V2:
18651
+ case LLM_ARCH_JINA_BERT_V3:
18382
18652
  case LLM_ARCH_NOMIC_BERT:
18383
18653
  case LLM_ARCH_NOMIC_BERT_MOE:
18384
18654
  {
@@ -18595,6 +18865,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18595
18865
  {
18596
18866
  llm = std::make_unique<llm_build_nemotron>(*this, params);
18597
18867
  } break;
18868
+ case LLM_ARCH_NEMOTRON_H:
18869
+ {
18870
+ llm = std::make_unique<llm_build_nemotron_h>(*this, params);
18871
+ } break;
18598
18872
  case LLM_ARCH_EXAONE:
18599
18873
  {
18600
18874
  llm = std::make_unique<llm_build_exaone>(*this, params);
@@ -18720,7 +18994,7 @@ llama_model_params llama_model_default_params() {
18720
18994
  llama_model_params result = {
18721
18995
  /*.devices =*/ nullptr,
18722
18996
  /*.tensor_buft_overrides =*/ nullptr,
18723
- /*.n_gpu_layers =*/ 0,
18997
+ /*.n_gpu_layers =*/ 999,
18724
18998
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
18725
18999
  /*.main_gpu =*/ 0,
18726
19000
  /*.tensor_split =*/ nullptr,
@@ -18734,11 +19008,6 @@ llama_model_params llama_model_default_params() {
18734
19008
  /*.use_extra_bufts =*/ true,
18735
19009
  };
18736
19010
 
18737
- #ifdef GGML_USE_METAL
18738
- // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
18739
- result.n_gpu_layers = 999;
18740
- #endif
18741
-
18742
19011
  return result;
18743
19012
  }
18744
19013
 
@@ -18830,6 +19099,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18830
19099
  case LLM_ARCH_RWKV7:
18831
19100
  case LLM_ARCH_ARWKV7:
18832
19101
  case LLM_ARCH_WAVTOKENIZER_DEC:
19102
+ case LLM_ARCH_NEMOTRON_H:
18833
19103
  return LLAMA_ROPE_TYPE_NONE;
18834
19104
 
18835
19105
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -18869,6 +19139,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18869
19139
  case LLM_ARCH_GROK:
18870
19140
  case LLM_ARCH_DBRX:
18871
19141
  case LLM_ARCH_BERT:
19142
+ case LLM_ARCH_JINA_BERT_V3:
18872
19143
  case LLM_ARCH_NOMIC_BERT:
18873
19144
  case LLM_ARCH_NOMIC_BERT_MOE:
18874
19145
  case LLM_ARCH_STABLELM:
@@ -40,6 +40,7 @@ enum llm_type {
40
40
  LLM_TYPE_450M,
41
41
  LLM_TYPE_475M,
42
42
  LLM_TYPE_537M,
43
+ LLM_TYPE_558M,
43
44
  LLM_TYPE_700M,
44
45
  LLM_TYPE_770M,
45
46
  LLM_TYPE_780M,
@@ -2470,7 +2470,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2470
2470
  // set attributes by model/tokenizer/architecture name
2471
2471
  if (false
2472
2472
  || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2473
- || _contains_any(general_arch, {"nomic-bert-moe"})
2473
+ || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2474
2474
  ) {
2475
2475
  if (token_to_id.count("<mask>") == 0) {
2476
2476
  LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
@@ -25,6 +25,18 @@
25
25
  // interface implementation
26
26
  //
27
27
 
28
+ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
29
+ switch (flash_attn_type) {
30
+ case LLAMA_FLASH_ATTN_TYPE_AUTO:
31
+ return "auto";
32
+ case LLAMA_FLASH_ATTN_TYPE_DISABLED:
33
+ return "disabled";
34
+ case LLAMA_FLASH_ATTN_TYPE_ENABLED:
35
+ return "enabled";
36
+ }
37
+ GGML_ABORT("fatal error");
38
+ }
39
+
28
40
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
29
41
  struct llama_sampler_chain_params result = {
30
42
  /*.no_perf =*/ true,