@fugood/llama.node 1.1.10 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +20 -2
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +174 -388
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +67 -37
  27. package/src/llama.cpp/common/chat.cpp +263 -2
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.cpp +10 -3
  30. package/src/llama.cpp/common/common.h +5 -2
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  39. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  40. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  45. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  46. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
  53. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
  54. package/src/llama.cpp/include/llama.h +32 -7
  55. package/src/llama.cpp/src/llama-adapter.cpp +101 -4
  56. package/src/llama.cpp/src/llama-adapter.h +6 -0
  57. package/src/llama.cpp/src/llama-arch.cpp +69 -2
  58. package/src/llama.cpp/src/llama-arch.h +6 -0
  59. package/src/llama.cpp/src/llama-context.cpp +92 -45
  60. package/src/llama.cpp/src/llama-context.h +1 -5
  61. package/src/llama.cpp/src/llama-graph.cpp +74 -19
  62. package/src/llama.cpp/src/llama-graph.h +10 -1
  63. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  64. package/src/llama.cpp/src/llama-hparams.h +9 -3
  65. package/src/llama.cpp/src/llama-impl.h +2 -0
  66. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
  67. package/src/llama.cpp/src/llama-kv-cache.h +4 -13
  68. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  69. package/src/llama.cpp/src/llama-model.cpp +434 -21
  70. package/src/llama.cpp/src/llama-model.h +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  72. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  73. package/src/llama.cpp/src/llama.cpp +12 -0
  74. package/src/anyascii.c +0 -22223
  75. package/src/anyascii.h +0 -42
  76. package/src/tts_utils.cpp +0 -371
  77. package/src/tts_utils.h +0 -103
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
47
47
  case LLM_TYPE_410M: return "410M";
48
48
  case LLM_TYPE_450M: return "450M";
49
49
  case LLM_TYPE_475M: return "475M";
50
+ case LLM_TYPE_558M: return "558M";
50
51
  case LLM_TYPE_700M: return "700M";
51
52
  case LLM_TYPE_770M: return "770M";
52
53
  case LLM_TYPE_780M: return "780M";
@@ -772,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
772
773
  default: type = LLM_TYPE_UNKNOWN;
773
774
  }
774
775
  } break;
776
+ case LLM_ARCH_JINA_BERT_V3:
777
+ {
778
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
779
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
780
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
781
+
782
+ switch (hparams.n_layer) {
783
+ case 24:
784
+ type = LLM_TYPE_558M; break;
785
+ default: type = LLM_TYPE_UNKNOWN;
786
+ }
787
+ } break;
775
788
  case LLM_ARCH_NOMIC_BERT:
776
789
  case LLM_ARCH_NOMIC_BERT_MOE:
777
790
  {
@@ -1097,7 +1110,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1097
1110
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1098
1111
 
1099
1112
  switch (hparams.n_layer) {
1100
- case 18: type = LLM_TYPE_537M; break;
1113
+ case 18: type = LLM_TYPE_270M; break;
1101
1114
  case 26: type = LLM_TYPE_1B; break;
1102
1115
  case 34: type = LLM_TYPE_4B; break;
1103
1116
  case 48: type = LLM_TYPE_12B; break;
@@ -1129,6 +1142,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1129
1142
  default: type = LLM_TYPE_UNKNOWN;
1130
1143
  }
1131
1144
  } break;
1145
+ case LLM_ARCH_GEMMA_EMBEDDING:
1146
+ {
1147
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1148
+ hparams.set_swa_pattern(6);
1149
+
1150
+ hparams.causal_attn = false; // embeddings do not use causal attention
1151
+ hparams.rope_freq_base_train_swa = 10000.0f;
1152
+ hparams.rope_freq_scale_train_swa = 1.0f;
1153
+
1154
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1155
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1156
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1157
+
1158
+ switch (hparams.n_layer) {
1159
+ case 24: type = LLM_TYPE_0_3B; break;
1160
+ default: type = LLM_TYPE_UNKNOWN;
1161
+ }
1162
+ hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1163
+
1164
+ } break;
1132
1165
  case LLM_ARCH_STARCODER2:
1133
1166
  {
1134
1167
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1557,6 +1590,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1557
1590
  default: type = LLM_TYPE_UNKNOWN;
1558
1591
  }
1559
1592
  } break;
1593
+ case LLM_ARCH_NEMOTRON_H:
1594
+ {
1595
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1596
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1597
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1598
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1599
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1600
+
1601
+ // A layer is recurrent IFF the n_head_kv value is set to 0 and
1602
+ // the n_ff value is set to 0
1603
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1604
+ hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1605
+ }
1606
+
1607
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1608
+
1609
+ switch (hparams.n_layer) {
1610
+ case 56: type = LLM_TYPE_9B; break;
1611
+ default: type = LLM_TYPE_UNKNOWN;
1612
+ }
1613
+ } break;
1560
1614
  case LLM_ARCH_EXAONE:
1561
1615
  {
1562
1616
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2631,6 +2685,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2631
2685
  case LLM_ARCH_BERT:
2632
2686
  case LLM_ARCH_NOMIC_BERT:
2633
2687
  case LLM_ARCH_NOMIC_BERT_MOE:
2688
+ case LLM_ARCH_JINA_BERT_V3:
2634
2689
  {
2635
2690
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2636
2691
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
@@ -2666,24 +2721,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2666
2721
  }
2667
2722
 
2668
2723
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2724
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2669
2725
 
2670
2726
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2671
2727
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
2672
2728
 
2673
2729
  if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
2674
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2675
2730
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2676
2731
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2677
2732
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2678
2733
  } else {
2679
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2680
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2681
-
2682
- if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2683
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2684
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2685
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2686
- } else {
2734
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2735
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2736
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2737
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2738
+
2739
+ if (arch == LLM_ARCH_NOMIC_BERT) {
2687
2740
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2688
2741
  }
2689
2742
  }
@@ -3451,6 +3504,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3451
3504
  }
3452
3505
  } break;
3453
3506
  case LLM_ARCH_GEMMA3:
3507
+ case LLM_ARCH_GEMMA_EMBEDDING:
3454
3508
  {
3455
3509
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3456
3510
 
@@ -4676,6 +4730,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4676
4730
  layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4677
4731
  }
4678
4732
  } break;
4733
+ case LLM_ARCH_NEMOTRON_H:
4734
+ {
4735
+ // mamba2 Mixer SSM params
4736
+ // NOTE: int64_t for tensor dimensions
4737
+ const int64_t d_conv = hparams.ssm_d_conv;
4738
+ const int64_t d_inner = hparams.ssm_d_inner;
4739
+ const int64_t d_state = hparams.ssm_d_state;
4740
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
4741
+ const int64_t n_group = hparams.ssm_n_group;
4742
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4743
+
4744
+ // embeddings
4745
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4746
+
4747
+ // output
4748
+ {
4749
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4750
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4751
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
4752
+ if (output == NULL) {
4753
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4754
+ }
4755
+ }
4756
+
4757
+ for (int i = 0; i < n_layer; ++i) {
4758
+ auto & layer = layers[i];
4759
+
4760
+ // all blocks use the attn norm
4761
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4762
+
4763
+ if (hparams.is_recurrent(i)) {
4764
+ // ssm layers
4765
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4766
+
4767
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4768
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4769
+
4770
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4771
+
4772
+ // no "weight" suffix for these
4773
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4774
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4775
+
4776
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4777
+
4778
+ // out_proj
4779
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4780
+ } else if (hparams.n_ff(i) == 0) {
4781
+ // attention layers (with optional bias)
4782
+ const int64_t n_head_i = hparams.n_head(i);
4783
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4784
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4785
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4786
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4787
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4788
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4789
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4790
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4791
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4792
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4793
+ } else {
4794
+ // mlp layers
4795
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
4796
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
4797
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4798
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
4799
+ }
4800
+ }
4801
+ } break;
4679
4802
  case LLM_ARCH_EXAONE:
4680
4803
  {
4681
4804
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5850,7 +5973,8 @@ void llama_model::print_info() const {
5850
5973
  arch == LLM_ARCH_JAMBA ||
5851
5974
  arch == LLM_ARCH_FALCON_H1 ||
5852
5975
  arch == LLM_ARCH_PLAMO2 ||
5853
- arch == LLM_ARCH_GRANITE_HYBRID) {
5976
+ arch == LLM_ARCH_GRANITE_HYBRID ||
5977
+ arch == LLM_ARCH_NEMOTRON_H) {
5854
5978
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
5855
5979
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5856
5980
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -7461,7 +7585,7 @@ struct llm_build_bert : public llm_graph_context {
7461
7585
  }
7462
7586
 
7463
7587
  // RoPE
7464
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7588
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7465
7589
  Qcur = ggml_rope_ext(
7466
7590
  ctx0, Qcur, inp_pos, nullptr,
7467
7591
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -7520,7 +7644,7 @@ struct llm_build_bert : public llm_graph_context {
7520
7644
  0.0f,
7521
7645
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
7522
7646
  cb(cur, "ffn_moe_out", il);
7523
- } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7647
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7524
7648
  cur = build_ffn(cur,
7525
7649
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7526
7650
  NULL, NULL, NULL,
@@ -10942,6 +11066,137 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10942
11066
  }
10943
11067
  };
10944
11068
 
11069
+ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11070
+ llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11071
+ const int64_t n_embd_head = hparams.n_embd_head_k;
11072
+
11073
+ ggml_tensor * cur;
11074
+ ggml_tensor * inpL;
11075
+
11076
+ inpL = build_inp_embd(model.tok_embd);
11077
+
11078
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
11079
+ if (ubatch.token) {
11080
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
11081
+ cb(inpL, "inp_scaled", -1);
11082
+ }
11083
+
11084
+ // inp_pos - contains the positions
11085
+ ggml_tensor * inp_pos = build_inp_pos();
11086
+
11087
+ // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
11088
+ auto * inp_attn = build_attn_inp_kv_iswa();
11089
+
11090
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11091
+
11092
+ for (int il = 0; il < n_layer; ++il) {
11093
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
11094
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
11095
+
11096
+ // norm
11097
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
11098
+ cb(cur, "attn_norm", il);
11099
+
11100
+ // self-attention
11101
+ {
11102
+ // compute Q and K and RoPE them
11103
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
11104
+ cb(Qcur, "Qcur", il);
11105
+
11106
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
11107
+ cb(Kcur, "Kcur", il);
11108
+
11109
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
11110
+ cb(Vcur, "Vcur", il);
11111
+
11112
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11113
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11114
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11115
+
11116
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
11117
+ cb(Qcur, "Qcur_normed", il);
11118
+
11119
+ Qcur = ggml_rope_ext(
11120
+ ctx0, Qcur, inp_pos, nullptr,
11121
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
11122
+ ext_factor, attn_factor, beta_fast, beta_slow);
11123
+
11124
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
11125
+ cb(Kcur, "Kcur_normed", il);
11126
+
11127
+ Kcur = ggml_rope_ext(
11128
+ ctx0, Kcur, inp_pos, nullptr,
11129
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
11130
+ ext_factor, attn_factor, beta_fast, beta_slow);
11131
+
11132
+ cb(Qcur, "Qcur", il);
11133
+ cb(Kcur, "Kcur", il);
11134
+ cb(Vcur, "Vcur", il);
11135
+
11136
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
11137
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
11138
+
11139
+ cur = build_attn(inp_attn,
11140
+ model.layers[il].wo, NULL,
11141
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
11142
+ }
11143
+
11144
+ if (il == n_layer - 1 && inp_out_ids) {
11145
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11146
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
11147
+ }
11148
+
11149
+ cur = build_norm(cur,
11150
+ model.layers[il].attn_post_norm, NULL,
11151
+ LLM_NORM_RMS, il);
11152
+ cb(cur, "attn_post_norm", il);
11153
+
11154
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
11155
+ cb(sa_out, "sa_out", il);
11156
+
11157
+ cur = build_norm(sa_out,
11158
+ model.layers[il].ffn_norm, NULL,
11159
+ LLM_NORM_RMS, il);
11160
+ cb(cur, "ffn_norm", il);
11161
+
11162
+ // feed-forward network
11163
+ {
11164
+ cur = build_ffn(cur,
11165
+ model.layers[il].ffn_up, NULL, NULL,
11166
+ model.layers[il].ffn_gate, NULL, NULL,
11167
+ model.layers[il].ffn_down, NULL, NULL,
11168
+ NULL,
11169
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
11170
+ cb(cur, "ffn_out", il);
11171
+ }
11172
+
11173
+ cur = build_norm(cur,
11174
+ model.layers[il].ffn_post_norm, NULL,
11175
+ LLM_NORM_RMS, -1);
11176
+ cb(cur, "ffn_post_norm", -1);
11177
+
11178
+ cur = ggml_add(ctx0, cur, sa_out);
11179
+
11180
+ cur = build_cvec(cur, il);
11181
+ cb(cur, "l_out", il);
11182
+
11183
+ // input for next layer
11184
+ inpL = cur;
11185
+ }
11186
+
11187
+ cur = inpL;
11188
+
11189
+ cur = build_norm(cur,
11190
+ model.output_norm, NULL,
11191
+ LLM_NORM_RMS, -1);
11192
+
11193
+ cb(cur, "result_norm", -1);
11194
+ res->t_embd = cur;
11195
+
11196
+ ggml_build_forward_expand(gf, cur);
11197
+ }
11198
+ };
11199
+
10945
11200
  // TODO: move up next to build_starcoder
10946
11201
  struct llm_build_starcoder2 : public llm_graph_context {
10947
11202
  llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@@ -14117,6 +14372,138 @@ struct llm_build_nemotron : public llm_graph_context {
14117
14372
  }
14118
14373
  };
14119
14374
 
14375
+ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14376
+ llm_build_nemotron_h(
14377
+ const llama_model & model,
14378
+ const llm_graph_params & params) :
14379
+ llm_graph_context_mamba(params) {
14380
+
14381
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14382
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14383
+
14384
+ ggml_tensor * cur;
14385
+ ggml_tensor * inpL;
14386
+
14387
+ inpL = build_inp_embd(model.tok_embd);
14388
+
14389
+ auto * inp = build_inp_mem_hybrid();
14390
+
14391
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14392
+
14393
+ for (int il = 0; il < n_layer; ++il) {
14394
+ struct ggml_tensor * inpSA = inpL;
14395
+
14396
+ // norm
14397
+ cur = build_norm(inpL,
14398
+ model.layers[il].attn_norm, NULL,
14399
+ LLM_NORM_RMS, il);
14400
+ cb(cur, "attn_norm", il);
14401
+
14402
+ if (hparams.is_recurrent(il)) {
14403
+ // ssm layer //
14404
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
14405
+ } else if (hparams.n_ff(il) == 0) {
14406
+ // attention layer //
14407
+ cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
14408
+ } else {
14409
+ cur = build_ffn_layer(cur, model, il);
14410
+ }
14411
+
14412
+ if (il == n_layer - 1 && inp_out_ids) {
14413
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14414
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14415
+ }
14416
+
14417
+ // add residual
14418
+ cur = ggml_add(ctx0, cur, inpSA);
14419
+ cb(cur, "block_out", il);
14420
+
14421
+ // input for next layer
14422
+ inpL = cur;
14423
+ }
14424
+
14425
+ cur = inpL;
14426
+
14427
+ cur = build_norm(cur,
14428
+ model.output_norm, NULL,
14429
+ LLM_NORM_RMS, -1);
14430
+
14431
+ cb(cur, "result_norm", -1);
14432
+ res->t_embd = cur;
14433
+
14434
+ // lm_head
14435
+ cur = build_lora_mm(model.output, cur);
14436
+ cb(cur, "result_output", -1);
14437
+ res->t_logits = cur;
14438
+
14439
+ ggml_build_forward_expand(gf, cur);
14440
+ }
14441
+
14442
+ ggml_tensor * build_attention_layer(
14443
+ ggml_tensor * cur,
14444
+ llm_graph_input_attn_kv * inp_attn,
14445
+ const llama_model & model,
14446
+ const int64_t n_embd_head,
14447
+ const int il) {
14448
+
14449
+ // compute Q and K and (optionally) RoPE them
14450
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14451
+ cb(Qcur, "Qcur", il);
14452
+ if (model.layers[il].bq) {
14453
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14454
+ cb(Qcur, "Qcur", il);
14455
+ }
14456
+
14457
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14458
+ cb(Kcur, "Kcur", il);
14459
+ if (model.layers[il].bk) {
14460
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14461
+ cb(Kcur, "Kcur", il);
14462
+ }
14463
+
14464
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14465
+ cb(Vcur, "Vcur", il);
14466
+ if (model.layers[il].bv) {
14467
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14468
+ cb(Vcur, "Vcur", il);
14469
+ }
14470
+
14471
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14472
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14473
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14474
+
14475
+ cb(Qcur, "Qcur", il);
14476
+ cb(Kcur, "Kcur", il);
14477
+ cb(Vcur, "Vcur", il);
14478
+
14479
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14480
+ cur = build_attn(inp_attn,
14481
+ model.layers[il].wo, model.layers[il].bo,
14482
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
14483
+ cb(cur, "attn_out", il);
14484
+ return cur;
14485
+ }
14486
+
14487
+ ggml_tensor * build_ffn_layer(
14488
+ ggml_tensor * cur,
14489
+ const llama_model & model,
14490
+ const int il) {
14491
+
14492
+ cur = build_ffn(cur,
14493
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14494
+ NULL, NULL, NULL,
14495
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14496
+ NULL,
14497
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
14498
+ cb(cur, "ffn_out", il);
14499
+
14500
+ cur = build_cvec(cur, il);
14501
+ cb(cur, "l_out", il);
14502
+
14503
+ return cur;
14504
+ }
14505
+ };
14506
+
14120
14507
  struct llm_build_exaone : public llm_graph_context {
14121
14508
  llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14122
14509
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -18241,10 +18628,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18241
18628
  // switch statement
18242
18629
  case LLM_ARCH_BERT:
18243
18630
  case LLM_ARCH_JINA_BERT_V2:
18631
+ case LLM_ARCH_JINA_BERT_V3:
18244
18632
  case LLM_ARCH_NOMIC_BERT:
18245
18633
  case LLM_ARCH_NOMIC_BERT_MOE:
18246
18634
  case LLM_ARCH_NEO_BERT:
18247
18635
  case LLM_ARCH_WAVTOKENIZER_DEC:
18636
+ //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
18248
18637
  case LLM_ARCH_DREAM:
18249
18638
  case LLM_ARCH_LLADA:
18250
18639
  {
@@ -18264,6 +18653,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18264
18653
  cparams.n_seq_max,
18265
18654
  nullptr);
18266
18655
  } else if (llm_arch_is_hybrid(arch)) {
18656
+
18657
+ // The main difference between hybrid architectures is the
18658
+ // layer filters, so pick the right one here
18659
+ llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
18660
+ llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
18661
+ if (arch == LLM_ARCH_FALCON_H1) {
18662
+ filter_attn = [&](int32_t) { return true; };
18663
+ filter_recr = [&](int32_t) { return true; };
18664
+ } else if (arch == LLM_ARCH_NEMOTRON_H) {
18665
+ filter_attn = [&](int32_t il) {
18666
+ return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18667
+ };
18668
+ filter_recr = [&](int32_t il) {
18669
+ return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18670
+ };
18671
+ }
18672
+
18267
18673
  const auto padding = llama_kv_cache::get_padding(cparams);
18268
18674
 
18269
18675
  cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
@@ -18283,8 +18689,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18283
18689
  /* n_seq_max */ cparams.n_seq_max,
18284
18690
  /* offload */ cparams.offload_kqv,
18285
18691
  /* unified */ cparams.kv_unified,
18286
- /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18287
- /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18692
+ /* filter_attn */ std::move(filter_attn),
18693
+ /* filter_recr */ std::move(filter_recr));
18288
18694
  } else {
18289
18695
  const auto padding = llama_kv_cache::get_padding(cparams);
18290
18696
 
@@ -18395,6 +18801,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18395
18801
  } break;
18396
18802
  case LLM_ARCH_BERT:
18397
18803
  case LLM_ARCH_JINA_BERT_V2:
18804
+ case LLM_ARCH_JINA_BERT_V3:
18398
18805
  case LLM_ARCH_NOMIC_BERT:
18399
18806
  case LLM_ARCH_NOMIC_BERT_MOE:
18400
18807
  {
@@ -18507,6 +18914,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18507
18914
  {
18508
18915
  llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
18509
18916
  } break;
18917
+ case LLM_ARCH_GEMMA_EMBEDDING:
18918
+ {
18919
+ llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
18920
+ } break;
18510
18921
  case LLM_ARCH_STARCODER2:
18511
18922
  {
18512
18923
  llm = std::make_unique<llm_build_starcoder2>(*this, params);
@@ -18611,6 +19022,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18611
19022
  {
18612
19023
  llm = std::make_unique<llm_build_nemotron>(*this, params);
18613
19024
  } break;
19025
+ case LLM_ARCH_NEMOTRON_H:
19026
+ {
19027
+ llm = std::make_unique<llm_build_nemotron_h>(*this, params);
19028
+ } break;
18614
19029
  case LLM_ARCH_EXAONE:
18615
19030
  {
18616
19031
  llm = std::make_unique<llm_build_exaone>(*this, params);
@@ -18736,7 +19151,7 @@ llama_model_params llama_model_default_params() {
18736
19151
  llama_model_params result = {
18737
19152
  /*.devices =*/ nullptr,
18738
19153
  /*.tensor_buft_overrides =*/ nullptr,
18739
- /*.n_gpu_layers =*/ 0,
19154
+ /*.n_gpu_layers =*/ 999,
18740
19155
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
18741
19156
  /*.main_gpu =*/ 0,
18742
19157
  /*.tensor_split =*/ nullptr,
@@ -18750,11 +19165,6 @@ llama_model_params llama_model_default_params() {
18750
19165
  /*.use_extra_bufts =*/ true,
18751
19166
  };
18752
19167
 
18753
- #ifdef GGML_USE_METAL
18754
- // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
18755
- result.n_gpu_layers = 999;
18756
- #endif
18757
-
18758
19168
  return result;
18759
19169
  }
18760
19170
 
@@ -18846,6 +19256,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18846
19256
  case LLM_ARCH_RWKV7:
18847
19257
  case LLM_ARCH_ARWKV7:
18848
19258
  case LLM_ARCH_WAVTOKENIZER_DEC:
19259
+ case LLM_ARCH_NEMOTRON_H:
18849
19260
  return LLAMA_ROPE_TYPE_NONE;
18850
19261
 
18851
19262
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -18885,6 +19296,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18885
19296
  case LLM_ARCH_GROK:
18886
19297
  case LLM_ARCH_DBRX:
18887
19298
  case LLM_ARCH_BERT:
19299
+ case LLM_ARCH_JINA_BERT_V3:
18888
19300
  case LLM_ARCH_NOMIC_BERT:
18889
19301
  case LLM_ARCH_NOMIC_BERT_MOE:
18890
19302
  case LLM_ARCH_STABLELM:
@@ -18906,6 +19318,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18906
19318
  case LLM_ARCH_GEMMA2:
18907
19319
  case LLM_ARCH_GEMMA3:
18908
19320
  case LLM_ARCH_GEMMA3N:
19321
+ case LLM_ARCH_GEMMA_EMBEDDING:
18909
19322
  case LLM_ARCH_STARCODER2:
18910
19323
  case LLM_ARCH_OPENELM:
18911
19324
  case LLM_ARCH_GPTNEOX:
@@ -39,7 +39,7 @@ enum llm_type {
39
39
  LLM_TYPE_410M,
40
40
  LLM_TYPE_450M,
41
41
  LLM_TYPE_475M,
42
- LLM_TYPE_537M,
42
+ LLM_TYPE_558M,
43
43
  LLM_TYPE_700M,
44
44
  LLM_TYPE_770M,
45
45
  LLM_TYPE_780M,