@fugood/llama.node 1.1.10 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/lib/binding.ts +2 -1
  2. package/package.json +14 -14
  3. package/src/LlamaContext.cpp +17 -1
  4. package/src/llama.cpp/common/arg.cpp +29 -19
  5. package/src/llama.cpp/common/chat.cpp +152 -1
  6. package/src/llama.cpp/common/chat.h +1 -0
  7. package/src/llama.cpp/common/common.cpp +10 -3
  8. package/src/llama.cpp/common/common.h +4 -1
  9. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  13. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  14. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
  20. package/src/llama.cpp/include/llama.h +27 -1
  21. package/src/llama.cpp/src/llama-adapter.cpp +68 -4
  22. package/src/llama.cpp/src/llama-adapter.h +3 -0
  23. package/src/llama.cpp/src/llama-arch.cpp +46 -2
  24. package/src/llama.cpp/src/llama-arch.h +4 -0
  25. package/src/llama.cpp/src/llama-context.cpp +80 -39
  26. package/src/llama.cpp/src/llama-context.h +0 -4
  27. package/src/llama.cpp/src/llama-graph.cpp +20 -10
  28. package/src/llama.cpp/src/llama-graph.h +2 -1
  29. package/src/llama.cpp/src/llama-impl.h +2 -0
  30. package/src/llama.cpp/src/llama-kv-cache.cpp +32 -97
  31. package/src/llama.cpp/src/llama-kv-cache.h +3 -13
  32. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  33. package/src/llama.cpp/src/llama-model.cpp +275 -20
  34. package/src/llama.cpp/src/llama-model.h +1 -0
  35. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  36. package/src/llama.cpp/src/llama.cpp +12 -0
@@ -788,6 +788,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
788
788
  }
789
789
 
790
790
  struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
791
+ LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
791
792
  const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
792
793
 
793
794
  if (cur == NULL) {
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
47
47
  case LLM_TYPE_410M: return "410M";
48
48
  case LLM_TYPE_450M: return "450M";
49
49
  case LLM_TYPE_475M: return "475M";
50
+ case LLM_TYPE_558M: return "558M";
50
51
  case LLM_TYPE_700M: return "700M";
51
52
  case LLM_TYPE_770M: return "770M";
52
53
  case LLM_TYPE_780M: return "780M";
@@ -772,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
772
773
  default: type = LLM_TYPE_UNKNOWN;
773
774
  }
774
775
  } break;
776
+ case LLM_ARCH_JINA_BERT_V3:
777
+ {
778
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
779
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
780
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
781
+
782
+ switch (hparams.n_layer) {
783
+ case 24:
784
+ type = LLM_TYPE_558M; break;
785
+ default: type = LLM_TYPE_UNKNOWN;
786
+ }
787
+ } break;
775
788
  case LLM_ARCH_NOMIC_BERT:
776
789
  case LLM_ARCH_NOMIC_BERT_MOE:
777
790
  {
@@ -1557,6 +1570,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1557
1570
  default: type = LLM_TYPE_UNKNOWN;
1558
1571
  }
1559
1572
  } break;
1573
+ case LLM_ARCH_NEMOTRON_H:
1574
+ {
1575
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1576
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1577
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1578
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1579
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1580
+
1581
+ // A layer is recurrent IFF the n_head_kv value is set to 0 and
1582
+ // the n_ff value is set to 0
1583
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1584
+ hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1585
+ }
1586
+
1587
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1588
+
1589
+ switch (hparams.n_layer) {
1590
+ case 56: type = LLM_TYPE_9B; break;
1591
+ default: type = LLM_TYPE_UNKNOWN;
1592
+ }
1593
+ } break;
1560
1594
  case LLM_ARCH_EXAONE:
1561
1595
  {
1562
1596
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2631,6 +2665,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2631
2665
  case LLM_ARCH_BERT:
2632
2666
  case LLM_ARCH_NOMIC_BERT:
2633
2667
  case LLM_ARCH_NOMIC_BERT_MOE:
2668
+ case LLM_ARCH_JINA_BERT_V3:
2634
2669
  {
2635
2670
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2636
2671
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
@@ -2666,24 +2701,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2666
2701
  }
2667
2702
 
2668
2703
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2704
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2669
2705
 
2670
2706
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2671
2707
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
2672
2708
 
2673
2709
  if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
2674
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2675
2710
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2676
2711
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2677
2712
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2678
2713
  } else {
2679
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2680
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2681
-
2682
- if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2683
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2684
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2685
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2686
- } else {
2714
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2715
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2716
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2717
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2718
+
2719
+ if (arch == LLM_ARCH_NOMIC_BERT) {
2687
2720
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2688
2721
  }
2689
2722
  }
@@ -4676,6 +4709,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4676
4709
  layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4677
4710
  }
4678
4711
  } break;
4712
+ case LLM_ARCH_NEMOTRON_H:
4713
+ {
4714
+ // mamba2 Mixer SSM params
4715
+ // NOTE: int64_t for tensor dimensions
4716
+ const int64_t d_conv = hparams.ssm_d_conv;
4717
+ const int64_t d_inner = hparams.ssm_d_inner;
4718
+ const int64_t d_state = hparams.ssm_d_state;
4719
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
4720
+ const int64_t n_group = hparams.ssm_n_group;
4721
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4722
+
4723
+ // embeddings
4724
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4725
+
4726
+ // output
4727
+ {
4728
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4729
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4730
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
4731
+ if (output == NULL) {
4732
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4733
+ }
4734
+ }
4735
+
4736
+ for (int i = 0; i < n_layer; ++i) {
4737
+ auto & layer = layers[i];
4738
+
4739
+ // all blocks use the attn norm
4740
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4741
+
4742
+ if (hparams.is_recurrent(i)) {
4743
+ // ssm layers
4744
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4745
+
4746
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4747
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4748
+
4749
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4750
+
4751
+ // no "weight" suffix for these
4752
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4753
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4754
+
4755
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4756
+
4757
+ // out_proj
4758
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4759
+ } else if (hparams.n_ff(i) == 0) {
4760
+ // attention layers (with optional bias)
4761
+ const int64_t n_head_i = hparams.n_head(i);
4762
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4763
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4764
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4765
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4766
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4767
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4768
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4769
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4770
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4771
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4772
+ } else {
4773
+ // mlp layers
4774
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
4775
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
4776
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4777
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
4778
+ }
4779
+ }
4780
+ } break;
4679
4781
  case LLM_ARCH_EXAONE:
4680
4782
  {
4681
4783
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5850,7 +5952,8 @@ void llama_model::print_info() const {
5850
5952
  arch == LLM_ARCH_JAMBA ||
5851
5953
  arch == LLM_ARCH_FALCON_H1 ||
5852
5954
  arch == LLM_ARCH_PLAMO2 ||
5853
- arch == LLM_ARCH_GRANITE_HYBRID) {
5955
+ arch == LLM_ARCH_GRANITE_HYBRID ||
5956
+ arch == LLM_ARCH_NEMOTRON_H) {
5854
5957
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
5855
5958
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5856
5959
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -7461,7 +7564,7 @@ struct llm_build_bert : public llm_graph_context {
7461
7564
  }
7462
7565
 
7463
7566
  // RoPE
7464
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7567
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7465
7568
  Qcur = ggml_rope_ext(
7466
7569
  ctx0, Qcur, inp_pos, nullptr,
7467
7570
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -7520,7 +7623,7 @@ struct llm_build_bert : public llm_graph_context {
7520
7623
  0.0f,
7521
7624
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
7522
7625
  cb(cur, "ffn_moe_out", il);
7523
- } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7626
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7524
7627
  cur = build_ffn(cur,
7525
7628
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7526
7629
  NULL, NULL, NULL,
@@ -14117,6 +14220,138 @@ struct llm_build_nemotron : public llm_graph_context {
14117
14220
  }
14118
14221
  };
14119
14222
 
14223
+ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14224
+ llm_build_nemotron_h(
14225
+ const llama_model & model,
14226
+ const llm_graph_params & params) :
14227
+ llm_graph_context_mamba(params) {
14228
+
14229
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14230
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14231
+
14232
+ ggml_tensor * cur;
14233
+ ggml_tensor * inpL;
14234
+
14235
+ inpL = build_inp_embd(model.tok_embd);
14236
+
14237
+ auto * inp = build_inp_mem_hybrid();
14238
+
14239
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14240
+
14241
+ for (int il = 0; il < n_layer; ++il) {
14242
+ struct ggml_tensor * inpSA = inpL;
14243
+
14244
+ // norm
14245
+ cur = build_norm(inpL,
14246
+ model.layers[il].attn_norm, NULL,
14247
+ LLM_NORM_RMS, il);
14248
+ cb(cur, "attn_norm", il);
14249
+
14250
+ if (hparams.is_recurrent(il)) {
14251
+ // ssm layer //
14252
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
14253
+ } else if (hparams.n_ff(il) == 0) {
14254
+ // attention layer //
14255
+ cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
14256
+ } else {
14257
+ cur = build_ffn_layer(cur, model, il);
14258
+ }
14259
+
14260
+ if (il == n_layer - 1 && inp_out_ids) {
14261
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14262
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14263
+ }
14264
+
14265
+ // add residual
14266
+ cur = ggml_add(ctx0, cur, inpSA);
14267
+ cb(cur, "block_out", il);
14268
+
14269
+ // input for next layer
14270
+ inpL = cur;
14271
+ }
14272
+
14273
+ cur = inpL;
14274
+
14275
+ cur = build_norm(cur,
14276
+ model.output_norm, NULL,
14277
+ LLM_NORM_RMS, -1);
14278
+
14279
+ cb(cur, "result_norm", -1);
14280
+ res->t_embd = cur;
14281
+
14282
+ // lm_head
14283
+ cur = build_lora_mm(model.output, cur);
14284
+ cb(cur, "result_output", -1);
14285
+ res->t_logits = cur;
14286
+
14287
+ ggml_build_forward_expand(gf, cur);
14288
+ }
14289
+
14290
+ ggml_tensor * build_attention_layer(
14291
+ ggml_tensor * cur,
14292
+ llm_graph_input_attn_kv * inp_attn,
14293
+ const llama_model & model,
14294
+ const int64_t n_embd_head,
14295
+ const int il) {
14296
+
14297
+ // compute Q and K and (optionally) RoPE them
14298
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14299
+ cb(Qcur, "Qcur", il);
14300
+ if (model.layers[il].bq) {
14301
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14302
+ cb(Qcur, "Qcur", il);
14303
+ }
14304
+
14305
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14306
+ cb(Kcur, "Kcur", il);
14307
+ if (model.layers[il].bk) {
14308
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14309
+ cb(Kcur, "Kcur", il);
14310
+ }
14311
+
14312
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14313
+ cb(Vcur, "Vcur", il);
14314
+ if (model.layers[il].bv) {
14315
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14316
+ cb(Vcur, "Vcur", il);
14317
+ }
14318
+
14319
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14320
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14321
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14322
+
14323
+ cb(Qcur, "Qcur", il);
14324
+ cb(Kcur, "Kcur", il);
14325
+ cb(Vcur, "Vcur", il);
14326
+
14327
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14328
+ cur = build_attn(inp_attn,
14329
+ model.layers[il].wo, model.layers[il].bo,
14330
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
14331
+ cb(cur, "attn_out", il);
14332
+ return cur;
14333
+ }
14334
+
14335
+ ggml_tensor * build_ffn_layer(
14336
+ ggml_tensor * cur,
14337
+ const llama_model & model,
14338
+ const int il) {
14339
+
14340
+ cur = build_ffn(cur,
14341
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14342
+ NULL, NULL, NULL,
14343
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14344
+ NULL,
14345
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
14346
+ cb(cur, "ffn_out", il);
14347
+
14348
+ cur = build_cvec(cur, il);
14349
+ cb(cur, "l_out", il);
14350
+
14351
+ return cur;
14352
+ }
14353
+ };
14354
+
14120
14355
  struct llm_build_exaone : public llm_graph_context {
14121
14356
  llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14122
14357
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -18241,6 +18476,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18241
18476
  // switch statement
18242
18477
  case LLM_ARCH_BERT:
18243
18478
  case LLM_ARCH_JINA_BERT_V2:
18479
+ case LLM_ARCH_JINA_BERT_V3:
18244
18480
  case LLM_ARCH_NOMIC_BERT:
18245
18481
  case LLM_ARCH_NOMIC_BERT_MOE:
18246
18482
  case LLM_ARCH_NEO_BERT:
@@ -18264,6 +18500,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18264
18500
  cparams.n_seq_max,
18265
18501
  nullptr);
18266
18502
  } else if (llm_arch_is_hybrid(arch)) {
18503
+
18504
+ // The main difference between hybrid architectures is the
18505
+ // layer filters, so pick the right one here
18506
+ llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
18507
+ llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
18508
+ if (arch == LLM_ARCH_FALCON_H1) {
18509
+ filter_attn = [&](int32_t) { return true; };
18510
+ filter_recr = [&](int32_t) { return true; };
18511
+ } else if (arch == LLM_ARCH_NEMOTRON_H) {
18512
+ filter_attn = [&](int32_t il) {
18513
+ return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18514
+ };
18515
+ filter_recr = [&](int32_t il) {
18516
+ return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18517
+ };
18518
+ }
18519
+
18267
18520
  const auto padding = llama_kv_cache::get_padding(cparams);
18268
18521
 
18269
18522
  cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
@@ -18283,8 +18536,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18283
18536
  /* n_seq_max */ cparams.n_seq_max,
18284
18537
  /* offload */ cparams.offload_kqv,
18285
18538
  /* unified */ cparams.kv_unified,
18286
- /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18287
- /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18539
+ /* filter_attn */ std::move(filter_attn),
18540
+ /* filter_recr */ std::move(filter_recr));
18288
18541
  } else {
18289
18542
  const auto padding = llama_kv_cache::get_padding(cparams);
18290
18543
 
@@ -18395,6 +18648,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18395
18648
  } break;
18396
18649
  case LLM_ARCH_BERT:
18397
18650
  case LLM_ARCH_JINA_BERT_V2:
18651
+ case LLM_ARCH_JINA_BERT_V3:
18398
18652
  case LLM_ARCH_NOMIC_BERT:
18399
18653
  case LLM_ARCH_NOMIC_BERT_MOE:
18400
18654
  {
@@ -18611,6 +18865,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18611
18865
  {
18612
18866
  llm = std::make_unique<llm_build_nemotron>(*this, params);
18613
18867
  } break;
18868
+ case LLM_ARCH_NEMOTRON_H:
18869
+ {
18870
+ llm = std::make_unique<llm_build_nemotron_h>(*this, params);
18871
+ } break;
18614
18872
  case LLM_ARCH_EXAONE:
18615
18873
  {
18616
18874
  llm = std::make_unique<llm_build_exaone>(*this, params);
@@ -18736,7 +18994,7 @@ llama_model_params llama_model_default_params() {
18736
18994
  llama_model_params result = {
18737
18995
  /*.devices =*/ nullptr,
18738
18996
  /*.tensor_buft_overrides =*/ nullptr,
18739
- /*.n_gpu_layers =*/ 0,
18997
+ /*.n_gpu_layers =*/ 999,
18740
18998
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
18741
18999
  /*.main_gpu =*/ 0,
18742
19000
  /*.tensor_split =*/ nullptr,
@@ -18750,11 +19008,6 @@ llama_model_params llama_model_default_params() {
18750
19008
  /*.use_extra_bufts =*/ true,
18751
19009
  };
18752
19010
 
18753
- #ifdef GGML_USE_METAL
18754
- // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
18755
- result.n_gpu_layers = 999;
18756
- #endif
18757
-
18758
19011
  return result;
18759
19012
  }
18760
19013
 
@@ -18846,6 +19099,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18846
19099
  case LLM_ARCH_RWKV7:
18847
19100
  case LLM_ARCH_ARWKV7:
18848
19101
  case LLM_ARCH_WAVTOKENIZER_DEC:
19102
+ case LLM_ARCH_NEMOTRON_H:
18849
19103
  return LLAMA_ROPE_TYPE_NONE;
18850
19104
 
18851
19105
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -18885,6 +19139,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18885
19139
  case LLM_ARCH_GROK:
18886
19140
  case LLM_ARCH_DBRX:
18887
19141
  case LLM_ARCH_BERT:
19142
+ case LLM_ARCH_JINA_BERT_V3:
18888
19143
  case LLM_ARCH_NOMIC_BERT:
18889
19144
  case LLM_ARCH_NOMIC_BERT_MOE:
18890
19145
  case LLM_ARCH_STABLELM:
@@ -40,6 +40,7 @@ enum llm_type {
40
40
  LLM_TYPE_450M,
41
41
  LLM_TYPE_475M,
42
42
  LLM_TYPE_537M,
43
+ LLM_TYPE_558M,
43
44
  LLM_TYPE_700M,
44
45
  LLM_TYPE_770M,
45
46
  LLM_TYPE_780M,
@@ -2470,7 +2470,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2470
2470
  // set attributes by model/tokenizer/architecture name
2471
2471
  if (false
2472
2472
  || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2473
- || _contains_any(general_arch, {"nomic-bert-moe"})
2473
+ || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2474
2474
  ) {
2475
2475
  if (token_to_id.count("<mask>") == 0) {
2476
2476
  LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
@@ -25,6 +25,18 @@
25
25
  // interface implementation
26
26
  //
27
27
 
28
+ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
29
+ switch (flash_attn_type) {
30
+ case LLAMA_FLASH_ATTN_TYPE_AUTO:
31
+ return "auto";
32
+ case LLAMA_FLASH_ATTN_TYPE_DISABLED:
33
+ return "disabled";
34
+ case LLAMA_FLASH_ATTN_TYPE_ENABLED:
35
+ return "enabled";
36
+ }
37
+ GGML_ABORT("fatal error");
38
+ }
39
+
28
40
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
29
41
  struct llama_sampler_chain_params result = {
30
42
  /*.no_perf =*/ true,