@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
@@ -463,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
463
463
  GGML_ASSERT(hparams.n_expert_used == 0);
464
464
  }
465
465
 
466
- // zero-out the array hparams
467
466
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
468
467
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
469
468
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
470
469
 
470
+ std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
471
+
472
+ std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
473
+
471
474
  ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
472
475
  ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
473
476
 
@@ -571,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
571
574
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
572
575
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
573
576
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
574
- hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
575
- hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
576
- hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
577
+
578
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
579
+ hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
580
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
577
581
 
578
582
  switch (hparams.n_expert) {
579
583
  case 16: type = LLM_TYPE_17B_16E; break;
@@ -852,22 +856,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
852
856
  default: type = LLM_TYPE_UNKNOWN;
853
857
  }
854
858
 
855
- // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
856
- if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
857
- // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
858
- hparams.n_swa = 2047;
859
- } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
860
- // default value for Phi-3-mini-128k-instruct
861
- // note: this seems incorrect because the window is bigger than the train context?
862
- hparams.n_swa = 262144;
863
- } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
864
- // default value for Phi-3-medium-128k-instruct
865
- // note: this seems incorrect because the window is equal to the train context?
866
- hparams.n_swa = 131072;
867
- }
868
- bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
869
- if (!found_swa && hparams.n_swa == 0) {
870
- throw std::runtime_error("invalid value for sliding_window");
859
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
860
+
861
+ if (found_swa && hparams.n_swa > 0) {
862
+ LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
863
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
864
+
865
+ // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
866
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
867
+
868
+ hparams.n_swa = 0;
869
+ hparams.set_swa_pattern(1);
871
870
  }
872
871
  } break;
873
872
  case LLM_ARCH_PHIMOE:
@@ -937,8 +936,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
937
936
  } break;
938
937
  case LLM_ARCH_GEMMA2:
939
938
  {
939
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
940
940
  hparams.n_swa = 4096; // default value of gemma 2
941
- hparams.n_swa_pattern = 2;
941
+ hparams.set_swa_pattern(2);
942
942
  hparams.attn_soft_cap = true;
943
943
 
944
944
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -955,7 +955,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
955
955
  } break;
956
956
  case LLM_ARCH_GEMMA3:
957
957
  {
958
- hparams.n_swa_pattern = 6;
958
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
959
+ hparams.set_swa_pattern(6);
959
960
 
960
961
  hparams.rope_freq_base_train_swa = 10000.0f;
961
962
  hparams.rope_freq_scale_train_swa = 1.0f;
@@ -1039,7 +1040,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1039
1040
  } break;
1040
1041
  case LLM_ARCH_COHERE2:
1041
1042
  {
1042
- hparams.n_swa_pattern = 4;
1043
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1044
+ hparams.set_swa_pattern(4);
1043
1045
 
1044
1046
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1045
1047
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -4321,7 +4323,7 @@ void llama_model::print_info() const {
4321
4323
  LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
4322
4324
  LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
4323
4325
  LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
4324
- LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
4326
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
4325
4327
  LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
4326
4328
  LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
4327
4329
  LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
@@ -4489,7 +4491,17 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
4489
4491
  return it->second;
4490
4492
  }
4491
4493
 
4492
- ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4494
+ float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
4495
+ return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
4496
+ }
4497
+
4498
+ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
4499
+ return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
4500
+ }
4501
+
4502
+ ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
4503
+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
4504
+
4493
4505
  // choose long/short freq factors based on the context size
4494
4506
  if (layers[il].rope_freqs != nullptr) {
4495
4507
  return layers[il].rope_freqs;
@@ -4517,21 +4529,174 @@ struct llm_build_llama : public llm_graph_context {
4517
4529
  // inp_pos - contains the positions
4518
4530
  ggml_tensor * inp_pos = build_inp_pos();
4519
4531
 
4532
+ auto * inp_attn = build_attn_inp_kv_unified();
4533
+
4534
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4535
+
4536
+ for (int il = 0; il < n_layer; ++il) {
4537
+ ggml_tensor * inpSA = inpL;
4538
+
4539
+ // norm
4540
+ cur = build_norm(inpL,
4541
+ model.layers[il].attn_norm, NULL,
4542
+ LLM_NORM_RMS, il);
4543
+ cb(cur, "attn_norm", il);
4544
+
4545
+ // self-attention
4546
+ {
4547
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
4548
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4549
+
4550
+ // compute Q and K and RoPE them
4551
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
4552
+ cb(Qcur, "Qcur", il);
4553
+ if (model.layers[il].bq) {
4554
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4555
+ cb(Qcur, "Qcur", il);
4556
+ }
4557
+
4558
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
4559
+ cb(Kcur, "Kcur", il);
4560
+ if (model.layers[il].bk) {
4561
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4562
+ cb(Kcur, "Kcur", il);
4563
+ }
4564
+
4565
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
4566
+ cb(Vcur, "Vcur", il);
4567
+ if (model.layers[il].bv) {
4568
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4569
+ cb(Vcur, "Vcur", il);
4570
+ }
4571
+
4572
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4573
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4574
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4575
+
4576
+ Qcur = ggml_rope_ext(
4577
+ ctx0, Qcur, inp_pos, rope_factors,
4578
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4579
+ ext_factor, attn_factor, beta_fast, beta_slow
4580
+ );
4581
+
4582
+ Kcur = ggml_rope_ext(
4583
+ ctx0, Kcur, inp_pos, rope_factors,
4584
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4585
+ ext_factor, attn_factor, beta_fast, beta_slow
4586
+ );
4587
+
4588
+ cb(Qcur, "Qcur", il);
4589
+ cb(Kcur, "Kcur", il);
4590
+ cb(Vcur, "Vcur", il);
4591
+
4592
+ cur = build_attn(inp_attn, gf,
4593
+ model.layers[il].wo, model.layers[il].bo,
4594
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4595
+ cb(cur, "attn_out", il);
4596
+ }
4597
+
4598
+ if (il == n_layer - 1) {
4599
+ // skip computing output for unused tokens
4600
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4601
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4602
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4603
+ }
4604
+
4605
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4606
+ cb(ffn_inp, "ffn_inp", il);
4607
+
4608
+ // feed-forward network (non-MoE)
4609
+ if (model.layers[il].ffn_gate_inp == nullptr) {
4610
+
4611
+ cur = build_norm(ffn_inp,
4612
+ model.layers[il].ffn_norm, NULL,
4613
+ LLM_NORM_RMS, il);
4614
+ cb(cur, "ffn_norm", il);
4615
+
4616
+ cur = build_ffn(cur,
4617
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
4618
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
4619
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
4620
+ NULL,
4621
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4622
+ cb(cur, "ffn_out", il);
4623
+ } else {
4624
+ // MoE branch
4625
+ cur = build_norm(ffn_inp,
4626
+ model.layers[il].ffn_norm, NULL,
4627
+ LLM_NORM_RMS, il);
4628
+ cb(cur, "ffn_norm", il);
4629
+
4630
+ cur = build_moe_ffn(cur,
4631
+ model.layers[il].ffn_gate_inp,
4632
+ model.layers[il].ffn_up_exps,
4633
+ model.layers[il].ffn_gate_exps,
4634
+ model.layers[il].ffn_down_exps,
4635
+ nullptr,
4636
+ n_expert, n_expert_used,
4637
+ LLM_FFN_SILU, true,
4638
+ false, 0.0,
4639
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4640
+ il);
4641
+ cb(cur, "ffn_moe_out", il);
4642
+ }
4643
+
4644
+ cur = ggml_add(ctx0, cur, ffn_inp);
4645
+ cb(cur, "ffn_out", il);
4646
+
4647
+ cur = build_cvec(cur, il);
4648
+ cb(cur, "l_out", il);
4649
+
4650
+ // input for next layer
4651
+ inpL = cur;
4652
+ }
4653
+
4654
+ cur = inpL;
4655
+
4656
+ cur = build_norm(cur,
4657
+ model.output_norm, NULL,
4658
+ LLM_NORM_RMS, -1);
4659
+
4660
+ cb(cur, "result_norm", -1);
4661
+ res->t_embd = cur;
4662
+
4663
+ // lm_head
4664
+ cur = build_lora_mm(model.output, cur);
4665
+
4666
+ cb(cur, "result_output", -1);
4667
+ res->t_logits = cur;
4668
+
4669
+ ggml_build_forward_expand(gf, cur);
4670
+ }
4671
+ };
4672
+
4673
+ struct llm_build_llama_iswa : public llm_graph_context {
4674
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
4675
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4676
+
4677
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4678
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4679
+
4680
+ ggml_tensor * cur;
4681
+ ggml_tensor * inpL;
4682
+
4683
+ inpL = build_inp_embd(model.tok_embd);
4684
+
4685
+ // inp_pos - contains the positions
4686
+ ggml_tensor * inp_pos = build_inp_pos();
4687
+
4520
4688
  // temperature tuning
4521
4689
  ggml_tensor * inp_attn_scale = nullptr;
4522
- if (arch == LLM_ARCH_LLAMA4) {
4523
- inp_attn_scale = build_inp_attn_scale();
4524
- }
4690
+ inp_attn_scale = build_inp_attn_scale();
4525
4691
 
4526
- auto * inp_attn = build_attn_inp_kv_unified();
4692
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
4527
4693
 
4528
4694
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4695
+
4529
4696
  for (int il = 0; il < n_layer; ++il) {
4530
4697
  ggml_tensor * inpSA = inpL;
4531
4698
 
4532
- bool use_rope = arch == LLM_ARCH_LLAMA4
4533
- ? (il + 1) % hparams.n_no_rope_layer_step != 0
4534
- : true;
4699
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
4535
4700
 
4536
4701
  // norm
4537
4702
  cur = build_norm(inpL,
@@ -4542,7 +4707,7 @@ struct llm_build_llama : public llm_graph_context {
4542
4707
  // self-attention
4543
4708
  {
4544
4709
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4545
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4710
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4546
4711
 
4547
4712
  // compute Q and K and RoPE them
4548
4713
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4590,7 +4755,7 @@ struct llm_build_llama : public llm_graph_context {
4590
4755
  cb(Kcur, "Kcur", il);
4591
4756
  cb(Vcur, "Vcur", il);
4592
4757
 
4593
- if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
4758
+ if (use_rope && hparams.use_kq_norm) {
4594
4759
  // Llama4TextL2Norm
4595
4760
  Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
4596
4761
  Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
@@ -4616,7 +4781,6 @@ struct llm_build_llama : public llm_graph_context {
4616
4781
 
4617
4782
  // feed-forward network (non-MoE)
4618
4783
  if (model.layers[il].ffn_gate_inp == nullptr) {
4619
-
4620
4784
  cur = build_norm(ffn_inp,
4621
4785
  model.layers[il].ffn_norm, NULL,
4622
4786
  LLM_NORM_RMS, il);
@@ -4629,9 +4793,7 @@ struct llm_build_llama : public llm_graph_context {
4629
4793
  NULL,
4630
4794
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4631
4795
  cb(cur, "ffn_out", il);
4632
-
4633
- } else if (arch == LLM_ARCH_LLAMA4) {
4634
- // llama4 MoE
4796
+ } else {
4635
4797
  ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4636
4798
  model.layers[il].ffn_norm, NULL,
4637
4799
  LLM_NORM_RMS, il);
@@ -4660,26 +4822,6 @@ struct llm_build_llama : public llm_graph_context {
4660
4822
 
4661
4823
  cur = ggml_add(ctx0, moe_out, shexp_out);
4662
4824
  cb(cur, "ffn_moe_out_merged", il);
4663
-
4664
- } else {
4665
- // MoE branch
4666
- cur = build_norm(ffn_inp,
4667
- model.layers[il].ffn_norm, NULL,
4668
- LLM_NORM_RMS, il);
4669
- cb(cur, "ffn_norm", il);
4670
-
4671
- cur = build_moe_ffn(cur,
4672
- model.layers[il].ffn_gate_inp,
4673
- model.layers[il].ffn_up_exps,
4674
- model.layers[il].ffn_gate_exps,
4675
- model.layers[il].ffn_down_exps,
4676
- nullptr,
4677
- n_expert, n_expert_used,
4678
- LLM_FFN_SILU, true,
4679
- false, 0.0,
4680
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4681
- il);
4682
- cb(cur, "ffn_moe_out", il);
4683
4825
  }
4684
4826
 
4685
4827
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4753,7 +4895,7 @@ struct llm_build_deci : public llm_graph_context {
4753
4895
  } else if (n_head > 0) {
4754
4896
  // self-attention
4755
4897
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4756
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4898
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4757
4899
 
4758
4900
  // compute Q and K and RoPE them
4759
4901
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -7202,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
7202
7344
  }
7203
7345
  };
7204
7346
 
7347
+ template<bool iswa>
7205
7348
  struct llm_build_phi3 : public llm_graph_context {
7206
7349
  llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7207
7350
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -7217,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
7217
7360
  // inp_pos - contains the positions
7218
7361
  ggml_tensor * inp_pos = build_inp_pos();
7219
7362
 
7220
- auto * inp_attn = build_attn_inp_kv_unified();
7363
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
7364
+ inp_attn_type * inp_attn = nullptr;
7365
+
7366
+ if constexpr (iswa) {
7367
+ inp_attn = build_attn_inp_kv_unified_iswa();
7368
+ } else {
7369
+ inp_attn = build_attn_inp_kv_unified();
7370
+ }
7221
7371
 
7222
7372
  for (int il = 0; il < n_layer; ++il) {
7223
7373
  auto * residual = inpL;
@@ -7225,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
7225
7375
  // self-attention
7226
7376
  {
7227
7377
  // rope freq factors for 128k context
7228
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
7378
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7229
7379
 
7230
7380
  ggml_tensor* attn_norm_output = build_norm(inpL,
7231
7381
  model.layers[il].attn_norm,
@@ -7977,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7977
8127
  for (int il = 0; il < n_layer; ++il) {
7978
8128
  ggml_tensor * inpSA = inpL;
7979
8129
 
7980
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
8130
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7981
8131
 
7982
8132
  // norm
7983
8133
  cur = build_norm(inpL,
@@ -8277,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
8277
8427
  }
8278
8428
  };
8279
8429
 
8280
- struct llm_build_gemma2 : public llm_graph_context {
8281
- llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8430
+ struct llm_build_gemma2_iswa : public llm_graph_context {
8431
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8282
8432
  const int64_t n_embd_head = hparams.n_embd_head_k;
8283
8433
 
8284
8434
  ggml_tensor * cur;
@@ -8292,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
8292
8442
  // inp_pos - contains the positions
8293
8443
  ggml_tensor * inp_pos = build_inp_pos();
8294
8444
 
8295
- auto * inp_attn = build_attn_inp_kv_unified();
8445
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8296
8446
 
8297
8447
  for (int il = 0; il < n_layer; ++il) {
8298
8448
  // norm
@@ -8414,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
8414
8564
  }
8415
8565
  };
8416
8566
 
8417
- struct llm_build_gemma3 : public llm_graph_context {
8418
- llm_build_gemma3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8567
+ struct llm_build_gemma3_iswa : public llm_graph_context {
8568
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8419
8569
  const int64_t n_embd_head = hparams.n_embd_head_k;
8420
8570
 
8421
8571
  ggml_tensor * cur;
@@ -8433,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
8433
8583
  ggml_tensor * inp_pos = build_inp_pos();
8434
8584
 
8435
8585
  // TODO: is causal == true correct? might need some changes
8436
- auto * inp_attn = build_attn_inp_kv_unified();
8586
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8437
8587
 
8438
8588
  for (int il = 0; il < n_layer; ++il) {
8439
- const bool is_swa = hparams.is_swa(il);
8440
-
8441
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
8442
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
8589
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
8590
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
8443
8591
 
8444
8592
  // norm
8445
8593
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -9016,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
9016
9164
  }
9017
9165
  };
9018
9166
 
9019
- struct llm_build_cohere2 : public llm_graph_context {
9020
- llm_build_cohere2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9167
+ struct llm_build_cohere2_iswa : public llm_graph_context {
9168
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9021
9169
  const int64_t n_embd_head = hparams.n_embd_head_v;
9022
9170
 
9023
9171
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9032,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9032
9180
  // inp_pos - contains the positions
9033
9181
  ggml_tensor * inp_pos = build_inp_pos();
9034
9182
 
9035
- auto * inp_attn = build_attn_inp_kv_unified();
9183
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
9036
9184
 
9037
9185
  for (int il = 0; il < n_layer; ++il) {
9038
9186
  const bool is_swa = hparams.is_swa(il);
@@ -9045,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9045
9193
  // self-attention
9046
9194
  {
9047
9195
  // rope freq factors for 128k context
9048
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
9196
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
9049
9197
 
9050
9198
  // compute Q and K and RoPE them
9051
9199
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9983,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
9983
10131
  // self-attention
9984
10132
  {
9985
10133
  // rope freq factors for llama3; may return nullptr for llama2 and other models
9986
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
10134
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
9987
10135
 
9988
10136
  // compute Q and K and RoPE them
9989
10137
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11347,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
11347
11495
  // self-attention
11348
11496
  {
11349
11497
  // rope freq factors for llama3; may return nullptr for llama2 and other models
11350
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
11498
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
11351
11499
 
11352
11500
  // compute Q and K and RoPE them
11353
11501
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12263,7 +12411,7 @@ struct llm_build_granite : public llm_graph_context {
12263
12411
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12264
12412
 
12265
12413
  if (use_rope) {
12266
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12414
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12267
12415
  Qcur = ggml_rope_ext(
12268
12416
  ctx0, Qcur, inp_pos, rope_factors,
12269
12417
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -12916,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12916
13064
  // self-attention
12917
13065
  {
12918
13066
  // rope freq factors for llama3; may return nullptr for llama2 and other models
12919
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
13067
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12920
13068
 
12921
13069
  // compute Q and K and RoPE them
12922
13070
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13044,6 +13192,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
13044
13192
  case LLM_ARCH_JINA_BERT_V2:
13045
13193
  case LLM_ARCH_NOMIC_BERT:
13046
13194
  case LLM_ARCH_NOMIC_BERT_MOE:
13195
+ case LLM_ARCH_WAVTOKENIZER_DEC:
13047
13196
  {
13048
13197
  res = nullptr;
13049
13198
  } break;
@@ -13058,7 +13207,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
13058
13207
  GGML_TYPE_F32,
13059
13208
  GGML_TYPE_F32,
13060
13209
  cparams.offload_kqv,
13061
- std::max((uint32_t) 1, cparams.n_seq_max));
13210
+ std::max((uint32_t) 1, cparams.n_seq_max),
13211
+ cparams.n_seq_max);
13062
13212
  } break;
13063
13213
  default:
13064
13214
  {
@@ -13068,14 +13218,36 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
13068
13218
 
13069
13219
  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13070
13220
 
13071
- res = new llama_kv_cache_unified(
13072
- *this,
13073
- params.type_k,
13074
- params.type_v,
13075
- !cparams.flash_attn,
13076
- cparams.offload_kqv,
13077
- cparams.n_ctx,
13078
- padding);
13221
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13222
+ GGML_ASSERT(hparams.is_swa_any());
13223
+
13224
+ res = new llama_kv_cache_unified_iswa(
13225
+ *this,
13226
+ params.type_k,
13227
+ params.type_v,
13228
+ !cparams.flash_attn,
13229
+ cparams.offload_kqv,
13230
+ params.swa_full,
13231
+ cparams.n_ctx,
13232
+ cparams.n_seq_max,
13233
+ cparams.n_batch,
13234
+ padding);
13235
+ } else {
13236
+ GGML_ASSERT(!hparams.is_swa_any());
13237
+
13238
+ res = new llama_kv_cache_unified(
13239
+ *this,
13240
+ nullptr,
13241
+ params.type_k,
13242
+ params.type_v,
13243
+ !cparams.flash_attn,
13244
+ cparams.offload_kqv,
13245
+ cparams.n_ctx,
13246
+ cparams.n_seq_max,
13247
+ padding,
13248
+ hparams.n_swa,
13249
+ hparams.swa_type);
13250
+ }
13079
13251
  }
13080
13252
  }
13081
13253
 
@@ -13090,11 +13262,14 @@ llm_graph_result_ptr llama_model::build_graph(
13090
13262
 
13091
13263
  switch (arch) {
13092
13264
  case LLM_ARCH_LLAMA:
13093
- case LLM_ARCH_LLAMA4:
13094
13265
  case LLM_ARCH_MINICPM:
13095
13266
  {
13096
13267
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
13097
13268
  } break;
13269
+ case LLM_ARCH_LLAMA4:
13270
+ {
13271
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
13272
+ } break;
13098
13273
  case LLM_ARCH_DECI:
13099
13274
  {
13100
13275
  llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13169,7 +13344,11 @@ llm_graph_result_ptr llama_model::build_graph(
13169
13344
  case LLM_ARCH_PHI3:
13170
13345
  case LLM_ARCH_PHIMOE:
13171
13346
  {
13172
- llm = std::make_unique<llm_build_phi3>(*this, params, gf);
13347
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13348
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
13349
+ } else {
13350
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
13351
+ }
13173
13352
  } break;
13174
13353
  case LLM_ARCH_PLAMO:
13175
13354
  {
@@ -13201,11 +13380,11 @@ llm_graph_result_ptr llama_model::build_graph(
13201
13380
  } break;
13202
13381
  case LLM_ARCH_GEMMA2:
13203
13382
  {
13204
- llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
13383
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
13205
13384
  } break;
13206
13385
  case LLM_ARCH_GEMMA3:
13207
13386
  {
13208
- llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
13387
+ llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
13209
13388
  } break;
13210
13389
  case LLM_ARCH_STARCODER2:
13211
13390
  {
@@ -13225,7 +13404,7 @@ llm_graph_result_ptr llama_model::build_graph(
13225
13404
  } break;
13226
13405
  case LLM_ARCH_COHERE2:
13227
13406
  {
13228
- llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
13407
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
13229
13408
  } break;
13230
13409
  case LLM_ARCH_DBRX:
13231
13410
  {
@@ -398,7 +398,10 @@ struct llama_model {
398
398
 
399
399
  const struct ggml_tensor * get_tensor(const char * name) const;
400
400
 
401
- ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
401
+ float get_rope_freq_base (const llama_cparams & cparams, int il) const;
402
+ float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
403
+
404
+ ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
402
405
 
403
406
  // note: can mutate `cparams`
404
407
  // TODO: move this to new llm_arch_model_i interface
@@ -128,7 +128,7 @@ int main(void) {
128
128
 
129
129
  if (common_has_curl()) {
130
130
  printf("test-arg-parser: test curl-related functions\n\n");
131
- const char * GOOD_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/README.md";
131
+ const char * GOOD_URL = "https://ggml.ai/";
132
132
  const char * BAD_URL = "https://www.google.com/404";
133
133
  const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";
134
134
 
@@ -991,6 +991,7 @@ struct cmd_params_instance {
991
991
  cparams.flash_attn = flash_attn;
992
992
  cparams.embeddings = embeddings;
993
993
  cparams.op_offload = !no_op_offload;
994
+ cparams.swa_full = false;
994
995
 
995
996
  return cparams;
996
997
  }