@fugood/llama.node 1.3.0-rc.6 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/CMakeLists.txt +12 -2
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +8 -9
  4. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  5. package/src/llama.cpp/common/arg.cpp +39 -1001
  6. package/src/llama.cpp/common/arg.h +2 -2
  7. package/src/llama.cpp/common/chat.cpp +216 -2
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +33 -0
  10. package/src/llama.cpp/common/common.h +13 -0
  11. package/src/llama.cpp/common/download.cpp +1054 -0
  12. package/src/llama.cpp/common/download.h +55 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
  14. package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
  15. package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  16. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  26. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  27. package/src/llama.cpp/include/llama.h +7 -3
  28. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  29. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  30. package/src/llama.cpp/src/llama-arch.h +11 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  32. package/src/llama.cpp/src/llama-batch.h +12 -1
  33. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  34. package/src/llama.cpp/src/llama-chat.h +1 -0
  35. package/src/llama.cpp/src/llama-context.cpp +44 -16
  36. package/src/llama.cpp/src/llama-context.h +5 -5
  37. package/src/llama.cpp/src/llama-cparams.h +1 -0
  38. package/src/llama.cpp/src/llama-graph.cpp +12 -7
  39. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  40. package/src/llama.cpp/src/llama-hparams.h +6 -0
  41. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  42. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
  43. package/src/llama.cpp/src/llama-kv-cache.h +2 -4
  44. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  45. package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
  46. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  47. package/src/llama.cpp/src/llama-model.cpp +350 -13194
  48. package/src/llama.cpp/src/llama-model.h +9 -2
  49. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  50. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  51. package/src/llama.cpp/src/llama-vocab.h +1 -0
  52. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  53. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  54. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  55. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  56. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  57. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  58. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  59. package/src/llama.cpp/src/models/bert.cpp +176 -0
  60. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  61. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  62. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  63. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  64. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  65. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  66. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  67. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  68. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  69. package/src/llama.cpp/src/models/deci.cpp +135 -0
  70. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  71. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  72. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  73. package/src/llama.cpp/src/models/dream.cpp +105 -0
  74. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  75. package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
  76. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  77. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  78. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  79. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  80. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  81. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  82. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  83. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  84. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  85. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  86. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  87. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  88. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  89. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  90. package/src/llama.cpp/src/models/granite.cpp +211 -0
  91. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  92. package/src/llama.cpp/src/models/grok.cpp +159 -0
  93. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  94. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  95. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  96. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  97. package/src/llama.cpp/src/models/jais.cpp +86 -0
  98. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  99. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  100. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  101. package/src/llama.cpp/src/models/llada.cpp +99 -0
  102. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  103. package/src/llama.cpp/src/models/llama.cpp +155 -0
  104. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  105. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  106. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  107. package/src/llama.cpp/src/models/models.h +481 -0
  108. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  109. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  110. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  111. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  112. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  113. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  114. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  115. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
  116. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  117. package/src/llama.cpp/src/models/orion.cpp +123 -0
  118. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  119. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  120. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  121. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  122. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  123. package/src/llama.cpp/src/models/plm.cpp +168 -0
  124. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  125. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  126. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  127. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  128. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  129. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  130. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  131. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  132. package/src/llama.cpp/src/models/refact.cpp +94 -0
  133. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  134. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  135. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  136. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  137. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  138. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  139. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  140. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  141. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  142. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  143. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  144. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  145. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  146. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  147. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -32,6 +32,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
32
32
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
33
33
  { LLM_ARCH_QWEN3, "qwen3" },
34
34
  { LLM_ARCH_QWEN3MOE, "qwen3moe" },
35
+ { LLM_ARCH_QWEN3VL, "qwen3vl" },
36
+ { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
35
37
  { LLM_ARCH_PHI2, "phi2" },
36
38
  { LLM_ARCH_PHI3, "phi3" },
37
39
  { LLM_ARCH_PHIMOE, "phimoe" },
@@ -103,6 +105,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
103
105
  { LLM_ARCH_SEED_OSS, "seed_oss" },
104
106
  { LLM_ARCH_GROVEMOE, "grovemoe" },
105
107
  { LLM_ARCH_APERTUS, "apertus" },
108
+ { LLM_ARCH_MINIMAX_M2, "minimax-m2" },
109
+ { LLM_ARCH_COGVLM, "cogvlm" },
110
+ { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
106
111
  { LLM_ARCH_UNKNOWN, "(unknown)" },
107
112
  };
108
113
 
@@ -145,6 +150,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
145
150
  { LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
146
151
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
147
152
  { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
153
+ { LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
148
154
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
149
155
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
150
156
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -779,6 +785,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
779
785
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
780
786
  },
781
787
  },
788
+ {
789
+ LLM_ARCH_QWEN3VL,
790
+ {
791
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
792
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
793
+ { LLM_TENSOR_OUTPUT, "output" },
794
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
795
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
796
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
797
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
798
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
799
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
800
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
801
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
802
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
803
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
804
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
805
+ },
806
+ },
807
+ {
808
+ LLM_ARCH_QWEN3VLMOE,
809
+ {
810
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
811
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
812
+ { LLM_TENSOR_OUTPUT, "output" },
813
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
814
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
815
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
816
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
817
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
822
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
823
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
824
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
825
+ },
826
+ },
782
827
  {
783
828
  LLM_ARCH_PHI2,
784
829
  {
@@ -2312,6 +2357,64 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2312
2357
  { LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
2313
2358
  },
2314
2359
  },
2360
+ {
2361
+ LLM_ARCH_MINIMAX_M2,
2362
+ {
2363
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2364
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2365
+ { LLM_TENSOR_OUTPUT, "output" },
2366
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2367
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2368
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2369
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2370
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2371
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2372
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2373
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2374
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2375
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2376
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2377
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2378
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
2379
+ },
2380
+ },
2381
+ {
2382
+ LLM_ARCH_PANGU_EMBED,
2383
+ {
2384
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2385
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2386
+ { LLM_TENSOR_OUTPUT, "output" },
2387
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2388
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2389
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2390
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2391
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2392
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2393
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2394
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2395
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2396
+ },
2397
+ },
2398
+ {
2399
+ LLM_ARCH_COGVLM,
2400
+ {
2401
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2402
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2403
+ { LLM_TENSOR_OUTPUT, "output" },
2404
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2405
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
2406
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2407
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2408
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2409
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2410
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2411
+ { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
2412
+ { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
2413
+ { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
2414
+ { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
2415
+ { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
2416
+ },
2417
+ },
2315
2418
  {
2316
2419
  LLM_ARCH_UNKNOWN,
2317
2420
  {
@@ -2488,6 +2591,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2488
2591
  {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2489
2592
  {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2490
2593
  {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2594
+ {LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2595
+ {LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2596
+ {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2597
+ {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2598
+ {LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2491
2599
  // NextN/MTP tensors are currently ignored (reserved for future MTP support)
2492
2600
  // These tensors only exist in the last layer(s) and are treated as output tensors
2493
2601
  {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -36,6 +36,8 @@ enum llm_arch {
36
36
  LLM_ARCH_QWEN2VL,
37
37
  LLM_ARCH_QWEN3,
38
38
  LLM_ARCH_QWEN3MOE,
39
+ LLM_ARCH_QWEN3VL,
40
+ LLM_ARCH_QWEN3VLMOE,
39
41
  LLM_ARCH_PHI2,
40
42
  LLM_ARCH_PHI3,
41
43
  LLM_ARCH_PHIMOE,
@@ -107,6 +109,9 @@ enum llm_arch {
107
109
  LLM_ARCH_SEED_OSS,
108
110
  LLM_ARCH_GROVEMOE,
109
111
  LLM_ARCH_APERTUS,
112
+ LLM_ARCH_MINIMAX_M2,
113
+ LLM_ARCH_COGVLM,
114
+ LLM_ARCH_PANGU_EMBED,
110
115
  LLM_ARCH_UNKNOWN,
111
116
  };
112
117
 
@@ -149,6 +154,7 @@ enum llm_kv {
149
154
  LLM_KV_EXPERTS_PER_GROUP,
150
155
  LLM_KV_MOE_EVERY_N_LAYERS,
151
156
  LLM_KV_NEXTN_PREDICT_LAYERS,
157
+ LLM_KV_NUM_DEEPSTACK_LAYERS,
152
158
  LLM_KV_POOLING_TYPE,
153
159
  LLM_KV_LOGIT_SCALE,
154
160
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -455,6 +461,11 @@ enum llm_tensor {
455
461
  LLM_TENSOR_SHORTCONV_CONV,
456
462
  LLM_TENSOR_SHORTCONV_INPROJ,
457
463
  LLM_TENSOR_SHORTCONV_OUTPROJ,
464
+ LLM_TENSOR_VISEXP_ATTN_QKV,
465
+ LLM_TENSOR_VISEXP_ATTN_OUT,
466
+ LLM_TENSOR_VISEXP_FFN_GATE,
467
+ LLM_TENSOR_VISEXP_FFN_DOWN,
468
+ LLM_TENSOR_VISEXP_FFN_UP,
458
469
  LLM_TENSOR_NEXTN_EH_PROJ,
459
470
  LLM_TENSOR_NEXTN_EMBED_TOKENS,
460
471
  LLM_TENSOR_NEXTN_ENORM,
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
215
215
  /*.n_seq_tokens =*/ (uint32_t) 1,
216
216
  /*.n_seqs =*/ (uint32_t) batch.n_tokens,
217
217
  /*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(),
218
+ /*.n_pos =*/ n_pos_per_embd,
218
219
  /*.token =*/ batch.token,
219
220
  /*.embd =*/ batch.embd,
220
221
  /*.pos =*/ batch.pos,
@@ -251,46 +252,72 @@ bool llama_batch_allocr::init(
251
252
  // consistency checks
252
253
  //
253
254
 
254
- for (uint32_t s = 0; s < n_seq_max; ++s) {
255
- if (seq_pos[s].empty()) {
256
- continue;
255
+ if (n_pos_per_embd > 1) {
256
+ // M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
257
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
258
+ if (seq_pos[s].empty()) {
259
+ continue;
260
+ }
261
+
262
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
263
+
264
+ if (batch.token) {
265
+ if (p0 >= 0 && p0 >= seq_pos_min(s)) {
266
+ LLAMA_LOG_ERROR(
267
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
268
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
269
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
270
+ " for M-RoPE, it is required that the position satisfies: X < Y\n",
271
+ __func__, s, s, p0, s, seq_pos_min(s));
272
+
273
+ return false;
274
+ }
275
+ } else {
276
+ // embedding inputs can have overlapping positions
277
+ if (p0 >= 0 && p0 > seq_pos_min(s)) {
278
+ LLAMA_LOG_ERROR(
279
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
280
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
281
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
282
+ " for M-RoPE, it is required that the position satisfies: X <= Y\n",
283
+ __func__, s, s, p0, s, seq_pos_min(s));
284
+
285
+ return false;
286
+ }
287
+ }
257
288
  }
289
+ } else {
290
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
291
+ if (seq_pos[s].empty()) {
292
+ continue;
293
+ }
258
294
 
259
- const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
295
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
260
296
 
261
- if (p0 >= 0) {
262
- bool ok = true;
297
+ if (p0 >= 0) {
298
+ bool ok = true;
263
299
 
264
- if (batch.token) {
265
300
  if (seq_pos_min(s) != p0 + 1) {
266
301
  ok = false;
267
302
  }
268
- } else {
269
- assert(batch.embd);
270
303
 
271
- // for embeddings (typically used as vision input), we allow them to have repeating positions
272
- // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
273
- if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
274
- ok = false;
304
+ if (!ok) {
305
+ LLAMA_LOG_ERROR(
306
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
307
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
308
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
309
+ " it is required that the sequence positions remain consecutive: Y = X + 1\n",
310
+ __func__, s, s, p0, s, seq_pos_min(s));
311
+
312
+ return false;
275
313
  }
276
314
  }
277
315
 
278
- if (!ok) {
279
- LLAMA_LOG_ERROR(
280
- "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
281
- " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
282
- " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
283
- " it is required that the sequence positions remain consecutive: Y = X + 1\n",
284
- __func__, s, s, p0, s, seq_pos_min(s));
285
-
316
+ if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
317
+ LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
286
318
  return false;
287
319
  }
288
320
  }
289
-
290
- if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
291
- LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
292
- return false;
293
- }
294
321
  }
295
322
 
296
323
  if (memory) {
@@ -389,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
389
416
  /*.n_seq_tokens =*/ n_seq_tokens,
390
417
  /*.n_seqs =*/ n_seqs,
391
418
  /*.n_seqs_unq =*/ n_seqs,
419
+ /*.n_pos =*/ n_pos_per_embd,
392
420
 
393
421
  /*.token =*/ udata->token.data(),
394
422
  /*.embd =*/ nullptr,
@@ -655,10 +683,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
655
683
 
656
684
  auto udata = std::make_shared<llama_ubatch::data_t>();
657
685
 
658
- const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
659
-
660
686
  const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
661
- const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur;
687
+ const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
662
688
 
663
689
  udata->token .resize(n_tokens);
664
690
  udata->embd .resize(n_embd_all);
@@ -680,8 +706,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
680
706
  memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
681
707
  }
682
708
 
683
- for (int j = 0; j < n_pos_cur; ++j) {
684
- udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
709
+ for (size_t j = 0; j < (size_t)n_pos_per_embd; ++j) {
710
+ // if we are using M-RoPE
711
+ // if the current batch is text, we need to broadcast the same position across all RoPE sections
712
+ // otherwise, the input batch is image embeddings, we copy the positions as-is
713
+ // if we are not using M-RoPE, there is only one position per token (this loop runs only once)
714
+ size_t src_off = batch.token ? 0 : j*batch.n_tokens;
715
+ udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
685
716
  }
686
717
 
687
718
  udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
@@ -710,6 +741,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
710
741
  /*.n_seq_tokens =*/ n_tokens/n_seqs,
711
742
  /*.n_seqs =*/ n_seqs,
712
743
  /*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
744
+ /*.n_pos =*/ n_pos_per_embd,
713
745
 
714
746
  /*.token =*/ batch.token ? udata->token.data() : nullptr,
715
747
  /*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
@@ -17,6 +17,16 @@ struct llama_ubatch {
17
17
  return b_equal_seqs != 0;
18
18
  }
19
19
 
20
+ // typical for M-RoPE cases:
21
+ // 0 - sequantial position of the tokens/embeddings in the sequence
22
+ // 1 - y position in the image
23
+ // 2 - x position in the image
24
+ // 3 - other
25
+ bool is_pos_2d() const {
26
+ // TODO @ngxson : we may need to check for model arch when more models use >1 positions
27
+ return n_pos >= 3;
28
+ }
29
+
20
30
  uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
21
31
  // otherwise address sanitizer complains
22
32
  // TODO: whole_seqs for embeddings?
@@ -25,6 +35,7 @@ struct llama_ubatch {
25
35
  uint32_t n_seq_tokens; // tokens per sequence set
26
36
  uint32_t n_seqs; // sequence sets in the ubatch
27
37
  uint32_t n_seqs_unq; // unique sequence ids in the ubatch
38
+ uint32_t n_pos; // number of position inputs for each token/embedding
28
39
 
29
40
  // seq_id_unq: unique sequence ids in the ubatch
30
41
  // seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
@@ -33,7 +44,7 @@ struct llama_ubatch {
33
44
  // // size | idx | val
34
45
  llama_token * token; // [n_tokens] | i | id, token
35
46
  float * embd; // [n_embd, n_tokens] | i | embd
36
- llama_pos * pos; // [n_tokens] | i | pos
47
+ llama_pos * pos; // [n_tokens*n_pos] | i | pos
37
48
  int32_t * n_seq_id; // [n_tokens] | i | -
38
49
  llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
39
50
  llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
73
73
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
74
74
  { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
75
75
  { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
76
+ { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
76
77
  };
77
78
 
78
79
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -213,6 +214,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
213
214
  return LLM_CHAT_TEMPLATE_SEED_OSS;
214
215
  } else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
215
216
  return LLM_CHAT_TEMPLATE_GROK_2;
217
+ } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
218
+ return LLM_CHAT_TEMPLATE_PANGU_EMBED;
216
219
  }
217
220
  return LLM_CHAT_TEMPLATE_UNKNOWN;
218
221
  }
@@ -813,6 +816,35 @@ int32_t llm_chat_apply_template(
813
816
  if (add_ass) {
814
817
  ss << "Assistant:";
815
818
  }
819
+ }else if (tmpl == LLM_CHAT_TEMPLATE_PANGU_EMBED) {
820
+ // [unused9]系统:xxx[unused10]
821
+ // [unused9]用户:xxx[unused10]
822
+ // [unused9]助手:xxx[unused10]
823
+ // ...
824
+ for (size_t i = 0; i < chat.size(); ++i) {
825
+ const auto & msg = chat[i];
826
+ const std::string & role = msg->role;
827
+ const std::string & content = msg->content;
828
+
829
+ if (i == 0 && role != "system") {
830
+ ss << "[unused9]系统:[unused10]";
831
+ }
832
+
833
+ if (role == "system") {
834
+ ss << "[unused9]系统:" << content << "[unused10]";
835
+ } else if (role == "user") {
836
+ ss << "[unused9]用户:" << content << "[unused10]";
837
+ } else if (role == "assistant") {
838
+ ss << "[unused9]助手:" << content << "[unused10]";
839
+ } else if (role == "tool") {
840
+ ss << "[unused9]工具:" << content << "[unused10]";
841
+ } else if (role == "function") {
842
+ ss << "[unused9]方法:" << content << "[unused10]";
843
+ }
844
+ }
845
+ if (add_ass) {
846
+ ss << "[unused9]助手:";
847
+ }
816
848
  } else {
817
849
  // template not supported
818
850
  return -1;
@@ -53,6 +53,7 @@ enum llm_chat_template {
53
53
  LLM_CHAT_TEMPLATE_KIMI_K2,
54
54
  LLM_CHAT_TEMPLATE_SEED_OSS,
55
55
  LLM_CHAT_TEMPLATE_GROK_2,
56
+ LLM_CHAT_TEMPLATE_PANGU_EMBED,
56
57
  LLM_CHAT_TEMPLATE_UNKNOWN,
57
58
  };
58
59
 
@@ -21,6 +21,8 @@ llama_context::llama_context(
21
21
  llama_context_params params) :
22
22
  model(model),
23
23
  balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
24
+ // TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
25
+ // may need to be backend-dependent
24
26
  LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
25
27
 
26
28
  t_start_us = model.t_start_us;
@@ -112,11 +114,28 @@ llama_context::llama_context(
112
114
  }
113
115
  }
114
116
 
115
- const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
117
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
118
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
119
+
120
+ if (cparams.kv_unified) {
121
+ cparams.n_ctx_seq = cparams.n_ctx;
122
+ } else {
123
+ cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
124
+ cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
125
+
126
+ if (cparams.n_ctx_seq == 0) {
127
+ throw std::runtime_error("n_ctx_seq == 0");
128
+ }
129
+
130
+ if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
131
+ cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
132
+ LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
133
+ }
134
+ }
116
135
 
117
136
  LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
118
137
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
119
- LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
138
+ LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
120
139
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
121
140
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
122
141
  LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
@@ -125,14 +144,14 @@ llama_context::llama_context(
125
144
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
126
145
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
127
146
 
128
- if (n_ctx_per_seq < hparams.n_ctx_train) {
129
- LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
130
- __func__, n_ctx_per_seq, hparams.n_ctx_train);
147
+ if (cparams.n_ctx_seq < hparams.n_ctx_train) {
148
+ LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
149
+ __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
131
150
  }
132
151
 
133
- if (n_ctx_per_seq > hparams.n_ctx_train) {
134
- LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
135
- __func__, n_ctx_per_seq, hparams.n_ctx_train);
152
+ if (cparams.n_ctx_seq > hparams.n_ctx_train) {
153
+ LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
154
+ __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
136
155
  }
137
156
 
138
157
  if (!hparams.vocab_only) {
@@ -268,9 +287,7 @@ llama_context::llama_context(
268
287
  if (pipeline_parallel) {
269
288
  LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
270
289
  }
271
- }
272
290
 
273
- if (!hparams.vocab_only) {
274
291
  llama_memory_context_ptr mctx;
275
292
  if (memory) {
276
293
  LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -343,7 +360,14 @@ llama_context::llama_context(
343
360
  {
344
361
  auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
345
362
  if (!gf) {
346
- throw std::runtime_error("failed to allocate compute pp buffers");
363
+ if (pipeline_parallel) {
364
+ LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
365
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
366
+ gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
367
+ }
368
+ if (!gf) {
369
+ throw std::runtime_error("failed to allocate compute pp buffers");
370
+ }
347
371
  }
348
372
 
349
373
  n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
@@ -448,8 +472,8 @@ uint32_t llama_context::n_ctx() const {
448
472
  return cparams.n_ctx;
449
473
  }
450
474
 
451
- uint32_t llama_context::n_ctx_per_seq() const {
452
- return cparams.n_ctx / cparams.n_seq_max;
475
+ uint32_t llama_context::n_ctx_seq() const {
476
+ return cparams.n_ctx_seq;
453
477
  }
454
478
 
455
479
  uint32_t llama_context::n_batch() const {
@@ -803,7 +827,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
803
827
 
804
828
  const auto & hparams = model.hparams;
805
829
 
806
- const int64_t n_embd = hparams.n_embd;
830
+ const int64_t n_embd = hparams.n_embd_inp();
807
831
  const int64_t n_vocab = model.vocab.n_tokens();
808
832
 
809
833
  // note: during encode, we always pass the full sequence starting from pos = 0
@@ -972,7 +996,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
972
996
  const auto & hparams = model.hparams;
973
997
 
974
998
  const int64_t n_vocab = vocab.n_tokens();
975
- const int64_t n_embd = hparams.n_embd;
999
+ const int64_t n_embd = hparams.n_embd_inp();
976
1000
 
977
1001
  // when computing embeddings, all tokens are output
978
1002
  const bool output_all = cparams.embeddings;
@@ -2130,7 +2154,7 @@ void llama_context::opt_epoch_iter(
2130
2154
  batch.logits [pos_batch] = true;
2131
2155
  }
2132
2156
 
2133
- if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
2157
+ if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
2134
2158
  LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
2135
2159
  return;
2136
2160
  }
@@ -2378,6 +2402,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
2378
2402
  return ctx->n_ctx();
2379
2403
  }
2380
2404
 
2405
+ uint32_t llama_n_ctx_seq(const llama_context * ctx) {
2406
+ return ctx->n_ctx_seq();
2407
+ }
2408
+
2381
2409
  uint32_t llama_n_batch(const llama_context * ctx) {
2382
2410
  return ctx->n_batch();
2383
2411
  }
@@ -43,11 +43,11 @@ struct llama_context {
43
43
 
44
44
  ggml_backend_sched_t get_sched() const;
45
45
 
46
- uint32_t n_ctx() const;
47
- uint32_t n_ctx_per_seq() const;
48
- uint32_t n_batch() const;
49
- uint32_t n_ubatch() const;
50
- uint32_t n_seq_max() const;
46
+ uint32_t n_ctx() const;
47
+ uint32_t n_ctx_seq() const;
48
+ uint32_t n_batch() const;
49
+ uint32_t n_ubatch() const;
50
+ uint32_t n_seq_max() const;
51
51
 
52
52
  uint32_t n_threads() const;
53
53
  uint32_t n_threads_batch() const;
@@ -8,6 +8,7 @@
8
8
 
9
9
  struct llama_cparams {
10
10
  uint32_t n_ctx; // context size used during inference
11
+ uint32_t n_ctx_seq; // context for a single sequence
11
12
  uint32_t n_batch;
12
13
  uint32_t n_ubatch;
13
14
  uint32_t n_seq_max;
@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn(
810
810
  GGML_ABORT("fatal error");
811
811
  }
812
812
 
813
+ //expand here so that we can fuse ffn gate
814
+ ggml_build_forward_expand(gf, cur);
815
+
813
816
  if (gate && type_gate == LLM_FFN_PAR) {
814
817
  cur = ggml_mul(ctx0, cur, tmp);
815
818
  cb(cur, "ffn_gate_par", il);
@@ -1006,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1006
1009
  ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
1007
1010
  cb(weights_sum, "ffn_moe_weights_sum", il);
1008
1011
 
1009
- if (arch == LLM_ARCH_BAILINGMOE2) {
1010
- weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
1011
- cb(weights_sum, "ffn_moe_weights_sum_biased", il);
1012
- }
1012
+ // Avoid division by zero, clamp to smallest number representable by F16
1013
+ weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
1014
+ cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
1013
1015
 
1014
1016
  weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
1015
1017
  cb(weights, "ffn_moe_weights_norm", il);
@@ -1091,6 +1093,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1091
1093
  GGML_ABORT("fatal error");
1092
1094
  }
1093
1095
 
1096
+ //expand here so that we can fuse ffn gate
1097
+ ggml_build_forward_expand(gf, cur);
1098
+
1094
1099
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
1095
1100
  cb(experts, "ffn_moe_down", il);
1096
1101
 
@@ -1137,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1137
1142
 
1138
1143
  // input embeddings with optional lora
1139
1144
  ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1140
- const int64_t n_embd = hparams.n_embd;
1145
+ const int64_t n_embd = hparams.n_embd_inp();
1141
1146
 
1142
1147
  auto inp = std::make_unique<llm_graph_input_embd>();
1143
1148
 
@@ -1274,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
1274
1279
  // return cur;
1275
1280
  //}
1276
1281
 
1277
- const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
1282
+ const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
1278
1283
  const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1279
1284
 
1280
1285
  cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
@@ -2030,7 +2035,7 @@ int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buck
2030
2035
 
2031
2036
  if (bidirectional) {
2032
2037
  relative_bucket += (relative_position > 0) * n_buckets;
2033
- relative_position = abs(relative_position);
2038
+ relative_position = std::abs(relative_position);
2034
2039
  } else {
2035
2040
  relative_position = -std::min<int32_t>(relative_position, 0);
2036
2041
  }