@fugood/llama.node 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +8 -8
  3. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  4. package/src/llama.cpp/common/arg.cpp +44 -999
  5. package/src/llama.cpp/common/arg.h +2 -2
  6. package/src/llama.cpp/common/chat.cpp +17 -2
  7. package/src/llama.cpp/common/common.cpp +33 -0
  8. package/src/llama.cpp/common/common.h +15 -1
  9. package/src/llama.cpp/common/download.cpp +1054 -0
  10. package/src/llama.cpp/common/download.h +55 -0
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  12. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
  23. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  24. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  25. package/src/llama.cpp/include/llama.h +7 -3
  26. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  27. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  28. package/src/llama.cpp/src/llama-arch.h +11 -0
  29. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  30. package/src/llama.cpp/src/llama-batch.h +12 -1
  31. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  32. package/src/llama.cpp/src/llama-chat.h +1 -0
  33. package/src/llama.cpp/src/llama-context.cpp +36 -13
  34. package/src/llama.cpp/src/llama-context.h +5 -5
  35. package/src/llama.cpp/src/llama-cparams.h +1 -0
  36. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  37. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  38. package/src/llama.cpp/src/llama-hparams.h +6 -0
  39. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  40. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
  41. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  42. package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
  43. package/src/llama.cpp/src/llama-model.cpp +320 -13171
  44. package/src/llama.cpp/src/llama-model.h +8 -0
  45. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  46. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  47. package/src/llama.cpp/src/llama-vocab.h +1 -0
  48. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  49. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  50. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  51. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  52. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  53. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  54. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  55. package/src/llama.cpp/src/models/bert.cpp +176 -0
  56. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  57. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  58. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  59. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  60. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  61. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  62. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  63. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  64. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  65. package/src/llama.cpp/src/models/deci.cpp +135 -0
  66. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  67. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  68. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  69. package/src/llama.cpp/src/models/dream.cpp +105 -0
  70. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  71. package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
  72. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  73. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  74. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  75. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  76. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  77. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  78. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  79. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  80. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  81. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  82. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  83. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  84. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  85. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  86. package/src/llama.cpp/src/models/granite.cpp +211 -0
  87. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  88. package/src/llama.cpp/src/models/grok.cpp +159 -0
  89. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  90. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  91. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  92. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  93. package/src/llama.cpp/src/models/jais.cpp +86 -0
  94. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  95. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  96. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  97. package/src/llama.cpp/src/models/llada.cpp +99 -0
  98. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  99. package/src/llama.cpp/src/models/llama.cpp +155 -0
  100. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  101. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  102. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  103. package/src/llama.cpp/src/models/models.h +481 -0
  104. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  105. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  106. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  107. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  108. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  109. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  110. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  111. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  112. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  113. package/src/llama.cpp/src/models/orion.cpp +123 -0
  114. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  115. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  116. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  117. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  118. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  119. package/src/llama.cpp/src/models/plm.cpp +168 -0
  120. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  121. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  122. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  123. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  124. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  125. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  126. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  127. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  128. package/src/llama.cpp/src/models/refact.cpp +94 -0
  129. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  130. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  131. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  132. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  133. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  134. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  135. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  136. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  137. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  138. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  139. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  140. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  141. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  142. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  143. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -35,6 +35,101 @@ add_library(llama
35
35
  unicode-data.cpp
36
36
  unicode.cpp
37
37
  unicode.h
38
+ models/apertus.cpp
39
+ models/arcee.cpp
40
+ models/arctic.cpp
41
+ models/arwkv7.cpp
42
+ models/baichuan.cpp
43
+ models/bailingmoe.cpp
44
+ models/bailingmoe2.cpp
45
+ models/bert.cpp
46
+ models/bitnet.cpp
47
+ models/bloom.cpp
48
+ models/chameleon.cpp
49
+ models/chatglm.cpp
50
+ models/codeshell.cpp
51
+ models/cogvlm.cpp
52
+ models/cohere2-iswa.cpp
53
+ models/command-r.cpp
54
+ models/dbrx.cpp
55
+ models/deci.cpp
56
+ models/deepseek.cpp
57
+ models/deepseek2.cpp
58
+ models/dots1.cpp
59
+ models/dream.cpp
60
+ models/ernie4-5-moe.cpp
61
+ models/ernie4-5.cpp
62
+ models/exaone.cpp
63
+ models/exaone4.cpp
64
+ models/falcon-h1.cpp
65
+ models/falcon.cpp
66
+ models/gemma-embedding.cpp
67
+ models/gemma.cpp
68
+ models/gemma2-iswa.cpp
69
+ models/gemma3-iswa.cpp
70
+ models/gemma3n-iswa.cpp
71
+ models/glm4-moe.cpp
72
+ models/glm4.cpp
73
+ models/gpt2.cpp
74
+ models/gptneox.cpp
75
+ models/granite-hybrid.cpp
76
+ models/granite.cpp
77
+ models/grok.cpp
78
+ models/grovemoe.cpp
79
+ models/hunyuan-dense.cpp
80
+ models/hunyuan-moe.cpp
81
+ models/internlm2.cpp
82
+ models/jais.cpp
83
+ models/jamba.cpp
84
+ models/lfm2.cpp
85
+ models/llada-moe.cpp
86
+ models/llada.cpp
87
+ models/llama-iswa.cpp
88
+ models/llama.cpp
89
+ models/mamba.cpp
90
+ models/minicpm3.cpp
91
+ models/minimax-m2.cpp
92
+ models/mpt.cpp
93
+ models/nemotron-h.cpp
94
+ models/nemotron.cpp
95
+ models/neo-bert.cpp
96
+ models/olmo.cpp
97
+ models/olmo2.cpp
98
+ models/olmoe.cpp
99
+ models/openai-moe-iswa.cpp
100
+ models/openelm.cpp
101
+ models/orion.cpp
102
+ models/pangu-embedded.cpp
103
+ models/phi2.cpp
104
+ models/phi3.cpp
105
+ models/plamo.cpp
106
+ models/plamo2.cpp
107
+ models/plm.cpp
108
+ models/qwen.cpp
109
+ models/qwen2.cpp
110
+ models/qwen2moe.cpp
111
+ models/qwen2vl.cpp
112
+ models/qwen3.cpp
113
+ models/qwen3vl.cpp
114
+ models/qwen3vl-moe.cpp
115
+ models/qwen3moe.cpp
116
+ models/refact.cpp
117
+ models/rwkv6-base.cpp
118
+ models/rwkv6.cpp
119
+ models/rwkv6qwen2.cpp
120
+ models/rwkv7-base.cpp
121
+ models/rwkv7.cpp
122
+ models/seed-oss.cpp
123
+ models/smallthinker.cpp
124
+ models/smollm3.cpp
125
+ models/stablelm.cpp
126
+ models/starcoder.cpp
127
+ models/starcoder2.cpp
128
+ models/t5-dec.cpp
129
+ models/t5-enc.cpp
130
+ models/wavtokenizer-dec.cpp
131
+ models/xverse.cpp
132
+ models/graph-context-mamba.cpp
38
133
  )
39
134
 
40
135
  target_include_directories(llama PRIVATE .)
@@ -32,6 +32,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
32
32
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
33
33
  { LLM_ARCH_QWEN3, "qwen3" },
34
34
  { LLM_ARCH_QWEN3MOE, "qwen3moe" },
35
+ { LLM_ARCH_QWEN3VL, "qwen3vl" },
36
+ { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
35
37
  { LLM_ARCH_PHI2, "phi2" },
36
38
  { LLM_ARCH_PHI3, "phi3" },
37
39
  { LLM_ARCH_PHIMOE, "phimoe" },
@@ -103,6 +105,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
103
105
  { LLM_ARCH_SEED_OSS, "seed_oss" },
104
106
  { LLM_ARCH_GROVEMOE, "grovemoe" },
105
107
  { LLM_ARCH_APERTUS, "apertus" },
108
+ { LLM_ARCH_MINIMAX_M2, "minimax-m2" },
109
+ { LLM_ARCH_COGVLM, "cogvlm" },
110
+ { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
106
111
  { LLM_ARCH_UNKNOWN, "(unknown)" },
107
112
  };
108
113
 
@@ -145,6 +150,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
145
150
  { LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
146
151
  { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
147
152
  { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
153
+ { LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
148
154
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
149
155
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
150
156
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -779,6 +785,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
779
785
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
780
786
  },
781
787
  },
788
+ {
789
+ LLM_ARCH_QWEN3VL,
790
+ {
791
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
792
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
793
+ { LLM_TENSOR_OUTPUT, "output" },
794
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
795
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
796
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
797
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
798
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
799
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
800
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
801
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
802
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
803
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
804
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
805
+ },
806
+ },
807
+ {
808
+ LLM_ARCH_QWEN3VLMOE,
809
+ {
810
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
811
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
812
+ { LLM_TENSOR_OUTPUT, "output" },
813
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
814
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
815
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
816
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
817
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
822
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
823
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
824
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
825
+ },
826
+ },
782
827
  {
783
828
  LLM_ARCH_PHI2,
784
829
  {
@@ -2312,6 +2357,64 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2312
2357
  { LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
2313
2358
  },
2314
2359
  },
2360
+ {
2361
+ LLM_ARCH_MINIMAX_M2,
2362
+ {
2363
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2364
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2365
+ { LLM_TENSOR_OUTPUT, "output" },
2366
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2367
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2368
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2369
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2370
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2371
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2372
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2373
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2374
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2375
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2376
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2377
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2378
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
2379
+ },
2380
+ },
2381
+ {
2382
+ LLM_ARCH_PANGU_EMBED,
2383
+ {
2384
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2385
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2386
+ { LLM_TENSOR_OUTPUT, "output" },
2387
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2388
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2389
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2390
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2391
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2392
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2393
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2394
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2395
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2396
+ },
2397
+ },
2398
+ {
2399
+ LLM_ARCH_COGVLM,
2400
+ {
2401
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2402
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2403
+ { LLM_TENSOR_OUTPUT, "output" },
2404
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2405
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
2406
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2407
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2408
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2409
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2410
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2411
+ { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
2412
+ { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
2413
+ { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
2414
+ { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
2415
+ { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
2416
+ },
2417
+ },
2315
2418
  {
2316
2419
  LLM_ARCH_UNKNOWN,
2317
2420
  {
@@ -2488,6 +2591,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2488
2591
  {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2489
2592
  {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2490
2593
  {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2594
+ {LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2595
+ {LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2596
+ {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2597
+ {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2598
+ {LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2491
2599
  // NextN/MTP tensors are currently ignored (reserved for future MTP support)
2492
2600
  // These tensors only exist in the last layer(s) and are treated as output tensors
2493
2601
  {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -36,6 +36,8 @@ enum llm_arch {
36
36
  LLM_ARCH_QWEN2VL,
37
37
  LLM_ARCH_QWEN3,
38
38
  LLM_ARCH_QWEN3MOE,
39
+ LLM_ARCH_QWEN3VL,
40
+ LLM_ARCH_QWEN3VLMOE,
39
41
  LLM_ARCH_PHI2,
40
42
  LLM_ARCH_PHI3,
41
43
  LLM_ARCH_PHIMOE,
@@ -107,6 +109,9 @@ enum llm_arch {
107
109
  LLM_ARCH_SEED_OSS,
108
110
  LLM_ARCH_GROVEMOE,
109
111
  LLM_ARCH_APERTUS,
112
+ LLM_ARCH_MINIMAX_M2,
113
+ LLM_ARCH_COGVLM,
114
+ LLM_ARCH_PANGU_EMBED,
110
115
  LLM_ARCH_UNKNOWN,
111
116
  };
112
117
 
@@ -149,6 +154,7 @@ enum llm_kv {
149
154
  LLM_KV_EXPERTS_PER_GROUP,
150
155
  LLM_KV_MOE_EVERY_N_LAYERS,
151
156
  LLM_KV_NEXTN_PREDICT_LAYERS,
157
+ LLM_KV_NUM_DEEPSTACK_LAYERS,
152
158
  LLM_KV_POOLING_TYPE,
153
159
  LLM_KV_LOGIT_SCALE,
154
160
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -455,6 +461,11 @@ enum llm_tensor {
455
461
  LLM_TENSOR_SHORTCONV_CONV,
456
462
  LLM_TENSOR_SHORTCONV_INPROJ,
457
463
  LLM_TENSOR_SHORTCONV_OUTPROJ,
464
+ LLM_TENSOR_VISEXP_ATTN_QKV,
465
+ LLM_TENSOR_VISEXP_ATTN_OUT,
466
+ LLM_TENSOR_VISEXP_FFN_GATE,
467
+ LLM_TENSOR_VISEXP_FFN_DOWN,
468
+ LLM_TENSOR_VISEXP_FFN_UP,
458
469
  LLM_TENSOR_NEXTN_EH_PROJ,
459
470
  LLM_TENSOR_NEXTN_EMBED_TOKENS,
460
471
  LLM_TENSOR_NEXTN_ENORM,
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
215
215
  /*.n_seq_tokens =*/ (uint32_t) 1,
216
216
  /*.n_seqs =*/ (uint32_t) batch.n_tokens,
217
217
  /*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(),
218
+ /*.n_pos =*/ n_pos_per_embd,
218
219
  /*.token =*/ batch.token,
219
220
  /*.embd =*/ batch.embd,
220
221
  /*.pos =*/ batch.pos,
@@ -251,46 +252,72 @@ bool llama_batch_allocr::init(
251
252
  // consistency checks
252
253
  //
253
254
 
254
- for (uint32_t s = 0; s < n_seq_max; ++s) {
255
- if (seq_pos[s].empty()) {
256
- continue;
255
+ if (n_pos_per_embd > 1) {
256
+ // M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
257
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
258
+ if (seq_pos[s].empty()) {
259
+ continue;
260
+ }
261
+
262
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
263
+
264
+ if (batch.token) {
265
+ if (p0 >= 0 && p0 >= seq_pos_min(s)) {
266
+ LLAMA_LOG_ERROR(
267
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
268
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
269
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
270
+ " for M-RoPE, it is required that the position satisfies: X < Y\n",
271
+ __func__, s, s, p0, s, seq_pos_min(s));
272
+
273
+ return false;
274
+ }
275
+ } else {
276
+ // embedding inputs can have overlapping positions
277
+ if (p0 >= 0 && p0 > seq_pos_min(s)) {
278
+ LLAMA_LOG_ERROR(
279
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
280
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
281
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
282
+ " for M-RoPE, it is required that the position satisfies: X <= Y\n",
283
+ __func__, s, s, p0, s, seq_pos_min(s));
284
+
285
+ return false;
286
+ }
287
+ }
257
288
  }
289
+ } else {
290
+ for (uint32_t s = 0; s < n_seq_max; ++s) {
291
+ if (seq_pos[s].empty()) {
292
+ continue;
293
+ }
258
294
 
259
- const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
295
+ const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
260
296
 
261
- if (p0 >= 0) {
262
- bool ok = true;
297
+ if (p0 >= 0) {
298
+ bool ok = true;
263
299
 
264
- if (batch.token) {
265
300
  if (seq_pos_min(s) != p0 + 1) {
266
301
  ok = false;
267
302
  }
268
- } else {
269
- assert(batch.embd);
270
303
 
271
- // for embeddings (typically used as vision input), we allow them to have repeating positions
272
- // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
273
- if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
274
- ok = false;
304
+ if (!ok) {
305
+ LLAMA_LOG_ERROR(
306
+ "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
307
+ " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
308
+ " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
309
+ " it is required that the sequence positions remain consecutive: Y = X + 1\n",
310
+ __func__, s, s, p0, s, seq_pos_min(s));
311
+
312
+ return false;
275
313
  }
276
314
  }
277
315
 
278
- if (!ok) {
279
- LLAMA_LOG_ERROR(
280
- "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
281
- " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
282
- " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
283
- " it is required that the sequence positions remain consecutive: Y = X + 1\n",
284
- __func__, s, s, p0, s, seq_pos_min(s));
285
-
316
+ if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
317
+ LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
286
318
  return false;
287
319
  }
288
320
  }
289
-
290
- if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
291
- LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
292
- return false;
293
- }
294
321
  }
295
322
 
296
323
  if (memory) {
@@ -389,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
389
416
  /*.n_seq_tokens =*/ n_seq_tokens,
390
417
  /*.n_seqs =*/ n_seqs,
391
418
  /*.n_seqs_unq =*/ n_seqs,
419
+ /*.n_pos =*/ n_pos_per_embd,
392
420
 
393
421
  /*.token =*/ udata->token.data(),
394
422
  /*.embd =*/ nullptr,
@@ -655,10 +683,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
655
683
 
656
684
  auto udata = std::make_shared<llama_ubatch::data_t>();
657
685
 
658
- const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
659
-
660
686
  const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
661
- const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur;
687
+ const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
662
688
 
663
689
  udata->token .resize(n_tokens);
664
690
  udata->embd .resize(n_embd_all);
@@ -680,8 +706,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
680
706
  memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
681
707
  }
682
708
 
683
- for (int j = 0; j < n_pos_cur; ++j) {
684
- udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
709
+ for (size_t j = 0; j < (size_t)n_pos_per_embd; ++j) {
710
+ // if we are using M-RoPE
711
+ // if the current batch is text, we need to broadcast the same position across all RoPE sections
712
+ // otherwise, the input batch is image embeddings, we copy the positions as-is
713
+ // if we are not using M-RoPE, there is only one position per token (this loop runs only once)
714
+ size_t src_off = batch.token ? 0 : j*batch.n_tokens;
715
+ udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
685
716
  }
686
717
 
687
718
  udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
@@ -710,6 +741,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
710
741
  /*.n_seq_tokens =*/ n_tokens/n_seqs,
711
742
  /*.n_seqs =*/ n_seqs,
712
743
  /*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
744
+ /*.n_pos =*/ n_pos_per_embd,
713
745
 
714
746
  /*.token =*/ batch.token ? udata->token.data() : nullptr,
715
747
  /*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
@@ -17,6 +17,16 @@ struct llama_ubatch {
17
17
  return b_equal_seqs != 0;
18
18
  }
19
19
 
20
+ // typical for M-RoPE cases:
21
+ // 0 - sequantial position of the tokens/embeddings in the sequence
22
+ // 1 - y position in the image
23
+ // 2 - x position in the image
24
+ // 3 - other
25
+ bool is_pos_2d() const {
26
+ // TODO @ngxson : we may need to check for model arch when more models use >1 positions
27
+ return n_pos >= 3;
28
+ }
29
+
20
30
  uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
21
31
  // otherwise address sanitizer complains
22
32
  // TODO: whole_seqs for embeddings?
@@ -25,6 +35,7 @@ struct llama_ubatch {
25
35
  uint32_t n_seq_tokens; // tokens per sequence set
26
36
  uint32_t n_seqs; // sequence sets in the ubatch
27
37
  uint32_t n_seqs_unq; // unique sequence ids in the ubatch
38
+ uint32_t n_pos; // number of position inputs for each token/embedding
28
39
 
29
40
  // seq_id_unq: unique sequence ids in the ubatch
30
41
  // seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
@@ -33,7 +44,7 @@ struct llama_ubatch {
33
44
  // // size | idx | val
34
45
  llama_token * token; // [n_tokens] | i | id, token
35
46
  float * embd; // [n_embd, n_tokens] | i | embd
36
- llama_pos * pos; // [n_tokens] | i | pos
47
+ llama_pos * pos; // [n_tokens*n_pos] | i | pos
37
48
  int32_t * n_seq_id; // [n_tokens] | i | -
38
49
  llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
39
50
  llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
73
73
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
74
74
  { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
75
75
  { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
76
+ { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
76
77
  };
77
78
 
78
79
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -213,6 +214,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
213
214
  return LLM_CHAT_TEMPLATE_SEED_OSS;
214
215
  } else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
215
216
  return LLM_CHAT_TEMPLATE_GROK_2;
217
+ } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
218
+ return LLM_CHAT_TEMPLATE_PANGU_EMBED;
216
219
  }
217
220
  return LLM_CHAT_TEMPLATE_UNKNOWN;
218
221
  }
@@ -813,6 +816,35 @@ int32_t llm_chat_apply_template(
813
816
  if (add_ass) {
814
817
  ss << "Assistant:";
815
818
  }
819
+ }else if (tmpl == LLM_CHAT_TEMPLATE_PANGU_EMBED) {
820
+ // [unused9]系统:xxx[unused10]
821
+ // [unused9]用户:xxx[unused10]
822
+ // [unused9]助手:xxx[unused10]
823
+ // ...
824
+ for (size_t i = 0; i < chat.size(); ++i) {
825
+ const auto & msg = chat[i];
826
+ const std::string & role = msg->role;
827
+ const std::string & content = msg->content;
828
+
829
+ if (i == 0 && role != "system") {
830
+ ss << "[unused9]系统:[unused10]";
831
+ }
832
+
833
+ if (role == "system") {
834
+ ss << "[unused9]系统:" << content << "[unused10]";
835
+ } else if (role == "user") {
836
+ ss << "[unused9]用户:" << content << "[unused10]";
837
+ } else if (role == "assistant") {
838
+ ss << "[unused9]助手:" << content << "[unused10]";
839
+ } else if (role == "tool") {
840
+ ss << "[unused9]工具:" << content << "[unused10]";
841
+ } else if (role == "function") {
842
+ ss << "[unused9]方法:" << content << "[unused10]";
843
+ }
844
+ }
845
+ if (add_ass) {
846
+ ss << "[unused9]助手:";
847
+ }
816
848
  } else {
817
849
  // template not supported
818
850
  return -1;
@@ -53,6 +53,7 @@ enum llm_chat_template {
53
53
  LLM_CHAT_TEMPLATE_KIMI_K2,
54
54
  LLM_CHAT_TEMPLATE_SEED_OSS,
55
55
  LLM_CHAT_TEMPLATE_GROK_2,
56
+ LLM_CHAT_TEMPLATE_PANGU_EMBED,
56
57
  LLM_CHAT_TEMPLATE_UNKNOWN,
57
58
  };
58
59
 
@@ -21,6 +21,8 @@ llama_context::llama_context(
21
21
  llama_context_params params) :
22
22
  model(model),
23
23
  balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
24
+ // TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
25
+ // may need to be backend-dependent
24
26
  LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
25
27
 
26
28
  t_start_us = model.t_start_us;
@@ -112,11 +114,28 @@ llama_context::llama_context(
112
114
  }
113
115
  }
114
116
 
115
- const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
117
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
118
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
119
+
120
+ if (cparams.kv_unified) {
121
+ cparams.n_ctx_seq = cparams.n_ctx;
122
+ } else {
123
+ cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
124
+ cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
125
+
126
+ if (cparams.n_ctx_seq == 0) {
127
+ throw std::runtime_error("n_ctx_seq == 0");
128
+ }
129
+
130
+ if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
131
+ cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
132
+ LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
133
+ }
134
+ }
116
135
 
117
136
  LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
118
137
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
119
- LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
138
+ LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
120
139
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
121
140
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
122
141
  LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
@@ -125,14 +144,14 @@ llama_context::llama_context(
125
144
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
126
145
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
127
146
 
128
- if (n_ctx_per_seq < hparams.n_ctx_train) {
129
- LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
130
- __func__, n_ctx_per_seq, hparams.n_ctx_train);
147
+ if (cparams.n_ctx_seq < hparams.n_ctx_train) {
148
+ LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
149
+ __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
131
150
  }
132
151
 
133
- if (n_ctx_per_seq > hparams.n_ctx_train) {
134
- LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
135
- __func__, n_ctx_per_seq, hparams.n_ctx_train);
152
+ if (cparams.n_ctx_seq > hparams.n_ctx_train) {
153
+ LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
154
+ __func__, cparams.n_ctx_seq, hparams.n_ctx_train);
136
155
  }
137
156
 
138
157
  if (!hparams.vocab_only) {
@@ -453,8 +472,8 @@ uint32_t llama_context::n_ctx() const {
453
472
  return cparams.n_ctx;
454
473
  }
455
474
 
456
- uint32_t llama_context::n_ctx_per_seq() const {
457
- return cparams.n_ctx / cparams.n_seq_max;
475
+ uint32_t llama_context::n_ctx_seq() const {
476
+ return cparams.n_ctx_seq;
458
477
  }
459
478
 
460
479
  uint32_t llama_context::n_batch() const {
@@ -808,7 +827,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
808
827
 
809
828
  const auto & hparams = model.hparams;
810
829
 
811
- const int64_t n_embd = hparams.n_embd;
830
+ const int64_t n_embd = hparams.n_embd_inp();
812
831
  const int64_t n_vocab = model.vocab.n_tokens();
813
832
 
814
833
  // note: during encode, we always pass the full sequence starting from pos = 0
@@ -977,7 +996,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
977
996
  const auto & hparams = model.hparams;
978
997
 
979
998
  const int64_t n_vocab = vocab.n_tokens();
980
- const int64_t n_embd = hparams.n_embd;
999
+ const int64_t n_embd = hparams.n_embd_inp();
981
1000
 
982
1001
  // when computing embeddings, all tokens are output
983
1002
  const bool output_all = cparams.embeddings;
@@ -2135,7 +2154,7 @@ void llama_context::opt_epoch_iter(
2135
2154
  batch.logits [pos_batch] = true;
2136
2155
  }
2137
2156
 
2138
- if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
2157
+ if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
2139
2158
  LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
2140
2159
  return;
2141
2160
  }
@@ -2383,6 +2402,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
2383
2402
  return ctx->n_ctx();
2384
2403
  }
2385
2404
 
2405
+ uint32_t llama_n_ctx_seq(const llama_context * ctx) {
2406
+ return ctx->n_ctx_seq();
2407
+ }
2408
+
2386
2409
  uint32_t llama_n_batch(const llama_context * ctx) {
2387
2410
  return ctx->n_batch();
2388
2411
  }
@@ -43,11 +43,11 @@ struct llama_context {
43
43
 
44
44
  ggml_backend_sched_t get_sched() const;
45
45
 
46
- uint32_t n_ctx() const;
47
- uint32_t n_ctx_per_seq() const;
48
- uint32_t n_batch() const;
49
- uint32_t n_ubatch() const;
50
- uint32_t n_seq_max() const;
46
+ uint32_t n_ctx() const;
47
+ uint32_t n_ctx_seq() const;
48
+ uint32_t n_batch() const;
49
+ uint32_t n_ubatch() const;
50
+ uint32_t n_seq_max() const;
51
51
 
52
52
  uint32_t n_threads() const;
53
53
  uint32_t n_threads_batch() const;
@@ -8,6 +8,7 @@
8
8
 
9
9
  struct llama_cparams {
10
10
  uint32_t n_ctx; // context size used during inference
11
+ uint32_t n_ctx_seq; // context for a single sequence
11
12
  uint32_t n_batch;
12
13
  uint32_t n_ubatch;
13
14
  uint32_t n_seq_max;