cui-llama.rn 1.4.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +52 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1779
  14. package/cpp/chat.h +9 -1
  15. package/cpp/common.cpp +20 -522
  16. package/cpp/common.h +13 -36
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-common.h +12 -6
  19. package/cpp/ggml-cpu-aarch64.cpp +1557 -80
  20. package/cpp/ggml-cpu-impl.h +2 -21
  21. package/cpp/ggml-cpu-quants.c +904 -405
  22. package/cpp/ggml-cpu.c +909 -13237
  23. package/cpp/ggml-impl.h +50 -23
  24. package/cpp/ggml-metal-impl.h +77 -3
  25. package/cpp/ggml-metal.m +794 -580
  26. package/cpp/ggml.c +92 -3
  27. package/cpp/ggml.h +29 -5
  28. package/cpp/gguf.cpp +1 -0
  29. package/cpp/llama-adapter.cpp +55 -20
  30. package/cpp/llama-adapter.h +11 -9
  31. package/cpp/llama-arch.cpp +217 -16
  32. package/cpp/llama-arch.h +25 -0
  33. package/cpp/llama-batch.h +2 -2
  34. package/cpp/llama-chat.cpp +54 -2
  35. package/cpp/llama-chat.h +3 -0
  36. package/cpp/llama-context.cpp +2294 -1238
  37. package/cpp/llama-context.h +214 -77
  38. package/cpp/llama-cparams.h +1 -0
  39. package/cpp/llama-graph.cpp +1695 -0
  40. package/cpp/llama-graph.h +592 -0
  41. package/cpp/llama-hparams.cpp +8 -0
  42. package/cpp/llama-hparams.h +17 -0
  43. package/cpp/llama-io.cpp +15 -0
  44. package/cpp/llama-io.h +35 -0
  45. package/cpp/llama-kv-cache.cpp +965 -303
  46. package/cpp/llama-kv-cache.h +145 -151
  47. package/cpp/llama-memory.cpp +1 -0
  48. package/cpp/llama-memory.h +21 -0
  49. package/cpp/llama-mmap.cpp +1 -1
  50. package/cpp/llama-model-loader.cpp +10 -5
  51. package/cpp/llama-model-loader.h +5 -3
  52. package/cpp/llama-model.cpp +9194 -201
  53. package/cpp/llama-model.h +40 -1
  54. package/cpp/llama-sampling.cpp +5 -0
  55. package/cpp/llama-vocab.cpp +36 -5
  56. package/cpp/llama.cpp +51 -9984
  57. package/cpp/llama.h +102 -22
  58. package/cpp/log.cpp +34 -0
  59. package/cpp/minja/chat-template.hpp +15 -7
  60. package/cpp/minja/minja.hpp +120 -94
  61. package/cpp/ops.cpp +8723 -0
  62. package/cpp/ops.h +128 -0
  63. package/cpp/rn-llama.cpp +44 -53
  64. package/cpp/rn-llama.h +2 -12
  65. package/cpp/sampling.cpp +3 -0
  66. package/cpp/sgemm.cpp +533 -88
  67. package/cpp/simd-mappings.h +888 -0
  68. package/cpp/speculative.cpp +4 -4
  69. package/cpp/unary-ops.cpp +186 -0
  70. package/cpp/unary-ops.h +28 -0
  71. package/cpp/vec.cpp +258 -0
  72. package/cpp/vec.h +802 -0
  73. package/ios/CMakeLists.txt +5 -2
  74. package/ios/RNLlama.mm +2 -2
  75. package/ios/RNLlamaContext.mm +40 -24
  76. package/package.json +1 -1
  77. package/src/NativeRNLlama.ts +6 -4
  78. package/src/index.ts +3 -1
  79. package/cpp/chat-template.hpp +0 -529
  80. package/cpp/minja.hpp +0 -2915
@@ -6,6 +6,7 @@
6
6
 
7
7
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8
8
  { LLM_ARCH_LLAMA, "llama" },
9
+ { LLM_ARCH_LLAMA4, "llama4" },
9
10
  { LLM_ARCH_DECI, "deci" },
10
11
  { LLM_ARCH_FALCON, "falcon" },
11
12
  { LLM_ARCH_GROK, "grok" },
@@ -25,6 +26,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
25
26
  { LLM_ARCH_QWEN2, "qwen2" },
26
27
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
27
28
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
29
+ { LLM_ARCH_QWEN3, "qwen3" },
30
+ { LLM_ARCH_QWEN3MOE, "qwen3moe" },
28
31
  { LLM_ARCH_PHI2, "phi2" },
29
32
  { LLM_ARCH_PHI3, "phi3" },
30
33
  { LLM_ARCH_PHIMOE, "phimoe" },
@@ -59,10 +62,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
59
62
  { LLM_ARCH_EXAONE, "exaone" },
60
63
  { LLM_ARCH_RWKV6, "rwkv6" },
61
64
  { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
65
+ { LLM_ARCH_RWKV7, "rwkv7" },
66
+ { LLM_ARCH_ARWKV7, "arwkv7" },
62
67
  { LLM_ARCH_GRANITE, "granite" },
63
68
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
64
69
  { LLM_ARCH_CHAMELEON, "chameleon" },
65
70
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
71
+ { LLM_ARCH_PLM, "plm" },
72
+ { LLM_ARCH_BAILINGMOE, "bailingmoe" },
66
73
  { LLM_ARCH_UNKNOWN, "(unknown)" },
67
74
  };
68
75
 
@@ -71,6 +78,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
71
78
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
72
79
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
73
80
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
81
+ { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
74
82
  { LLM_KV_GENERAL_NAME, "general.name" },
75
83
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
76
84
  { LLM_KV_GENERAL_VERSION, "general.version" },
@@ -109,23 +117,28 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
109
117
  { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
110
118
  { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
111
119
  { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
120
+ { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
112
121
 
113
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
114
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
115
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
116
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
117
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
118
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
119
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
120
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
121
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
122
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
123
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
124
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
125
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
126
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
127
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
128
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
122
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
123
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
124
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
125
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
126
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
127
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
128
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
129
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
130
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
131
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
132
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
133
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
134
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
135
+ { LLM_KV_ATTENTION_DECAY_LORA_RANK, "%s.attention.decay_lora_rank" },
136
+ { LLM_KV_ATTENTION_ICLR_LORA_RANK, "%s.attention.iclr_lora_rank" },
137
+ { LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, "%s.attention.value_residual_mix_lora_rank" },
138
+ { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
139
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
140
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
141
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
129
142
 
130
143
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
131
144
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -224,6 +237,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
224
237
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
225
238
  },
226
239
  },
240
+ {
241
+ LLM_ARCH_LLAMA4,
242
+ {
243
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
244
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
245
+ { LLM_TENSOR_OUTPUT, "output" },
246
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
247
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
248
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
249
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
250
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
251
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
252
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
253
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
254
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
255
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
256
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
257
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
258
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
259
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
260
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
261
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
262
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
263
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
264
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
265
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
266
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
267
+ },
268
+ },
227
269
  {
228
270
  LLM_ARCH_DECI,
229
271
  {
@@ -555,6 +597,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
555
597
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
556
598
  },
557
599
  },
600
+ {
601
+ LLM_ARCH_QWEN3,
602
+ {
603
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
604
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
605
+ { LLM_TENSOR_OUTPUT, "output" },
606
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
607
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
608
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
609
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
610
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
611
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
612
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
613
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
614
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
615
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
616
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
617
+ },
618
+ },
619
+ {
620
+ LLM_ARCH_QWEN3MOE,
621
+ {
622
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
623
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
624
+ { LLM_TENSOR_OUTPUT, "output" },
625
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
626
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
627
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
628
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
629
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
630
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
631
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
632
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
633
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
634
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
635
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
636
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
637
+ },
638
+ },
558
639
  {
559
640
  LLM_ARCH_PHI2,
560
641
  {
@@ -772,6 +853,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
772
853
  {
773
854
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
774
855
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
856
+ { LLM_TENSOR_OUTPUT, "output" },
775
857
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
776
858
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
777
859
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -1036,6 +1118,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1036
1118
  { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1037
1119
  },
1038
1120
  },
1121
+ {
1122
+ LLM_ARCH_PLM,
1123
+ {
1124
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1125
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1126
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1127
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1128
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1129
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1130
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1131
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1132
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1133
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1134
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1135
+ },
1136
+ },
1039
1137
  {
1040
1138
  LLM_ARCH_CHATGLM,
1041
1139
  {
@@ -1238,6 +1336,74 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1238
1336
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1239
1337
  },
1240
1338
  },
1339
+ {
1340
+ LLM_ARCH_RWKV7,
1341
+ {
1342
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1343
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1344
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1345
+ { LLM_TENSOR_OUTPUT, "output" },
1346
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1347
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
1348
+ { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
1349
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
1350
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
1351
+ { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
1352
+ { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
1353
+ { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
1354
+ { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
1355
+ { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
1356
+ { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
1357
+ { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
1358
+ { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
1359
+ { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
1360
+ { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
1361
+ { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
1362
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
1363
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
1364
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
1365
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
1366
+ { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
1367
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
1368
+ { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
1369
+ { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
1370
+ { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
1371
+ },
1372
+ },
1373
+ {
1374
+ LLM_ARCH_ARWKV7,
1375
+ {
1376
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1377
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1378
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1379
+ { LLM_TENSOR_OUTPUT, "output" },
1380
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1381
+ { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
1382
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
1383
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
1384
+ { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
1385
+ { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
1386
+ { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
1387
+ { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
1388
+ { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
1389
+ { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
1390
+ { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
1391
+ { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
1392
+ { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
1393
+ { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
1394
+ { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
1395
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
1396
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
1397
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
1398
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
1399
+ { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
1400
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
1401
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1402
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1403
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1404
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1405
+ },
1406
+ },
1241
1407
  {
1242
1408
  LLM_ARCH_GRANITE,
1243
1409
  {
@@ -1317,6 +1483,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1317
1483
  { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
1318
1484
  },
1319
1485
  },
1486
+ {
1487
+ LLM_ARCH_BAILINGMOE,
1488
+ {
1489
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1490
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1491
+ { LLM_TENSOR_OUTPUT, "output" },
1492
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1493
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1494
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1495
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1496
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1497
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1498
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1499
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1500
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1501
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1502
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1503
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1504
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1505
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1506
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1507
+ },
1508
+ },
1320
1509
  {
1321
1510
  LLM_ARCH_UNKNOWN,
1322
1511
  {
@@ -1397,6 +1586,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1397
1586
  {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1398
1587
  {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1399
1588
  {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1589
+ {LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1590
+ {LLM_TENSOR_TIME_MIX_A2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1591
+ {LLM_TENSOR_TIME_MIX_V1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1592
+ {LLM_TENSOR_TIME_MIX_V2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1593
+ {LLM_TENSOR_TIME_MIX_G1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1594
+ {LLM_TENSOR_TIME_MIX_G2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1400
1595
  {LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1401
1596
  {LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1402
1597
  {LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
@@ -1415,6 +1610,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1415
1610
  {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1416
1611
  {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1417
1612
  {LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1613
+ {LLM_TENSOR_TIME_MIX_K_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1614
+ {LLM_TENSOR_TIME_MIX_K_A, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1615
+ {LLM_TENSOR_TIME_MIX_R_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1418
1616
  {LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1419
1617
  {LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1420
1618
  {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
@@ -1422,6 +1620,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1422
1620
  {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1423
1621
  {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1424
1622
  {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1623
+ {LLM_TENSOR_TIME_MIX_W0, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1624
+ {LLM_TENSOR_TIME_MIX_A0, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1625
+ {LLM_TENSOR_TIME_MIX_V0, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
1425
1626
  {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_RWKV_WKV6}},
1426
1627
  {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
1427
1628
  {LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
package/cpp/llama-arch.h CHANGED
@@ -10,6 +10,7 @@
10
10
 
11
11
  enum llm_arch {
12
12
  LLM_ARCH_LLAMA,
13
+ LLM_ARCH_LLAMA4,
13
14
  LLM_ARCH_DECI,
14
15
  LLM_ARCH_FALCON,
15
16
  LLM_ARCH_BAICHUAN,
@@ -29,6 +30,8 @@ enum llm_arch {
29
30
  LLM_ARCH_QWEN2,
30
31
  LLM_ARCH_QWEN2MOE,
31
32
  LLM_ARCH_QWEN2VL,
33
+ LLM_ARCH_QWEN3,
34
+ LLM_ARCH_QWEN3MOE,
32
35
  LLM_ARCH_PHI2,
33
36
  LLM_ARCH_PHI3,
34
37
  LLM_ARCH_PHIMOE,
@@ -63,10 +66,14 @@ enum llm_arch {
63
66
  LLM_ARCH_EXAONE,
64
67
  LLM_ARCH_RWKV6,
65
68
  LLM_ARCH_RWKV6QWEN2,
69
+ LLM_ARCH_RWKV7,
70
+ LLM_ARCH_ARWKV7,
66
71
  LLM_ARCH_GRANITE,
67
72
  LLM_ARCH_GRANITE_MOE,
68
73
  LLM_ARCH_CHAMELEON,
69
74
  LLM_ARCH_WAVTOKENIZER_DEC,
75
+ LLM_ARCH_PLM,
76
+ LLM_ARCH_BAILINGMOE,
70
77
  LLM_ARCH_UNKNOWN,
71
78
  };
72
79
 
@@ -75,6 +82,7 @@ enum llm_kv {
75
82
  LLM_KV_GENERAL_ARCHITECTURE,
76
83
  LLM_KV_GENERAL_QUANTIZATION_VERSION,
77
84
  LLM_KV_GENERAL_ALIGNMENT,
85
+ LLM_KV_GENERAL_FILE_TYPE,
78
86
  LLM_KV_GENERAL_NAME,
79
87
  LLM_KV_GENERAL_AUTHOR,
80
88
  LLM_KV_GENERAL_VERSION,
@@ -113,6 +121,7 @@ enum llm_kv {
113
121
  LLM_KV_RESIDUAL_SCALE,
114
122
  LLM_KV_EMBEDDING_SCALE,
115
123
  LLM_KV_TOKEN_SHIFT_COUNT,
124
+ LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
116
125
 
117
126
  LLM_KV_ATTENTION_HEAD_COUNT,
118
127
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -127,6 +136,10 @@ enum llm_kv {
127
136
  LLM_KV_ATTENTION_CAUSAL,
128
137
  LLM_KV_ATTENTION_Q_LORA_RANK,
129
138
  LLM_KV_ATTENTION_KV_LORA_RANK,
139
+ LLM_KV_ATTENTION_DECAY_LORA_RANK,
140
+ LLM_KV_ATTENTION_ICLR_LORA_RANK,
141
+ LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
142
+ LLM_KV_ATTENTION_GATE_LORA_RANK,
130
143
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
131
144
  LLM_KV_ATTENTION_SLIDING_WINDOW,
132
145
  LLM_KV_ATTENTION_SCALE,
@@ -250,8 +263,20 @@ enum llm_tensor {
250
263
  LLM_TENSOR_SSM_A,
251
264
  LLM_TENSOR_SSM_D,
252
265
  LLM_TENSOR_SSM_OUT,
266
+ LLM_TENSOR_TIME_MIX_W0,
253
267
  LLM_TENSOR_TIME_MIX_W1,
254
268
  LLM_TENSOR_TIME_MIX_W2,
269
+ LLM_TENSOR_TIME_MIX_A0,
270
+ LLM_TENSOR_TIME_MIX_A1,
271
+ LLM_TENSOR_TIME_MIX_A2,
272
+ LLM_TENSOR_TIME_MIX_V0,
273
+ LLM_TENSOR_TIME_MIX_V1,
274
+ LLM_TENSOR_TIME_MIX_V2,
275
+ LLM_TENSOR_TIME_MIX_G1,
276
+ LLM_TENSOR_TIME_MIX_G2,
277
+ LLM_TENSOR_TIME_MIX_K_K,
278
+ LLM_TENSOR_TIME_MIX_K_A,
279
+ LLM_TENSOR_TIME_MIX_R_K,
255
280
  LLM_TENSOR_TIME_MIX_LERP_X,
256
281
  LLM_TENSOR_TIME_MIX_LERP_W,
257
282
  LLM_TENSOR_TIME_MIX_LERP_K,
package/cpp/llama-batch.h CHANGED
@@ -42,9 +42,9 @@ struct llama_sbatch {
42
42
  bool logits_all; // TODO: remove once lctx.logits_all is removed too
43
43
 
44
44
  // sorted indices into the batch
45
- std::vector<size_t> ids;
45
+ std::vector<int64_t> ids;
46
46
  // batch indices of the output
47
- std::vector<size_t> out_ids;
47
+ std::vector<int64_t> out_ids;
48
48
  std::vector<llama_sbatch_seq> seq;
49
49
 
50
50
  const llama_batch * batch = nullptr;
@@ -59,6 +59,9 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
59
59
  { "granite", LLM_CHAT_TEMPLATE_GRANITE },
60
60
  { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
61
61
  { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
62
+ { "yandex", LLM_CHAT_TEMPLATE_YANDEX },
63
+ { "bailing", LLM_CHAT_TEMPLATE_BAILING },
64
+ { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
62
65
  };
63
66
 
64
67
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -168,6 +171,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
168
171
  return LLM_CHAT_TEMPLATE_GIGACHAT;
169
172
  } else if (tmpl_contains("<|role_start|>")) {
170
173
  return LLM_CHAT_TEMPLATE_MEGREZ;
174
+ } else if (tmpl_contains(" Ассистент:")) {
175
+ return LLM_CHAT_TEMPLATE_YANDEX;
176
+ } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
177
+ return LLM_CHAT_TEMPLATE_BAILING;
178
+ } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
179
+ return LLM_CHAT_TEMPLATE_LLAMA4;
171
180
  }
172
181
  return LLM_CHAT_TEMPLATE_UNKNOWN;
173
182
  }
@@ -567,7 +576,51 @@ int32_t llm_chat_apply_template(
567
576
  if (add_ass) {
568
577
  ss << "<|role_start|>assistant<|role_end|>";
569
578
  }
570
- } else {
579
+ } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
580
+ // Yandex template ("\n\n" is defined as EOT token)
581
+
582
+ ss << "<s>";
583
+
584
+ for (size_t i = 0; i < chat.size(); i++) {
585
+ std::string role(chat[i]->role);
586
+ if (role == "user") {
587
+ ss << " Пользователь: " << chat[i]->content << "\n\n";
588
+ } else if (role == "assistant") {
589
+ ss << " Ассистент: " << chat[i]->content << "\n\n";
590
+ }
591
+ }
592
+
593
+ // Add generation prompt if needed
594
+ if (add_ass) {
595
+ ss << " Ассистент:[SEP]";
596
+ }
597
+ } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
598
+ // Bailing (Ling) template
599
+ for (auto message : chat) {
600
+ std::string role(message->role);
601
+
602
+ if (role == "user") {
603
+ role = "HUMAN";
604
+ } else {
605
+ std::transform(role.begin(), role.end(), role.begin(), ::toupper);
606
+ }
607
+
608
+ ss << "<role>" << role << "</role>" << message->content;
609
+ }
610
+
611
+ if (add_ass) {
612
+ ss << "<role>ASSISTANT</role>";
613
+ }
614
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
615
+ // Llama 4
616
+ for (auto message : chat) {
617
+ std::string role(message->role);
618
+ ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
619
+ }
620
+ if (add_ass) {
621
+ ss << "<|header_start|>assistant<|header_end|>\n\n";
622
+ }
623
+ } else {
571
624
  // template not supported
572
625
  return -1;
573
626
  }
@@ -585,4 +638,3 @@ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
585
638
  }
586
639
  return (int32_t) LLM_CHAT_TEMPLATES.size();
587
640
  }
588
-
package/cpp/llama-chat.h CHANGED
@@ -38,6 +38,9 @@ enum llm_chat_template {
38
38
  LLM_CHAT_TEMPLATE_GRANITE,
39
39
  LLM_CHAT_TEMPLATE_GIGACHAT,
40
40
  LLM_CHAT_TEMPLATE_MEGREZ,
41
+ LLM_CHAT_TEMPLATE_YANDEX,
42
+ LLM_CHAT_TEMPLATE_BAILING,
43
+ LLM_CHAT_TEMPLATE_LLAMA4,
41
44
  LLM_CHAT_TEMPLATE_UNKNOWN,
42
45
  };
43
46