bigdl-core-cpp 2.7.0b20250630__py3-none-win_amd64.whl → 2.7.0b20250701__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +1987 -558
  2. bigdl/cpp/convert_hf_to_gguf_update.py +131 -67
  3. bigdl/cpp/convert_lora_to_gguf.py +3 -3
  4. bigdl/cpp/gguf-py/gguf/constants.py +546 -16
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +57 -6
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +119 -7
  7. bigdl/cpp/gguf-py/gguf/lazy.py +10 -0
  8. bigdl/cpp/gguf-py/gguf/metadata.py +28 -8
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +461 -48
  10. bigdl/cpp/gguf-py/gguf/utility.py +195 -0
  11. bigdl/cpp/gguf-py/gguf/vocab.py +6 -1
  12. bigdl/cpp/libs/llama_cpp/ggml-base.dll +0 -0
  13. bigdl/cpp/libs/llama_cpp/ggml-cpu.dll +0 -0
  14. bigdl/cpp/libs/llama_cpp/ggml-sycl.dll +0 -0
  15. bigdl/cpp/libs/llama_cpp/ggml.dll +0 -0
  16. bigdl/cpp/libs/llama_cpp/llama-batched.exe +0 -0
  17. bigdl/cpp/libs/llama_cpp/llama-bench.exe +0 -0
  18. bigdl/cpp/libs/llama_cpp/llama-cli.exe +0 -0
  19. bigdl/cpp/libs/llama_cpp/llama-embedding.exe +0 -0
  20. bigdl/cpp/libs/llama_cpp/llama-gemma3-cli.exe +0 -0
  21. bigdl/cpp/libs/llama_cpp/llama-gguf.exe +0 -0
  22. bigdl/cpp/libs/llama_cpp/llama-llava-cli.exe +0 -0
  23. bigdl/cpp/libs/llama_cpp/llama-lookup.exe +0 -0
  24. bigdl/cpp/libs/llama_cpp/llama-ls-sycl-device.exe +0 -0
  25. bigdl/cpp/libs/llama_cpp/llama-minicpmv-cli.exe +0 -0
  26. bigdl/cpp/libs/llama_cpp/llama-perplexity.exe +0 -0
  27. bigdl/cpp/libs/llama_cpp/llama-quantize.exe +0 -0
  28. bigdl/cpp/libs/llama_cpp/llama-server.exe +0 -0
  29. bigdl/cpp/libs/llama_cpp/llama-simple.exe +0 -0
  30. bigdl/cpp/libs/llama_cpp/llama-speculative.exe +0 -0
  31. bigdl/cpp/libs/llama_cpp/llama-tokenize.exe +0 -0
  32. bigdl/cpp/libs/llama_cpp/llama.dll +0 -0
  33. bigdl/cpp/libs/ollama/ggml-base.dll +0 -0
  34. bigdl/cpp/libs/ollama/ggml-cpu.dll +0 -0
  35. bigdl/cpp/libs/ollama/ggml-sycl.dll +0 -0
  36. bigdl/cpp/libs/ollama/ggml.dll +0 -0
  37. bigdl/cpp/libs/ollama/llama.dll +0 -0
  38. bigdl/cpp/libs/ollama/llava_shared.dll +0 -0
  39. bigdl/cpp/libs/ollama/mtmd_shared.dll +0 -0
  40. bigdl/cpp/libs/ollama/ollama-lib.exe +0 -0
  41. bigdl/cpp/libs/ollama/ollama.exe +0 -0
  42. {bigdl_core_cpp-2.7.0b20250630.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/METADATA +1 -1
  43. bigdl_core_cpp-2.7.0b20250701.dist-info/RECORD +56 -0
  44. bigdl/cpp/libs/llama_cpp/llava_shared.dll +0 -0
  45. bigdl_core_cpp-2.7.0b20250630.dist-info/RECORD +0 -57
  46. {bigdl_core_cpp-2.7.0b20250630.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-llama-cpp.bat +0 -0
  47. {bigdl_core_cpp-2.7.0b20250630.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-llama-cpp.ps1 +0 -0
  48. {bigdl_core_cpp-2.7.0b20250630.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-ollama.bat +0 -0
  49. {bigdl_core_cpp-2.7.0b20250630.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/WHEEL +0 -0
  50. {bigdl_core_cpp-2.7.0b20250630.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@ class TensorNameMap:
13
13
  "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
14
14
  "transformer.word_embeddings", # falcon
15
15
  "word_embeddings", # bloom
16
- "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2
16
+ "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
17
17
  "tok_embeddings", # llama-pth
18
18
  "embeddings.word_embeddings", # bert nomic-bert
19
19
  "language_model.embedding.word_embeddings", # persimmon
@@ -27,7 +27,10 @@ class TensorNameMap:
27
27
  "embedding.word_embeddings", # chatglm
28
28
  "transformer.token_embeddings", # openelm
29
29
  "shared", # t5
30
- "rwkv.embeddings", # rwkv
30
+ "rwkv.embeddings", # rwkv6
31
+ "model.embeddings", # rwkv7
32
+ "model.word_embeddings", # bailingmoe
33
+ "language_model.model.embed_tokens", # llama4
31
34
  ),
32
35
 
33
36
  # Token type embeddings
@@ -42,6 +45,9 @@ class TensorNameMap:
42
45
  "emb_ln", # nomic-bert
43
46
  "transformer.norm", # openelm
44
47
  "rwkv.blocks.0.pre_ln", # rwkv
48
+ "rwkv.blocks.0.pre_ln", # rwkv6
49
+ "model.pre_ln", # rwkv7
50
+ "model.layers.0.pre_norm", # rwkv7
45
51
  "backbone.norm", # wavtokenizer
46
52
  ),
47
53
 
@@ -62,6 +68,7 @@ class TensorNameMap:
62
68
  "output_layer", # chatglm
63
69
  "head", # rwkv
64
70
  "head.out", # wavtokenizer
71
+ "lm_head", # llama4
65
72
  ),
66
73
 
67
74
  # Output norm
@@ -81,8 +88,10 @@ class TensorNameMap:
81
88
  "encoder.final_layernorm", # chatglm
82
89
  "transformer.norm", # openelm
83
90
  "model.norm", # nemotron
84
- "rwkv.ln_out", # rwkv
91
+ "rwkv.ln_out", # rwkv6
92
+ "model.ln_out", # rwkv7
85
93
  "backbone.final_layer_norm", # wavtokenizer
94
+ "model.norm", # llama4
86
95
  ),
87
96
 
88
97
  # Rope frequencies
@@ -100,6 +109,13 @@ class TensorNameMap:
100
109
 
101
110
  MODEL_TENSOR.ROPE_FACTORS_LONG: (),
102
111
  MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
112
+
113
+ MODEL_TENSOR.CONV1D: (
114
+ "backbone.embed", # roberta
115
+ ),
116
+
117
+ MODEL_TENSOR.ROPE_FACTORS_LONG: (),
118
+ MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
103
119
  }
104
120
 
105
121
  block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -125,14 +141,17 @@ class TensorNameMap:
125
141
  "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
126
142
  "encoder.layers.{bid}.input_layernorm", # chatglm
127
143
  "transformer.layers.{bid}.attn_norm", # openelm
128
- "rwkv.blocks.{bid}.ln1", # rwkv
144
+ "rwkv.blocks.{bid}.ln1", # rwkv6
145
+ "model.layers.{bid}.ln1", # rwkv7
146
+ "model.layers.{bid}.input_layernorm", # llama4
129
147
  ),
130
148
 
131
149
  # Attention norm 2
132
150
  MODEL_TENSOR.ATTN_NORM_2: (
133
151
  "transformer.h.{bid}.ln_attn", # falcon40b
134
152
  "encoder.layer.{bid}.layer_norm_1", # jina-v2-code
135
- "rwkv.blocks.{bid}.ln2", # rwkv
153
+ "rwkv.blocks.{bid}.ln2", # rwkv6
154
+ "model.layers.{bid}.ln2", # rwkv7
136
155
  ),
137
156
 
138
157
  # Attention query-key-value
@@ -148,6 +167,7 @@ class TensorNameMap:
148
167
  "h.{bid}.attn.c_attn", # gpt2
149
168
  "transformer.h.{bid}.mixer.Wqkv", # phi2
150
169
  "encoder.layers.{bid}.attn.Wqkv", # nomic-bert
170
+ "encoder.layers.{bid}.mixer.Wqkv", # jina
151
171
  "model.layers.{bid}.self_attn.qkv_proj", # phi3
152
172
  "encoder.layers.{bid}.self_attention.query_key_value", # chatglm
153
173
  "transformer.layers.{bid}.attn.qkv_proj", # openelm
@@ -159,11 +179,13 @@ class TensorNameMap:
159
179
  "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
160
180
  "layers.{bid}.attention.wq", # llama-pth
161
181
  "encoder.layer.{bid}.attention.self.query", # bert
182
+ "transformer.layer.{bid}.attention.q_lin", # distillbert
162
183
  "transformer.h.{bid}.attn.q_proj", # gpt-j
163
184
  "model.layers.layers.{bid}.self_attn.q_proj", # plamo
164
185
  "model.layers.{bid}.attention.wq", # internlm2
165
186
  "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
166
187
  "transformer.h.{bid}.attn.attention.q_proj", # exaone
188
+ "model.layers.{bid}.self_attn.q_proj", # llama4
167
189
  ),
168
190
 
169
191
  # Attention key
@@ -172,12 +194,14 @@ class TensorNameMap:
172
194
  "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
173
195
  "layers.{bid}.attention.wk", # llama-pth
174
196
  "encoder.layer.{bid}.attention.self.key", # bert
197
+ "transformer.layer.{bid}.attention.k_lin", # distillbert
175
198
  "transformer.h.{bid}.attn.k_proj", # gpt-j
176
199
  "transformer.h.{bid}.attn.k", # refact
177
200
  "model.layers.layers.{bid}.self_attn.k_proj", # plamo
178
201
  "model.layers.{bid}.attention.wk", # internlm2
179
202
  "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
180
203
  "transformer.h.{bid}.attn.attention.k_proj", # exaone
204
+ "model.layers.{bid}.self_attn.k_proj", # llama4
181
205
  ),
182
206
 
183
207
  # Attention value
@@ -185,12 +209,14 @@ class TensorNameMap:
185
209
  "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
186
210
  "layers.{bid}.attention.wv", # llama-pth
187
211
  "encoder.layer.{bid}.attention.self.value", # bert
212
+ "transformer.layer.{bid}.attention.v_lin", # distillbert
188
213
  "transformer.h.{bid}.attn.v_proj", # gpt-j
189
214
  "transformer.h.{bid}.attn.v", # refact
190
215
  "model.layers.layers.{bid}.self_attn.v_proj", # plamo
191
216
  "model.layers.{bid}.attention.wv", # internlm2
192
217
  "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
193
218
  "transformer.h.{bid}.attn.attention.v_proj", # exaone
219
+ "model.layers.{bid}.self_attn.v_proj", # llama4
194
220
  ),
195
221
 
196
222
  # Attention output
@@ -204,6 +230,7 @@ class TensorNameMap:
204
230
  "model.layers.{bid}.self_attn.linear_attn", # deci
205
231
  "layers.{bid}.attention.wo", # llama-pth
206
232
  "encoder.layer.{bid}.attention.output.dense", # bert
233
+ "transformer.layer.{bid}.attention.out_lin", # distillbert
207
234
  "transformer.h.{bid}.attn.out_proj", # gpt-j
208
235
  "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
209
236
  "model.layers.{bid}.self_attn.dense", # persimmon
@@ -212,23 +239,27 @@ class TensorNameMap:
212
239
  "model.layers.layers.{bid}.self_attn.o_proj", # plamo
213
240
  "model.layers.{bid}.attention.wo", # internlm2
214
241
  "encoder.layers.{bid}.attn.out_proj", # nomic-bert
242
+ "encoder.layers.{bid}.mixer.out_proj", # jina
215
243
  "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
216
244
  "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
217
245
  "encoder.layers.{bid}.self_attention.dense", # chatglm
218
246
  "transformer.layers.{bid}.attn.out_proj", # openelm
219
247
  "transformer.h.{bid}.attn.attention.out_proj", # exaone
248
+ "model.layers.{bid}.self_attn.o_proj", # llama4
220
249
  ),
221
250
 
222
251
  # Attention output norm
223
252
  MODEL_TENSOR.ATTN_OUT_NORM: (
224
253
  "encoder.layer.{bid}.attention.output.LayerNorm", # bert
254
+ "transformer.layer.{bid}.sa_layer_norm", # distillbert
225
255
  "encoder.layers.{bid}.norm1", # nomic-bert
226
256
  "transformer.decoder_layer.{bid}.rms_norm_1", # Grok
227
257
  "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
228
258
  ),
229
259
 
230
260
  MODEL_TENSOR.ATTN_POST_NORM: (
231
- "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
261
+ "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
262
+ "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
232
263
  ),
233
264
 
234
265
  # Rotary embeddings
@@ -254,6 +285,7 @@ class TensorNameMap:
254
285
  "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
255
286
  "encoder.layers.{bid}.post_attention_layernorm", # chatglm
256
287
  "transformer.layers.{bid}.ffn_norm", # openelm
288
+ "model.layers.{bid}.post_attention_layernorm", # llama4
257
289
  ),
258
290
 
259
291
  # Post feed-forward norm
@@ -264,6 +296,7 @@ class TensorNameMap:
264
296
  # Post feed-forward norm
265
297
  MODEL_TENSOR.FFN_POST_NORM: (
266
298
  "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
299
+ "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
267
300
  ),
268
301
 
269
302
  MODEL_TENSOR.FFN_GATE_INP: (
@@ -273,6 +306,8 @@ class TensorNameMap:
273
306
  "transformer.decoder_layer.{bid}.router", # Grok
274
307
  "transformer.blocks.{bid}.ffn.router.layer", # dbrx
275
308
  "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
309
+ "model.layers.{bid}.feed_forward.router", # llama4
310
+ "encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
276
311
  ),
277
312
 
278
313
  MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -280,7 +315,7 @@ class TensorNameMap:
280
315
  ),
281
316
 
282
317
  MODEL_TENSOR.FFN_EXP_PROBS_B: (
283
- "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
318
+ "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
284
319
  ),
285
320
 
286
321
  # Feed-forward up
@@ -293,6 +328,7 @@ class TensorNameMap:
293
328
  "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
294
329
  "layers.{bid}.feed_forward.w3", # llama-pth
295
330
  "encoder.layer.{bid}.intermediate.dense", # bert
331
+ "transformer.layer.{bid}.ffn.lin1", # distillbert
296
332
  "transformer.h.{bid}.mlp.fc_in", # gpt-j
297
333
  "transformer.h.{bid}.mlp.linear_3", # refact
298
334
  "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
@@ -301,15 +337,19 @@ class TensorNameMap:
301
337
  "h.{bid}.mlp.c_fc", # gpt2
302
338
  "transformer.h.{bid}.mlp.fc1", # phi2
303
339
  "model.layers.{bid}.mlp.fc1", # phi2
304
- "model.layers.{bid}.mlp.gate_up_proj", # phi3
340
+ "model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414
305
341
  "model.layers.layers.{bid}.mlp.up_proj", # plamo
306
342
  "model.layers.{bid}.feed_forward.w3", # internlm2
307
343
  "encoder.layers.{bid}.mlp.fc11", # nomic-bert
344
+ "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
308
345
  "model.layers.{bid}.mlp.c_fc", # starcoder2
309
- "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
346
+ "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used)
347
+ "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
348
+ "encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
310
349
  "model.layers.{bid}.residual_mlp.w3", # arctic
311
350
  "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
312
351
  "transformer.h.{bid}.mlp.c_fc_1", # exaone
352
+ "model.layers.{bid}.feed_forward.up_proj", # llama4
313
353
  ),
314
354
 
315
355
  MODEL_TENSOR.FFN_UP_EXP: (
@@ -318,11 +358,14 @@ class TensorNameMap:
318
358
  "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
319
359
  "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
320
360
  "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
361
+ "model.layers.{bid}.feed_forward.experts.up_proj", # llama4
362
+ "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
321
363
  ),
322
364
 
323
365
  MODEL_TENSOR.FFN_UP_SHEXP: (
324
- "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
325
- "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
366
+ "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
367
+ "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
368
+ "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
326
369
  ),
327
370
 
328
371
  # AWQ-activation gate
@@ -339,23 +382,26 @@ class TensorNameMap:
339
382
  "model.layers.layers.{bid}.mlp.gate_proj", # plamo
340
383
  "model.layers.{bid}.feed_forward.w1", # internlm2
341
384
  "encoder.layers.{bid}.mlp.fc12", # nomic-bert
342
- "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
385
+ "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
343
386
  "transformer.h.{bid}.mlp.linear_1", # refact
344
387
  "model.layers.{bid}.residual_mlp.w1", # arctic
345
388
  "transformer.h.{bid}.mlp.c_fc_0", # exaone
389
+ "model.layers.{bid}.feed_forward.gate_proj", # llama4
346
390
  ),
347
391
 
348
392
  MODEL_TENSOR.FFN_GATE_EXP: (
349
- "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
350
- "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
351
- "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
352
- "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
353
- "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
393
+ "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
394
+ "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
395
+ "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
396
+ "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
397
+ "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
398
+ "model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
354
399
  ),
355
400
 
356
401
  MODEL_TENSOR.FFN_GATE_SHEXP: (
357
- "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
358
- "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
402
+ "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
403
+ "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
404
+ "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
359
405
  ),
360
406
 
361
407
  # Feed-forward down
@@ -368,6 +414,7 @@ class TensorNameMap:
368
414
  "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
369
415
  "layers.{bid}.feed_forward.w2", # llama-pth
370
416
  "encoder.layer.{bid}.output.dense", # bert
417
+ "transformer.layer.{bid}.ffn.lin2", # distillbert
371
418
  "transformer.h.{bid}.mlp.fc_out", # gpt-j
372
419
  "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
373
420
  "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
@@ -384,6 +431,7 @@ class TensorNameMap:
384
431
  "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
385
432
  "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
386
433
  "model.layers.h.{bid}.mlp.c_proj", # exaone
434
+ "model.layers.{bid}.feed_forward.down_proj", # llama4
387
435
  ),
388
436
 
389
437
  MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -393,11 +441,15 @@ class TensorNameMap:
393
441
  "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
394
442
  "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
395
443
  "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
444
+ "model.layers.{bid}.feed_forward.experts.down_proj", # llama4
445
+ "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
396
446
  ),
397
447
 
398
448
  MODEL_TENSOR.FFN_DOWN_SHEXP: (
399
- "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
400
- "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
449
+ "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
450
+ "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
451
+ "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
452
+ "model.layers.{bid}.shared_mlp.output_linear", # granitemoe
401
453
  ),
402
454
 
403
455
  MODEL_TENSOR.ATTN_Q_NORM: (
@@ -424,6 +476,7 @@ class TensorNameMap:
424
476
 
425
477
  MODEL_TENSOR.LAYER_OUT_NORM: (
426
478
  "encoder.layer.{bid}.output.LayerNorm", # bert
479
+ "transformer.layer.{bid}.output_layer_norm", # distillbert
427
480
  "encoder.layers.{bid}.norm2", # nomic-bert
428
481
  "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
429
482
  "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
@@ -465,112 +518,174 @@ class TensorNameMap:
465
518
  "backbone.layers.{bid}.mixer.out_proj",
466
519
  ),
467
520
 
521
+ MODEL_TENSOR.TIME_MIX_W0: (
522
+ "model.layers.{bid}.attention.w0", # rwkv7
523
+ ),
524
+
468
525
  MODEL_TENSOR.TIME_MIX_W1: (
469
- "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
470
- "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
526
+ "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6
527
+ "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
528
+ "model.layers.{bid}.attention.w1", # rwkv7
471
529
  ),
472
530
 
473
531
  MODEL_TENSOR.TIME_MIX_W2: (
474
- "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
475
- "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
532
+ "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6
533
+ "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
534
+ "model.layers.{bid}.attention.w2", # rwkv7
535
+ ),
536
+
537
+ MODEL_TENSOR.TIME_MIX_A0: (
538
+ "model.layers.{bid}.attention.a0", # rwkv7
539
+ ),
540
+
541
+ MODEL_TENSOR.TIME_MIX_A1: (
542
+ "model.layers.{bid}.attention.a1", # rwkv7
543
+ ),
544
+
545
+ MODEL_TENSOR.TIME_MIX_A2: (
546
+ "model.layers.{bid}.attention.a2", # rwkv7
547
+ ),
548
+
549
+ MODEL_TENSOR.TIME_MIX_V0: (
550
+ "model.layers.{bid}.attention.v0", # rwkv7
551
+ ),
552
+
553
+ MODEL_TENSOR.TIME_MIX_V1: (
554
+ "model.layers.{bid}.attention.v1", # rwkv7
555
+ ),
556
+
557
+ MODEL_TENSOR.TIME_MIX_V2: (
558
+ "model.layers.{bid}.attention.v2", # rwkv7
559
+ ),
560
+
561
+ MODEL_TENSOR.TIME_MIX_G1: (
562
+ "model.layers.{bid}.attention.g1", # rwkv7
563
+ ),
564
+
565
+ MODEL_TENSOR.TIME_MIX_G2: (
566
+ "model.layers.{bid}.attention.g2", # rwkv7
567
+ ),
568
+
569
+ MODEL_TENSOR.TIME_MIX_K_K: (
570
+ "model.layers.{bid}.attention.k_k", # rwkv7
571
+ ),
572
+
573
+ MODEL_TENSOR.TIME_MIX_K_A: (
574
+ "model.layers.{bid}.attention.k_a", # rwkv7
575
+ ),
576
+
577
+ MODEL_TENSOR.TIME_MIX_R_K: (
578
+ "model.layers.{bid}.attention.r_k", # rwkv7
476
579
  ),
477
580
 
478
581
  MODEL_TENSOR.TIME_MIX_LERP_X: (
479
- "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
582
+ "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6
480
583
  "model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2
481
584
  ),
482
585
 
483
586
  MODEL_TENSOR.TIME_MIX_LERP_K: (
484
- "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
587
+ "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6
485
588
  "model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2
486
589
  ),
487
590
 
488
591
  MODEL_TENSOR.TIME_MIX_LERP_V: (
489
- "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
592
+ "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6
490
593
  "model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2
491
594
  ),
492
595
 
493
596
  MODEL_TENSOR.TIME_MIX_LERP_R: (
494
- "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
597
+ "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6
495
598
  "model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2
496
599
  ),
497
600
 
498
601
  MODEL_TENSOR.TIME_MIX_LERP_G: (
499
- "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
602
+ "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6
500
603
  "model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2
501
604
  ),
502
605
 
503
606
  MODEL_TENSOR.TIME_MIX_LERP_W: (
504
- "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
607
+ "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6
505
608
  "model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2
506
609
  ),
507
610
 
508
611
  MODEL_TENSOR.TIME_MIX_FIRST: (
509
- "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
612
+ "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6
510
613
  ),
511
614
 
512
615
  MODEL_TENSOR.TIME_MIX_DECAY: (
513
- "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
616
+ "rwkv.blocks.{bid}.attention.time_decay", # rwkv6
514
617
  "model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2
515
618
  ),
516
619
 
517
620
  MODEL_TENSOR.TIME_MIX_DECAY_W1: (
518
- "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
621
+ "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6
519
622
  "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
520
623
  ),
521
624
 
522
625
  MODEL_TENSOR.TIME_MIX_DECAY_W2: (
523
- "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
626
+ "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6
524
627
  "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
525
628
  ),
526
629
 
527
630
  MODEL_TENSOR.TIME_MIX_KEY: (
528
- "rwkv.blocks.{bid}.attention.key", # rwkv
631
+ "rwkv.blocks.{bid}.attention.key", # rwkv6
529
632
  "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
633
+ "model.layers.{bid}.attention.key", # rwkv7
634
+ "model.layers.{bid}.attention.k_proj", # rwkv7
530
635
  ),
531
636
 
532
637
  MODEL_TENSOR.TIME_MIX_VALUE: (
533
- "rwkv.blocks.{bid}.attention.value", # rwkv
638
+ "rwkv.blocks.{bid}.attention.value", # rwkv6
534
639
  "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
640
+ "model.layers.{bid}.attention.value", # rwkv7
641
+ "model.layers.{bid}.attention.v_proj", # rwkv7
535
642
  ),
536
643
 
537
644
  MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
538
- "rwkv.blocks.{bid}.attention.receptance", # rwkv
539
- "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
645
+ "rwkv.blocks.{bid}.attention.receptance", # rwkv6
646
+ "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
647
+ "model.layers.{bid}.attention.receptance", # rwkv7
648
+ "model.layers.{bid}.attention.r_proj", # rwkv7
540
649
  ),
541
650
 
542
651
  MODEL_TENSOR.TIME_MIX_GATE: (
543
- "rwkv.blocks.{bid}.attention.gate", # rwkv
544
- "model.layers.{bid}.self_attn.gate", # rwkv6qwen2
652
+ "rwkv.blocks.{bid}.attention.gate", # rwkv6
653
+ "model.layers.{bid}.self_attn.gate", # rwkv6qwen2
545
654
  ),
546
655
 
547
656
  MODEL_TENSOR.TIME_MIX_LN: (
548
- "rwkv.blocks.{bid}.attention.ln_x", # rwkv
657
+ "rwkv.blocks.{bid}.attention.ln_x", # rwkv6
658
+ "model.layers.{bid}.attention.ln_x" # rwkv7
549
659
  ),
550
660
 
551
661
  MODEL_TENSOR.TIME_MIX_OUTPUT: (
552
- "rwkv.blocks.{bid}.attention.output", # rwkv
662
+ "rwkv.blocks.{bid}.attention.output", # rwkv6
553
663
  "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
664
+ "model.layers.{bid}.attention.output", # rwkv7
665
+ "model.layers.{bid}.attention.o_proj", # rwkv7
554
666
  ),
555
667
 
556
668
  MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
557
- "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
669
+ "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
670
+ "model.layers.{bid}.feed_forward.x_k", # rwkv7
558
671
  ),
559
672
 
560
673
  MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
561
- "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
674
+ "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6
562
675
  ),
563
676
 
564
677
  MODEL_TENSOR.CHANNEL_MIX_KEY: (
565
- "rwkv.blocks.{bid}.feed_forward.key", # rwkv
678
+ "rwkv.blocks.{bid}.feed_forward.key", # rwkv6
679
+ "model.layers.{bid}.feed_forward.key", # rwkv7
566
680
  ),
567
681
 
568
682
  MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
569
- "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
683
+ "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6
570
684
  ),
571
685
 
572
686
  MODEL_TENSOR.CHANNEL_MIX_VALUE: (
573
- "rwkv.blocks.{bid}.feed_forward.value", # rwkv
687
+ "rwkv.blocks.{bid}.feed_forward.value", # rwkv6
688
+ "model.layers.{bid}.feed_forward.value", # rwkv7
574
689
  ),
575
690
 
576
691
  MODEL_TENSOR.ATTN_Q_A: (
@@ -589,6 +704,14 @@ class TensorNameMap:
589
704
  "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
590
705
  ),
591
706
 
707
+ MODEL_TENSOR.ATTN_K_B: (
708
+ "model.layers.{bid}.self_attn.k_b_proj", # deepseek2
709
+ ),
710
+
711
+ MODEL_TENSOR.ATTN_V_B: (
712
+ "model.layers.{bid}.self_attn.v_b_proj", # deepseek2
713
+ ),
714
+
592
715
  MODEL_TENSOR.ATTN_Q_A_NORM: (
593
716
  "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
594
717
  ),
@@ -724,6 +847,7 @@ class TensorNameMap:
724
847
  MODEL_TENSOR.CLS: (
725
848
  "classifier", # jina
726
849
  "classifier.dense", # roberta
850
+ "pre_classifier", # distillbert
727
851
  ),
728
852
 
729
853
  MODEL_TENSOR.CLS_OUT: (
@@ -790,6 +914,295 @@ class TensorNameMap:
790
914
  MODEL_TENSOR.POSNET_ATTN_OUT: (
791
915
  "backbone.posnet.{bid}.proj_out", # wavtokenizer
792
916
  ),
917
+
918
+ #############################################################################
919
+ ## Vision encoder
920
+
921
+ MODEL_TENSOR.V_MMPROJ: (
922
+ "multi_modal_projector.linear_{bid}",
923
+ "visual.merger.mlp.{bid}", # qwen2vl
924
+ ),
925
+
926
+ MODEL_TENSOR.V_MMPROJ_FC: (
927
+ "model.connector.modality_projection.proj", # SmolVLM
928
+ ),
929
+
930
+ MODEL_TENSOR.V_MMPROJ_MLP: (
931
+ "model.mm_projector.mlp.mlp.{bid}",
932
+ "vision_model.vision_adapter.mlp.fc{bid}", # llama 4
933
+ "mlp1.{bid}", # InternVL
934
+ ),
935
+
936
+ MODEL_TENSOR.V_MMPROJ_PEG: (
937
+ "model.mm_projector.peg.peg.{bid}",
938
+ ),
939
+
940
+ MODEL_TENSOR.V_ENC_EMBD_CLS: (
941
+ "vision_tower.vision_model.embeddings.class_embedding",
942
+ "vision_model.class_embedding", # llama 4
943
+ ),
944
+
945
+ MODEL_TENSOR.V_ENC_EMBD_PATCH: (
946
+ "vision_tower.vision_model.embeddings.patch_embedding",
947
+ "vpm.embeddings.patch_embedding",
948
+ "model.vision_model.embeddings.patch_embedding", # SmolVLM
949
+ "vision_tower.patch_conv", # pixtral
950
+ "vision_model.patch_embedding.linear", # llama 4
951
+ "visual.patch_embed.proj", # qwen2vl
952
+ ),
953
+
954
+ MODEL_TENSOR.V_ENC_EMBD_POS: (
955
+ "vision_tower.vision_model.embeddings.position_embedding",
956
+ "vpm.embeddings.position_embedding",
957
+ "model.vision_model.embeddings.position_embedding", # SmolVLM
958
+ "vision_model.positional_embedding_vlm", # llama 4
959
+ ),
960
+
961
+ MODEL_TENSOR.V_ENC_ATTN_Q: (
962
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
963
+ "vpm.encoder.layers.{bid}.self_attn.q_proj",
964
+ "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
965
+ "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
966
+ "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
967
+ "visual.blocks.{bid}.attn.q", # qwen2vl, generated
968
+ ),
969
+
970
+ MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
971
+ "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
972
+ ),
973
+
974
+ MODEL_TENSOR.V_ENC_ATTN_K: (
975
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
976
+ "vpm.encoder.layers.{bid}.self_attn.k_proj",
977
+ "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
978
+ "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
979
+ "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
980
+ "visual.blocks.{bid}.attn.k", # qwen2vl, generated
981
+ ),
982
+
983
+ MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
984
+ "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
985
+ ),
986
+
987
+ MODEL_TENSOR.V_ENC_ATTN_V: (
988
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
989
+ "vpm.encoder.layers.{bid}.self_attn.v_proj",
990
+ "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
991
+ "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
992
+ "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
993
+ "visual.blocks.{bid}.attn.v", # qwen2vl, generated
994
+ ),
995
+
996
+ MODEL_TENSOR.V_ENC_INPUT_NORM: (
997
+ "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
998
+ "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
999
+ "vpm.encoder.layers.{bid}.layer_norm1",
1000
+ "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
1001
+ "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
1002
+ "vision_model.model.layers.{bid}.input_layernorm", # llama4
1003
+ "visual.blocks.{bid}.norm1", # qwen2vl
1004
+ ),
1005
+
1006
+ MODEL_TENSOR.V_ENC_ATTN_O: (
1007
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
1008
+ "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
1009
+ "vpm.encoder.layers.{bid}.self_attn.out_proj",
1010
+ "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
1011
+ "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
1012
+ "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
1013
+ "visual.blocks.{bid}.attn.proj", # qwen2vl
1014
+ ),
1015
+
1016
+ MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
1017
+ "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
1018
+ "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
1019
+ "vpm.encoder.layers.{bid}.layer_norm2",
1020
+ "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
1021
+ "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
1022
+ "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
1023
+ "visual.blocks.{bid}.norm2", # qwen2vl
1024
+ ),
1025
+
1026
+ MODEL_TENSOR.V_ENC_FFN_UP: (
1027
+ "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
1028
+ "vpm.encoder.layers.{bid}.mlp.fc1",
1029
+ "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
1030
+ "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
1031
+ "vision_model.model.layers.{bid}.mlp.fc1", # llama4
1032
+ "visual.blocks.{bid}.mlp.fc1", # qwen2vl
1033
+ "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
1034
+ ),
1035
+
1036
+ MODEL_TENSOR.V_ENC_FFN_GATE: (
1037
+ "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
1038
+ "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
1039
+ ),
1040
+
1041
+ MODEL_TENSOR.V_ENC_FFN_DOWN: (
1042
+ "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
1043
+ "vpm.encoder.layers.{bid}.mlp.fc2",
1044
+ "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
1045
+ "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
1046
+ "vision_model.model.layers.{bid}.mlp.fc2", # llama4
1047
+ "visual.blocks.{bid}.mlp.fc2", # qwen2vl
1048
+ "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
1049
+ ),
1050
+
1051
+ MODEL_TENSOR.V_LAYER_SCALE_1: (
1052
+ "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
1053
+ ),
1054
+
1055
+ MODEL_TENSOR.V_LAYER_SCALE_2: (
1056
+ "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
1057
+ ),
1058
+
1059
+ MODEL_TENSOR.V_PRE_NORM: (
1060
+ "vision_tower.vision_model.pre_layrnorm",
1061
+ "vision_tower.ln_pre", # pixtral
1062
+ "vision_model.layernorm_pre", # llama4
1063
+ ),
1064
+
1065
+ MODEL_TENSOR.V_POST_NORM: (
1066
+ "vision_tower.vision_model.post_layernorm",
1067
+ "model.vision_model.post_layernorm", # SmolVLM
1068
+ "vision_model.layernorm_post", # llama4
1069
+ "visual.merger.ln_q", # qwen2vl
1070
+ ),
1071
+
1072
+ MODEL_TENSOR.V_MM_INP_PROJ: (
1073
+ "multi_modal_projector.mm_input_projection",
1074
+ ),
1075
+
1076
+ MODEL_TENSOR.V_MM_INP_NORM: (
1077
+ "multi_modal_projector.norm",
1078
+ ),
1079
+
1080
+ MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
1081
+ "multi_modal_projector.mm_soft_emb_norm",
1082
+ ),
1083
+
1084
+ MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
1085
+ "resampler.pos_embed_k",
1086
+ ),
1087
+
1088
+ MODEL_TENSOR.V_RESMPL_ATTN_Q: (
1089
+ "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
1090
+ ),
1091
+
1092
+ MODEL_TENSOR.V_RESMPL_ATTN_K: (
1093
+ "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
1094
+ ),
1095
+
1096
+ MODEL_TENSOR.V_RESMPL_ATTN_V: (
1097
+ "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
1098
+ ),
1099
+
1100
+ MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
1101
+ "resampler.attn.out_proj",
1102
+ ),
1103
+
1104
+ MODEL_TENSOR.V_RESMPL_KV: (
1105
+ "resampler.kv_proj",
1106
+ ),
1107
+
1108
+ MODEL_TENSOR.V_RESMPL_POST_NORM: (
1109
+ "resampler.ln_post",
1110
+ ),
1111
+
1112
+ MODEL_TENSOR.V_RESMPL_KV_NORM: (
1113
+ "resampler.ln_kv",
1114
+ ),
1115
+
1116
+ MODEL_TENSOR.V_RESMPL_Q_NORM: (
1117
+ "resampler.ln_q",
1118
+ ),
1119
+
1120
+ MODEL_TENSOR.V_RESMPL_PROJ: (
1121
+ "resampler.proj",
1122
+ ),
1123
+
1124
+ MODEL_TENSOR.V_RESMPL_QUERY: (
1125
+ "resampler.query",
1126
+ ),
1127
+
1128
+ MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
1129
+ "v.token_embd.img_break", # for pixtral, this is a generated vector
1130
+ ),
1131
+
1132
+ MODEL_TENSOR.V_MM_PATCH_MERGER: (
1133
+ "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
1134
+ ),
1135
+
1136
+ # audio (mtmd)
1137
+
1138
+ MODEL_TENSOR.A_ENC_EMBD_POS: (
1139
+ "audio_tower.embed_positions", # ultravox
1140
+ ),
1141
+
1142
+ MODEL_TENSOR.A_ENC_CONV1D: (
1143
+ "audio_tower.conv{bid}", # ultravox
1144
+ ),
1145
+
1146
+ MODEL_TENSOR.A_PRE_NORM: (),
1147
+
1148
+ MODEL_TENSOR.A_POST_NORM: (
1149
+ "audio_tower.layer_norm", # ultravox
1150
+ "audio_tower.ln_post", # qwen2omni
1151
+ ),
1152
+
1153
+ MODEL_TENSOR.A_ENC_ATTN_Q: (
1154
+ "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
1155
+ ),
1156
+
1157
+ MODEL_TENSOR.A_ENC_ATTN_K: (
1158
+ "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
1159
+ ),
1160
+
1161
+ MODEL_TENSOR.A_ENC_ATTN_V: (
1162
+ "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
1163
+ ),
1164
+
1165
+ MODEL_TENSOR.A_ENC_INPUT_NORM: (
1166
+ "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
1167
+ ),
1168
+
1169
+ MODEL_TENSOR.A_ENC_OUTPUT: (
1170
+ "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
1171
+ ),
1172
+
1173
+ MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
1174
+ "audio_tower.layers.{bid}.final_layer_norm", # ultravox
1175
+ ),
1176
+
1177
+ MODEL_TENSOR.A_ENC_FFN_UP: (
1178
+ "audio_tower.layers.{bid}.fc1", # ultravox
1179
+ ),
1180
+
1181
+ MODEL_TENSOR.A_ENC_FFN_GATE: (),
1182
+
1183
+ MODEL_TENSOR.A_ENC_FFN_DOWN: (
1184
+ "audio_tower.layers.{bid}.fc2", # ultravox
1185
+ ),
1186
+
1187
+ # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
1188
+ # this prefix is added in the conversion code in modify_tensors()
1189
+
1190
+ MODEL_TENSOR.A_MMPROJ: (
1191
+ "audio.multi_modal_projector.linear_{bid}", # ultravox
1192
+ ),
1193
+
1194
+ MODEL_TENSOR.A_MMPROJ_FC: (
1195
+ "audio.multi_modal_projector.linear", # qwen2audio
1196
+ "audio_tower.proj", # qwen2omni
1197
+ ),
1198
+
1199
+ MODEL_TENSOR.A_MM_NORM_PRE: (
1200
+ "audio.multi_modal_projector.ln_pre", # ultravox
1201
+ ),
1202
+
1203
+ MODEL_TENSOR.A_MM_NORM_MID: (
1204
+ "audio.multi_modal_projector.ln_mid", # ultravox
1205
+ ),
793
1206
  }
794
1207
 
795
1208
  # architecture-specific block mappings