bigdl-core-cpp 2.7.0b20250629__py3-none-win_amd64.whl → 2.7.0b20250701__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert_hf_to_gguf.py +1987 -558
- bigdl/cpp/convert_hf_to_gguf_update.py +131 -67
- bigdl/cpp/convert_lora_to_gguf.py +3 -3
- bigdl/cpp/gguf-py/gguf/constants.py +546 -16
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +57 -6
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +119 -7
- bigdl/cpp/gguf-py/gguf/lazy.py +10 -0
- bigdl/cpp/gguf-py/gguf/metadata.py +28 -8
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +461 -48
- bigdl/cpp/gguf-py/gguf/utility.py +195 -0
- bigdl/cpp/gguf-py/gguf/vocab.py +6 -1
- bigdl/cpp/libs/llama_cpp/ggml-base.dll +0 -0
- bigdl/cpp/libs/llama_cpp/ggml-cpu.dll +0 -0
- bigdl/cpp/libs/llama_cpp/ggml-sycl.dll +0 -0
- bigdl/cpp/libs/llama_cpp/ggml.dll +0 -0
- bigdl/cpp/libs/llama_cpp/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-gemma3-cli.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-server.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama_cpp/llama.dll +0 -0
- bigdl/cpp/libs/ollama/ggml-base.dll +0 -0
- bigdl/cpp/libs/ollama/ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ollama/ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ollama/ggml.dll +0 -0
- bigdl/cpp/libs/ollama/llama.dll +0 -0
- bigdl/cpp/libs/ollama/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama/mtmd_shared.dll +0 -0
- bigdl/cpp/libs/ollama/ollama-lib.exe +0 -0
- bigdl/cpp/libs/ollama/ollama.exe +0 -0
- {bigdl_core_cpp-2.7.0b20250629.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-ollama.bat +1 -5
- {bigdl_core_cpp-2.7.0b20250629.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.7.0b20250701.dist-info/RECORD +56 -0
- bigdl/cpp/libs/llama_cpp/llava_shared.dll +0 -0
- bigdl_core_cpp-2.7.0b20250629.dist-info/RECORD +0 -56
- {bigdl_core_cpp-2.7.0b20250629.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.7.0b20250629.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.7.0b20250629.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.7.0b20250629.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@ class TensorNameMap:
|
|
13
13
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
14
14
|
"transformer.word_embeddings", # falcon
|
15
15
|
"word_embeddings", # bloom
|
16
|
-
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2
|
16
|
+
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
|
17
17
|
"tok_embeddings", # llama-pth
|
18
18
|
"embeddings.word_embeddings", # bert nomic-bert
|
19
19
|
"language_model.embedding.word_embeddings", # persimmon
|
@@ -27,7 +27,10 @@ class TensorNameMap:
|
|
27
27
|
"embedding.word_embeddings", # chatglm
|
28
28
|
"transformer.token_embeddings", # openelm
|
29
29
|
"shared", # t5
|
30
|
-
"rwkv.embeddings", #
|
30
|
+
"rwkv.embeddings", # rwkv6
|
31
|
+
"model.embeddings", # rwkv7
|
32
|
+
"model.word_embeddings", # bailingmoe
|
33
|
+
"language_model.model.embed_tokens", # llama4
|
31
34
|
),
|
32
35
|
|
33
36
|
# Token type embeddings
|
@@ -42,6 +45,9 @@ class TensorNameMap:
|
|
42
45
|
"emb_ln", # nomic-bert
|
43
46
|
"transformer.norm", # openelm
|
44
47
|
"rwkv.blocks.0.pre_ln", # rwkv
|
48
|
+
"rwkv.blocks.0.pre_ln", # rwkv6
|
49
|
+
"model.pre_ln", # rwkv7
|
50
|
+
"model.layers.0.pre_norm", # rwkv7
|
45
51
|
"backbone.norm", # wavtokenizer
|
46
52
|
),
|
47
53
|
|
@@ -62,6 +68,7 @@ class TensorNameMap:
|
|
62
68
|
"output_layer", # chatglm
|
63
69
|
"head", # rwkv
|
64
70
|
"head.out", # wavtokenizer
|
71
|
+
"lm_head", # llama4
|
65
72
|
),
|
66
73
|
|
67
74
|
# Output norm
|
@@ -81,8 +88,10 @@ class TensorNameMap:
|
|
81
88
|
"encoder.final_layernorm", # chatglm
|
82
89
|
"transformer.norm", # openelm
|
83
90
|
"model.norm", # nemotron
|
84
|
-
"rwkv.ln_out", #
|
91
|
+
"rwkv.ln_out", # rwkv6
|
92
|
+
"model.ln_out", # rwkv7
|
85
93
|
"backbone.final_layer_norm", # wavtokenizer
|
94
|
+
"model.norm", # llama4
|
86
95
|
),
|
87
96
|
|
88
97
|
# Rope frequencies
|
@@ -100,6 +109,13 @@ class TensorNameMap:
|
|
100
109
|
|
101
110
|
MODEL_TENSOR.ROPE_FACTORS_LONG: (),
|
102
111
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
|
112
|
+
|
113
|
+
MODEL_TENSOR.CONV1D: (
|
114
|
+
"backbone.embed", # roberta
|
115
|
+
),
|
116
|
+
|
117
|
+
MODEL_TENSOR.ROPE_FACTORS_LONG: (),
|
118
|
+
MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
|
103
119
|
}
|
104
120
|
|
105
121
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
@@ -125,14 +141,17 @@ class TensorNameMap:
|
|
125
141
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
126
142
|
"encoder.layers.{bid}.input_layernorm", # chatglm
|
127
143
|
"transformer.layers.{bid}.attn_norm", # openelm
|
128
|
-
"rwkv.blocks.{bid}.ln1", #
|
144
|
+
"rwkv.blocks.{bid}.ln1", # rwkv6
|
145
|
+
"model.layers.{bid}.ln1", # rwkv7
|
146
|
+
"model.layers.{bid}.input_layernorm", # llama4
|
129
147
|
),
|
130
148
|
|
131
149
|
# Attention norm 2
|
132
150
|
MODEL_TENSOR.ATTN_NORM_2: (
|
133
151
|
"transformer.h.{bid}.ln_attn", # falcon40b
|
134
152
|
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
135
|
-
"rwkv.blocks.{bid}.ln2", #
|
153
|
+
"rwkv.blocks.{bid}.ln2", # rwkv6
|
154
|
+
"model.layers.{bid}.ln2", # rwkv7
|
136
155
|
),
|
137
156
|
|
138
157
|
# Attention query-key-value
|
@@ -148,6 +167,7 @@ class TensorNameMap:
|
|
148
167
|
"h.{bid}.attn.c_attn", # gpt2
|
149
168
|
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
150
169
|
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
170
|
+
"encoder.layers.{bid}.mixer.Wqkv", # jina
|
151
171
|
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
152
172
|
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
153
173
|
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
@@ -159,11 +179,13 @@ class TensorNameMap:
|
|
159
179
|
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
|
160
180
|
"layers.{bid}.attention.wq", # llama-pth
|
161
181
|
"encoder.layer.{bid}.attention.self.query", # bert
|
182
|
+
"transformer.layer.{bid}.attention.q_lin", # distillbert
|
162
183
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
163
184
|
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
164
185
|
"model.layers.{bid}.attention.wq", # internlm2
|
165
186
|
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
|
166
187
|
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
188
|
+
"model.layers.{bid}.self_attn.q_proj", # llama4
|
167
189
|
),
|
168
190
|
|
169
191
|
# Attention key
|
@@ -172,12 +194,14 @@ class TensorNameMap:
|
|
172
194
|
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
|
173
195
|
"layers.{bid}.attention.wk", # llama-pth
|
174
196
|
"encoder.layer.{bid}.attention.self.key", # bert
|
197
|
+
"transformer.layer.{bid}.attention.k_lin", # distillbert
|
175
198
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
176
199
|
"transformer.h.{bid}.attn.k", # refact
|
177
200
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
178
201
|
"model.layers.{bid}.attention.wk", # internlm2
|
179
202
|
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
|
180
203
|
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
204
|
+
"model.layers.{bid}.self_attn.k_proj", # llama4
|
181
205
|
),
|
182
206
|
|
183
207
|
# Attention value
|
@@ -185,12 +209,14 @@ class TensorNameMap:
|
|
185
209
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
186
210
|
"layers.{bid}.attention.wv", # llama-pth
|
187
211
|
"encoder.layer.{bid}.attention.self.value", # bert
|
212
|
+
"transformer.layer.{bid}.attention.v_lin", # distillbert
|
188
213
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
189
214
|
"transformer.h.{bid}.attn.v", # refact
|
190
215
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
191
216
|
"model.layers.{bid}.attention.wv", # internlm2
|
192
217
|
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
|
193
218
|
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
219
|
+
"model.layers.{bid}.self_attn.v_proj", # llama4
|
194
220
|
),
|
195
221
|
|
196
222
|
# Attention output
|
@@ -204,6 +230,7 @@ class TensorNameMap:
|
|
204
230
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
205
231
|
"layers.{bid}.attention.wo", # llama-pth
|
206
232
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
233
|
+
"transformer.layer.{bid}.attention.out_lin", # distillbert
|
207
234
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
208
235
|
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
209
236
|
"model.layers.{bid}.self_attn.dense", # persimmon
|
@@ -212,23 +239,27 @@ class TensorNameMap:
|
|
212
239
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
213
240
|
"model.layers.{bid}.attention.wo", # internlm2
|
214
241
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
242
|
+
"encoder.layers.{bid}.mixer.out_proj", # jina
|
215
243
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
216
244
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
217
245
|
"encoder.layers.{bid}.self_attention.dense", # chatglm
|
218
246
|
"transformer.layers.{bid}.attn.out_proj", # openelm
|
219
247
|
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
248
|
+
"model.layers.{bid}.self_attn.o_proj", # llama4
|
220
249
|
),
|
221
250
|
|
222
251
|
# Attention output norm
|
223
252
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
224
253
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
254
|
+
"transformer.layer.{bid}.sa_layer_norm", # distillbert
|
225
255
|
"encoder.layers.{bid}.norm1", # nomic-bert
|
226
256
|
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
227
257
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
228
258
|
),
|
229
259
|
|
230
260
|
MODEL_TENSOR.ATTN_POST_NORM: (
|
231
|
-
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
|
261
|
+
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
|
262
|
+
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
|
232
263
|
),
|
233
264
|
|
234
265
|
# Rotary embeddings
|
@@ -254,6 +285,7 @@ class TensorNameMap:
|
|
254
285
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
255
286
|
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
256
287
|
"transformer.layers.{bid}.ffn_norm", # openelm
|
288
|
+
"model.layers.{bid}.post_attention_layernorm", # llama4
|
257
289
|
),
|
258
290
|
|
259
291
|
# Post feed-forward norm
|
@@ -264,6 +296,7 @@ class TensorNameMap:
|
|
264
296
|
# Post feed-forward norm
|
265
297
|
MODEL_TENSOR.FFN_POST_NORM: (
|
266
298
|
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
299
|
+
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
267
300
|
),
|
268
301
|
|
269
302
|
MODEL_TENSOR.FFN_GATE_INP: (
|
@@ -273,6 +306,8 @@ class TensorNameMap:
|
|
273
306
|
"transformer.decoder_layer.{bid}.router", # Grok
|
274
307
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
275
308
|
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
309
|
+
"model.layers.{bid}.feed_forward.router", # llama4
|
310
|
+
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
|
276
311
|
),
|
277
312
|
|
278
313
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
@@ -280,7 +315,7 @@ class TensorNameMap:
|
|
280
315
|
),
|
281
316
|
|
282
317
|
MODEL_TENSOR.FFN_EXP_PROBS_B: (
|
283
|
-
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
|
318
|
+
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
|
284
319
|
),
|
285
320
|
|
286
321
|
# Feed-forward up
|
@@ -293,6 +328,7 @@ class TensorNameMap:
|
|
293
328
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
|
294
329
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
295
330
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
331
|
+
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
296
332
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
297
333
|
"transformer.h.{bid}.mlp.linear_3", # refact
|
298
334
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
@@ -301,15 +337,19 @@ class TensorNameMap:
|
|
301
337
|
"h.{bid}.mlp.c_fc", # gpt2
|
302
338
|
"transformer.h.{bid}.mlp.fc1", # phi2
|
303
339
|
"model.layers.{bid}.mlp.fc1", # phi2
|
304
|
-
"model.layers.{bid}.mlp.gate_up_proj", # phi3
|
340
|
+
"model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414
|
305
341
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
306
342
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
307
343
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
344
|
+
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
308
345
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
309
|
-
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
346
|
+
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used)
|
347
|
+
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
|
348
|
+
"encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
|
310
349
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
311
350
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
312
351
|
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
352
|
+
"model.layers.{bid}.feed_forward.up_proj", # llama4
|
313
353
|
),
|
314
354
|
|
315
355
|
MODEL_TENSOR.FFN_UP_EXP: (
|
@@ -318,11 +358,14 @@ class TensorNameMap:
|
|
318
358
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
319
359
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
320
360
|
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
361
|
+
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
362
|
+
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
321
363
|
),
|
322
364
|
|
323
365
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
324
|
-
"model.layers.{bid}.mlp.shared_expert.up_proj",
|
325
|
-
"model.layers.{bid}.mlp.shared_experts.up_proj",
|
366
|
+
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
367
|
+
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
|
368
|
+
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
|
326
369
|
),
|
327
370
|
|
328
371
|
# AWQ-activation gate
|
@@ -339,23 +382,26 @@ class TensorNameMap:
|
|
339
382
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
340
383
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
341
384
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
342
|
-
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
385
|
+
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
|
343
386
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
344
387
|
"model.layers.{bid}.residual_mlp.w1", # arctic
|
345
388
|
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
389
|
+
"model.layers.{bid}.feed_forward.gate_proj", # llama4
|
346
390
|
),
|
347
391
|
|
348
392
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
349
|
-
"layers.{bid}.feed_forward.experts.w1",
|
350
|
-
"transformer.decoder_layer.{bid}.moe.linear",
|
351
|
-
"transformer.blocks.{bid}.ffn.experts.mlp.w1",
|
352
|
-
"model.layers.{bid}.mlp.experts.gate_proj",
|
353
|
-
"model.layers.{bid}.block_sparse_moe.experts.w1",
|
393
|
+
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
394
|
+
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
395
|
+
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
396
|
+
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
397
|
+
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
398
|
+
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
|
354
399
|
),
|
355
400
|
|
356
401
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
357
|
-
"model.layers.{bid}.mlp.shared_expert.gate_proj",
|
358
|
-
"model.layers.{bid}.mlp.shared_experts.gate_proj",
|
402
|
+
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
403
|
+
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
|
404
|
+
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
|
359
405
|
),
|
360
406
|
|
361
407
|
# Feed-forward down
|
@@ -368,6 +414,7 @@ class TensorNameMap:
|
|
368
414
|
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
|
369
415
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
370
416
|
"encoder.layer.{bid}.output.dense", # bert
|
417
|
+
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
371
418
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
372
419
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
373
420
|
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
@@ -384,6 +431,7 @@ class TensorNameMap:
|
|
384
431
|
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
385
432
|
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
386
433
|
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
434
|
+
"model.layers.{bid}.feed_forward.down_proj", # llama4
|
387
435
|
),
|
388
436
|
|
389
437
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
@@ -393,11 +441,15 @@ class TensorNameMap:
|
|
393
441
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
394
442
|
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
395
443
|
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
444
|
+
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
445
|
+
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
396
446
|
),
|
397
447
|
|
398
448
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
399
|
-
"model.layers.{bid}.mlp.shared_expert.down_proj",
|
400
|
-
"model.layers.{bid}.mlp.shared_experts.down_proj",
|
449
|
+
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
450
|
+
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
|
451
|
+
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
|
452
|
+
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
401
453
|
),
|
402
454
|
|
403
455
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
@@ -424,6 +476,7 @@ class TensorNameMap:
|
|
424
476
|
|
425
477
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
426
478
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
479
|
+
"transformer.layer.{bid}.output_layer_norm", # distillbert
|
427
480
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
428
481
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
429
482
|
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
|
@@ -465,112 +518,174 @@ class TensorNameMap:
|
|
465
518
|
"backbone.layers.{bid}.mixer.out_proj",
|
466
519
|
),
|
467
520
|
|
521
|
+
MODEL_TENSOR.TIME_MIX_W0: (
|
522
|
+
"model.layers.{bid}.attention.w0", # rwkv7
|
523
|
+
),
|
524
|
+
|
468
525
|
MODEL_TENSOR.TIME_MIX_W1: (
|
469
|
-
"rwkv.blocks.{bid}.attention.time_maa_w1",
|
470
|
-
"model.layers.{bid}.self_attn.time_maa_w1",
|
526
|
+
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6
|
527
|
+
"model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
|
528
|
+
"model.layers.{bid}.attention.w1", # rwkv7
|
471
529
|
),
|
472
530
|
|
473
531
|
MODEL_TENSOR.TIME_MIX_W2: (
|
474
|
-
"rwkv.blocks.{bid}.attention.time_maa_w2",
|
475
|
-
"model.layers.{bid}.self_attn.time_maa_w2",
|
532
|
+
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6
|
533
|
+
"model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
|
534
|
+
"model.layers.{bid}.attention.w2", # rwkv7
|
535
|
+
),
|
536
|
+
|
537
|
+
MODEL_TENSOR.TIME_MIX_A0: (
|
538
|
+
"model.layers.{bid}.attention.a0", # rwkv7
|
539
|
+
),
|
540
|
+
|
541
|
+
MODEL_TENSOR.TIME_MIX_A1: (
|
542
|
+
"model.layers.{bid}.attention.a1", # rwkv7
|
543
|
+
),
|
544
|
+
|
545
|
+
MODEL_TENSOR.TIME_MIX_A2: (
|
546
|
+
"model.layers.{bid}.attention.a2", # rwkv7
|
547
|
+
),
|
548
|
+
|
549
|
+
MODEL_TENSOR.TIME_MIX_V0: (
|
550
|
+
"model.layers.{bid}.attention.v0", # rwkv7
|
551
|
+
),
|
552
|
+
|
553
|
+
MODEL_TENSOR.TIME_MIX_V1: (
|
554
|
+
"model.layers.{bid}.attention.v1", # rwkv7
|
555
|
+
),
|
556
|
+
|
557
|
+
MODEL_TENSOR.TIME_MIX_V2: (
|
558
|
+
"model.layers.{bid}.attention.v2", # rwkv7
|
559
|
+
),
|
560
|
+
|
561
|
+
MODEL_TENSOR.TIME_MIX_G1: (
|
562
|
+
"model.layers.{bid}.attention.g1", # rwkv7
|
563
|
+
),
|
564
|
+
|
565
|
+
MODEL_TENSOR.TIME_MIX_G2: (
|
566
|
+
"model.layers.{bid}.attention.g2", # rwkv7
|
567
|
+
),
|
568
|
+
|
569
|
+
MODEL_TENSOR.TIME_MIX_K_K: (
|
570
|
+
"model.layers.{bid}.attention.k_k", # rwkv7
|
571
|
+
),
|
572
|
+
|
573
|
+
MODEL_TENSOR.TIME_MIX_K_A: (
|
574
|
+
"model.layers.{bid}.attention.k_a", # rwkv7
|
575
|
+
),
|
576
|
+
|
577
|
+
MODEL_TENSOR.TIME_MIX_R_K: (
|
578
|
+
"model.layers.{bid}.attention.r_k", # rwkv7
|
476
579
|
),
|
477
580
|
|
478
581
|
MODEL_TENSOR.TIME_MIX_LERP_X: (
|
479
|
-
"rwkv.blocks.{bid}.attention.time_maa_x", #
|
582
|
+
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6
|
480
583
|
"model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2
|
481
584
|
),
|
482
585
|
|
483
586
|
MODEL_TENSOR.TIME_MIX_LERP_K: (
|
484
|
-
"rwkv.blocks.{bid}.attention.time_maa_k", #
|
587
|
+
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6
|
485
588
|
"model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2
|
486
589
|
),
|
487
590
|
|
488
591
|
MODEL_TENSOR.TIME_MIX_LERP_V: (
|
489
|
-
"rwkv.blocks.{bid}.attention.time_maa_v", #
|
592
|
+
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6
|
490
593
|
"model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2
|
491
594
|
),
|
492
595
|
|
493
596
|
MODEL_TENSOR.TIME_MIX_LERP_R: (
|
494
|
-
"rwkv.blocks.{bid}.attention.time_maa_r", #
|
597
|
+
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6
|
495
598
|
"model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2
|
496
599
|
),
|
497
600
|
|
498
601
|
MODEL_TENSOR.TIME_MIX_LERP_G: (
|
499
|
-
"rwkv.blocks.{bid}.attention.time_maa_g", #
|
602
|
+
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6
|
500
603
|
"model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2
|
501
604
|
),
|
502
605
|
|
503
606
|
MODEL_TENSOR.TIME_MIX_LERP_W: (
|
504
|
-
"rwkv.blocks.{bid}.attention.time_maa_w", #
|
607
|
+
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6
|
505
608
|
"model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2
|
506
609
|
),
|
507
610
|
|
508
611
|
MODEL_TENSOR.TIME_MIX_FIRST: (
|
509
|
-
"rwkv.blocks.{bid}.attention.time_faaaa", #
|
612
|
+
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6
|
510
613
|
),
|
511
614
|
|
512
615
|
MODEL_TENSOR.TIME_MIX_DECAY: (
|
513
|
-
"rwkv.blocks.{bid}.attention.time_decay", #
|
616
|
+
"rwkv.blocks.{bid}.attention.time_decay", # rwkv6
|
514
617
|
"model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2
|
515
618
|
),
|
516
619
|
|
517
620
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
|
518
|
-
"rwkv.blocks.{bid}.attention.time_decay_w1", #
|
621
|
+
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6
|
519
622
|
"model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
|
520
623
|
),
|
521
624
|
|
522
625
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
|
523
|
-
"rwkv.blocks.{bid}.attention.time_decay_w2", #
|
626
|
+
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6
|
524
627
|
"model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
|
525
628
|
),
|
526
629
|
|
527
630
|
MODEL_TENSOR.TIME_MIX_KEY: (
|
528
|
-
"rwkv.blocks.{bid}.attention.key", #
|
631
|
+
"rwkv.blocks.{bid}.attention.key", # rwkv6
|
529
632
|
"model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
|
633
|
+
"model.layers.{bid}.attention.key", # rwkv7
|
634
|
+
"model.layers.{bid}.attention.k_proj", # rwkv7
|
530
635
|
),
|
531
636
|
|
532
637
|
MODEL_TENSOR.TIME_MIX_VALUE: (
|
533
|
-
"rwkv.blocks.{bid}.attention.value", #
|
638
|
+
"rwkv.blocks.{bid}.attention.value", # rwkv6
|
534
639
|
"model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
|
640
|
+
"model.layers.{bid}.attention.value", # rwkv7
|
641
|
+
"model.layers.{bid}.attention.v_proj", # rwkv7
|
535
642
|
),
|
536
643
|
|
537
644
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
|
538
|
-
"rwkv.blocks.{bid}.attention.receptance",
|
539
|
-
"model.layers.{bid}.self_attn.q_proj",
|
645
|
+
"rwkv.blocks.{bid}.attention.receptance", # rwkv6
|
646
|
+
"model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
|
647
|
+
"model.layers.{bid}.attention.receptance", # rwkv7
|
648
|
+
"model.layers.{bid}.attention.r_proj", # rwkv7
|
540
649
|
),
|
541
650
|
|
542
651
|
MODEL_TENSOR.TIME_MIX_GATE: (
|
543
|
-
"rwkv.blocks.{bid}.attention.gate",
|
544
|
-
"model.layers.{bid}.self_attn.gate",
|
652
|
+
"rwkv.blocks.{bid}.attention.gate", # rwkv6
|
653
|
+
"model.layers.{bid}.self_attn.gate", # rwkv6qwen2
|
545
654
|
),
|
546
655
|
|
547
656
|
MODEL_TENSOR.TIME_MIX_LN: (
|
548
|
-
"rwkv.blocks.{bid}.attention.ln_x", #
|
657
|
+
"rwkv.blocks.{bid}.attention.ln_x", # rwkv6
|
658
|
+
"model.layers.{bid}.attention.ln_x" # rwkv7
|
549
659
|
),
|
550
660
|
|
551
661
|
MODEL_TENSOR.TIME_MIX_OUTPUT: (
|
552
|
-
"rwkv.blocks.{bid}.attention.output", #
|
662
|
+
"rwkv.blocks.{bid}.attention.output", # rwkv6
|
553
663
|
"model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
|
664
|
+
"model.layers.{bid}.attention.output", # rwkv7
|
665
|
+
"model.layers.{bid}.attention.o_proj", # rwkv7
|
554
666
|
),
|
555
667
|
|
556
668
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
|
557
|
-
"rwkv.blocks.{bid}.feed_forward.time_maa_k", #
|
669
|
+
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
|
670
|
+
"model.layers.{bid}.feed_forward.x_k", # rwkv7
|
558
671
|
),
|
559
672
|
|
560
673
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
|
561
|
-
"rwkv.blocks.{bid}.feed_forward.time_maa_r", #
|
674
|
+
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6
|
562
675
|
),
|
563
676
|
|
564
677
|
MODEL_TENSOR.CHANNEL_MIX_KEY: (
|
565
|
-
"rwkv.blocks.{bid}.feed_forward.key",
|
678
|
+
"rwkv.blocks.{bid}.feed_forward.key", # rwkv6
|
679
|
+
"model.layers.{bid}.feed_forward.key", # rwkv7
|
566
680
|
),
|
567
681
|
|
568
682
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
|
569
|
-
"rwkv.blocks.{bid}.feed_forward.receptance", #
|
683
|
+
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6
|
570
684
|
),
|
571
685
|
|
572
686
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
|
573
|
-
"rwkv.blocks.{bid}.feed_forward.value",
|
687
|
+
"rwkv.blocks.{bid}.feed_forward.value", # rwkv6
|
688
|
+
"model.layers.{bid}.feed_forward.value", # rwkv7
|
574
689
|
),
|
575
690
|
|
576
691
|
MODEL_TENSOR.ATTN_Q_A: (
|
@@ -589,6 +704,14 @@ class TensorNameMap:
|
|
589
704
|
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
|
590
705
|
),
|
591
706
|
|
707
|
+
MODEL_TENSOR.ATTN_K_B: (
|
708
|
+
"model.layers.{bid}.self_attn.k_b_proj", # deepseek2
|
709
|
+
),
|
710
|
+
|
711
|
+
MODEL_TENSOR.ATTN_V_B: (
|
712
|
+
"model.layers.{bid}.self_attn.v_b_proj", # deepseek2
|
713
|
+
),
|
714
|
+
|
592
715
|
MODEL_TENSOR.ATTN_Q_A_NORM: (
|
593
716
|
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
|
594
717
|
),
|
@@ -724,6 +847,7 @@ class TensorNameMap:
|
|
724
847
|
MODEL_TENSOR.CLS: (
|
725
848
|
"classifier", # jina
|
726
849
|
"classifier.dense", # roberta
|
850
|
+
"pre_classifier", # distillbert
|
727
851
|
),
|
728
852
|
|
729
853
|
MODEL_TENSOR.CLS_OUT: (
|
@@ -790,6 +914,295 @@ class TensorNameMap:
|
|
790
914
|
MODEL_TENSOR.POSNET_ATTN_OUT: (
|
791
915
|
"backbone.posnet.{bid}.proj_out", # wavtokenizer
|
792
916
|
),
|
917
|
+
|
918
|
+
#############################################################################
|
919
|
+
## Vision encoder
|
920
|
+
|
921
|
+
MODEL_TENSOR.V_MMPROJ: (
|
922
|
+
"multi_modal_projector.linear_{bid}",
|
923
|
+
"visual.merger.mlp.{bid}", # qwen2vl
|
924
|
+
),
|
925
|
+
|
926
|
+
MODEL_TENSOR.V_MMPROJ_FC: (
|
927
|
+
"model.connector.modality_projection.proj", # SmolVLM
|
928
|
+
),
|
929
|
+
|
930
|
+
MODEL_TENSOR.V_MMPROJ_MLP: (
|
931
|
+
"model.mm_projector.mlp.mlp.{bid}",
|
932
|
+
"vision_model.vision_adapter.mlp.fc{bid}", # llama 4
|
933
|
+
"mlp1.{bid}", # InternVL
|
934
|
+
),
|
935
|
+
|
936
|
+
MODEL_TENSOR.V_MMPROJ_PEG: (
|
937
|
+
"model.mm_projector.peg.peg.{bid}",
|
938
|
+
),
|
939
|
+
|
940
|
+
MODEL_TENSOR.V_ENC_EMBD_CLS: (
|
941
|
+
"vision_tower.vision_model.embeddings.class_embedding",
|
942
|
+
"vision_model.class_embedding", # llama 4
|
943
|
+
),
|
944
|
+
|
945
|
+
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
946
|
+
"vision_tower.vision_model.embeddings.patch_embedding",
|
947
|
+
"vpm.embeddings.patch_embedding",
|
948
|
+
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
949
|
+
"vision_tower.patch_conv", # pixtral
|
950
|
+
"vision_model.patch_embedding.linear", # llama 4
|
951
|
+
"visual.patch_embed.proj", # qwen2vl
|
952
|
+
),
|
953
|
+
|
954
|
+
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
955
|
+
"vision_tower.vision_model.embeddings.position_embedding",
|
956
|
+
"vpm.embeddings.position_embedding",
|
957
|
+
"model.vision_model.embeddings.position_embedding", # SmolVLM
|
958
|
+
"vision_model.positional_embedding_vlm", # llama 4
|
959
|
+
),
|
960
|
+
|
961
|
+
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
962
|
+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
|
963
|
+
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
964
|
+
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
965
|
+
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
|
966
|
+
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
|
967
|
+
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
|
968
|
+
),
|
969
|
+
|
970
|
+
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
971
|
+
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
|
972
|
+
),
|
973
|
+
|
974
|
+
MODEL_TENSOR.V_ENC_ATTN_K: (
|
975
|
+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
976
|
+
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
977
|
+
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
978
|
+
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
|
979
|
+
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
|
980
|
+
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
|
981
|
+
),
|
982
|
+
|
983
|
+
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
984
|
+
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
|
985
|
+
),
|
986
|
+
|
987
|
+
MODEL_TENSOR.V_ENC_ATTN_V: (
|
988
|
+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
989
|
+
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
990
|
+
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
991
|
+
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
|
992
|
+
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
|
993
|
+
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
|
994
|
+
),
|
995
|
+
|
996
|
+
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
997
|
+
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
998
|
+
"vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
|
999
|
+
"vpm.encoder.layers.{bid}.layer_norm1",
|
1000
|
+
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
1001
|
+
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
|
1002
|
+
"vision_model.model.layers.{bid}.input_layernorm", # llama4
|
1003
|
+
"visual.blocks.{bid}.norm1", # qwen2vl
|
1004
|
+
),
|
1005
|
+
|
1006
|
+
MODEL_TENSOR.V_ENC_ATTN_O: (
|
1007
|
+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
1008
|
+
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
|
1009
|
+
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
1010
|
+
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
1011
|
+
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
|
1012
|
+
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
|
1013
|
+
"visual.blocks.{bid}.attn.proj", # qwen2vl
|
1014
|
+
),
|
1015
|
+
|
1016
|
+
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
1017
|
+
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
1018
|
+
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
|
1019
|
+
"vpm.encoder.layers.{bid}.layer_norm2",
|
1020
|
+
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
1021
|
+
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
|
1022
|
+
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
|
1023
|
+
"visual.blocks.{bid}.norm2", # qwen2vl
|
1024
|
+
),
|
1025
|
+
|
1026
|
+
MODEL_TENSOR.V_ENC_FFN_UP: (
|
1027
|
+
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
1028
|
+
"vpm.encoder.layers.{bid}.mlp.fc1",
|
1029
|
+
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
|
1030
|
+
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
|
1031
|
+
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
|
1032
|
+
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
|
1033
|
+
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
|
1034
|
+
),
|
1035
|
+
|
1036
|
+
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
1037
|
+
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
|
1038
|
+
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
1039
|
+
),
|
1040
|
+
|
1041
|
+
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
1042
|
+
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
1043
|
+
"vpm.encoder.layers.{bid}.mlp.fc2",
|
1044
|
+
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
|
1045
|
+
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
|
1046
|
+
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
|
1047
|
+
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
|
1048
|
+
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
|
1049
|
+
),
|
1050
|
+
|
1051
|
+
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
1052
|
+
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
|
1053
|
+
),
|
1054
|
+
|
1055
|
+
MODEL_TENSOR.V_LAYER_SCALE_2: (
|
1056
|
+
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
|
1057
|
+
),
|
1058
|
+
|
1059
|
+
MODEL_TENSOR.V_PRE_NORM: (
|
1060
|
+
"vision_tower.vision_model.pre_layrnorm",
|
1061
|
+
"vision_tower.ln_pre", # pixtral
|
1062
|
+
"vision_model.layernorm_pre", # llama4
|
1063
|
+
),
|
1064
|
+
|
1065
|
+
MODEL_TENSOR.V_POST_NORM: (
|
1066
|
+
"vision_tower.vision_model.post_layernorm",
|
1067
|
+
"model.vision_model.post_layernorm", # SmolVLM
|
1068
|
+
"vision_model.layernorm_post", # llama4
|
1069
|
+
"visual.merger.ln_q", # qwen2vl
|
1070
|
+
),
|
1071
|
+
|
1072
|
+
MODEL_TENSOR.V_MM_INP_PROJ: (
|
1073
|
+
"multi_modal_projector.mm_input_projection",
|
1074
|
+
),
|
1075
|
+
|
1076
|
+
MODEL_TENSOR.V_MM_INP_NORM: (
|
1077
|
+
"multi_modal_projector.norm",
|
1078
|
+
),
|
1079
|
+
|
1080
|
+
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
1081
|
+
"multi_modal_projector.mm_soft_emb_norm",
|
1082
|
+
),
|
1083
|
+
|
1084
|
+
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
|
1085
|
+
"resampler.pos_embed_k",
|
1086
|
+
),
|
1087
|
+
|
1088
|
+
MODEL_TENSOR.V_RESMPL_ATTN_Q: (
|
1089
|
+
"resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
|
1090
|
+
),
|
1091
|
+
|
1092
|
+
MODEL_TENSOR.V_RESMPL_ATTN_K: (
|
1093
|
+
"resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
|
1094
|
+
),
|
1095
|
+
|
1096
|
+
MODEL_TENSOR.V_RESMPL_ATTN_V: (
|
1097
|
+
"resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
|
1098
|
+
),
|
1099
|
+
|
1100
|
+
MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
|
1101
|
+
"resampler.attn.out_proj",
|
1102
|
+
),
|
1103
|
+
|
1104
|
+
MODEL_TENSOR.V_RESMPL_KV: (
|
1105
|
+
"resampler.kv_proj",
|
1106
|
+
),
|
1107
|
+
|
1108
|
+
MODEL_TENSOR.V_RESMPL_POST_NORM: (
|
1109
|
+
"resampler.ln_post",
|
1110
|
+
),
|
1111
|
+
|
1112
|
+
MODEL_TENSOR.V_RESMPL_KV_NORM: (
|
1113
|
+
"resampler.ln_kv",
|
1114
|
+
),
|
1115
|
+
|
1116
|
+
MODEL_TENSOR.V_RESMPL_Q_NORM: (
|
1117
|
+
"resampler.ln_q",
|
1118
|
+
),
|
1119
|
+
|
1120
|
+
MODEL_TENSOR.V_RESMPL_PROJ: (
|
1121
|
+
"resampler.proj",
|
1122
|
+
),
|
1123
|
+
|
1124
|
+
MODEL_TENSOR.V_RESMPL_QUERY: (
|
1125
|
+
"resampler.query",
|
1126
|
+
),
|
1127
|
+
|
1128
|
+
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
|
1129
|
+
"v.token_embd.img_break", # for pixtral, this is a generated vector
|
1130
|
+
),
|
1131
|
+
|
1132
|
+
MODEL_TENSOR.V_MM_PATCH_MERGER: (
|
1133
|
+
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
|
1134
|
+
),
|
1135
|
+
|
1136
|
+
# audio (mtmd)
|
1137
|
+
|
1138
|
+
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
1139
|
+
"audio_tower.embed_positions", # ultravox
|
1140
|
+
),
|
1141
|
+
|
1142
|
+
MODEL_TENSOR.A_ENC_CONV1D: (
|
1143
|
+
"audio_tower.conv{bid}", # ultravox
|
1144
|
+
),
|
1145
|
+
|
1146
|
+
MODEL_TENSOR.A_PRE_NORM: (),
|
1147
|
+
|
1148
|
+
MODEL_TENSOR.A_POST_NORM: (
|
1149
|
+
"audio_tower.layer_norm", # ultravox
|
1150
|
+
"audio_tower.ln_post", # qwen2omni
|
1151
|
+
),
|
1152
|
+
|
1153
|
+
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
1154
|
+
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
1155
|
+
),
|
1156
|
+
|
1157
|
+
MODEL_TENSOR.A_ENC_ATTN_K: (
|
1158
|
+
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
1159
|
+
),
|
1160
|
+
|
1161
|
+
MODEL_TENSOR.A_ENC_ATTN_V: (
|
1162
|
+
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
1163
|
+
),
|
1164
|
+
|
1165
|
+
MODEL_TENSOR.A_ENC_INPUT_NORM: (
|
1166
|
+
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
1167
|
+
),
|
1168
|
+
|
1169
|
+
MODEL_TENSOR.A_ENC_OUTPUT: (
|
1170
|
+
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
1171
|
+
),
|
1172
|
+
|
1173
|
+
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
1174
|
+
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
1175
|
+
),
|
1176
|
+
|
1177
|
+
MODEL_TENSOR.A_ENC_FFN_UP: (
|
1178
|
+
"audio_tower.layers.{bid}.fc1", # ultravox
|
1179
|
+
),
|
1180
|
+
|
1181
|
+
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
1182
|
+
|
1183
|
+
MODEL_TENSOR.A_ENC_FFN_DOWN: (
|
1184
|
+
"audio_tower.layers.{bid}.fc2", # ultravox
|
1185
|
+
),
|
1186
|
+
|
1187
|
+
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
1188
|
+
# this prefix is added in the conversion code in modify_tensors()
|
1189
|
+
|
1190
|
+
MODEL_TENSOR.A_MMPROJ: (
|
1191
|
+
"audio.multi_modal_projector.linear_{bid}", # ultravox
|
1192
|
+
),
|
1193
|
+
|
1194
|
+
MODEL_TENSOR.A_MMPROJ_FC: (
|
1195
|
+
"audio.multi_modal_projector.linear", # qwen2audio
|
1196
|
+
"audio_tower.proj", # qwen2omni
|
1197
|
+
),
|
1198
|
+
|
1199
|
+
MODEL_TENSOR.A_MM_NORM_PRE: (
|
1200
|
+
"audio.multi_modal_projector.ln_pre", # ultravox
|
1201
|
+
),
|
1202
|
+
|
1203
|
+
MODEL_TENSOR.A_MM_NORM_MID: (
|
1204
|
+
"audio.multi_modal_projector.ln_mid", # ultravox
|
1205
|
+
),
|
793
1206
|
}
|
794
1207
|
|
795
1208
|
# architecture-specific block mappings
|