bigdl-core-cpp 2.1.0b20240820.post1__py3-none-win_amd64.whl → 2.2.0b20250217.post0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +908 -140
- bigdl/cpp/convert_hf_to_gguf_update.py +376 -0
- bigdl/cpp/convert_llama_ggml_to_gguf.py +450 -0
- bigdl/cpp/convert_lora_to_gguf.py +433 -0
- bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
- bigdl/cpp/gguf-py/gguf/constants.py +414 -89
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +77 -14
- bigdl/cpp/gguf-py/gguf/lazy.py +3 -1
- bigdl/cpp/gguf-py/gguf/metadata.py +195 -76
- bigdl/cpp/gguf-py/gguf/quants.py +1210 -64
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +156 -34
- bigdl/cpp/gguf-py/gguf/utility.py +1 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +325 -3
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/ggml-base.dll +0 -0
- bigdl/cpp/libs/ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ggml.dll +0 -0
- bigdl/cpp/libs/libc++.dll +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ollama-lib.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/ollama_ggml.dll +0 -0
- bigdl/cpp/libs/ollama_llama.dll +0 -0
- bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.bat +7 -2
- bigdl_core_cpp-2.2.0b20250217.post0.data/scripts/init-ollama.bat +16 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/METADATA +9 -5
- bigdl_core_cpp-2.2.0b20250217.post0.dist-info/RECORD +56 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/WHEEL +1 -1
- bigdl/cpp/convert.py +0 -1714
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- bigdl_core_cpp-2.1.0b20240820.post1.data/scripts/init-ollama.bat +0 -13
- bigdl_core_cpp-2.1.0b20240820.post1.dist-info/RECORD +0 -63
- {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,10 @@ class TensorNameMap:
|
|
10
10
|
# Token embeddings
|
11
11
|
MODEL_TENSOR.TOKEN_EMBD: (
|
12
12
|
"gpt_neox.embed_in", # gptneox
|
13
|
-
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
|
13
|
+
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
14
14
|
"transformer.word_embeddings", # falcon
|
15
15
|
"word_embeddings", # bloom
|
16
|
-
"model.embed_tokens", # llama-hf
|
16
|
+
"model.embed_tokens", # llama-hf nemotron olmoe olmo2
|
17
17
|
"tok_embeddings", # llama-pth
|
18
18
|
"embeddings.word_embeddings", # bert nomic-bert
|
19
19
|
"language_model.embedding.word_embeddings", # persimmon
|
@@ -27,6 +27,7 @@ class TensorNameMap:
|
|
27
27
|
"embedding.word_embeddings", # chatglm
|
28
28
|
"transformer.token_embeddings", # openelm
|
29
29
|
"shared", # t5
|
30
|
+
"rwkv.embeddings", # rwkv
|
30
31
|
),
|
31
32
|
|
32
33
|
# Token type embeddings
|
@@ -40,6 +41,7 @@ class TensorNameMap:
|
|
40
41
|
"embeddings.LayerNorm", # bert
|
41
42
|
"emb_ln", # nomic-bert
|
42
43
|
"transformer.norm", # openelm
|
44
|
+
"rwkv.blocks.0.pre_ln", # rwkv
|
43
45
|
),
|
44
46
|
|
45
47
|
# Position embeddings
|
@@ -52,18 +54,19 @@ class TensorNameMap:
|
|
52
54
|
# Output
|
53
55
|
MODEL_TENSOR.OUTPUT: (
|
54
56
|
"embed_out", # gptneox
|
55
|
-
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
|
57
|
+
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
|
56
58
|
"output", # llama-pth bloom internlm2
|
57
59
|
"word_embeddings_for_head", # persimmon
|
58
60
|
"lm_head.linear", # phi2
|
59
61
|
"output_layer", # chatglm
|
62
|
+
"head", # rwkv
|
60
63
|
),
|
61
64
|
|
62
65
|
# Output norm
|
63
66
|
MODEL_TENSOR.OUTPUT_NORM: (
|
64
67
|
"gpt_neox.final_layer_norm", # gptneox
|
65
|
-
"transformer.ln_f", # gpt2 gpt-j falcon jais
|
66
|
-
"model.norm", # llama-hf baichuan internlm2
|
68
|
+
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
|
69
|
+
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2
|
67
70
|
"norm", # llama-pth
|
68
71
|
"transformer.norm_f", # mpt dbrx
|
69
72
|
"ln_f", # refact bloom qwen gpt2
|
@@ -75,6 +78,8 @@ class TensorNameMap:
|
|
75
78
|
"transformer.rms_norm", # Grok
|
76
79
|
"encoder.final_layernorm", # chatglm
|
77
80
|
"transformer.norm", # openelm
|
81
|
+
"model.norm", # nemotron
|
82
|
+
"rwkv.ln_out", # rwkv
|
78
83
|
),
|
79
84
|
|
80
85
|
# Rope frequencies
|
@@ -82,18 +87,21 @@ class TensorNameMap:
|
|
82
87
|
"rope.freqs", # llama-pth
|
83
88
|
"rotary_pos_emb.inv_freq", # chatglm
|
84
89
|
),
|
90
|
+
|
91
|
+
MODEL_TENSOR.ROPE_FACTORS_LONG: (),
|
92
|
+
MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
|
85
93
|
}
|
86
94
|
|
87
95
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
88
96
|
# Attention norm
|
89
97
|
MODEL_TENSOR.ATTN_NORM: (
|
90
98
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
91
|
-
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais
|
99
|
+
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone
|
92
100
|
"transformer.blocks.{bid}.norm_1", # mpt
|
93
101
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
94
102
|
"h.{bid}.input_layernorm", # bloom
|
95
103
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
96
|
-
"model.layers.{bid}.input_layernorm", # llama-hf
|
104
|
+
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe
|
97
105
|
"layers.{bid}.attention_norm", # llama-pth
|
98
106
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
99
107
|
"model.layers.{bid}.ln1", # yi
|
@@ -107,12 +115,14 @@ class TensorNameMap:
|
|
107
115
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
108
116
|
"encoder.layers.{bid}.input_layernorm", # chatglm
|
109
117
|
"transformer.layers.{bid}.attn_norm", # openelm
|
118
|
+
"rwkv.blocks.{bid}.ln1", # rwkv
|
110
119
|
),
|
111
120
|
|
112
121
|
# Attention norm 2
|
113
122
|
MODEL_TENSOR.ATTN_NORM_2: (
|
114
|
-
"transformer.h.{bid}.ln_attn",
|
123
|
+
"transformer.h.{bid}.ln_attn", # falcon40b
|
115
124
|
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
125
|
+
"rwkv.blocks.{bid}.ln2", # rwkv
|
116
126
|
),
|
117
127
|
|
118
128
|
# Attention query-key-value
|
@@ -135,18 +145,21 @@ class TensorNameMap:
|
|
135
145
|
|
136
146
|
# Attention query
|
137
147
|
MODEL_TENSOR.ATTN_Q: (
|
138
|
-
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
148
|
+
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
|
149
|
+
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
|
139
150
|
"layers.{bid}.attention.wq", # llama-pth
|
140
151
|
"encoder.layer.{bid}.attention.self.query", # bert
|
141
152
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
142
153
|
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
143
154
|
"model.layers.{bid}.attention.wq", # internlm2
|
144
155
|
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
|
156
|
+
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
145
157
|
),
|
146
158
|
|
147
159
|
# Attention key
|
148
160
|
MODEL_TENSOR.ATTN_K: (
|
149
|
-
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
161
|
+
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
|
162
|
+
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
|
150
163
|
"layers.{bid}.attention.wk", # llama-pth
|
151
164
|
"encoder.layer.{bid}.attention.self.key", # bert
|
152
165
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
@@ -154,18 +167,20 @@ class TensorNameMap:
|
|
154
167
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
155
168
|
"model.layers.{bid}.attention.wk", # internlm2
|
156
169
|
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
|
170
|
+
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
157
171
|
),
|
158
172
|
|
159
173
|
# Attention value
|
160
174
|
MODEL_TENSOR.ATTN_V: (
|
161
|
-
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
175
|
+
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
|
162
176
|
"layers.{bid}.attention.wv", # llama-pth
|
163
177
|
"encoder.layer.{bid}.attention.self.value", # bert
|
164
178
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
165
179
|
"transformer.h.{bid}.attn.v", # refact
|
166
180
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
167
181
|
"model.layers.{bid}.attention.wv", # internlm2
|
168
|
-
"transformer.decoder_layer.{bid}.multi_head_attention.value"
|
182
|
+
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
|
183
|
+
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
169
184
|
),
|
170
185
|
|
171
186
|
# Attention output
|
@@ -175,7 +190,7 @@ class TensorNameMap:
|
|
175
190
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
176
191
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
177
192
|
"h.{bid}.self_attention.dense", # bloom
|
178
|
-
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
193
|
+
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
|
179
194
|
"layers.{bid}.attention.wo", # llama-pth
|
180
195
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
181
196
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
@@ -190,6 +205,7 @@ class TensorNameMap:
|
|
190
205
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
191
206
|
"encoder.layers.{bid}.self_attention.dense", # chatglm
|
192
207
|
"transformer.layers.{bid}.attn.out_proj", # openelm
|
208
|
+
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
193
209
|
),
|
194
210
|
|
195
211
|
# Attention output norm
|
@@ -201,7 +217,7 @@ class TensorNameMap:
|
|
201
217
|
),
|
202
218
|
|
203
219
|
MODEL_TENSOR.ATTN_POST_NORM: (
|
204
|
-
"model.layers.{bid}.post_attention_layernorm", # gemma2
|
220
|
+
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
|
205
221
|
),
|
206
222
|
|
207
223
|
# Rotary embeddings
|
@@ -215,10 +231,10 @@ class TensorNameMap:
|
|
215
231
|
# Feed-forward norm
|
216
232
|
MODEL_TENSOR.FFN_NORM: (
|
217
233
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
218
|
-
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
|
234
|
+
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
|
219
235
|
"h.{bid}.post_attention_layernorm", # bloom
|
220
236
|
"transformer.blocks.{bid}.norm_2", # mpt
|
221
|
-
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
237
|
+
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe
|
222
238
|
"layers.{bid}.ffn_norm", # llama-pth
|
223
239
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
224
240
|
"model.layers.{bid}.ln2", # yi
|
@@ -236,15 +252,16 @@ class TensorNameMap:
|
|
236
252
|
|
237
253
|
# Post feed-forward norm
|
238
254
|
MODEL_TENSOR.FFN_POST_NORM: (
|
239
|
-
"model.layers.{bid}.post_feedforward_layernorm", # gemma2
|
255
|
+
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
240
256
|
),
|
241
257
|
|
242
258
|
MODEL_TENSOR.FFN_GATE_INP: (
|
243
|
-
"layers.{bid}.feed_forward.gate",
|
244
|
-
"model.layers.{bid}.block_sparse_moe.gate",
|
245
|
-
"model.layers.{bid}.mlp.gate",
|
246
|
-
"transformer.decoder_layer.{bid}.router",
|
247
|
-
"transformer.blocks.{bid}.ffn.router.layer",
|
259
|
+
"layers.{bid}.feed_forward.gate", # mixtral
|
260
|
+
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
261
|
+
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
262
|
+
"transformer.decoder_layer.{bid}.router", # Grok
|
263
|
+
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
264
|
+
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
248
265
|
),
|
249
266
|
|
250
267
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
@@ -258,7 +275,7 @@ class TensorNameMap:
|
|
258
275
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
259
276
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
260
277
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
261
|
-
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
278
|
+
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
|
262
279
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
263
280
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
264
281
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
@@ -277,13 +294,14 @@ class TensorNameMap:
|
|
277
294
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
278
295
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
279
296
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
297
|
+
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
280
298
|
),
|
281
299
|
|
282
300
|
MODEL_TENSOR.FFN_UP_EXP: (
|
283
301
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
284
302
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
285
303
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
286
|
-
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
|
304
|
+
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
287
305
|
),
|
288
306
|
|
289
307
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
@@ -298,7 +316,7 @@ class TensorNameMap:
|
|
298
316
|
|
299
317
|
# Feed-forward gate
|
300
318
|
MODEL_TENSOR.FFN_GATE: (
|
301
|
-
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
319
|
+
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
|
302
320
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
303
321
|
"transformer.h.{bid}.mlp.w2", # qwen
|
304
322
|
"transformer.h.{bid}.mlp.c_fc2", # jais
|
@@ -308,13 +326,14 @@ class TensorNameMap:
|
|
308
326
|
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
309
327
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
310
328
|
"model.layers.{bid}.residual_mlp.w1", # arctic
|
329
|
+
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
311
330
|
),
|
312
331
|
|
313
332
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
314
333
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
315
334
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
316
335
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
317
|
-
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
|
336
|
+
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
318
337
|
),
|
319
338
|
|
320
339
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
@@ -329,7 +348,7 @@ class TensorNameMap:
|
|
329
348
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
330
349
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
331
350
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
332
|
-
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
351
|
+
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
|
333
352
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
334
353
|
"encoder.layer.{bid}.output.dense", # bert
|
335
354
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
@@ -347,13 +366,15 @@ class TensorNameMap:
|
|
347
366
|
"model.layers.{bid}.residual_mlp.w2", # arctic
|
348
367
|
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
349
368
|
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
369
|
+
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
350
370
|
),
|
351
371
|
|
352
372
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
353
|
-
"layers.{bid}.feed_forward.experts.w2",
|
354
|
-
"transformer.decoder_layer.{bid}.moe.linear_1",
|
355
|
-
"transformer.blocks.{bid}.ffn.experts.mlp.w2",
|
356
|
-
"model.layers.{bid}.mlp.experts.down_proj",
|
373
|
+
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
374
|
+
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
375
|
+
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
376
|
+
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
377
|
+
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
357
378
|
),
|
358
379
|
|
359
380
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
@@ -364,7 +385,7 @@ class TensorNameMap:
|
|
364
385
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
365
386
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
366
387
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
367
|
-
"model.layers.{bid}.self_attn.q_norm", # cohere
|
388
|
+
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
|
368
389
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
369
390
|
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
370
391
|
"transformer.layers.{bid}.attn.q_norm", # openelm
|
@@ -373,7 +394,7 @@ class TensorNameMap:
|
|
373
394
|
MODEL_TENSOR.ATTN_K_NORM: (
|
374
395
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
375
396
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
376
|
-
"model.layers.{bid}.self_attn.k_norm", # cohere
|
397
|
+
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
|
377
398
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
378
399
|
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
379
400
|
"transformer.layers.{bid}.attn.k_norm", # openelm
|
@@ -426,6 +447,98 @@ class TensorNameMap:
|
|
426
447
|
"backbone.layers.{bid}.mixer.out_proj",
|
427
448
|
),
|
428
449
|
|
450
|
+
MODEL_TENSOR.TIME_MIX_W1: (
|
451
|
+
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
|
452
|
+
),
|
453
|
+
|
454
|
+
MODEL_TENSOR.TIME_MIX_W2: (
|
455
|
+
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
|
456
|
+
),
|
457
|
+
|
458
|
+
MODEL_TENSOR.TIME_MIX_LERP_X: (
|
459
|
+
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
|
460
|
+
),
|
461
|
+
|
462
|
+
MODEL_TENSOR.TIME_MIX_LERP_K: (
|
463
|
+
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
|
464
|
+
),
|
465
|
+
|
466
|
+
MODEL_TENSOR.TIME_MIX_LERP_V: (
|
467
|
+
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
|
468
|
+
),
|
469
|
+
|
470
|
+
MODEL_TENSOR.TIME_MIX_LERP_R: (
|
471
|
+
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
|
472
|
+
),
|
473
|
+
|
474
|
+
MODEL_TENSOR.TIME_MIX_LERP_G: (
|
475
|
+
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
|
476
|
+
),
|
477
|
+
|
478
|
+
MODEL_TENSOR.TIME_MIX_LERP_W: (
|
479
|
+
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
|
480
|
+
),
|
481
|
+
|
482
|
+
MODEL_TENSOR.TIME_MIX_FIRST: (
|
483
|
+
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
|
484
|
+
),
|
485
|
+
|
486
|
+
MODEL_TENSOR.TIME_MIX_DECAY: (
|
487
|
+
"rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
|
488
|
+
),
|
489
|
+
|
490
|
+
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
|
491
|
+
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
|
492
|
+
),
|
493
|
+
|
494
|
+
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
|
495
|
+
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
|
496
|
+
),
|
497
|
+
|
498
|
+
MODEL_TENSOR.TIME_MIX_KEY: (
|
499
|
+
"rwkv.blocks.{bid}.attention.key", # rwkv
|
500
|
+
),
|
501
|
+
|
502
|
+
MODEL_TENSOR.TIME_MIX_VALUE: (
|
503
|
+
"rwkv.blocks.{bid}.attention.value", # rwkv
|
504
|
+
),
|
505
|
+
|
506
|
+
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
|
507
|
+
"rwkv.blocks.{bid}.attention.receptance", # rwkv
|
508
|
+
),
|
509
|
+
|
510
|
+
MODEL_TENSOR.TIME_MIX_GATE: (
|
511
|
+
"rwkv.blocks.{bid}.attention.gate", # rwkv
|
512
|
+
),
|
513
|
+
|
514
|
+
MODEL_TENSOR.TIME_MIX_LN: (
|
515
|
+
"rwkv.blocks.{bid}.attention.ln_x", # rwkv
|
516
|
+
),
|
517
|
+
|
518
|
+
MODEL_TENSOR.TIME_MIX_OUTPUT: (
|
519
|
+
"rwkv.blocks.{bid}.attention.output", # rwkv
|
520
|
+
),
|
521
|
+
|
522
|
+
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
|
523
|
+
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
|
524
|
+
),
|
525
|
+
|
526
|
+
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
|
527
|
+
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
|
528
|
+
),
|
529
|
+
|
530
|
+
MODEL_TENSOR.CHANNEL_MIX_KEY: (
|
531
|
+
"rwkv.blocks.{bid}.feed_forward.key", # rwkv
|
532
|
+
),
|
533
|
+
|
534
|
+
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
|
535
|
+
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
|
536
|
+
),
|
537
|
+
|
538
|
+
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
|
539
|
+
"rwkv.blocks.{bid}.feed_forward.value", # rwkv
|
540
|
+
),
|
541
|
+
|
429
542
|
MODEL_TENSOR.ATTN_Q_A: (
|
430
543
|
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
|
431
544
|
),
|
@@ -571,6 +684,15 @@ class TensorNameMap:
|
|
571
684
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
572
685
|
"encoder.final_layer_norm", # t5
|
573
686
|
),
|
687
|
+
|
688
|
+
MODEL_TENSOR.CLS: (
|
689
|
+
"classifier", # jina
|
690
|
+
"classifier.dense", # roberta
|
691
|
+
),
|
692
|
+
|
693
|
+
MODEL_TENSOR.CLS_OUT: (
|
694
|
+
"classifier.out_proj", # roberta
|
695
|
+
),
|
574
696
|
}
|
575
697
|
|
576
698
|
# architecture-specific block mappings
|
@@ -646,4 +768,4 @@ class TensorNameMap:
|
|
646
768
|
|
647
769
|
|
648
770
|
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
|
649
|
-
return TensorNameMap(arch, n_blocks)
|
771
|
+
return TensorNameMap(arch, n_blocks)
|
@@ -66,4 +66,4 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
|
|
66
66
|
|
67
67
|
kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
|
68
68
|
|
69
|
-
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
|
69
|
+
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
|