bigdl-core-cpp 2.1.0b20240820.post1__py3-none-win_amd64.whl → 2.2.0b20250217.post0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +908 -140
  2. bigdl/cpp/convert_hf_to_gguf_update.py +376 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +450 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +433 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +414 -89
  7. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  8. bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
  9. bigdl/cpp/gguf-py/gguf/gguf_writer.py +77 -14
  10. bigdl/cpp/gguf-py/gguf/lazy.py +3 -1
  11. bigdl/cpp/gguf-py/gguf/metadata.py +195 -76
  12. bigdl/cpp/gguf-py/gguf/quants.py +1210 -64
  13. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +156 -34
  14. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  15. bigdl/cpp/gguf-py/gguf/vocab.py +325 -3
  16. bigdl/cpp/libs/common.lib +0 -0
  17. bigdl/cpp/libs/ggml-base.dll +0 -0
  18. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  19. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  20. bigdl/cpp/libs/ggml.dll +0 -0
  21. bigdl/cpp/libs/libc++.dll +0 -0
  22. bigdl/cpp/libs/llama-batched.exe +0 -0
  23. bigdl/cpp/libs/llama-bench.exe +0 -0
  24. bigdl/cpp/libs/llama-cli.exe +0 -0
  25. bigdl/cpp/libs/llama-embedding.exe +0 -0
  26. bigdl/cpp/libs/llama-gguf.exe +0 -0
  27. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  28. bigdl/cpp/libs/llama-lookup.exe +0 -0
  29. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  30. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  31. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  32. bigdl/cpp/libs/llama-quantize.exe +0 -0
  33. bigdl/cpp/libs/llama-server.exe +0 -0
  34. bigdl/cpp/libs/llama-simple.exe +0 -0
  35. bigdl/cpp/libs/llama-speculative.exe +0 -0
  36. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  37. bigdl/cpp/libs/llama.dll +0 -0
  38. bigdl/cpp/libs/llava_shared.dll +0 -0
  39. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  40. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  41. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  42. bigdl/cpp/libs/ollama-lib.exe +0 -0
  43. bigdl/cpp/libs/ollama.exe +0 -0
  44. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  45. bigdl/cpp/libs/ollama_llama.dll +0 -0
  46. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  47. {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.bat +7 -2
  48. bigdl_core_cpp-2.2.0b20250217.post0.data/scripts/init-ollama.bat +16 -0
  49. {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/METADATA +9 -5
  50. bigdl_core_cpp-2.2.0b20250217.post0.dist-info/RECORD +56 -0
  51. {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/WHEEL +1 -1
  52. bigdl/cpp/convert.py +0 -1714
  53. bigdl/cpp/libs/baby-llama.exe +0 -0
  54. bigdl/cpp/libs/batched-bench.exe +0 -0
  55. bigdl/cpp/libs/batched.exe +0 -0
  56. bigdl/cpp/libs/beam-search.exe +0 -0
  57. bigdl/cpp/libs/benchmark.exe +0 -0
  58. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  59. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  60. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  61. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  62. bigdl/cpp/libs/embedding.exe +0 -0
  63. bigdl/cpp/libs/export-lora.exe +0 -0
  64. bigdl/cpp/libs/finetune.exe +0 -0
  65. bigdl/cpp/libs/ggml_shared.dll +0 -0
  66. bigdl/cpp/libs/gguf.exe +0 -0
  67. bigdl/cpp/libs/gritlm.exe +0 -0
  68. bigdl/cpp/libs/imatrix.exe +0 -0
  69. bigdl/cpp/libs/infill.exe +0 -0
  70. bigdl/cpp/libs/llava-cli.exe +0 -0
  71. bigdl/cpp/libs/lookahead.exe +0 -0
  72. bigdl/cpp/libs/lookup.exe +0 -0
  73. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  74. bigdl/cpp/libs/main.exe +0 -0
  75. bigdl/cpp/libs/parallel.exe +0 -0
  76. bigdl/cpp/libs/passkey.exe +0 -0
  77. bigdl/cpp/libs/perplexity.exe +0 -0
  78. bigdl/cpp/libs/q8dot.exe +0 -0
  79. bigdl/cpp/libs/quantize-stats.exe +0 -0
  80. bigdl/cpp/libs/quantize.exe +0 -0
  81. bigdl/cpp/libs/save-load-state.exe +0 -0
  82. bigdl/cpp/libs/server.exe +0 -0
  83. bigdl/cpp/libs/simple.exe +0 -0
  84. bigdl/cpp/libs/speculative.exe +0 -0
  85. bigdl/cpp/libs/tokenize.exe +0 -0
  86. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  87. bigdl/cpp/libs/vdot.exe +0 -0
  88. bigdl_core_cpp-2.1.0b20240820.post1.data/scripts/init-ollama.bat +0 -13
  89. bigdl_core_cpp-2.1.0b20240820.post1.dist-info/RECORD +0 -63
  90. {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.ps1 +0 -0
  91. {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,10 @@ class TensorNameMap:
10
10
  # Token embeddings
11
11
  MODEL_TENSOR.TOKEN_EMBD: (
12
12
  "gpt_neox.embed_in", # gptneox
13
- "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
13
+ "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
14
14
  "transformer.word_embeddings", # falcon
15
15
  "word_embeddings", # bloom
16
- "model.embed_tokens", # llama-hf
16
+ "model.embed_tokens", # llama-hf nemotron olmoe olmo2
17
17
  "tok_embeddings", # llama-pth
18
18
  "embeddings.word_embeddings", # bert nomic-bert
19
19
  "language_model.embedding.word_embeddings", # persimmon
@@ -27,6 +27,7 @@ class TensorNameMap:
27
27
  "embedding.word_embeddings", # chatglm
28
28
  "transformer.token_embeddings", # openelm
29
29
  "shared", # t5
30
+ "rwkv.embeddings", # rwkv
30
31
  ),
31
32
 
32
33
  # Token type embeddings
@@ -40,6 +41,7 @@ class TensorNameMap:
40
41
  "embeddings.LayerNorm", # bert
41
42
  "emb_ln", # nomic-bert
42
43
  "transformer.norm", # openelm
44
+ "rwkv.blocks.0.pre_ln", # rwkv
43
45
  ),
44
46
 
45
47
  # Position embeddings
@@ -52,18 +54,19 @@ class TensorNameMap:
52
54
  # Output
53
55
  MODEL_TENSOR.OUTPUT: (
54
56
  "embed_out", # gptneox
55
- "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
57
+ "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
56
58
  "output", # llama-pth bloom internlm2
57
59
  "word_embeddings_for_head", # persimmon
58
60
  "lm_head.linear", # phi2
59
61
  "output_layer", # chatglm
62
+ "head", # rwkv
60
63
  ),
61
64
 
62
65
  # Output norm
63
66
  MODEL_TENSOR.OUTPUT_NORM: (
64
67
  "gpt_neox.final_layer_norm", # gptneox
65
- "transformer.ln_f", # gpt2 gpt-j falcon jais
66
- "model.norm", # llama-hf baichuan internlm2
68
+ "transformer.ln_f", # gpt2 gpt-j falcon jais exaone
69
+ "model.norm", # llama-hf baichuan internlm2 olmoe olmo2
67
70
  "norm", # llama-pth
68
71
  "transformer.norm_f", # mpt dbrx
69
72
  "ln_f", # refact bloom qwen gpt2
@@ -75,6 +78,8 @@ class TensorNameMap:
75
78
  "transformer.rms_norm", # Grok
76
79
  "encoder.final_layernorm", # chatglm
77
80
  "transformer.norm", # openelm
81
+ "model.norm", # nemotron
82
+ "rwkv.ln_out", # rwkv
78
83
  ),
79
84
 
80
85
  # Rope frequencies
@@ -82,18 +87,21 @@ class TensorNameMap:
82
87
  "rope.freqs", # llama-pth
83
88
  "rotary_pos_emb.inv_freq", # chatglm
84
89
  ),
90
+
91
+ MODEL_TENSOR.ROPE_FACTORS_LONG: (),
92
+ MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
85
93
  }
86
94
 
87
95
  block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
88
96
  # Attention norm
89
97
  MODEL_TENSOR.ATTN_NORM: (
90
98
  "gpt_neox.layers.{bid}.input_layernorm", # gptneox
91
- "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais
99
+ "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone
92
100
  "transformer.blocks.{bid}.norm_1", # mpt
93
101
  "transformer.h.{bid}.input_layernorm", # falcon7b
94
102
  "h.{bid}.input_layernorm", # bloom
95
103
  "transformer.h.{bid}.ln_mlp", # falcon40b
96
- "model.layers.{bid}.input_layernorm", # llama-hf
104
+ "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe
97
105
  "layers.{bid}.attention_norm", # llama-pth
98
106
  "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
99
107
  "model.layers.{bid}.ln1", # yi
@@ -107,12 +115,14 @@ class TensorNameMap:
107
115
  "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
108
116
  "encoder.layers.{bid}.input_layernorm", # chatglm
109
117
  "transformer.layers.{bid}.attn_norm", # openelm
118
+ "rwkv.blocks.{bid}.ln1", # rwkv
110
119
  ),
111
120
 
112
121
  # Attention norm 2
113
122
  MODEL_TENSOR.ATTN_NORM_2: (
114
- "transformer.h.{bid}.ln_attn", # falcon40b
123
+ "transformer.h.{bid}.ln_attn", # falcon40b
115
124
  "encoder.layer.{bid}.layer_norm_1", # jina-v2-code
125
+ "rwkv.blocks.{bid}.ln2", # rwkv
116
126
  ),
117
127
 
118
128
  # Attention query-key-value
@@ -135,18 +145,21 @@ class TensorNameMap:
135
145
 
136
146
  # Attention query
137
147
  MODEL_TENSOR.ATTN_Q: (
138
- "model.layers.{bid}.self_attn.q_proj", # llama-hf
148
+ "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
149
+ "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
139
150
  "layers.{bid}.attention.wq", # llama-pth
140
151
  "encoder.layer.{bid}.attention.self.query", # bert
141
152
  "transformer.h.{bid}.attn.q_proj", # gpt-j
142
153
  "model.layers.layers.{bid}.self_attn.q_proj", # plamo
143
154
  "model.layers.{bid}.attention.wq", # internlm2
144
155
  "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
156
+ "transformer.h.{bid}.attn.attention.q_proj", # exaone
145
157
  ),
146
158
 
147
159
  # Attention key
148
160
  MODEL_TENSOR.ATTN_K: (
149
- "model.layers.{bid}.self_attn.k_proj", # llama-hf
161
+ "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
162
+ "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
150
163
  "layers.{bid}.attention.wk", # llama-pth
151
164
  "encoder.layer.{bid}.attention.self.key", # bert
152
165
  "transformer.h.{bid}.attn.k_proj", # gpt-j
@@ -154,18 +167,20 @@ class TensorNameMap:
154
167
  "model.layers.layers.{bid}.self_attn.k_proj", # plamo
155
168
  "model.layers.{bid}.attention.wk", # internlm2
156
169
  "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
170
+ "transformer.h.{bid}.attn.attention.k_proj", # exaone
157
171
  ),
158
172
 
159
173
  # Attention value
160
174
  MODEL_TENSOR.ATTN_V: (
161
- "model.layers.{bid}.self_attn.v_proj", # llama-hf
175
+ "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
162
176
  "layers.{bid}.attention.wv", # llama-pth
163
177
  "encoder.layer.{bid}.attention.self.value", # bert
164
178
  "transformer.h.{bid}.attn.v_proj", # gpt-j
165
179
  "transformer.h.{bid}.attn.v", # refact
166
180
  "model.layers.layers.{bid}.self_attn.v_proj", # plamo
167
181
  "model.layers.{bid}.attention.wv", # internlm2
168
- "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
182
+ "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
183
+ "transformer.h.{bid}.attn.attention.v_proj", # exaone
169
184
  ),
170
185
 
171
186
  # Attention output
@@ -175,7 +190,7 @@ class TensorNameMap:
175
190
  "transformer.blocks.{bid}.attn.out_proj", # mpt
176
191
  "transformer.h.{bid}.self_attention.dense", # falcon
177
192
  "h.{bid}.self_attention.dense", # bloom
178
- "model.layers.{bid}.self_attn.o_proj", # llama-hf
193
+ "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
179
194
  "layers.{bid}.attention.wo", # llama-pth
180
195
  "encoder.layer.{bid}.attention.output.dense", # bert
181
196
  "transformer.h.{bid}.attn.out_proj", # gpt-j
@@ -190,6 +205,7 @@ class TensorNameMap:
190
205
  "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
191
206
  "encoder.layers.{bid}.self_attention.dense", # chatglm
192
207
  "transformer.layers.{bid}.attn.out_proj", # openelm
208
+ "transformer.h.{bid}.attn.attention.out_proj", # exaone
193
209
  ),
194
210
 
195
211
  # Attention output norm
@@ -201,7 +217,7 @@ class TensorNameMap:
201
217
  ),
202
218
 
203
219
  MODEL_TENSOR.ATTN_POST_NORM: (
204
- "model.layers.{bid}.post_attention_layernorm", # gemma2
220
+ "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
205
221
  ),
206
222
 
207
223
  # Rotary embeddings
@@ -215,10 +231,10 @@ class TensorNameMap:
215
231
  # Feed-forward norm
216
232
  MODEL_TENSOR.FFN_NORM: (
217
233
  "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
218
- "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
234
+ "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
219
235
  "h.{bid}.post_attention_layernorm", # bloom
220
236
  "transformer.blocks.{bid}.norm_2", # mpt
221
- "model.layers.{bid}.post_attention_layernorm", # llama-hf
237
+ "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe
222
238
  "layers.{bid}.ffn_norm", # llama-pth
223
239
  "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
224
240
  "model.layers.{bid}.ln2", # yi
@@ -236,15 +252,16 @@ class TensorNameMap:
236
252
 
237
253
  # Post feed-forward norm
238
254
  MODEL_TENSOR.FFN_POST_NORM: (
239
- "model.layers.{bid}.post_feedforward_layernorm", # gemma2
255
+ "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
240
256
  ),
241
257
 
242
258
  MODEL_TENSOR.FFN_GATE_INP: (
243
- "layers.{bid}.feed_forward.gate", # mixtral
244
- "model.layers.{bid}.block_sparse_moe.gate", # mixtral
245
- "model.layers.{bid}.mlp.gate", # qwen2moe
246
- "transformer.decoder_layer.{bid}.router", # Grok
247
- "transformer.blocks.{bid}.ffn.router.layer", # dbrx
259
+ "layers.{bid}.feed_forward.gate", # mixtral
260
+ "model.layers.{bid}.block_sparse_moe.gate", # mixtral
261
+ "model.layers.{bid}.mlp.gate", # qwen2moe olmoe
262
+ "transformer.decoder_layer.{bid}.router", # Grok
263
+ "transformer.blocks.{bid}.ffn.router.layer", # dbrx
264
+ "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
248
265
  ),
249
266
 
250
267
  MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -258,7 +275,7 @@ class TensorNameMap:
258
275
  "transformer.blocks.{bid}.ffn.up_proj", # mpt
259
276
  "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
260
277
  "h.{bid}.mlp.dense_h_to_4h", # bloom
261
- "model.layers.{bid}.mlp.up_proj", # llama-hf refact
278
+ "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
262
279
  "layers.{bid}.feed_forward.w3", # llama-pth
263
280
  "encoder.layer.{bid}.intermediate.dense", # bert
264
281
  "transformer.h.{bid}.mlp.fc_in", # gpt-j
@@ -277,13 +294,14 @@ class TensorNameMap:
277
294
  "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
278
295
  "model.layers.{bid}.residual_mlp.w3", # arctic
279
296
  "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
297
+ "transformer.h.{bid}.mlp.c_fc_1", # exaone
280
298
  ),
281
299
 
282
300
  MODEL_TENSOR.FFN_UP_EXP: (
283
301
  "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
284
302
  "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
285
303
  "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
286
- "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
304
+ "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
287
305
  ),
288
306
 
289
307
  MODEL_TENSOR.FFN_UP_SHEXP: (
@@ -298,7 +316,7 @@ class TensorNameMap:
298
316
 
299
317
  # Feed-forward gate
300
318
  MODEL_TENSOR.FFN_GATE: (
301
- "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
319
+ "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
302
320
  "layers.{bid}.feed_forward.w1", # llama-pth
303
321
  "transformer.h.{bid}.mlp.w2", # qwen
304
322
  "transformer.h.{bid}.mlp.c_fc2", # jais
@@ -308,13 +326,14 @@ class TensorNameMap:
308
326
  "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
309
327
  "transformer.h.{bid}.mlp.linear_1", # refact
310
328
  "model.layers.{bid}.residual_mlp.w1", # arctic
329
+ "transformer.h.{bid}.mlp.c_fc_0", # exaone
311
330
  ),
312
331
 
313
332
  MODEL_TENSOR.FFN_GATE_EXP: (
314
333
  "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
315
334
  "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
316
335
  "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
317
- "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
336
+ "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
318
337
  ),
319
338
 
320
339
  MODEL_TENSOR.FFN_GATE_SHEXP: (
@@ -329,7 +348,7 @@ class TensorNameMap:
329
348
  "transformer.blocks.{bid}.ffn.down_proj", # mpt
330
349
  "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
331
350
  "h.{bid}.mlp.dense_4h_to_h", # bloom
332
- "model.layers.{bid}.mlp.down_proj", # llama-hf
351
+ "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
333
352
  "layers.{bid}.feed_forward.w2", # llama-pth
334
353
  "encoder.layer.{bid}.output.dense", # bert
335
354
  "transformer.h.{bid}.mlp.fc_out", # gpt-j
@@ -347,13 +366,15 @@ class TensorNameMap:
347
366
  "model.layers.{bid}.residual_mlp.w2", # arctic
348
367
  "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
349
368
  "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
369
+ "model.layers.h.{bid}.mlp.c_proj", # exaone
350
370
  ),
351
371
 
352
372
  MODEL_TENSOR.FFN_DOWN_EXP: (
353
- "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
354
- "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
355
- "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
356
- "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
373
+ "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
374
+ "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
375
+ "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
376
+ "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
377
+ "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
357
378
  ),
358
379
 
359
380
  MODEL_TENSOR.FFN_DOWN_SHEXP: (
@@ -364,7 +385,7 @@ class TensorNameMap:
364
385
  MODEL_TENSOR.ATTN_Q_NORM: (
365
386
  "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
366
387
  "model.layers.{bid}.self_attn.q_layernorm", # persimmon
367
- "model.layers.{bid}.self_attn.q_norm", # cohere
388
+ "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
368
389
  "transformer.blocks.{bid}.attn.q_ln", # sea-lion
369
390
  "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
370
391
  "transformer.layers.{bid}.attn.q_norm", # openelm
@@ -373,7 +394,7 @@ class TensorNameMap:
373
394
  MODEL_TENSOR.ATTN_K_NORM: (
374
395
  "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
375
396
  "model.layers.{bid}.self_attn.k_layernorm", # persimmon
376
- "model.layers.{bid}.self_attn.k_norm", # cohere
397
+ "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
377
398
  "transformer.blocks.{bid}.attn.k_ln", # sea-lion
378
399
  "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
379
400
  "transformer.layers.{bid}.attn.k_norm", # openelm
@@ -426,6 +447,98 @@ class TensorNameMap:
426
447
  "backbone.layers.{bid}.mixer.out_proj",
427
448
  ),
428
449
 
450
+ MODEL_TENSOR.TIME_MIX_W1: (
451
+ "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
452
+ ),
453
+
454
+ MODEL_TENSOR.TIME_MIX_W2: (
455
+ "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
456
+ ),
457
+
458
+ MODEL_TENSOR.TIME_MIX_LERP_X: (
459
+ "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
460
+ ),
461
+
462
+ MODEL_TENSOR.TIME_MIX_LERP_K: (
463
+ "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
464
+ ),
465
+
466
+ MODEL_TENSOR.TIME_MIX_LERP_V: (
467
+ "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
468
+ ),
469
+
470
+ MODEL_TENSOR.TIME_MIX_LERP_R: (
471
+ "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
472
+ ),
473
+
474
+ MODEL_TENSOR.TIME_MIX_LERP_G: (
475
+ "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
476
+ ),
477
+
478
+ MODEL_TENSOR.TIME_MIX_LERP_W: (
479
+ "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
480
+ ),
481
+
482
+ MODEL_TENSOR.TIME_MIX_FIRST: (
483
+ "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
484
+ ),
485
+
486
+ MODEL_TENSOR.TIME_MIX_DECAY: (
487
+ "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
488
+ ),
489
+
490
+ MODEL_TENSOR.TIME_MIX_DECAY_W1: (
491
+ "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
492
+ ),
493
+
494
+ MODEL_TENSOR.TIME_MIX_DECAY_W2: (
495
+ "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
496
+ ),
497
+
498
+ MODEL_TENSOR.TIME_MIX_KEY: (
499
+ "rwkv.blocks.{bid}.attention.key", # rwkv
500
+ ),
501
+
502
+ MODEL_TENSOR.TIME_MIX_VALUE: (
503
+ "rwkv.blocks.{bid}.attention.value", # rwkv
504
+ ),
505
+
506
+ MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
507
+ "rwkv.blocks.{bid}.attention.receptance", # rwkv
508
+ ),
509
+
510
+ MODEL_TENSOR.TIME_MIX_GATE: (
511
+ "rwkv.blocks.{bid}.attention.gate", # rwkv
512
+ ),
513
+
514
+ MODEL_TENSOR.TIME_MIX_LN: (
515
+ "rwkv.blocks.{bid}.attention.ln_x", # rwkv
516
+ ),
517
+
518
+ MODEL_TENSOR.TIME_MIX_OUTPUT: (
519
+ "rwkv.blocks.{bid}.attention.output", # rwkv
520
+ ),
521
+
522
+ MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
523
+ "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
524
+ ),
525
+
526
+ MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
527
+ "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
528
+ ),
529
+
530
+ MODEL_TENSOR.CHANNEL_MIX_KEY: (
531
+ "rwkv.blocks.{bid}.feed_forward.key", # rwkv
532
+ ),
533
+
534
+ MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
535
+ "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
536
+ ),
537
+
538
+ MODEL_TENSOR.CHANNEL_MIX_VALUE: (
539
+ "rwkv.blocks.{bid}.feed_forward.value", # rwkv
540
+ ),
541
+
429
542
  MODEL_TENSOR.ATTN_Q_A: (
430
543
  "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
431
544
  ),
@@ -571,6 +684,15 @@ class TensorNameMap:
571
684
  MODEL_TENSOR.ENC_OUTPUT_NORM: (
572
685
  "encoder.final_layer_norm", # t5
573
686
  ),
687
+
688
+ MODEL_TENSOR.CLS: (
689
+ "classifier", # jina
690
+ "classifier.dense", # roberta
691
+ ),
692
+
693
+ MODEL_TENSOR.CLS_OUT: (
694
+ "classifier.out_proj", # roberta
695
+ ),
574
696
  }
575
697
 
576
698
  # architecture-specific block mappings
@@ -646,4 +768,4 @@ class TensorNameMap:
646
768
 
647
769
 
648
770
  def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
649
- return TensorNameMap(arch, n_blocks)
771
+ return TensorNameMap(arch, n_blocks)
@@ -66,4 +66,4 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
66
66
 
67
67
  kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
68
68
 
69
- return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
69
+ return f"{name}{parameters}{finetune}{version}{encoding}{kind}"