micronnx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
micronnx-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 <axiol>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.4
2
+ Name: micronnx
3
+ Version: 0.1.0
4
+ Summary: micronnx — runtime de inferencia puro NumPy sin dependencias pesadas.
5
+ License: MIT
6
+ Project-URL: Repository, https://github.com/tuusuario/micronnx
7
+ Keywords: llm,inference,numpy,gguf,safetensors,hdf5,mobilenet,activation-extraction,model-fusion,quantization
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: numpy>=1.24
22
+ Requires-Dist: pyfive
23
+ Dynamic: license-file
@@ -0,0 +1,83 @@
1
+ """
2
+ micronnx — runtime de inferencia puro NumPy.
3
+
4
+ import micronnx as nx
5
+ nx.ModelRunner(...)
6
+ nx.GGUFLoader(...)
7
+ nx.export_to_npz(...)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+
13
+ # ─────────────────────────────────────────
14
+ # LAZY LOADER
15
+ # ─────────────────────────────────────────
16
+
17
+ _PUBLIC = {
18
+ # Loaders
19
+ "SafeTensorsLoader": "micronnx.loaders.safetensors",
20
+ "GGUFLoader": "micronnx.loaders.gguf",
21
+ "H5Loader": "micronnx.loaders.h5",
22
+ "NpyLoader": "micronnx.loaders.npy",
23
+ # Ops
24
+ "softmax": "micronnx.executor.ops",
25
+ "sigmoid": "micronnx.executor.ops",
26
+ "relu": "micronnx.executor.ops",
27
+ "gelu": "micronnx.executor.ops",
28
+ "silu": "micronnx.executor.ops",
29
+ "layernorm": "micronnx.executor.ops",
30
+ "rmsnorm": "micronnx.executor.ops",
31
+ "batchnorm": "micronnx.executor.ops",
32
+ "linear": "micronnx.executor.ops",
33
+ "embedding": "micronnx.executor.ops",
34
+ "swiglu": "micronnx.executor.ops",
35
+ "swiglu_fused": "micronnx.executor.ops",
36
+ "geglu": "micronnx.executor.ops",
37
+ "ffn_gelu": "micronnx.executor.ops",
38
+ "attention": "micronnx.executor.ops",
39
+ "rope": "micronnx.executor.ops",
40
+ "conv2d": "micronnx.executor.ops",
41
+ "depthwise_conv2d": "micronnx.executor.ops",
42
+ "global_avg_pool": "micronnx.executor.ops",
43
+ "max_pool2d": "micronnx.executor.ops",
44
+ # Runtime
45
+ "ModelRunner": "micronnx.executor.forward",
46
+ "RoPECache": "micronnx.executor.forward",
47
+ "KVCache": "micronnx.executor.forward",
48
+ "WeightCache": "micronnx.executor.forward",
49
+ "SCHEMAS": "micronnx.executor.forward",
50
+ "detect_schema_gguf": "micronnx.executor.forward",
51
+ "detect_schema_hf": "micronnx.executor.forward",
52
+ "detect_schema_safetensors": "micronnx.executor.forward",
53
+ # Extractores y CNN
54
+ "ActivationExtractor": "micronnx.executor.extractor",
55
+ "CNNRunner": "micronnx.executor.imag",
56
+ "CNNActivationExtractor": "micronnx.executor.imag",
57
+ "CanonicalLoader": "micronnx.executor.imag",
58
+ # Canónico
59
+ "CanonicalTensor": "micronnx.executor.canonical",
60
+ "map_tensors": "micronnx.executor.canonical",
61
+ "map_tensors_full": "micronnx.executor.canonical",
62
+ "find_unmapped": "micronnx.executor.canonical",
63
+ "resolve_tied_embeddings": "micronnx.executor.canonical",
64
+ "detect_format": "micronnx.executor.canonical",
65
+ # Exportador
66
+ "export_to_npz": "micronnx.executor.exporter",
67
+ "load_index": "micronnx.executor.exporter",
68
+ "inspect_npz": "micronnx.executor.exporter",
69
+ }
70
+
71
+
72
+ def __getattr__(name: str):
73
+ if name in _PUBLIC:
74
+ import importlib
75
+ mod = importlib.import_module(_PUBLIC[name])
76
+ obj = getattr(mod, name)
77
+ # Cachear en el módulo para que el segundo acceso sea O(1)
78
+ globals()[name] = obj
79
+ return obj
80
+ raise AttributeError(f"micronnx no tiene '{name}'")
81
+
82
+
83
+ __all__ = list(_PUBLIC.keys())
@@ -0,0 +1,3 @@
1
+ from . import forward, ops, exporter
2
+
3
+ __all__ = ["forward", "ops", "exporter"]
@@ -0,0 +1,456 @@
1
+ """
2
+ canonical.py — mapeo de nombres de tensores de distintos formatos/arquitecturas
3
+ a un esquema canónico común.
4
+
5
+ Cobertura añadida respecto a la versión anterior:
6
+ - Familias nuevas: GPT-2/GPT-NeoX, Falcon, BLOOM, MPT, Phi-3
7
+ (qkv_proj/gate_up_proj fusionados), Baichuan2 (W_pack),
8
+ Gemma2/3 (pre/post_feedforward_layernorm).
9
+ - Biases de atención y MLP, q_norm/k_norm (Qwen3, OLMo2).
10
+ - MoE: Mixtral (block_sparse_moe.experts.{w1,w2,w3}), Qwen2/3-MoE y
11
+ DeepSeek-MoE (mlp.experts + shared_expert(s) + router), y su
12
+ equivalente GGUF (ffn_gate_inp / *_exps / *_shexp / exp_probs_b).
13
+ - H5: reglas genéricas de Keras (Conv2D/BatchNorm/Dense) además de las
14
+ específicas de MobileNet.
15
+ - CanonicalTensor.optional: marca tensores "secundarios" (bias, normas
16
+ extra, buffers) cuya ausencia en un checkpoint es normal y esperada.
17
+ - find_unmapped() / map_tensors_full(): detectan tensores del checkpoint
18
+ que ninguna regla pudo mapear — útil para no perder pesos en silencio
19
+ antes de fusionar.
20
+ - resolve_tied_embeddings(): genera 'head.weight' a partir de
21
+ 'embed.weight' cuando el modelo usa pesos atados (tied embeddings,
22
+ común en modelos chicos tipo SmolLM/GPT-2).
23
+ - detect_format(): ahora vota sobre toda la lista de tensores (hasta
24
+ sample_size) en vez de mirar solo los primeros 5.
25
+
26
+ NOTA — Gemma2/3 en GGUF:
27
+ blk.N.ffn_norm se usa en llama.cpp como "norma antes de la FFN", pero en
28
+ Gemma2 esa norma corresponde a pre_feedforward_layernorm, NO a
29
+ post_attention_layernorm (como sí ocurre en Llama). Los nombres GGUF
30
+ exactos de las dos normas "sandwich" extra de Gemma2
31
+ (post_attention_layernorm / post_feedforward_layernorm) no están
32
+ confirmados aquí. Si trabajas con un GGUF de Gemma2/3, verifica con
33
+ gguf_dump.py antes de asumir ese mapeo — es exactamente el tipo de
34
+ desalineación semántica que afecta la fusión entre arquitecturas.
35
+ """
36
+
37
+ import re
38
+ from dataclasses import dataclass
39
+
40
+
41
+ @dataclass
42
+ class CanonicalTensor:
43
+ original_name: str
44
+ canonical_name: str
45
+ layer_idx: int | None
46
+ role: str
47
+ optional: bool = False # bias / norma extra / buffer: su ausencia es normal
48
+
49
+
50
+ def _r(pattern: str, role: str, template: str) -> tuple:
51
+ """Regla de un peso "primario": si existe el tensor pero no matchea
52
+ ninguna regla _r, probablemente falta una regla."""
53
+ return (pattern, role, template, False)
54
+
55
+
56
+ def _opt(pattern: str, role: str, template: str) -> tuple:
57
+ """Regla "secundaria": bias, norma extra, buffer, router de experto
58
+ compartido, etc. Su ausencia en un checkpoint es normal."""
59
+ return (pattern, role, template, True)
60
+
61
+
62
+ # ════════════════════════════════════════════════════════════════
63
+ # SAFETENSORS
64
+ # ════════════════════════════════════════════════════════════════
65
+
66
+ # ── Familia LLaMA: Llama/Mistral/Yi/Qwen2/Qwen2.5/Qwen3-denso/
67
+ # Gemma/Gemma2/Gemma3/StableLM2/InternLM2/etc. (prefijo model.layers.N) ──
68
+ _SAFETENSORS_LLAMA = [
69
+ _r(r"model\.embed_tokens\.weight", "embed", "embed.weight"),
70
+ _r(r"lm_head\.weight", "head", "head.weight"),
71
+ _r(r"model\.norm\.weight", "norm_final", "norm_final.weight"),
72
+ _opt(r"model\.norm\.bias", "norm_final_bias", "norm_final.bias"),
73
+
74
+ # Proyecciones de atención
75
+ _r(r"model\.layers\.(\d+)\.self_attn\.q_proj\.weight", "attn_q", "layers.{}.attn.q.weight"),
76
+ _r(r"model\.layers\.(\d+)\.self_attn\.k_proj\.weight", "attn_k", "layers.{}.attn.k.weight"),
77
+ _r(r"model\.layers\.(\d+)\.self_attn\.v_proj\.weight", "attn_v", "layers.{}.attn.v.weight"),
78
+ _r(r"model\.layers\.(\d+)\.self_attn\.o_proj\.weight", "attn_o", "layers.{}.attn.o.weight"),
79
+
80
+ # Biases de atención (Qwen2/2.5, StableLM2, etc.)
81
+ _opt(r"model\.layers\.(\d+)\.self_attn\.q_proj\.bias", "attn_q_bias","layers.{}.attn.q.bias"),
82
+ _opt(r"model\.layers\.(\d+)\.self_attn\.k_proj\.bias", "attn_k_bias","layers.{}.attn.k.bias"),
83
+ _opt(r"model\.layers\.(\d+)\.self_attn\.v_proj\.bias", "attn_v_bias","layers.{}.attn.v.bias"),
84
+ _opt(r"model\.layers\.(\d+)\.self_attn\.o_proj\.bias", "attn_o_bias","layers.{}.attn.o.bias"),
85
+
86
+ # Q/K norm (Qwen3, OLMo2)
87
+ _opt(r"model\.layers\.(\d+)\.self_attn\.q_norm\.weight", "attn_q_norm","layers.{}.attn.q_norm.weight"),
88
+ _opt(r"model\.layers\.(\d+)\.self_attn\.k_norm\.weight", "attn_k_norm","layers.{}.attn.k_norm.weight"),
89
+
90
+ # Buffer no entrenable, a veces presente en checkpoints viejos/custom
91
+ _opt(r"model\.layers\.(\d+)\.self_attn\.rotary_emb\.inv_freq", "rope_buffer","layers.{}.attn.rope_inv_freq"),
92
+
93
+ # Phi-3/3.5: qkv y gate_up fusionados
94
+ _r(r"model\.layers\.(\d+)\.self_attn\.qkv_proj\.weight", "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
95
+ _r(r"model\.layers\.(\d+)\.mlp\.gate_up_proj\.weight", "mlp_gate_up_fused", "layers.{}.mlp.gate_up.weight"),
96
+
97
+ # Baichuan2: q/k/v fusionados en un solo W_pack
98
+ _r(r"model\.layers\.(\d+)\.self_attn\.W_pack\.weight", "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
99
+
100
+ # MLP denso (SwiGLU/GeGLU)
101
+ _r(r"model\.layers\.(\d+)\.mlp\.gate_proj\.weight", "mlp_gate", "layers.{}.mlp.gate.weight"),
102
+ _r(r"model\.layers\.(\d+)\.mlp\.up_proj\.weight", "mlp_up", "layers.{}.mlp.up.weight"),
103
+ _r(r"model\.layers\.(\d+)\.mlp\.down_proj\.weight", "mlp_down", "layers.{}.mlp.down.weight"),
104
+ _opt(r"model\.layers\.(\d+)\.mlp\.gate_proj\.bias", "mlp_gate_bias","layers.{}.mlp.gate.bias"),
105
+ _opt(r"model\.layers\.(\d+)\.mlp\.up_proj\.bias", "mlp_up_bias","layers.{}.mlp.up.bias"),
106
+ _opt(r"model\.layers\.(\d+)\.mlp\.down_proj\.bias", "mlp_down_bias","layers.{}.mlp.down.bias"),
107
+
108
+ # Normalizaciones pre/post atención
109
+ _r(r"model\.layers\.(\d+)\.input_layernorm\.weight", "norm_pre", "layers.{}.norm_pre.weight"),
110
+ _opt(r"model\.layers\.(\d+)\.input_layernorm\.bias", "norm_pre_bias","layers.{}.norm_pre.bias"),
111
+ _r(r"model\.layers\.(\d+)\.post_attention_layernorm\.weight", "norm_post", "layers.{}.norm_post.weight"),
112
+ _opt(r"model\.layers\.(\d+)\.post_attention_layernorm\.bias", "norm_post_bias","layers.{}.norm_post.bias"),
113
+
114
+ # Gemma2/3: dos normas extra alrededor del FFN (sin equivalente en Llama base)
115
+ _opt(r"model\.layers\.(\d+)\.pre_feedforward_layernorm\.weight", "norm_pre_ffn", "layers.{}.norm_pre_ffn.weight"),
116
+ _opt(r"model\.layers\.(\d+)\.post_feedforward_layernorm\.weight","norm_post_ffn", "layers.{}.norm_post_ffn.weight"),
117
+ ]
118
+
119
+
120
+ # ── MoE (HF safetensors): Mixtral, Qwen2/3-MoE, DeepSeek-MoE ──
121
+ _SAFETENSORS_MOE = [
122
+ # Mixtral: block_sparse_moe.gate + experts.{e}.{w1,w2,w3}
123
+ # w1=gate_proj (SiLU), w3=up_proj, w2=down_proj
124
+ _r(r"model\.layers\.(\d+)\.block_sparse_moe\.gate\.weight", "moe_router", "layers.{}.mlp.moe_router.weight"),
125
+ _r(r"model\.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.w1\.weight","moe_expert_gate", "layers.{}.mlp.moe_experts.{}.gate.weight"),
126
+ _r(r"model\.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.w3\.weight","moe_expert_up", "layers.{}.mlp.moe_experts.{}.up.weight"),
127
+ _r(r"model\.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.w2\.weight","moe_expert_down", "layers.{}.mlp.moe_experts.{}.down.weight"),
128
+
129
+ # Qwen2/3-MoE, DeepSeek-MoE: mlp.gate (router) + mlp.experts.{e}.*
130
+ _r(r"model\.layers\.(\d+)\.mlp\.gate\.weight", "moe_router", "layers.{}.mlp.moe_router.weight"),
131
+ _r(r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.gate_proj\.weight", "moe_expert_gate", "layers.{}.mlp.moe_experts.{}.gate.weight"),
132
+ _r(r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.up_proj\.weight", "moe_expert_up", "layers.{}.mlp.moe_experts.{}.up.weight"),
133
+ _r(r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.down_proj\.weight", "moe_expert_down", "layers.{}.mlp.moe_experts.{}.down.weight"),
134
+
135
+ # Experto compartido — Qwen2-MoE usa singular "shared_expert"
136
+ _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert\.gate_proj\.weight","moe_shared_gate", "layers.{}.mlp.moe_shared_gate.weight"),
137
+ _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert\.up_proj\.weight", "moe_shared_up", "layers.{}.mlp.moe_shared_up.weight"),
138
+ _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert\.down_proj\.weight","moe_shared_down", "layers.{}.mlp.moe_shared_down.weight"),
139
+ _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert_gate\.weight", "moe_shared_router","layers.{}.mlp.moe_shared_router.weight"),
140
+
141
+ # DeepSeek-MoE usa plural "shared_experts"
142
+ _opt(r"model\.layers\.(\d+)\.mlp\.shared_experts\.gate_proj\.weight","moe_shared_gate", "layers.{}.mlp.moe_shared_gate.weight"),
143
+ _opt(r"model\.layers\.(\d+)\.mlp\.shared_experts\.up_proj\.weight", "moe_shared_up", "layers.{}.mlp.moe_shared_up.weight"),
144
+ _opt(r"model\.layers\.(\d+)\.mlp\.shared_experts\.down_proj\.weight","moe_shared_down", "layers.{}.mlp.moe_shared_down.weight"),
145
+ ]
146
+
147
+
148
+ # ── GPT-2 (prefijo transformer.h.N, atención fusionada en c_attn) ──
149
+ _SAFETENSORS_GPT2 = [
150
+ _r(r"transformer\.wte\.weight", "embed", "embed.weight"),
151
+ _opt(r"transformer\.wpe\.weight", "pos_embed", "pos_embed.weight"),
152
+ _r(r"transformer\.ln_f\.weight", "norm_final", "norm_final.weight"),
153
+ _opt(r"transformer\.ln_f\.bias", "norm_final_bias", "norm_final.bias"),
154
+
155
+ _r(r"transformer\.h\.(\d+)\.attn\.c_attn\.weight", "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
156
+ _opt(r"transformer\.h\.(\d+)\.attn\.c_attn\.bias", "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
157
+ _r(r"transformer\.h\.(\d+)\.attn\.c_proj\.weight", "attn_o", "layers.{}.attn.o.weight"),
158
+ _opt(r"transformer\.h\.(\d+)\.attn\.c_proj\.bias", "attn_o_bias","layers.{}.attn.o.bias"),
159
+
160
+ # GPT-2 MLP: c_fc = up-proj, c_proj = down-proj, activación GELU (sin gate)
161
+ _r(r"transformer\.h\.(\d+)\.mlp\.c_fc\.weight", "mlp_up", "layers.{}.mlp.up.weight"),
162
+ _opt(r"transformer\.h\.(\d+)\.mlp\.c_fc\.bias", "mlp_up_bias","layers.{}.mlp.up.bias"),
163
+ _r(r"transformer\.h\.(\d+)\.mlp\.c_proj\.weight", "mlp_down", "layers.{}.mlp.down.weight"),
164
+ _opt(r"transformer\.h\.(\d+)\.mlp\.c_proj\.bias", "mlp_down_bias","layers.{}.mlp.down.bias"),
165
+
166
+ _r(r"transformer\.h\.(\d+)\.ln_1\.weight", "norm_pre", "layers.{}.norm_pre.weight"),
167
+ _opt(r"transformer\.h\.(\d+)\.ln_1\.bias", "norm_pre_bias","layers.{}.norm_pre.bias"),
168
+ _r(r"transformer\.h\.(\d+)\.ln_2\.weight", "norm_post", "layers.{}.norm_post.weight"),
169
+ _opt(r"transformer\.h\.(\d+)\.ln_2\.bias", "norm_post_bias","layers.{}.norm_post.bias"),
170
+ ]
171
+
172
+
173
+ # ── GPT-NeoX / Pythia (prefijo gpt_neox.layers.N) ──
174
+ _SAFETENSORS_GPT_NEOX = [
175
+ _r(r"gpt_neox\.embed_in\.weight", "embed", "embed.weight"),
176
+ _r(r"embed_out\.weight", "head", "head.weight"),
177
+ _r(r"gpt_neox\.final_layer_norm\.weight", "norm_final", "norm_final.weight"),
178
+ _opt(r"gpt_neox\.final_layer_norm\.bias", "norm_final_bias", "norm_final.bias"),
179
+
180
+ _r(r"gpt_neox\.layers\.(\d+)\.attention\.query_key_value\.weight", "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
181
+ _opt(r"gpt_neox\.layers\.(\d+)\.attention\.query_key_value\.bias", "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
182
+ _r(r"gpt_neox\.layers\.(\d+)\.attention\.dense\.weight", "attn_o", "layers.{}.attn.o.weight"),
183
+ _opt(r"gpt_neox\.layers\.(\d+)\.attention\.dense\.bias", "attn_o_bias","layers.{}.attn.o.bias"),
184
+
185
+ _r(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_h_to_4h\.weight", "mlp_up", "layers.{}.mlp.up.weight"),
186
+ _opt(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_h_to_4h\.bias", "mlp_up_bias","layers.{}.mlp.up.bias"),
187
+ _r(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_4h_to_h\.weight", "mlp_down", "layers.{}.mlp.down.weight"),
188
+ _opt(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_4h_to_h\.bias", "mlp_down_bias","layers.{}.mlp.down.bias"),
189
+
190
+ _r(r"gpt_neox\.layers\.(\d+)\.input_layernorm\.weight", "norm_pre", "layers.{}.norm_pre.weight"),
191
+ _opt(r"gpt_neox\.layers\.(\d+)\.input_layernorm\.bias", "norm_pre_bias","layers.{}.norm_pre.bias"),
192
+ _r(r"gpt_neox\.layers\.(\d+)\.post_attention_layernorm\.weight", "norm_post", "layers.{}.norm_post.weight"),
193
+ _opt(r"gpt_neox\.layers\.(\d+)\.post_attention_layernorm\.bias", "norm_post_bias","layers.{}.norm_post.bias"),
194
+ ]
195
+
196
+
197
+ # ── Falcon / BLOOM (prefijo transformer.h.N, qkv fusionada) ──
198
+ _SAFETENSORS_FALCON_BLOOM = [
199
+ _r(r"transformer\.word_embeddings\.weight", "embed", "embed.weight"),
200
+
201
+ _r(r"transformer\.h\.(\d+)\.self_attention\.query_key_value\.weight", "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
202
+ _opt(r"transformer\.h\.(\d+)\.self_attention\.query_key_value\.bias", "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
203
+ _r(r"transformer\.h\.(\d+)\.self_attention\.dense\.weight", "attn_o", "layers.{}.attn.o.weight"),
204
+ _opt(r"transformer\.h\.(\d+)\.self_attention\.dense\.bias", "attn_o_bias","layers.{}.attn.o.bias"),
205
+
206
+ _r(r"transformer\.h\.(\d+)\.mlp\.dense_h_to_4h\.weight", "mlp_up", "layers.{}.mlp.up.weight"),
207
+ _opt(r"transformer\.h\.(\d+)\.mlp\.dense_h_to_4h\.bias", "mlp_up_bias","layers.{}.mlp.up.bias"),
208
+ _r(r"transformer\.h\.(\d+)\.mlp\.dense_4h_to_h\.weight", "mlp_down", "layers.{}.mlp.down.weight"),
209
+ _opt(r"transformer\.h\.(\d+)\.mlp\.dense_4h_to_h\.bias", "mlp_down_bias","layers.{}.mlp.down.bias"),
210
+
211
+ # BLOOM: norma pre/post estándar (mismos nombres que Llama, ya cubiertos
212
+ # arriba para model.layers — aquí con prefijo transformer.h)
213
+ _r(r"transformer\.h\.(\d+)\.input_layernorm\.weight", "norm_pre", "layers.{}.norm_pre.weight"),
214
+ _opt(r"transformer\.h\.(\d+)\.input_layernorm\.bias", "norm_pre_bias","layers.{}.norm_pre.bias"),
215
+ _r(r"transformer\.h\.(\d+)\.post_attention_layernorm\.weight", "norm_post", "layers.{}.norm_post.weight"),
216
+ _opt(r"transformer\.h\.(\d+)\.post_attention_layernorm\.bias", "norm_post_bias","layers.{}.norm_post.bias"),
217
+
218
+ # Falcon-40B: atención paralela con dos normas independientes
219
+ _opt(r"transformer\.h\.(\d+)\.ln_attn\.weight", "norm_pre", "layers.{}.norm_pre.weight"),
220
+ _opt(r"transformer\.h\.(\d+)\.ln_attn\.bias", "norm_pre_bias", "layers.{}.norm_pre.bias"),
221
+ _opt(r"transformer\.h\.(\d+)\.ln_mlp\.weight", "norm_post_parallel", "layers.{}.norm_post.weight"),
222
+ _opt(r"transformer\.h\.(\d+)\.ln_mlp\.bias", "norm_post_parallel_bias","layers.{}.norm_post.bias"),
223
+ ]
224
+
225
+
226
+ # ── MPT (prefijo transformer.blocks.N, qkv fusionada en Wqkv) ──
227
+ _SAFETENSORS_MPT = [
228
+ _r(r"transformer\.norm_f\.weight", "norm_final", "norm_final.weight"),
229
+
230
+ _r(r"transformer\.blocks\.(\d+)\.attn\.Wqkv\.weight", "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
231
+ _r(r"transformer\.blocks\.(\d+)\.attn\.out_proj\.weight", "attn_o", "layers.{}.attn.o.weight"),
232
+ _r(r"transformer\.blocks\.(\d+)\.ffn\.up_proj\.weight", "mlp_up", "layers.{}.mlp.up.weight"),
233
+ _r(r"transformer\.blocks\.(\d+)\.ffn\.down_proj\.weight", "mlp_down", "layers.{}.mlp.down.weight"),
234
+ _r(r"transformer\.blocks\.(\d+)\.norm_1\.weight", "norm_pre", "layers.{}.norm_pre.weight"),
235
+ _r(r"transformer\.blocks\.(\d+)\.norm_2\.weight", "norm_post", "layers.{}.norm_post.weight"),
236
+ ]
237
+
238
+
239
+ _RULES_SAFETENSORS = (
240
+ _SAFETENSORS_LLAMA
241
+ + _SAFETENSORS_MOE
242
+ + _SAFETENSORS_GPT2
243
+ + _SAFETENSORS_GPT_NEOX
244
+ + _SAFETENSORS_FALCON_BLOOM
245
+ + _SAFETENSORS_MPT
246
+ )
247
+
248
+
249
+ # ════════════════════════════════════════════════════════════════
250
+ # GGUF
251
+ # ════════════════════════════════════════════════════════════════
252
+
253
+ _GGUF_BASE = [
254
+ _r(r"token_embd\.weight", "embed", "embed.weight"),
255
+ _opt(r"position_embd\.weight", "pos_embed", "pos_embed.weight"),
256
+ _r(r"output\.weight", "head", "head.weight"),
257
+ _r(r"output_norm\.weight", "norm_final", "norm_final.weight"),
258
+ _opt(r"output_norm\.bias", "norm_final_bias", "norm_final.bias"),
259
+
260
+ _r(r"blk\.(\d+)\.attn_q\.weight", "attn_q", "layers.{}.attn.q.weight"),
261
+ _r(r"blk\.(\d+)\.attn_k\.weight", "attn_k", "layers.{}.attn.k.weight"),
262
+ _r(r"blk\.(\d+)\.attn_v\.weight", "attn_v", "layers.{}.attn.v.weight"),
263
+ _r(r"blk\.(\d+)\.attn_output\.weight", "attn_o", "layers.{}.attn.o.weight"),
264
+
265
+ # Biases de atención (Qwen y otros)
266
+ _opt(r"blk\.(\d+)\.attn_q\.bias", "attn_q_bias","layers.{}.attn.q.bias"),
267
+ _opt(r"blk\.(\d+)\.attn_k\.bias", "attn_k_bias","layers.{}.attn.k.bias"),
268
+ _opt(r"blk\.(\d+)\.attn_v\.bias", "attn_v_bias","layers.{}.attn.v.bias"),
269
+ _opt(r"blk\.(\d+)\.attn_output\.bias", "attn_o_bias","layers.{}.attn.o.bias"),
270
+
271
+ # Q/K norm (Qwen3)
272
+ _opt(r"blk\.(\d+)\.attn_q_norm\.weight", "attn_q_norm","layers.{}.attn.q_norm.weight"),
273
+ _opt(r"blk\.(\d+)\.attn_k_norm\.weight", "attn_k_norm","layers.{}.attn.k_norm.weight"),
274
+
275
+ # QKV fusionada (modelos GPT-2/Falcon/MPT/Phi-3/Baichuan exportados a GGUF)
276
+ _r(r"blk\.(\d+)\.attn_qkv\.weight", "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
277
+ _opt(r"blk\.(\d+)\.attn_qkv\.bias", "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
278
+
279
+ # MLP denso
280
+ _r(r"blk\.(\d+)\.ffn_gate\.weight", "mlp_gate", "layers.{}.mlp.gate.weight"),
281
+ _r(r"blk\.(\d+)\.ffn_up\.weight", "mlp_up", "layers.{}.mlp.up.weight"),
282
+ _r(r"blk\.(\d+)\.ffn_down\.weight", "mlp_down", "layers.{}.mlp.down.weight"),
283
+ _opt(r"blk\.(\d+)\.ffn_gate\.bias", "mlp_gate_bias","layers.{}.mlp.gate.bias"),
284
+ _opt(r"blk\.(\d+)\.ffn_up\.bias", "mlp_up_bias","layers.{}.mlp.up.bias"),
285
+ _opt(r"blk\.(\d+)\.ffn_down\.bias", "mlp_down_bias","layers.{}.mlp.down.bias"),
286
+
287
+ # Normalizaciones
288
+ _r(r"blk\.(\d+)\.attn_norm\.weight", "norm_pre", "layers.{}.norm_pre.weight"),
289
+ _opt(r"blk\.(\d+)\.attn_norm\.bias", "norm_pre_bias","layers.{}.norm_pre.bias"),
290
+ _r(r"blk\.(\d+)\.ffn_norm\.weight", "norm_post", "layers.{}.norm_post.weight"),
291
+ _opt(r"blk\.(\d+)\.ffn_norm\.bias", "norm_post_bias","layers.{}.norm_post.bias"),
292
+
293
+ # Falcon-40B: segunda norma para atención paralela
294
+ _opt(r"blk\.(\d+)\.attn_norm_2\.weight", "norm_post_parallel", "layers.{}.norm_post.weight"),
295
+ _opt(r"blk\.(\d+)\.attn_norm_2\.bias", "norm_post_parallel_bias", "layers.{}.norm_post.bias"),
296
+ ]
297
+
298
+
299
+ # ── MoE en GGUF: router, expertos enrutados (3D, stackeados) y expertos compartidos ──
300
+ _GGUF_MOE = [
301
+ _r(r"blk\.(\d+)\.ffn_gate_inp\.weight", "moe_router", "layers.{}.mlp.moe_router.weight"),
302
+ _opt(r"blk\.(\d+)\.exp_probs_b\.bias", "moe_router_bias", "layers.{}.mlp.moe_router.bias"),
303
+
304
+ _r(r"blk\.(\d+)\.ffn_gate_exps\.weight", "moe_gate_exps", "layers.{}.mlp.moe_gate_exps.weight"),
305
+ _r(r"blk\.(\d+)\.ffn_up_exps\.weight", "moe_up_exps", "layers.{}.mlp.moe_up_exps.weight"),
306
+ _r(r"blk\.(\d+)\.ffn_down_exps\.weight", "moe_down_exps", "layers.{}.mlp.moe_down_exps.weight"),
307
+
308
+ _opt(r"blk\.(\d+)\.ffn_gate_shexp\.weight","moe_shared_gate", "layers.{}.mlp.moe_shared_gate.weight"),
309
+ _opt(r"blk\.(\d+)\.ffn_up_shexp\.weight", "moe_shared_up", "layers.{}.mlp.moe_shared_up.weight"),
310
+ _opt(r"blk\.(\d+)\.ffn_down_shexp\.weight","moe_shared_down", "layers.{}.mlp.moe_shared_down.weight"),
311
+ ]
312
+
313
+
314
+ _RULES_GGUF = _GGUF_BASE + _GGUF_MOE
315
+
316
+
317
+ # ════════════════════════════════════════════════════════════════
318
+ # H5 (Keras / TensorFlow)
319
+ # ════════════════════════════════════════════════════════════════
320
+
321
+ _RULES_H5_CNN = [
322
+ # MobileNet: doble ruta conv1/conv1/kernel:0
323
+ _r(r"conv1/conv1/kernel:0", "conv", "stem.conv.weight"),
324
+ _r(r"conv1_bn/conv1_bn/gamma:0", "bn_gamma", "stem.bn.gamma"),
325
+ _r(r"conv1_bn/conv1_bn/beta:0", "bn_beta", "stem.bn.beta"),
326
+ _r(r"conv1_bn/conv1_bn/moving_mean:0", "bn_mean", "stem.bn.mean"),
327
+ _r(r"conv1_bn/conv1_bn/moving_variance:0", "bn_var", "stem.bn.var"),
328
+ _r(r"conv_dw_(\d+)/conv_dw_\1/depthwise_kernel:0", "conv_dw", "layers.{}.conv_dw.weight"),
329
+ _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/gamma:0", "bn_gamma", "layers.{}.bn_dw.gamma"),
330
+ _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/beta:0", "bn_beta", "layers.{}.bn_dw.beta"),
331
+ _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/moving_mean:0", "bn_mean", "layers.{}.bn_dw.mean"),
332
+ _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/moving_variance:0","bn_var", "layers.{}.bn_dw.var"),
333
+ _r(r"conv_pw_(\d+)/conv_pw_\1/kernel:0", "conv_pw", "layers.{}.conv_pw.weight"),
334
+ _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/gamma:0", "bn_gamma", "layers.{}.bn_pw.gamma"),
335
+ _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/beta:0", "bn_beta", "layers.{}.bn_pw.beta"),
336
+ _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/moving_mean:0", "bn_mean", "layers.{}.bn_pw.mean"),
337
+ _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/moving_variance:0","bn_var", "layers.{}.bn_pw.var"),
338
+ _r(r"conv_preds/conv_preds/kernel:0", "head_conv", "head.conv.weight"),
339
+ _r(r"conv_preds/conv_preds/bias:0", "head_bias", "head.conv.bias"),
340
+ ]
341
+
342
+ # Reglas genéricas de Keras — útiles para CNNs simples (no-MobileNet) que
343
+ # usan Conv2D/BatchNormalization/Dense con la nomenclatura por defecto.
344
+ _RULES_H5_GENERIC = [
345
+ _r(r"conv2d/conv2d/kernel:0", "conv", "stem.conv.weight"),
346
+ _opt(r"conv2d/conv2d/bias:0", "conv_bias", "stem.conv.bias"),
347
+ _r(r"conv2d_(\d+)/conv2d_\1/kernel:0", "conv", "layers.{}.conv.weight"),
348
+ _opt(r"conv2d_(\d+)/conv2d_\1/bias:0", "conv_bias", "layers.{}.conv.bias"),
349
+
350
+ _opt(r"batch_normalization/batch_normalization/gamma:0", "bn_gamma","stem.bn.gamma"),
351
+ _opt(r"batch_normalization/batch_normalization/beta:0", "bn_beta", "stem.bn.beta"),
352
+ _opt(r"batch_normalization/batch_normalization/moving_mean:0", "bn_mean", "stem.bn.mean"),
353
+ _opt(r"batch_normalization/batch_normalization/moving_variance:0", "bn_var", "stem.bn.var"),
354
+ _opt(r"batch_normalization_(\d+)/batch_normalization_\1/gamma:0", "bn_gamma","layers.{}.bn.gamma"),
355
+ _opt(r"batch_normalization_(\d+)/batch_normalization_\1/beta:0", "bn_beta", "layers.{}.bn.beta"),
356
+ _opt(r"batch_normalization_(\d+)/batch_normalization_\1/moving_mean:0", "bn_mean", "layers.{}.bn.mean"),
357
+ _opt(r"batch_normalization_(\d+)/batch_normalization_\1/moving_variance:0", "bn_var", "layers.{}.bn.var"),
358
+
359
+ _r(r"dense/dense/kernel:0", "head_fc", "head.fc.weight"),
360
+ _opt(r"dense/dense/bias:0", "head_fc_bias", "head.fc.bias"),
361
+ _r(r"dense_(\d+)/dense_\1/kernel:0", "fc", "layers.{}.fc.weight"),
362
+ _opt(r"dense_(\d+)/dense_\1/bias:0", "fc_bias", "layers.{}.fc.bias"),
363
+ ]
364
+
365
+ _RULES_H5 = _RULES_H5_CNN + _RULES_H5_GENERIC
366
+
367
+
368
+ _FORMAT_RULES = {
369
+ "safetensors": _RULES_SAFETENSORS,
370
+ "gguf": _RULES_GGUF,
371
+ "h5": _RULES_H5,
372
+ "npy": [],
373
+ }
374
+
375
+
376
+ def map_tensors(tensors: dict, fmt: str) -> dict[str, CanonicalTensor]:
377
+ rules = _FORMAT_RULES.get(fmt, [])
378
+ result: dict[str, CanonicalTensor] = {}
379
+ for orig_name in tensors:
380
+ for pattern, role, template, optional in rules:
381
+ m = re.fullmatch(pattern, orig_name)
382
+ if m:
383
+ groups = m.groups()
384
+ layer_idx = int(groups[0]) if groups else None
385
+ canonical = template.format(*groups)
386
+ result[canonical] = CanonicalTensor(
387
+ original_name=orig_name,
388
+ canonical_name=canonical,
389
+ layer_idx=layer_idx,
390
+ role=role,
391
+ optional=optional,
392
+ )
393
+ break
394
+ return result
395
+
396
+
397
+ def find_unmapped(tensors: dict, fmt: str,
398
+ mapped: dict[str, CanonicalTensor]) -> list[str]:
399
+ """
400
+ Devuelve los nombres originales que no fueron mapeados por ninguna regla.
401
+ """
402
+ mapped_originals = {ct.original_name for ct in mapped.values()}
403
+ return [name for name in tensors.keys() if name not in mapped_originals]
404
+
405
+
406
+ def map_tensors_full(tensors: dict, fmt: str) -> dict[str, CanonicalTensor]:
407
+ """
408
+ Igual que map_tensors(), pero pensado para el flujo completo:
409
+ - mapea todos los tensores posibles
410
+ - deja trazabilidad por nombre original
411
+ """
412
+ return map_tensors(tensors, fmt)
413
+
414
+
415
+ def resolve_tied_embeddings(mapped: dict[str, CanonicalTensor]) -> dict[str, CanonicalTensor]:
416
+ """
417
+ Si existe embed.weight pero no head.weight, genera un alias canónico
418
+ para head.weight usando la misma referencia original.
419
+ """
420
+ if "embed.weight" in mapped and "head.weight" not in mapped:
421
+ src = mapped["embed.weight"]
422
+ mapped["head.weight"] = CanonicalTensor(
423
+ original_name=src.original_name,
424
+ canonical_name="head.weight",
425
+ layer_idx=None,
426
+ role="head",
427
+ optional=True,
428
+ )
429
+ return mapped
430
+
431
+
432
+ def detect_format(tensors: dict, sample_size: int = 32) -> str:
433
+ """
434
+ Detecta el formato del checkpoint votando sobre una muestra amplia
435
+ de nombres de tensores.
436
+ """
437
+ names = list(tensors.keys())[:sample_size]
438
+
439
+ scores = {
440
+ "safetensors": 0,
441
+ "gguf": 0,
442
+ "h5": 0,
443
+ "npy": 0,
444
+ }
445
+
446
+ for name in names:
447
+ if ".weight" in name or ".bias" in name:
448
+ scores["safetensors"] += 1
449
+ if name.startswith("blk.") or name.startswith("token_embd.") or name.startswith("output_norm."):
450
+ scores["gguf"] += 1
451
+ if "/" in name and (name.endswith(":0") or ":0" in name):
452
+ scores["h5"] += 1
453
+ if name.endswith(".npy"):
454
+ scores["npy"] += 1
455
+
456
+ return max(scores, key=scores.get)