PyPI - micronnx - Versions diffs - 0.1.0__tar.gz - Mend

micronnx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

micronnx-0.1.0/LICENSE +21 -0
micronnx-0.1.0/PKG-INFO +23 -0
micronnx-0.1.0/micronnx/__init__.py +83 -0
micronnx-0.1.0/micronnx/executor/__init__.py +3 -0
micronnx-0.1.0/micronnx/executor/canonical.py +456 -0
micronnx-0.1.0/micronnx/executor/exporter.py +426 -0
micronnx-0.1.0/micronnx/executor/extractor.py +236 -0
micronnx-0.1.0/micronnx/executor/forward.py +1426 -0
micronnx-0.1.0/micronnx/executor/imag.py +147 -0
micronnx-0.1.0/micronnx/executor/ops.py +281 -0
micronnx-0.1.0/micronnx/loaders/__init__.py +3 -0
micronnx-0.1.0/micronnx/loaders/exporter.py +50 -0
micronnx-0.1.0/micronnx/loaders/gguf.py +418 -0
micronnx-0.1.0/micronnx/loaders/h5.py +61 -0
micronnx-0.1.0/micronnx/loaders/npy.py +41 -0
micronnx-0.1.0/micronnx/loaders/safetensors.py +103 -0
micronnx-0.1.0/micronnx.egg-info/PKG-INFO +23 -0
micronnx-0.1.0/micronnx.egg-info/SOURCES.txt +35 -0
micronnx-0.1.0/micronnx.egg-info/dependency_links.txt +1 -0
micronnx-0.1.0/micronnx.egg-info/requires.txt +2 -0
micronnx-0.1.0/micronnx.egg-info/top_level.txt +3 -0
micronnx-0.1.0/pyproject.toml +65 -0
micronnx-0.1.0/setup.cfg +4 -0

micronnx-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 <axiol>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

micronnx-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,23 @@
+Metadata-Version: 2.4
+Name: micronnx
+Version: 0.1.0
+Summary: micronnx — runtime de inferencia puro NumPy sin dependencias pesadas.
+License: MIT
+Project-URL: Repository, https://github.com/tuusuario/micronnx
+Keywords: llm,inference,numpy,gguf,safetensors,hdf5,mobilenet,activation-extraction,model-fusion,quantization
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.24
+Requires-Dist: pyfive
+Dynamic: license-file

micronnx-0.1.0/micronnx/__init__.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+micronnx — runtime de inferencia puro NumPy.
+    import micronnx as nx
+    nx.ModelRunner(...)
+    nx.GGUFLoader(...)
+    nx.export_to_npz(...)
+"""
+from __future__ import annotations
+# ─────────────────────────────────────────
+# LAZY LOADER
+# ─────────────────────────────────────────
+_PUBLIC = {
+    # Loaders
+    "SafeTensorsLoader": "micronnx.loaders.safetensors",
+    "GGUFLoader":        "micronnx.loaders.gguf",
+    "H5Loader":          "micronnx.loaders.h5",
+    "NpyLoader":         "micronnx.loaders.npy",
+    # Ops
+    "softmax":           "micronnx.executor.ops",
+    "sigmoid":           "micronnx.executor.ops",
+    "relu":              "micronnx.executor.ops",
+    "gelu":              "micronnx.executor.ops",
+    "silu":              "micronnx.executor.ops",
+    "layernorm":         "micronnx.executor.ops",
+    "rmsnorm":           "micronnx.executor.ops",
+    "batchnorm":         "micronnx.executor.ops",
+    "linear":            "micronnx.executor.ops",
+    "embedding":         "micronnx.executor.ops",
+    "swiglu":            "micronnx.executor.ops",
+    "swiglu_fused":      "micronnx.executor.ops",
+    "geglu":             "micronnx.executor.ops",
+    "ffn_gelu":          "micronnx.executor.ops",
+    "attention":         "micronnx.executor.ops",
+    "rope":              "micronnx.executor.ops",
+    "conv2d":            "micronnx.executor.ops",
+    "depthwise_conv2d":  "micronnx.executor.ops",
+    "global_avg_pool":   "micronnx.executor.ops",
+    "max_pool2d":        "micronnx.executor.ops",
+    # Runtime
+    "ModelRunner":               "micronnx.executor.forward",
+    "RoPECache":                 "micronnx.executor.forward",
+    "KVCache":                   "micronnx.executor.forward",
+    "WeightCache":               "micronnx.executor.forward",
+    "SCHEMAS":                   "micronnx.executor.forward",
+    "detect_schema_gguf":        "micronnx.executor.forward",
+    "detect_schema_hf":          "micronnx.executor.forward",
+    "detect_schema_safetensors": "micronnx.executor.forward",
+    # Extractores y CNN
+    "ActivationExtractor":    "micronnx.executor.extractor",
+    "CNNRunner":              "micronnx.executor.imag",
+    "CNNActivationExtractor": "micronnx.executor.imag",
+    "CanonicalLoader":        "micronnx.executor.imag",
+    # Canónico
+    "CanonicalTensor":         "micronnx.executor.canonical",
+    "map_tensors":             "micronnx.executor.canonical",
+    "map_tensors_full":        "micronnx.executor.canonical",
+    "find_unmapped":           "micronnx.executor.canonical",
+    "resolve_tied_embeddings": "micronnx.executor.canonical",
+    "detect_format":           "micronnx.executor.canonical",
+    # Exportador
+    "export_to_npz": "micronnx.executor.exporter",
+    "load_index":    "micronnx.executor.exporter",
+    "inspect_npz":   "micronnx.executor.exporter",
+}
+def __getattr__(name: str):
+    if name in _PUBLIC:
+        import importlib
+        mod = importlib.import_module(_PUBLIC[name])
+        obj = getattr(mod, name)
+        # Cachear en el módulo para que el segundo acceso sea O(1)
+        globals()[name] = obj
+        return obj
+    raise AttributeError(f"micronnx no tiene '{name}'")
+__all__ = list(_PUBLIC.keys())

micronnx-0.1.0/micronnx/executor/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from . import forward, ops, exporter
+__all__ = ["forward", "ops", "exporter"]

micronnx-0.1.0/micronnx/executor/canonical.py ADDED Viewed

@@ -0,0 +1,456 @@
+"""
+canonical.py — mapeo de nombres de tensores de distintos formatos/arquitecturas
+a un esquema canónico común.
+Cobertura añadida respecto a la versión anterior:
+  - Familias nuevas: GPT-2/GPT-NeoX, Falcon, BLOOM, MPT, Phi-3
+    (qkv_proj/gate_up_proj fusionados), Baichuan2 (W_pack),
+    Gemma2/3 (pre/post_feedforward_layernorm).
+  - Biases de atención y MLP, q_norm/k_norm (Qwen3, OLMo2).
+  - MoE: Mixtral (block_sparse_moe.experts.{w1,w2,w3}), Qwen2/3-MoE y
+    DeepSeek-MoE (mlp.experts + shared_expert(s) + router), y su
+    equivalente GGUF (ffn_gate_inp / *_exps / *_shexp / exp_probs_b).
+  - H5: reglas genéricas de Keras (Conv2D/BatchNorm/Dense) además de las
+    específicas de MobileNet.
+  - CanonicalTensor.optional: marca tensores "secundarios" (bias, normas
+    extra, buffers) cuya ausencia en un checkpoint es normal y esperada.
+  - find_unmapped() / map_tensors_full(): detectan tensores del checkpoint
+    que ninguna regla pudo mapear — útil para no perder pesos en silencio
+    antes de fusionar.
+  - resolve_tied_embeddings(): genera 'head.weight' a partir de
+    'embed.weight' cuando el modelo usa pesos atados (tied embeddings,
+    común en modelos chicos tipo SmolLM/GPT-2).
+  - detect_format(): ahora vota sobre toda la lista de tensores (hasta
+    sample_size) en vez de mirar solo los primeros 5.
+NOTA — Gemma2/3 en GGUF:
+  blk.N.ffn_norm se usa en llama.cpp como "norma antes de la FFN", pero en
+  Gemma2 esa norma corresponde a pre_feedforward_layernorm, NO a
+  post_attention_layernorm (como sí ocurre en Llama). Los nombres GGUF
+  exactos de las dos normas "sandwich" extra de Gemma2
+  (post_attention_layernorm / post_feedforward_layernorm) no están
+  confirmados aquí. Si trabajas con un GGUF de Gemma2/3, verifica con
+  gguf_dump.py antes de asumir ese mapeo — es exactamente el tipo de
+  desalineación semántica que afecta la fusión entre arquitecturas.
+"""
+import re
+from dataclasses import dataclass
+@dataclass
+class CanonicalTensor:
+    original_name: str
+    canonical_name: str
+    layer_idx: int | None
+    role: str
+    optional: bool = False  # bias / norma extra / buffer: su ausencia es normal
+def _r(pattern: str, role: str, template: str) -> tuple:
+    """Regla de un peso "primario": si existe el tensor pero no matchea
+    ninguna regla _r, probablemente falta una regla."""
+    return (pattern, role, template, False)
+def _opt(pattern: str, role: str, template: str) -> tuple:
+    """Regla "secundaria": bias, norma extra, buffer, router de experto
+    compartido, etc. Su ausencia en un checkpoint es normal."""
+    return (pattern, role, template, True)
+# ════════════════════════════════════════════════════════════════
+# SAFETENSORS
+# ════════════════════════════════════════════════════════════════
+# ── Familia LLaMA: Llama/Mistral/Yi/Qwen2/Qwen2.5/Qwen3-denso/
+#    Gemma/Gemma2/Gemma3/StableLM2/InternLM2/etc. (prefijo model.layers.N) ──
+_SAFETENSORS_LLAMA = [
+    _r(r"model\.embed_tokens\.weight",                              "embed",      "embed.weight"),
+    _r(r"lm_head\.weight",                                          "head",       "head.weight"),
+    _r(r"model\.norm\.weight",                                      "norm_final", "norm_final.weight"),
+    _opt(r"model\.norm\.bias",                                      "norm_final_bias", "norm_final.bias"),
+    # Proyecciones de atención
+    _r(r"model\.layers\.(\d+)\.self_attn\.q_proj\.weight",         "attn_q",     "layers.{}.attn.q.weight"),
+    _r(r"model\.layers\.(\d+)\.self_attn\.k_proj\.weight",         "attn_k",     "layers.{}.attn.k.weight"),
+    _r(r"model\.layers\.(\d+)\.self_attn\.v_proj\.weight",         "attn_v",     "layers.{}.attn.v.weight"),
+    _r(r"model\.layers\.(\d+)\.self_attn\.o_proj\.weight",         "attn_o",     "layers.{}.attn.o.weight"),
+    # Biases de atención (Qwen2/2.5, StableLM2, etc.)
+    _opt(r"model\.layers\.(\d+)\.self_attn\.q_proj\.bias",         "attn_q_bias","layers.{}.attn.q.bias"),
+    _opt(r"model\.layers\.(\d+)\.self_attn\.k_proj\.bias",         "attn_k_bias","layers.{}.attn.k.bias"),
+    _opt(r"model\.layers\.(\d+)\.self_attn\.v_proj\.bias",         "attn_v_bias","layers.{}.attn.v.bias"),
+    _opt(r"model\.layers\.(\d+)\.self_attn\.o_proj\.bias",         "attn_o_bias","layers.{}.attn.o.bias"),
+    # Q/K norm (Qwen3, OLMo2)
+    _opt(r"model\.layers\.(\d+)\.self_attn\.q_norm\.weight",       "attn_q_norm","layers.{}.attn.q_norm.weight"),
+    _opt(r"model\.layers\.(\d+)\.self_attn\.k_norm\.weight",       "attn_k_norm","layers.{}.attn.k_norm.weight"),
+    # Buffer no entrenable, a veces presente en checkpoints viejos/custom
+    _opt(r"model\.layers\.(\d+)\.self_attn\.rotary_emb\.inv_freq", "rope_buffer","layers.{}.attn.rope_inv_freq"),
+    # Phi-3/3.5: qkv y gate_up fusionados
+    _r(r"model\.layers\.(\d+)\.self_attn\.qkv_proj\.weight",       "attn_qkv_fused",    "layers.{}.attn.qkv.weight"),
+    _r(r"model\.layers\.(\d+)\.mlp\.gate_up_proj\.weight",         "mlp_gate_up_fused", "layers.{}.mlp.gate_up.weight"),
+    # Baichuan2: q/k/v fusionados en un solo W_pack
+    _r(r"model\.layers\.(\d+)\.self_attn\.W_pack\.weight",         "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
+    # MLP denso (SwiGLU/GeGLU)
+    _r(r"model\.layers\.(\d+)\.mlp\.gate_proj\.weight",            "mlp_gate",   "layers.{}.mlp.gate.weight"),
+    _r(r"model\.layers\.(\d+)\.mlp\.up_proj\.weight",              "mlp_up",     "layers.{}.mlp.up.weight"),
+    _r(r"model\.layers\.(\d+)\.mlp\.down_proj\.weight",            "mlp_down",   "layers.{}.mlp.down.weight"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.gate_proj\.bias",            "mlp_gate_bias","layers.{}.mlp.gate.bias"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.up_proj\.bias",              "mlp_up_bias","layers.{}.mlp.up.bias"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.down_proj\.bias",            "mlp_down_bias","layers.{}.mlp.down.bias"),
+    # Normalizaciones pre/post atención
+    _r(r"model\.layers\.(\d+)\.input_layernorm\.weight",           "norm_pre",   "layers.{}.norm_pre.weight"),
+    _opt(r"model\.layers\.(\d+)\.input_layernorm\.bias",           "norm_pre_bias","layers.{}.norm_pre.bias"),
+    _r(r"model\.layers\.(\d+)\.post_attention_layernorm\.weight",  "norm_post",  "layers.{}.norm_post.weight"),
+    _opt(r"model\.layers\.(\d+)\.post_attention_layernorm\.bias",  "norm_post_bias","layers.{}.norm_post.bias"),
+    # Gemma2/3: dos normas extra alrededor del FFN (sin equivalente en Llama base)
+    _opt(r"model\.layers\.(\d+)\.pre_feedforward_layernorm\.weight", "norm_pre_ffn",  "layers.{}.norm_pre_ffn.weight"),
+    _opt(r"model\.layers\.(\d+)\.post_feedforward_layernorm\.weight","norm_post_ffn", "layers.{}.norm_post_ffn.weight"),
+]
+# ── MoE (HF safetensors): Mixtral, Qwen2/3-MoE, DeepSeek-MoE ──
+_SAFETENSORS_MOE = [
+    # Mixtral: block_sparse_moe.gate + experts.{e}.{w1,w2,w3}
+    # w1=gate_proj (SiLU), w3=up_proj, w2=down_proj
+    _r(r"model\.layers\.(\d+)\.block_sparse_moe\.gate\.weight",              "moe_router",      "layers.{}.mlp.moe_router.weight"),
+    _r(r"model\.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.w1\.weight","moe_expert_gate", "layers.{}.mlp.moe_experts.{}.gate.weight"),
+    _r(r"model\.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.w3\.weight","moe_expert_up",   "layers.{}.mlp.moe_experts.{}.up.weight"),
+    _r(r"model\.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.w2\.weight","moe_expert_down", "layers.{}.mlp.moe_experts.{}.down.weight"),
+    # Qwen2/3-MoE, DeepSeek-MoE: mlp.gate (router) + mlp.experts.{e}.*
+    _r(r"model\.layers\.(\d+)\.mlp\.gate\.weight",                       "moe_router",      "layers.{}.mlp.moe_router.weight"),
+    _r(r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.gate_proj\.weight",  "moe_expert_gate", "layers.{}.mlp.moe_experts.{}.gate.weight"),
+    _r(r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.up_proj\.weight",    "moe_expert_up",   "layers.{}.mlp.moe_experts.{}.up.weight"),
+    _r(r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.down_proj\.weight",  "moe_expert_down", "layers.{}.mlp.moe_experts.{}.down.weight"),
+    # Experto compartido — Qwen2-MoE usa singular "shared_expert"
+    _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert\.gate_proj\.weight","moe_shared_gate", "layers.{}.mlp.moe_shared_gate.weight"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert\.up_proj\.weight",  "moe_shared_up",   "layers.{}.mlp.moe_shared_up.weight"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert\.down_proj\.weight","moe_shared_down", "layers.{}.mlp.moe_shared_down.weight"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.shared_expert_gate\.weight",      "moe_shared_router","layers.{}.mlp.moe_shared_router.weight"),
+    # DeepSeek-MoE usa plural "shared_experts"
+    _opt(r"model\.layers\.(\d+)\.mlp\.shared_experts\.gate_proj\.weight","moe_shared_gate", "layers.{}.mlp.moe_shared_gate.weight"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.shared_experts\.up_proj\.weight",  "moe_shared_up",   "layers.{}.mlp.moe_shared_up.weight"),
+    _opt(r"model\.layers\.(\d+)\.mlp\.shared_experts\.down_proj\.weight","moe_shared_down", "layers.{}.mlp.moe_shared_down.weight"),
+]
+# ── GPT-2 (prefijo transformer.h.N, atención fusionada en c_attn) ──
+_SAFETENSORS_GPT2 = [
+    _r(r"transformer\.wte\.weight",            "embed",      "embed.weight"),
+    _opt(r"transformer\.wpe\.weight",          "pos_embed",  "pos_embed.weight"),
+    _r(r"transformer\.ln_f\.weight",           "norm_final", "norm_final.weight"),
+    _opt(r"transformer\.ln_f\.bias",           "norm_final_bias", "norm_final.bias"),
+    _r(r"transformer\.h\.(\d+)\.attn\.c_attn\.weight",  "attn_qkv_fused",      "layers.{}.attn.qkv.weight"),
+    _opt(r"transformer\.h\.(\d+)\.attn\.c_attn\.bias",  "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
+    _r(r"transformer\.h\.(\d+)\.attn\.c_proj\.weight",  "attn_o",     "layers.{}.attn.o.weight"),
+    _opt(r"transformer\.h\.(\d+)\.attn\.c_proj\.bias",  "attn_o_bias","layers.{}.attn.o.bias"),
+    # GPT-2 MLP: c_fc = up-proj, c_proj = down-proj, activación GELU (sin gate)
+    _r(r"transformer\.h\.(\d+)\.mlp\.c_fc\.weight",     "mlp_up",     "layers.{}.mlp.up.weight"),
+    _opt(r"transformer\.h\.(\d+)\.mlp\.c_fc\.bias",     "mlp_up_bias","layers.{}.mlp.up.bias"),
+    _r(r"transformer\.h\.(\d+)\.mlp\.c_proj\.weight",   "mlp_down",   "layers.{}.mlp.down.weight"),
+    _opt(r"transformer\.h\.(\d+)\.mlp\.c_proj\.bias",   "mlp_down_bias","layers.{}.mlp.down.bias"),
+    _r(r"transformer\.h\.(\d+)\.ln_1\.weight",          "norm_pre",   "layers.{}.norm_pre.weight"),
+    _opt(r"transformer\.h\.(\d+)\.ln_1\.bias",          "norm_pre_bias","layers.{}.norm_pre.bias"),
+    _r(r"transformer\.h\.(\d+)\.ln_2\.weight",          "norm_post",  "layers.{}.norm_post.weight"),
+    _opt(r"transformer\.h\.(\d+)\.ln_2\.bias",          "norm_post_bias","layers.{}.norm_post.bias"),
+]
+# ── GPT-NeoX / Pythia (prefijo gpt_neox.layers.N) ──
+_SAFETENSORS_GPT_NEOX = [
+    _r(r"gpt_neox\.embed_in\.weight",                   "embed",      "embed.weight"),
+    _r(r"embed_out\.weight",                            "head",       "head.weight"),
+    _r(r"gpt_neox\.final_layer_norm\.weight",           "norm_final", "norm_final.weight"),
+    _opt(r"gpt_neox\.final_layer_norm\.bias",           "norm_final_bias", "norm_final.bias"),
+    _r(r"gpt_neox\.layers\.(\d+)\.attention\.query_key_value\.weight", "attn_qkv_fused",      "layers.{}.attn.qkv.weight"),
+    _opt(r"gpt_neox\.layers\.(\d+)\.attention\.query_key_value\.bias", "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
+    _r(r"gpt_neox\.layers\.(\d+)\.attention\.dense\.weight",           "attn_o",     "layers.{}.attn.o.weight"),
+    _opt(r"gpt_neox\.layers\.(\d+)\.attention\.dense\.bias",           "attn_o_bias","layers.{}.attn.o.bias"),
+    _r(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_h_to_4h\.weight",         "mlp_up",     "layers.{}.mlp.up.weight"),
+    _opt(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_h_to_4h\.bias",         "mlp_up_bias","layers.{}.mlp.up.bias"),
+    _r(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_4h_to_h\.weight",         "mlp_down",   "layers.{}.mlp.down.weight"),
+    _opt(r"gpt_neox\.layers\.(\d+)\.mlp\.dense_4h_to_h\.bias",         "mlp_down_bias","layers.{}.mlp.down.bias"),
+    _r(r"gpt_neox\.layers\.(\d+)\.input_layernorm\.weight",            "norm_pre",   "layers.{}.norm_pre.weight"),
+    _opt(r"gpt_neox\.layers\.(\d+)\.input_layernorm\.bias",            "norm_pre_bias","layers.{}.norm_pre.bias"),
+    _r(r"gpt_neox\.layers\.(\d+)\.post_attention_layernorm\.weight",   "norm_post",  "layers.{}.norm_post.weight"),
+    _opt(r"gpt_neox\.layers\.(\d+)\.post_attention_layernorm\.bias",   "norm_post_bias","layers.{}.norm_post.bias"),
+]
+# ── Falcon / BLOOM (prefijo transformer.h.N, qkv fusionada) ──
+_SAFETENSORS_FALCON_BLOOM = [
+    _r(r"transformer\.word_embeddings\.weight",         "embed",      "embed.weight"),
+    _r(r"transformer\.h\.(\d+)\.self_attention\.query_key_value\.weight", "attn_qkv_fused",      "layers.{}.attn.qkv.weight"),
+    _opt(r"transformer\.h\.(\d+)\.self_attention\.query_key_value\.bias", "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
+    _r(r"transformer\.h\.(\d+)\.self_attention\.dense\.weight",           "attn_o",     "layers.{}.attn.o.weight"),
+    _opt(r"transformer\.h\.(\d+)\.self_attention\.dense\.bias",           "attn_o_bias","layers.{}.attn.o.bias"),
+    _r(r"transformer\.h\.(\d+)\.mlp\.dense_h_to_4h\.weight",   "mlp_up",     "layers.{}.mlp.up.weight"),
+    _opt(r"transformer\.h\.(\d+)\.mlp\.dense_h_to_4h\.bias",   "mlp_up_bias","layers.{}.mlp.up.bias"),
+    _r(r"transformer\.h\.(\d+)\.mlp\.dense_4h_to_h\.weight",   "mlp_down",   "layers.{}.mlp.down.weight"),
+    _opt(r"transformer\.h\.(\d+)\.mlp\.dense_4h_to_h\.bias",   "mlp_down_bias","layers.{}.mlp.down.bias"),
+    # BLOOM: norma pre/post estándar (mismos nombres que Llama, ya cubiertos
+    # arriba para model.layers — aquí con prefijo transformer.h)
+    _r(r"transformer\.h\.(\d+)\.input_layernorm\.weight",            "norm_pre",   "layers.{}.norm_pre.weight"),
+    _opt(r"transformer\.h\.(\d+)\.input_layernorm\.bias",            "norm_pre_bias","layers.{}.norm_pre.bias"),
+    _r(r"transformer\.h\.(\d+)\.post_attention_layernorm\.weight",   "norm_post",  "layers.{}.norm_post.weight"),
+    _opt(r"transformer\.h\.(\d+)\.post_attention_layernorm\.bias",   "norm_post_bias","layers.{}.norm_post.bias"),
+    # Falcon-40B: atención paralela con dos normas independientes
+    _opt(r"transformer\.h\.(\d+)\.ln_attn\.weight",   "norm_pre",            "layers.{}.norm_pre.weight"),
+    _opt(r"transformer\.h\.(\d+)\.ln_attn\.bias",     "norm_pre_bias",       "layers.{}.norm_pre.bias"),
+    _opt(r"transformer\.h\.(\d+)\.ln_mlp\.weight",    "norm_post_parallel",  "layers.{}.norm_post.weight"),
+    _opt(r"transformer\.h\.(\d+)\.ln_mlp\.bias",      "norm_post_parallel_bias","layers.{}.norm_post.bias"),
+]
+# ── MPT (prefijo transformer.blocks.N, qkv fusionada en Wqkv) ──
+_SAFETENSORS_MPT = [
+    _r(r"transformer\.norm_f\.weight",                  "norm_final", "norm_final.weight"),
+    _r(r"transformer\.blocks\.(\d+)\.attn\.Wqkv\.weight",      "attn_qkv_fused", "layers.{}.attn.qkv.weight"),
+    _r(r"transformer\.blocks\.(\d+)\.attn\.out_proj\.weight",  "attn_o",     "layers.{}.attn.o.weight"),
+    _r(r"transformer\.blocks\.(\d+)\.ffn\.up_proj\.weight",    "mlp_up",     "layers.{}.mlp.up.weight"),
+    _r(r"transformer\.blocks\.(\d+)\.ffn\.down_proj\.weight",  "mlp_down",   "layers.{}.mlp.down.weight"),
+    _r(r"transformer\.blocks\.(\d+)\.norm_1\.weight",          "norm_pre",   "layers.{}.norm_pre.weight"),
+    _r(r"transformer\.blocks\.(\d+)\.norm_2\.weight",          "norm_post",  "layers.{}.norm_post.weight"),
+]
+_RULES_SAFETENSORS = (
+    _SAFETENSORS_LLAMA
+    + _SAFETENSORS_MOE
+    + _SAFETENSORS_GPT2
+    + _SAFETENSORS_GPT_NEOX
+    + _SAFETENSORS_FALCON_BLOOM
+    + _SAFETENSORS_MPT
+)
+# ════════════════════════════════════════════════════════════════
+# GGUF
+# ════════════════════════════════════════════════════════════════
+_GGUF_BASE = [
+    _r(r"token_embd\.weight",                "embed",      "embed.weight"),
+    _opt(r"position_embd\.weight",           "pos_embed",  "pos_embed.weight"),
+    _r(r"output\.weight",                    "head",       "head.weight"),
+    _r(r"output_norm\.weight",               "norm_final", "norm_final.weight"),
+    _opt(r"output_norm\.bias",               "norm_final_bias", "norm_final.bias"),
+    _r(r"blk\.(\d+)\.attn_q\.weight",       "attn_q",     "layers.{}.attn.q.weight"),
+    _r(r"blk\.(\d+)\.attn_k\.weight",       "attn_k",     "layers.{}.attn.k.weight"),
+    _r(r"blk\.(\d+)\.attn_v\.weight",       "attn_v",     "layers.{}.attn.v.weight"),
+    _r(r"blk\.(\d+)\.attn_output\.weight",  "attn_o",     "layers.{}.attn.o.weight"),
+    # Biases de atención (Qwen y otros)
+    _opt(r"blk\.(\d+)\.attn_q\.bias",         "attn_q_bias","layers.{}.attn.q.bias"),
+    _opt(r"blk\.(\d+)\.attn_k\.bias",         "attn_k_bias","layers.{}.attn.k.bias"),
+    _opt(r"blk\.(\d+)\.attn_v\.bias",         "attn_v_bias","layers.{}.attn.v.bias"),
+    _opt(r"blk\.(\d+)\.attn_output\.bias",    "attn_o_bias","layers.{}.attn.o.bias"),
+    # Q/K norm (Qwen3)
+    _opt(r"blk\.(\d+)\.attn_q_norm\.weight",  "attn_q_norm","layers.{}.attn.q_norm.weight"),
+    _opt(r"blk\.(\d+)\.attn_k_norm\.weight",  "attn_k_norm","layers.{}.attn.k_norm.weight"),
+    # QKV fusionada (modelos GPT-2/Falcon/MPT/Phi-3/Baichuan exportados a GGUF)
+    _r(r"blk\.(\d+)\.attn_qkv\.weight",       "attn_qkv_fused",      "layers.{}.attn.qkv.weight"),
+    _opt(r"blk\.(\d+)\.attn_qkv\.bias",       "attn_qkv_fused_bias", "layers.{}.attn.qkv.bias"),
+    # MLP denso
+    _r(r"blk\.(\d+)\.ffn_gate\.weight",     "mlp_gate",   "layers.{}.mlp.gate.weight"),
+    _r(r"blk\.(\d+)\.ffn_up\.weight",       "mlp_up",     "layers.{}.mlp.up.weight"),
+    _r(r"blk\.(\d+)\.ffn_down\.weight",     "mlp_down",   "layers.{}.mlp.down.weight"),
+    _opt(r"blk\.(\d+)\.ffn_gate\.bias",     "mlp_gate_bias","layers.{}.mlp.gate.bias"),
+    _opt(r"blk\.(\d+)\.ffn_up\.bias",       "mlp_up_bias","layers.{}.mlp.up.bias"),
+    _opt(r"blk\.(\d+)\.ffn_down\.bias",     "mlp_down_bias","layers.{}.mlp.down.bias"),
+    # Normalizaciones
+    _r(r"blk\.(\d+)\.attn_norm\.weight",    "norm_pre",   "layers.{}.norm_pre.weight"),
+    _opt(r"blk\.(\d+)\.attn_norm\.bias",    "norm_pre_bias","layers.{}.norm_pre.bias"),
+    _r(r"blk\.(\d+)\.ffn_norm\.weight",     "norm_post",  "layers.{}.norm_post.weight"),
+    _opt(r"blk\.(\d+)\.ffn_norm\.bias",     "norm_post_bias","layers.{}.norm_post.bias"),
+    # Falcon-40B: segunda norma para atención paralela
+    _opt(r"blk\.(\d+)\.attn_norm_2\.weight", "norm_post_parallel",      "layers.{}.norm_post.weight"),
+    _opt(r"blk\.(\d+)\.attn_norm_2\.bias",   "norm_post_parallel_bias", "layers.{}.norm_post.bias"),
+]
+# ── MoE en GGUF: router, expertos enrutados (3D, stackeados) y expertos compartidos ──
+_GGUF_MOE = [
+    _r(r"blk\.(\d+)\.ffn_gate_inp\.weight",   "moe_router",       "layers.{}.mlp.moe_router.weight"),
+    _opt(r"blk\.(\d+)\.exp_probs_b\.bias",    "moe_router_bias",  "layers.{}.mlp.moe_router.bias"),
+    _r(r"blk\.(\d+)\.ffn_gate_exps\.weight",  "moe_gate_exps",    "layers.{}.mlp.moe_gate_exps.weight"),
+    _r(r"blk\.(\d+)\.ffn_up_exps\.weight",    "moe_up_exps",      "layers.{}.mlp.moe_up_exps.weight"),
+    _r(r"blk\.(\d+)\.ffn_down_exps\.weight",  "moe_down_exps",    "layers.{}.mlp.moe_down_exps.weight"),
+    _opt(r"blk\.(\d+)\.ffn_gate_shexp\.weight","moe_shared_gate", "layers.{}.mlp.moe_shared_gate.weight"),
+    _opt(r"blk\.(\d+)\.ffn_up_shexp\.weight",  "moe_shared_up",   "layers.{}.mlp.moe_shared_up.weight"),
+    _opt(r"blk\.(\d+)\.ffn_down_shexp\.weight","moe_shared_down", "layers.{}.mlp.moe_shared_down.weight"),
+]
+_RULES_GGUF = _GGUF_BASE + _GGUF_MOE
+# ════════════════════════════════════════════════════════════════
+# H5 (Keras / TensorFlow)
+# ════════════════════════════════════════════════════════════════
+_RULES_H5_CNN = [
+    # MobileNet: doble ruta conv1/conv1/kernel:0
+    _r(r"conv1/conv1/kernel:0",                            "conv",      "stem.conv.weight"),
+    _r(r"conv1_bn/conv1_bn/gamma:0",                       "bn_gamma",  "stem.bn.gamma"),
+    _r(r"conv1_bn/conv1_bn/beta:0",                        "bn_beta",   "stem.bn.beta"),
+    _r(r"conv1_bn/conv1_bn/moving_mean:0",                 "bn_mean",   "stem.bn.mean"),
+    _r(r"conv1_bn/conv1_bn/moving_variance:0",             "bn_var",    "stem.bn.var"),
+    _r(r"conv_dw_(\d+)/conv_dw_\1/depthwise_kernel:0",     "conv_dw",   "layers.{}.conv_dw.weight"),
+    _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/gamma:0",          "bn_gamma",  "layers.{}.bn_dw.gamma"),
+    _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/beta:0",           "bn_beta",   "layers.{}.bn_dw.beta"),
+    _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/moving_mean:0",    "bn_mean",   "layers.{}.bn_dw.mean"),
+    _r(r"conv_dw_(\d+)_bn/conv_dw_\1_bn/moving_variance:0","bn_var",    "layers.{}.bn_dw.var"),
+    _r(r"conv_pw_(\d+)/conv_pw_\1/kernel:0",               "conv_pw",   "layers.{}.conv_pw.weight"),
+    _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/gamma:0",          "bn_gamma",  "layers.{}.bn_pw.gamma"),
+    _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/beta:0",           "bn_beta",   "layers.{}.bn_pw.beta"),
+    _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/moving_mean:0",    "bn_mean",   "layers.{}.bn_pw.mean"),
+    _r(r"conv_pw_(\d+)_bn/conv_pw_\1_bn/moving_variance:0","bn_var",    "layers.{}.bn_pw.var"),
+    _r(r"conv_preds/conv_preds/kernel:0",                  "head_conv", "head.conv.weight"),
+    _r(r"conv_preds/conv_preds/bias:0",                    "head_bias", "head.conv.bias"),
+]
+# Reglas genéricas de Keras — útiles para CNNs simples (no-MobileNet) que
+# usan Conv2D/BatchNormalization/Dense con la nomenclatura por defecto.
+_RULES_H5_GENERIC = [
+    _r(r"conv2d/conv2d/kernel:0",                                   "conv",      "stem.conv.weight"),
+    _opt(r"conv2d/conv2d/bias:0",                                   "conv_bias", "stem.conv.bias"),
+    _r(r"conv2d_(\d+)/conv2d_\1/kernel:0",                          "conv",      "layers.{}.conv.weight"),
+    _opt(r"conv2d_(\d+)/conv2d_\1/bias:0",                          "conv_bias", "layers.{}.conv.bias"),
+    _opt(r"batch_normalization/batch_normalization/gamma:0",           "bn_gamma","stem.bn.gamma"),
+    _opt(r"batch_normalization/batch_normalization/beta:0",            "bn_beta", "stem.bn.beta"),
+    _opt(r"batch_normalization/batch_normalization/moving_mean:0",     "bn_mean", "stem.bn.mean"),
+    _opt(r"batch_normalization/batch_normalization/moving_variance:0", "bn_var",  "stem.bn.var"),
+    _opt(r"batch_normalization_(\d+)/batch_normalization_\1/gamma:0",           "bn_gamma","layers.{}.bn.gamma"),
+    _opt(r"batch_normalization_(\d+)/batch_normalization_\1/beta:0",            "bn_beta", "layers.{}.bn.beta"),
+    _opt(r"batch_normalization_(\d+)/batch_normalization_\1/moving_mean:0",     "bn_mean", "layers.{}.bn.mean"),
+    _opt(r"batch_normalization_(\d+)/batch_normalization_\1/moving_variance:0", "bn_var",  "layers.{}.bn.var"),
+    _r(r"dense/dense/kernel:0",            "head_fc",      "head.fc.weight"),
+    _opt(r"dense/dense/bias:0",            "head_fc_bias", "head.fc.bias"),
+    _r(r"dense_(\d+)/dense_\1/kernel:0",   "fc",           "layers.{}.fc.weight"),
+    _opt(r"dense_(\d+)/dense_\1/bias:0",   "fc_bias",      "layers.{}.fc.bias"),
+]
+_RULES_H5 = _RULES_H5_CNN + _RULES_H5_GENERIC
+_FORMAT_RULES = {
+    "safetensors": _RULES_SAFETENSORS,
+    "gguf":        _RULES_GGUF,
+    "h5":          _RULES_H5,
+    "npy":         [],
+}
+def map_tensors(tensors: dict, fmt: str) -> dict[str, CanonicalTensor]:
+    rules = _FORMAT_RULES.get(fmt, [])
+    result: dict[str, CanonicalTensor] = {}
+    for orig_name in tensors:
+        for pattern, role, template, optional in rules:
+            m = re.fullmatch(pattern, orig_name)
+            if m:
+                groups = m.groups()
+                layer_idx = int(groups[0]) if groups else None
+                canonical = template.format(*groups)
+                result[canonical] = CanonicalTensor(
+                    original_name=orig_name,
+                    canonical_name=canonical,
+                    layer_idx=layer_idx,
+                    role=role,
+                    optional=optional,
+                )
+                break
+    return result
+def find_unmapped(tensors: dict, fmt: str,
+                  mapped: dict[str, CanonicalTensor]) -> list[str]:
+    """
+    Devuelve los nombres originales que no fueron mapeados por ninguna regla.
+    """
+    mapped_originals = {ct.original_name for ct in mapped.values()}
+    return [name for name in tensors.keys() if name not in mapped_originals]
+def map_tensors_full(tensors: dict, fmt: str) -> dict[str, CanonicalTensor]:
+    """
+    Igual que map_tensors(), pero pensado para el flujo completo:
+    - mapea todos los tensores posibles
+    - deja trazabilidad por nombre original
+    """
+    return map_tensors(tensors, fmt)
+def resolve_tied_embeddings(mapped: dict[str, CanonicalTensor]) -> dict[str, CanonicalTensor]:
+    """
+    Si existe embed.weight pero no head.weight, genera un alias canónico
+    para head.weight usando la misma referencia original.
+    """
+    if "embed.weight" in mapped and "head.weight" not in mapped:
+        src = mapped["embed.weight"]
+        mapped["head.weight"] = CanonicalTensor(
+            original_name=src.original_name,
+            canonical_name="head.weight",
+            layer_idx=None,
+            role="head",
+            optional=True,
+        )
+    return mapped
+def detect_format(tensors: dict, sample_size: int = 32) -> str:
+    """
+    Detecta el formato del checkpoint votando sobre una muestra amplia
+    de nombres de tensores.
+    """
+    names = list(tensors.keys())[:sample_size]
+    scores = {
+        "safetensors": 0,
+        "gguf": 0,
+        "h5": 0,
+        "npy": 0,
+    }
+    for name in names:
+        if ".weight" in name or ".bias" in name:
+            scores["safetensors"] += 1
+        if name.startswith("blk.") or name.startswith("token_embd.") or name.startswith("output_norm."):
+            scores["gguf"] += 1
+        if "/" in name and (name.endswith(":0") or ":0" in name):
+            scores["h5"] += 1
+        if name.endswith(".npy"):
+            scores["npy"] += 1
+    return max(scores, key=scores.get)