PyPI - model-unfolder - Versions diffs - 0.2.6__tar.gz → 0.2.8__tar.gz - Mend

model-unfolder 0.2.6tar.gz → 0.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

{model_unfolder-0.2.6 → model_unfolder-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: model-unfolder
-Version: 0.2.6
+Version: 0.2.8
 Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
 Author: model-unfolder contributors
 License: Apache-2.0
@@ -95,7 +95,7 @@ No extra config in `model_unfolder` itself.
 ```python
 diagram = unfold(cfg)
 diagram.save("model.html")   # standalone interactive HTML
-diagram.save("model.json")   # IR (no rendering)
+diagram.save("model.json")   # expanded architecture JSON (no rendering)
 diagram.param_count()        # {"total": ..., "active": ..., "per_layer": [...]}
 diagram.to_ir()              # full IR dict
 ```

{model_unfolder-0.2.6 → model_unfolder-0.2.8}/README.md RENAMED Viewed

@@ -71,7 +71,7 @@ No extra config in `model_unfolder` itself.
 ```python
 diagram = unfold(cfg)
 diagram.save("model.html")   # standalone interactive HTML
-diagram.save("model.json")   # IR (no rendering)
+diagram.save("model.json")   # expanded architecture JSON (no rendering)
 diagram.param_count()        # {"total": ..., "active": ..., "per_layer": [...]}
 diagram.to_ir()              # full IR dict
 ```

{model_unfolder-0.2.6 → model_unfolder-0.2.8}/model_unfolder/__init__.py RENAMED Viewed

@@ -11,6 +11,7 @@ Outside Jupyter::
     diagram.save("kimi_k2.html")
 """
 from .diagram import Diagram
+from .evidence import inspect_model_code
 from .parser import config_to_ir
 from .ir import ModelIR, LayerSpec, AttentionSpec, FFNSpec, CrossLayerEdge
 from .params import estimate_params
@@ -27,11 +28,19 @@ __all__ = [
     "FFNSpec",
     "CrossLayerEdge",
     "config_to_ir",
+    "inspect_model_code",
     "estimate_params",
 ]
-def unfold(cfg_or_id, token=None) -> Diagram:
+def unfold(
+    cfg_or_id,
+    token=None,
+    *,
+    inspect_code: bool = False,
+    code_source: str = "local",
+    return_json: bool = False,
+):
     """Unfold a transformer into a renderable architecture diagram.
     Parameters
@@ -44,14 +53,31 @@ def unfold(cfg_or_id, token=None) -> Diagram:
         Optional Hugging Face token used only when ``cfg_or_id`` is a model ID.
         If omitted, ``HF_TOKEN`` and legacy Hugging Face token env vars are used
         when present.
+    inspect_code
+        If True, attach static source-code evidence to the IR. The code scanner
+        parses modeling files as text/AST and does not execute model code.
+    code_source
+        Source for code inspection: ``"local"`` (installed transformers),
+        ``"path"``, ``"hub"``, ``"auto"``, or a local file/directory path.
+    return_json
+        If True, return the expanded architecture JSON dict instead of the
+        renderable ``Diagram``.  The JSON uses stable structural fields for
+        dimensions, projections, layer groups, operation graphs, cache behavior,
+        and trace paths instead of renderer labels/descriptions.
     Returns
     -------
-    Diagram
-        Renders inline in Jupyter; otherwise call ``.save()`` or ``.to_html()``.
+    Diagram | dict
+        ``Diagram`` by default; ``dict`` when ``return_json=True``.
     """
-    ir = config_to_ir(cfg_or_id, token=token)
-    return Diagram(ir)
+    ir = config_to_ir(
+        cfg_or_id,
+        token=token,
+        inspect_code=inspect_code,
+        code_source=code_source,
+    )
+    diagram = Diagram(ir)
+    return diagram.to_json() if return_json else diagram
 # friendly alias

model_unfolder-0.2.8/model_unfolder/adapters/transformer/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Transformer-LLM adapter.
+There is exactly one parser (``parser.py``); see its module docstring for
+the principle (config-driven, no per-family code).
+"""
+from . import parser
+ADAPTERS = [parser]

{model_unfolder-0.2.6 → model_unfolder-0.2.8}/model_unfolder/adapters/transformer/blocks/attention.py RENAMED Viewed

@@ -26,6 +26,18 @@ def _sdpa_child_blocks(attention: AttentionSpec, hidden_size: int) -> list[dict]
     q_out = _fmt(num_heads * head_dim) if (num_heads and head_dim) else hidden
     kv_out = _fmt(num_kv_heads * head_dim) if (num_kv_heads and head_dim) else hidden
     d_k = _fmt(head_dim) if head_dim else "d_k"
+    if attention.kind in {"mha", "gqa", "mqa"}:
+        return _sdpa_detailed_child_blocks(
+            attention.kind,
+            hidden,
+            q_out,
+            kv_out,
+            num_heads,
+            num_kv_heads,
+            d_k,
+            q_per_group,
+        )
     attention_title, attention_desc = _sdpa_operation_meta(attention, num_heads, num_kv_heads, d_k, q_per_group)
     return [
         {
@@ -36,12 +48,18 @@ def _sdpa_child_blocks(attention: AttentionSpec, hidden_size: int) -> list[dict]
         {
             "id": "k_proj",
             "title": "Key projection",
-            "description": f"Linear; {hidden} -> {kv_out}  ({num_kv_heads} KV-heads x {d_k} dims)",
+            "description": (
+                f"Linear; {hidden} -> {kv_out}  ({num_kv_heads} KV-heads x {d_k} dims). "
+                "Cache ports show K/V write/read during generation: arrowhead for write, blunt tail for read."
+            ),
         },
         {
             "id": "v_proj",
             "title": "Value projection",
-            "description": f"Linear; {hidden} -> {kv_out}  ({num_kv_heads} KV-heads x {d_k} dims)",
+            "description": (
+                f"Linear; {hidden} -> {kv_out}  ({num_kv_heads} KV-heads x {d_k} dims). "
+                "Cache ports show K/V write/read during generation: arrowhead for write, blunt tail for read."
+            ),
         },
         {
             "id": "qkv_dot",
@@ -56,6 +74,83 @@ def _sdpa_child_blocks(attention: AttentionSpec, hidden_size: int) -> list[dict]
     ]
+def _sdpa_detailed_child_blocks(
+    kind: str,
+    hidden: str,
+    q_out: str,
+    kv_out: str,
+    num_heads: int,
+    num_kv_heads: int,
+    d_k: str,
+    q_per_group: int | None,
+) -> list[dict]:
+    kv_label = "1 shared K/V head" if kind == "mqa" else f"{num_kv_heads} KV-heads"
+    scaled_title = "Scaled attention scores"
+    scaled_desc = "Per head: QK^T / sqrt(dim); dot-product scores scaled for numerical stability"
+    if kind == "gqa":
+        scaled_title = "Grouped scaled dot-product attention"
+        group = f"; each KV head serves {q_per_group} query heads" if q_per_group else ""
+        scaled_desc = (
+            f"Grouped SDPA scores: {num_heads} query heads attend through "
+            f"{num_kv_heads} shared K/V heads{group}; scores use QK^T / sqrt(dim)"
+        )
+    elif kind == "mqa":
+        scaled_title = "Multi-query scaled dot-product attention"
+        scaled_desc = (
+            f"Multi-Query SDPA scores: {num_heads} query heads share one K/V stream; "
+            "scores use QK^T / sqrt(dim)"
+        )
+    return [
+        {
+            "id": "q_proj",
+            "title": "Query projection",
+            "description": f"Linear; {hidden} -> {q_out}  ({num_heads} heads x {d_k} dims)",
+        },
+        {
+            "id": "k_proj",
+            "title": "Key projection",
+            "description": (
+                f"Linear; {hidden} -> {kv_out}  ({kv_label} x {d_k} dims). "
+                "Cache ports show K/V write/read during generation: arrowhead for write, blunt tail for read."
+            ),
+        },
+        {
+            "id": "v_proj",
+            "title": "Value projection",
+            "description": (
+                f"Linear; {hidden} -> {kv_out}  ({kv_label} x {d_k} dims). "
+                "Cache ports show K/V write/read during generation: arrowhead for write, blunt tail for read."
+            ),
+        },
+        {
+            "id": "scaled_scores",
+            "title": scaled_title,
+            "description": scaled_desc,
+        },
+        {
+            "id": "attn_softmax",
+            "title": "Softmax weights",
+            "description": "Normalize each query row into attention weights over source tokens",
+        },
+        {
+            "id": "attn_apply_v",
+            "title": "Apply values",
+            "description": "Multiply attention weights by V to produce one context vector per head",
+        },
+        {
+            "id": "concat_heads",
+            "title": "Concatenate heads",
+            "description": f"Stack all {num_heads} per-head context vectors back into width {q_out}",
+        },
+        {
+            "id": "o_proj",
+            "title": "Output projection",
+            "description": f"Linear; {q_out} -> {hidden}  (mixes information across heads)",
+        },
+    ]
 def _sdpa_operation_meta(
     attention: AttentionSpec,
     num_heads: int,
@@ -102,40 +197,141 @@ def _mla_child_blocks(attention: AttentionSpec, hidden_size: int) -> list[dict]:
     num_heads = attention.num_heads or 0
     head_dim = attention.head_dim or 0
     q_out = _fmt(num_heads * head_dim) if (num_heads and head_dim) else hidden
-    return [
+    query_children = [
         {
             "id": "mla_q",
             "label": "Q projection",
             "title": "Query projection",
             "description": (
-                f"Projects hidden states into query heads through LoRA rank {q_rank}"
+                f"Projects hidden states into query latent space through LoRA rank {q_rank}"
                 if attention.q_lora_rank
-                else f"Q projection; {hidden} -> {q_out}"
+                else f"Projects hidden states directly into query heads; {hidden} -> {q_out}"
             ),
         },
+        {
+            "id": "mla_q_nope",
+            "label": "Q noPE",
+            "title": "Query content slice",
+            "description": "Query content component that does not receive rotary position encoding",
+        },
+        {
+            "id": "mla_q_rope",
+            "label": "Q RoPE",
+            "title": "Query positional slice",
+            "description": f"Query positional component prepared for rotary position encoding; dim {rope}",
+        },
+        {
+            "id": "mla_q_rope_apply",
+            "label": "Apply RoPE",
+            "title": "Apply RoPE to query",
+            "description": "Applies rotary position encoding to the query positional slice",
+        },
+        {
+            "id": "mla_q_concat",
+            "label": "Q concat",
+            "title": "Final MLA query",
+            "description": "Concatenates Q noPE with RoPE-encoded Q RoPE before score computation",
+        },
+    ]
+    kv_children = [
         {
             "id": "mla_kv_down",
             "label": "KV compress",
             "title": "K/V latent compression",
-            "description": f"Compresses the token state into a shared latent K/V vector; {hidden} -> rank {kv_rank}",
+            "description": f"Compresses the token state into the shared latent K/V cache; {hidden} -> rank {kv_rank}",
+        },
+        {
+            "id": "mla_cache",
+            "label": "latent cache c_t",
+            "title": "Stored latent cache",
+            "description": (
+                f"Compressed K/V latent stored in the cache instead of full K and V heads; rank {kv_rank}. "
+                "Cache ports show write from compression and read back into K/V expansion."
+            ),
         },
         {
             "id": "mla_kv_up",
             "label": "KV expand",
             "title": "K/V head expansion",
-            "description": f"Expands the latent K/V vector into per-head key/value content for {num_heads} query heads",
+            "description": f"Expands cached latent c_t into K noPE content and V values for {num_heads} query heads",
+        },
+        {
+            "id": "mla_k_nope",
+            "label": "K noPE",
+            "title": "Latent key content",
+            "description": "Key content expanded from the compressed K/V latent; concatenated with the RoPE key before scoring",
+        },
+        {
+            "id": "mla_k_rope",
+            "label": "K RoPE",
+            "title": "Key positional slice",
+            "description": f"Key positional component produced alongside the latent cache; dim {rope}",
+        },
+        {
+            "id": "mla_k_rope_apply",
+            "label": "Apply RoPE",
+            "title": "Apply RoPE to key",
+            "description": "Applies rotary position encoding to the key positional slice",
+        },
+        {
+            "id": "mla_k_merge",
+            "label": "K concat",
+            "title": "Composed MLA key",
+            "description": "Concatenates K noPE with the RoPE key side-channel before QK^T score computation",
+        },
+        {
+            "id": "mla_v",
+            "label": "V values",
+            "title": "Latent value heads",
+            "description": "Value heads expanded from the compressed K/V latent; consumed after softmax",
+        },
+    ]
+    return [
+        {
+            "id": "mla_query_path",
+            "label": "Query path",
+            "title": "MLA query path",
+            "description": (
+                "Builds Q by projecting the hidden state, splitting content and positional slices, "
+                "applying RoPE to the positional slice, then concatenating them"
+            ),
+            "detail_view": "mla_query_path",
+            "children": query_children,
+        },
+        {
+            "id": "mla_kv_path",
+            "label": "KV cache path",
+            "title": "MLA K/V cache path",
+            "description": (
+                f"Compresses hidden state into rank {kv_rank} latent cache, expands K/V content, "
+                "and combines K noPE with a RoPE key side-channel. Cache ports mark the latent write/read point."
+            ),
+            "detail_view": "mla_kv_cache_path",
+            "children": kv_children,
+        },
+        {
+            "id": "scaled_scores",
+            "label": "Latent scores",
+            "title": "Multi-Head Latent scores",
+            "description": "Q attends to expanded latent K plus the RoPE key side-channel; scores use QK^T / sqrt(dim)",
+        },
+        {
+            "id": "attn_softmax",
+            "label": "Softmax",
+            "title": "Softmax weights",
+            "description": "Normalize latent attention scores over source positions",
         },
         {
-            "id": "mla_rope",
-            "label": "RoPE key",
-            "title": "Rotary key side-channel",
-            "description": f"Separate positional key slice used with RoPE; dim {rope}",
+            "id": "attn_apply_v",
+            "label": "Apply V",
+            "title": "Apply latent values",
+            "description": "Multiply softmax weights by V expanded from the compressed K/V latent",
         },
         {
-            "id": "mla_attn",
-            "label": "Latent attention",
-            "title": "Multi-head latent attention",
-            "description": "Attention over decompressed latent K/V plus the RoPE side channel",
+            "id": "concat_heads",
+            "label": "Concat heads",
+            "title": "Concatenate latent heads",
+            "description": f"Stack all {num_heads} context heads back into width {q_out}",
         },
         {
             "id": "o_proj",

{model_unfolder-0.2.6 → model_unfolder-0.2.8}/model_unfolder/adapters/transformer/blocks/descriptions.py RENAMED Viewed

@@ -67,13 +67,14 @@ def describe_attention(attention: AttentionSpec) -> str:
         )
         if attention.q_lora_rank:
             text += f"; Q LoRA {_fmt(attention.q_lora_rank)}"
+        text += "; cache ports mark latent write/read state"
         return text
     if attention.kind == "mqa":
-        return _with_attention_window(attention, f"Multi-query; {attention.num_heads} Q / 1 KV head")
+        return _with_attention_window(attention, f"Multi-query; {attention.num_heads} Q / 1 KV head; cache ports mark K/V write/read state")
     if attention.kind == "gqa":
         return _with_attention_window(attention, (
             f"Grouped-query; {attention.num_heads} Q / {attention.num_kv_heads} KV heads; "
-            f"head dim {_fmt(attention.head_dim)}"
+            f"head dim {_fmt(attention.head_dim)}; cache ports mark K/V write/read state"
         ))
     if attention.kind == "ssm":
         shared = "; weight-shared across positions" if attention.shared else ""
@@ -94,11 +95,12 @@ def describe_attention(attention: AttentionSpec) -> str:
     if attention.no_rope:
         extras.append("NoPE")
     suffix = f"; {', '.join(extras)}" if extras else ""
-    return _with_attention_window(attention, f"Multi-head; {attention.num_heads} heads; head dim {_fmt(attention.head_dim)}{suffix}")
+    cache_note = "; cache ports mark K/V write/read state"
+    return _with_attention_window(attention, f"Multi-head; {attention.num_heads} heads; head dim {_fmt(attention.head_dim)}{suffix}{cache_note}")
 def _attention_mask_prefix(attention: AttentionSpec) -> str:
-    return "SWA" if attention.mask == "sliding" else ""
+    return "SW" if attention.mask == "sliding" else ""
 def _attention_mask_title_prefix(attention: AttentionSpec) -> str:

model-unfolder 0.2.6__tar.gz → 0.2.8__tar.gz

model-unfolder 0.2.6tar.gz → 0.2.8tar.gz