PyPI - model-unfolder - Versions diffs - 0.2.8__tar.gz → 0.2.9__tar.gz - Mend

model-unfolder 0.2.8tar.gz → 0.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: model-unfolder
-Version: 0.2.8
+Version: 0.2.9
 Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
 Author: model-unfolder contributors
 License: Apache-2.0

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/parser.py RENAMED Viewed

@@ -26,6 +26,7 @@ Detection is config-driven:
 * ``num_local_experts`` / ``n_routed_experts``   → MoE FFN
 * ``n_shared_experts`` / ``num_shared_experts``  → shared experts
 * ``clip_qkv``                                   → attention extras
+* ``cross_attention_layers``                     → vision side-attention layers
 Warnings policy: warn only for *specific* config problems (missing
 critical field, unrecognized layer_type value, …).  Never warn just
@@ -43,6 +44,7 @@ from .special_parts.per_layer_embedding import (
     per_layer_embedding_extras,
 )
 from .special_parts.modalities import multimodal_extras
+from .special_parts.modalities.detect import cross_attention_layers as _cross_attention_layers
 # ---------------------------------------------------------------------------
@@ -252,6 +254,13 @@ def parse(cfg: Any) -> ModelIR:
     ple_dim   = _g(text_cfg, "hidden_size_per_layer_input") or 0
     ple_vocab = _g(text_cfg, "vocab_size_per_layer_input") or get("vocab_size", 0)
+    # ---- Decoder layers that read external modality states through cross-attention ----
+    cross_attn_layer_set = set(_cross_attention_layers(cfg, text_cfg) or [])
+    has_cross_attention_side_state = bool(
+        cross_attn_layer_set
+        and (_g(cfg, "vision_config") is not None or _g(cfg, "vision_model_config") is not None)
+    )
     # ---- Walk the layer stack ----
     unknown_layer_types: set[str] = set()
     cross_layer_edges: list[CrossLayerEdge] = []
@@ -325,6 +334,8 @@ def parse(cfg: Any) -> ModelIR:
             )
         extra_blocks = list(per_layer_embedding_blocks(hidden_size, ple_dim, activation="gelu")) if ple_dim else []
+        if has_cross_attention_side_state and i in cross_attn_layer_set:
+            extra_blocks.append(_cross_attention_adapter_block(cross_attn_layer_set))
         if use_parallel_residual:
             layers.append(parallel_decoder_layer(i, attn, ffn, hidden_size, norm_kind=norm_kind))
@@ -551,3 +562,34 @@ def _last_matching_layer(layer_types, i: int, first_shared: int) -> int | None:
         if layer_types[j] == target_type:
             return j
     return None
+def _cross_attention_adapter_block(layer_indices: set[int]) -> dict:
+    """Layer-local side block for decoder layers that read vision states."""
+    layers = sorted(layer_indices)
+    if layers:
+        layer_label = ", ".join(f"L{i}" for i in layers[:6])
+        if len(layers) > 6:
+            layer_label += ", ..."
+        layer_desc = f"active on {layer_label}"
+    else:
+        layer_desc = "active on selected decoder layers"
+    return {
+        "id": "cross_attention_adapter",
+        "role": "fusion",
+        "kind": "fusion",
+        "lane": "external_left",
+        "feeds": "attn",
+        "source_id": "vision_path",
+        "source_label": "Vision context",
+        "label": "Cross-attention adapter",
+        "title": "Cross-attention adapter",
+        "description": (
+            "Vision states stay separate; this decoder layer reads them with "
+            f"cross-attention; {layer_desc}."
+        ),
+        "detail_view": "multimodal_fusion",
+        "w": 270,
+        "h": 54,
+        "font": 16,
+    }

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/builder.py RENAMED Viewed

@@ -5,9 +5,10 @@ from typing import Any
 from .accessors import nested
 from .audio import audio_path
+from .detect import has_video_input, is_unified_grid_stream
 from .fusion import fusion_path
 from .schema import multimodal_payload
-from .vision import has_video_input, video_path, vision_path
+from .vision import video_path, vision_path
 def multimodal_extras(cfg: Any, text_cfg: Any, text_hidden_size: int) -> dict | None:
@@ -18,7 +19,7 @@ def multimodal_extras(cfg: Any, text_cfg: Any, text_hidden_size: int) -> dict |
     modalities: dict[str, Any] = {}
     if vision_cfg is not None:
         modalities["vision"] = vision_path(cfg, text_cfg, vision_cfg, text_hidden_size)
-        if has_video_input(cfg):
+        if has_video_input(cfg) and is_unified_grid_stream(cfg, vision_cfg):
             modalities["video"] = video_path(cfg, vision_cfg, text_hidden_size)
     if audio_cfg is not None:
         modalities["audio"] = audio_path(cfg, audio_cfg, text_hidden_size)
@@ -30,4 +31,3 @@ def multimodal_extras(cfg: Any, text_cfg: Any, text_hidden_size: int) -> dict |
 __all__ = ["multimodal_extras"]

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/detect.py RENAMED Viewed

@@ -34,8 +34,6 @@ def is_unified_grid_stream(cfg: Any, vision_cfg: Any | None = None) -> bool:
         "multimodal_rope",
     }:
         return True
-    if first(cfg, "vision_start_token_id", "vision_end_token_id") is not None and has_video_input(cfg):
-        return True
     if vision_cfg is not None and first(vision_cfg, "spatial_merge_size", "temporal_patch_size") is not None:
         return True
     return model_family_hint(cfg) == "qwen_vl"
@@ -179,4 +177,3 @@ def model_family_hint(cfg: Any) -> str | None:
     if mt == "pixtral" or "pixtral" in arch:
         return "pixtral"
     return None

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/ir.py RENAMED Viewed

@@ -61,6 +61,7 @@ class LayerSpec:
             a.qk_norm, a.shared, a.no_rope,
             f.kind, f.gated, f.num_experts,
             self.norm_kind, self.norm_placement,
+            any(block.get("id") == "cross_attention_adapter" for block in self.blocks),
         )

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/fusion_cross_attention.py RENAMED Viewed

@@ -53,10 +53,4 @@ def build_cross_attention_fusion_view(ir: dict, info: dict, mount_id: str, fusio
         "marker-end": f"url(#{arrow_id})", "fill": "none",
     }))
-    parts.append(_svg_text(
-        cx, 482,
-        "vision states stay separate; decoder layers read them with cross-attention",
-        {"text-anchor": "middle", "fill": C["muted"], "font-family": FONT_MONO, "font-size": 12},
-    ))
     return _svg(w, h, f"{ir.get('name', 'model')} cross-attention adapter", parts)

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/video.py RENAMED Viewed

@@ -1,15 +1,12 @@
 """Video pathway detail SVG."""
 from __future__ import annotations
-from ...svg import _defs, _ids, _rect_block, _region_rect, _svg, _svg_tag, _svg_text, _v_line
-from ...theme import C, FONT_MONO
-from .common import video_input
+from ...svg import _defs, _ids, _rect_block, _region_rect, _svg, _svg_tag, _v_line
+from ...theme import C
 def build_video_path_view(ir: dict, info: dict, mount_id: str, _block: dict) -> str:
     """Video frames -> visual encoder -> grid-aware video token stream."""
-    video = video_input(ir)
-    grid = ((video.get("tokens") or {}).get("grid") or {})
     w, h = 720, 560
     arrow_id, shadow_id = _ids(mount_id, "video-path")
     parts = [_defs(arrow_id, shadow_id)]
@@ -30,12 +27,4 @@ def build_video_path_view(ir: dict, info: dict, mount_id: str, _block: dict) ->
         "stroke": C["arrow"], "stroke-width": 1.6, "stroke-linecap": "round",
         "marker-end": f"url(#{arrow_id})", "fill": "none",
     }))
-    if grid.get("runtime_input"):
-        parts.append(_svg_text(
-            cx, 490,
-            f"runtime grid: {grid['runtime_input']}",
-            {"text-anchor": "middle", "fill": C["muted"], "font-family": FONT_MONO, "font-size": 11},
-        ))
     return _svg(w, h, f"{ir.get('name', 'model')} video pathway", parts)

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/metadata.py RENAMED Viewed

@@ -186,6 +186,8 @@ def _group_label(group: dict, info: dict | None = None) -> str:
         bits.append(mask_short(attn))
     bits.append(kind_short(attn))
     bits.append("MoE" if ffn.get("kind") == "moe" else "Dense")
+    if _has_cross_attention_adapter(group["spec"]):
+        bits.append("Vision XAttn")
     return f"{' · '.join(bits)}  ({_indices_summary(group, info)})"
@@ -233,10 +235,18 @@ def _signature(layer: dict) -> str:
             ffn.get("num_experts"),
             layer.get("norm_kind"),
             layer.get("norm_placement"),
+            _has_cross_attention_adapter(layer),
         )
     )
+def _has_cross_attention_adapter(layer: dict) -> bool:
+    return any(
+        block.get("id") == "cross_attention_adapter"
+        for block in layer.get("blocks", []) or []
+    )
 def _arch_badges(ir: dict, info: dict) -> list[dict[str, str]]:
     badges: list[dict[str, str]] = []
     attention = info["dominant"]["spec"]["attention"]

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/metadata_modalities.py RENAMED Viewed

@@ -351,6 +351,7 @@ def _fusion_description(fusion: dict) -> str:
         bits = ["cross attention", "vision states condition selected decoder layers"]
         if n_layers:
             bits.append(f"{_fmt_int(n_layers)} cross-attention layers")
+        bits.append("vision states stay separate")
         if width:
             bits.append(f"decoder width {_fmt_int(width)}")
         return "; ".join(bits)
@@ -390,7 +391,10 @@ def _fusion_children(fusion: dict, inputs: dict) -> list[dict]:
             {
                 "id": "cross_attention_adapter",
                 "title": "Cross-attention adapter layers",
-                "description": f"Vision context is read by {layers_desc}; it is not inserted as replacement text embeddings.",
+                "description": (
+                    f"Vision states stay separate; {layers_desc} read them with "
+                    "cross-attention instead of inserting them as replacement text embeddings."
+                ),
             },
             {
                 "id": "stack_input",

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/views.py RENAMED Viewed

@@ -71,14 +71,18 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
     fusion_spec = modalities.get("fusion") or {}
     has_modality_fusion = bool(modality_inputs) and bool(fusion_spec)
     has_cross_attention_fusion = has_modality_fusion and fusion_spec.get("kind") == "cross_attention"
+    has_external_side_stream = any(block.get("source_id") for block in side_blocks)
-    inner_x, inner_w = (230, 500) if has_cross_attention_fusion else (110, 500)
+    modality_count = len(modality_inputs)
+    has_wide_modality_scaffold = has_modality_fusion and modality_count >= 3
+    needs_wide_arch = has_external_side_stream or has_wide_modality_scaffold
+    inner_x, inner_w = (230, 500) if needs_wide_arch else (110, 500)
     # Default chain center.  Auto-shift right when a side_align="tap" block on
     # the left lane would overlap the widest chain block at the default cx.
     # This handles parallel-residual architectures (e.g. GPT-NeoX / GPT-J) where
     # FFN and Attention share the same y-row without any renderer special-casing.
-    cx = 480 if has_cross_attention_fusion else 360
+    cx = 480 if needs_wide_arch else 360
     _tap_left = [
         b for b in side_blocks
         if b.get("side_align") == "tap" and b.get("lane") == "left"
@@ -100,10 +104,12 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
     inner_h = max(490, stack_h + 2 * inner_padding)
     inner_y = 200
-    modality_count = len(modality_inputs)
     has_audio_fusion = has_modality_fusion and "audio" in modality_inputs
-    h = inner_y + inner_h + (360 if has_audio_fusion else 292 if has_modality_fusion else 232)
-    w = 960 if has_cross_attention_fusion or modality_count >= 3 else 720
+    if has_modality_fusion and not has_cross_attention_fusion:
+        h = inner_y + inner_h + (360 if has_audio_fusion else 292)
+    else:
+        h = inner_y + inner_h + 232
+    w = 960 if needs_wide_arch else 720
     arrow_id, shadow_id = _ids(mount_id, "arch")
     parts = [_defs(arrow_id, shadow_id)]
@@ -136,14 +142,16 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
     y_cursor = inner_y + inner_h - free / 2
     for block in chain_blocks:
         layout = _KIND_LAYOUT.get(block["kind"]) or _KIND_LAYOUT["norm"]
-        block_h = layout["h"]
+        block_w = block.get("w") or layout["w"]
+        block_h = block.get("h") or layout["h"]
+        font_size = block.get("font") or layout.get("font", 16)
         top = y_cursor - block_h
         if layout["shape"] == "rect":
             geom = _rect_block(
                 parts, info, shadow_id, block["id"],
-                cx - layout["w"] / 2, top, layout["w"], block_h,
+                cx - block_w / 2, top, block_w, block_h,
                 _block_label(info, block["id"], block.get("label")),
-                font_size=layout["font"],
+                font_size=font_size,
             )
         else:
             geom = _plus_block(
@@ -211,7 +219,10 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
 def _layer_stack_height(layer_blocks: list[dict]) -> int:
     if not layer_blocks:
         return 0
-    total = sum(_KIND_LAYOUT.get(b["kind"], _KIND_LAYOUT["norm"])["h"] for b in layer_blocks)
+    total = sum(
+        b.get("h") or _KIND_LAYOUT.get(b["kind"], _KIND_LAYOUT["norm"])["h"]
+        for b in layer_blocks
+    )
     total += _BLOCK_GAP * (len(layer_blocks) - 1)
     return total
@@ -235,14 +246,22 @@ def _draw_side_block(
     horizontal arrow into the ``feeds`` target.
     """
     layout = _KIND_LAYOUT.get(block["kind"]) or _KIND_LAYOUT["norm"]
-    block_w = layout["w"]
-    block_h = layout["h"]
+    block_w = block.get("w") or layout["w"]
+    block_h = block.get("h") or layout["h"]
+    font_size = block.get("font") or layout.get("font", 16)
     lane = block.get("lane", "left")
     feeds_id = block.get("feeds")
     tap_id = block.get("tap_from")
     feeds_geom = block_pos.get(feeds_id) if feeds_id else None
     tap_geom = block_pos.get(tap_id) if tap_id else None
+    if feeds_geom and block.get("source_id"):
+        _draw_external_side_block(
+            parts, info, shadow_id, block, feeds_geom,
+            inner_x, inner_w, arrow_id, block_pos,
+            block_w, block_h, font_size,
+        )
+        return
     if not feeds_geom or not tap_geom:
         return  # mis-declared; nothing to anchor to
@@ -263,7 +282,7 @@ def _draw_side_block(
         parts, info, shadow_id, block["id"],
         block_x, top, block_w, block_h,
         _block_label(info, block["id"], block.get("label")),
-        font_size=layout["font"],
+        font_size=font_size,
     )
     block_pos[block["id"]] = geom
@@ -308,6 +327,77 @@ def _draw_side_block(
         }))
+def _draw_external_side_block(
+    parts: list[str],
+    info: dict,
+    shadow_id: str,
+    block: dict,
+    feeds_geom: dict,
+    inner_x: float,
+    _inner_w: float,
+    arrow_id: str,
+    block_pos: dict,
+    block_w: float,
+    block_h: float,
+    font_size: int,
+) -> None:
+    """Draw a layer-local side stream, e.g. vision states into cross-attention."""
+    lane = block.get("lane", "external_left")
+    if lane.endswith("left"):
+        block_x = max(56, inner_x - block_w - 34)
+        target_x = feeds_geom["left"] - GAP
+    else:
+        block_x = inner_x + _inner_w + 34
+        target_x = feeds_geom["right"] + GAP
+    cy = feeds_geom["cy"] + float(block.get("offset_y", 28))
+    top = cy - block_h / 2
+    geom = _rect_block(
+        parts, info, shadow_id, block["id"],
+        block_x, top, block_w, block_h,
+        _block_label(info, block["id"], block.get("label")),
+        font_size=font_size,
+    )
+    block_pos[block["id"]] = geom
+    if lane.endswith("left"):
+        route_x = (geom["right"] + target_x) / 2 if geom["right"] < target_x else target_x - 44
+    else:
+        route_x = (geom["left"] + target_x) / 2 if geom["left"] > target_x else target_x + 44
+    source_id = block.get("source_id")
+    source_w = block.get("source_w") or 230
+    source_h = block.get("source_h") or 46
+    source_gap = block.get("source_gap") or 56
+    source_x = geom["cx"] - source_w / 2
+    source_top = geom["bottom"] + source_gap
+    source = _rect_block(
+        parts, info, shadow_id, source_id,
+        source_x, source_top, source_w, source_h,
+        _block_label(info, source_id, block.get("source_label", source_id)),
+        font_size=font_size,
+    )
+    block_pos[source_id] = source
+    parts.append(_v_line(source, geom, arrow_id))
+    # Route out with a visible 90-degree turn so the adapter reads as an
+    # external conditioning path, not another central-chain block.
+    x_start = geom["right"] if lane.endswith("left") else geom["left"]
+    parts.append(_svg_tag("path", {
+        "d": (
+            f"M {x_start} {geom['cy']} "
+            f"L {route_x} {geom['cy']} "
+            f"L {route_x} {feeds_geom['cy']} "
+            f"L {target_x} {feeds_geom['cy']}"
+        ),
+        "stroke": C["arrow"],
+        "stroke-width": 1.6,
+        "stroke-linecap": "round",
+        "stroke-linejoin": "round",
+        "marker-end": f"url(#{arrow_id})",
+        "fill": "none",
+    }))
 def _mark_branch_tap(
     parts: list[str],
     branch_taps: set[tuple[float, float]],

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/views_modalities.py RENAMED Viewed

@@ -2,8 +2,8 @@
 from __future__ import annotations
 from .metadata import _block_label
-from .svg import _block_top_to_block_bottom, _rect_block, _svg_tag, _v_line
-from .theme import C, GAP
+from .svg import _block_top_to_block_bottom, _rect_block, _v_line
+from .theme import GAP
 def draw_multimodal_input_scaffold(
@@ -40,14 +40,26 @@ def draw_multimodal_input_scaffold(
         route_specs.append(("audio_path", _block_label(info, "audio_path", "Audio -> tokens")))
     multi_route = len(route_specs) > 1
     if multi_route:
-        text_x = cx if len(route_specs) == 2 else cx - 300
         text_w = 230
-        route_w = 170
         modality_y = embed_y
+        route_ys: list[float]
         if len(route_specs) == 2:
+            text_x = cx
+            route_w = 170
             route_centers = [cx - 215, cx + 215]
+            route_ys = [modality_y, modality_y]
         else:
-            route_centers = [cx - 70, cx + 150, cx + 370]
+            text_x = cx
+            route_w = 170
+            if len(route_specs) == 3:
+                route_centers = [cx - 300, cx + 210, cx + 350]
+                route_ys = [embed_y, embed_y, tok_y]
+            else:
+                span_left = cx - 345
+                span_right = cx + 350
+                step = (span_right - span_left) / max(1, len(route_specs) - 1)
+                route_centers = [span_left + i * step for i in range(len(route_specs))]
+                route_ys = [modality_y for _ in route_specs]
     else:
         text_x = cx - 155
         modality_x = cx + 155
@@ -55,6 +67,7 @@ def draw_multimodal_input_scaffold(
         route_w = 210
         modality_y = embed_y
         route_centers = [modality_x]
+        route_ys = [modality_y]
     tok_text = _rect_block(
         parts, info, shadow_id, "tok_text",
@@ -79,15 +92,22 @@ def draw_multimodal_input_scaffold(
         fusion["bottom"] + GAP,
         arrow_id,
     ))
-    route_targets = (
-        [fusion["cx"] - 96, fusion["cx"] + 96] if len(route_specs) == 2
-        else [fusion["cx"] - 112, fusion["cx"], fusion["cx"] + 112] if len(route_specs) >= 3
-        else [fusion["cx"] + 56]
-    )
-    for (node_id, label), x, target_x in zip(route_specs, route_centers, route_targets):
+    if len(route_specs) == 2:
+        route_targets = [fusion["cx"] - 96, fusion["cx"] + 96]
+    elif len(route_specs) == 3:
+        route_targets = [fusion["cx"] - 112, fusion["cx"], fusion["cx"] + 112]
+    elif len(route_specs) > 3:
+        span_left = fusion["cx"] - 132
+        span_right = fusion["cx"] + 132
+        step = (span_right - span_left) / max(1, len(route_specs) - 1)
+        route_targets = [span_left + i * step for i in range(len(route_specs))]
+    else:
+        route_targets = [fusion["cx"] + 56]
+    for (node_id, label), x, y, target_x in zip(route_specs, route_centers, route_ys, route_targets):
         route = _rect_block(
             parts, info, shadow_id, node_id,
-            x - route_w / 2, modality_y, route_w, 44,
+            x - route_w / 2, y, route_w, 44,
             label, font_size=16,
         )
         parts.append(_block_top_to_block_bottom(
@@ -108,15 +128,9 @@ def draw_cross_attention_input_scaffold(
     inner_y: float,
     inner_h: float,
 ) -> tuple[dict, dict, dict]:
-    """Draw visual context as a side stream into decoder cross-attention."""
-    embed_y = inner_y + inner_h + 132
+    """Draw the text stream; vision side states appear only on cross-attn variants."""
+    embed_y = inner_y + inner_h + 64
     tok_y = embed_y + 66
-    stack_side_y = inner_y + inner_h - 110
-    adapter_y = embed_y - 78
-    vision_y = adapter_y + 96
-    side_cx = 220
-    adapter_w = 270
-    vision_w = 230
     tok_text = _rect_block(
         parts, info, shadow_id, "tok_text",
@@ -128,34 +142,6 @@ def draw_cross_attention_input_scaffold(
         cx - 125, embed_y, 250, 44,
         _block_label(info, "embed", "Token Embedding"), font_size=16,
     )
-    vision = _rect_block(
-        parts, info, shadow_id, "vision_path",
-        side_cx - vision_w / 2, vision_y, vision_w, 46,
-        _block_label(info, "vision_path", "Vision context"), font_size=17,
-    )
-    adapter = _rect_block(
-        parts, info, shadow_id, "fusion",
-        side_cx - adapter_w / 2, adapter_y, adapter_w, 54,
-        _block_label(info, "fusion", "Cross-attention adapter"), font_size=17,
-    )
     parts.append(_v_line(tok_text, embed, arrow_id))
-    parts.append(_v_line(vision, adapter, arrow_id))
-    target_x = cx - 122
-    target_y = stack_side_y - 48
-    parts.append(_svg_tag("path", {
-        "d": (
-            f"M {adapter['cx']} {adapter['top']} "
-            f"L {adapter['cx']} {target_y} "
-            f"L {target_x} {target_y}"
-        ),
-        "stroke": C["arrow"],
-        "stroke-width": 1.6,
-        "stroke-linecap": "round",
-        "stroke-linejoin": "round",
-        "marker-end": f"url(#{arrow_id})",
-        "fill": "none",
-    }))
     return tok_text, embed, embed

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: model-unfolder
-Version: 0.2.8
+Version: 0.2.9
 Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
 Author: model-unfolder contributors
 License: Apache-2.0

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "model-unfolder"
-version = "0.2.8"
+version = "0.2.9"
 description = "Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter."
 readme = "README.md"
 requires-python = ">=3.9"

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/tests/test_expanded_json.py RENAMED Viewed

@@ -425,6 +425,29 @@ def test_expanded_json_supports_mllama_cross_attention_vision():
     }
+def test_mllama_cross_attention_adapter_is_layer_variant_only():
+    diagram = unfold(MLLAMA_VISION_TINY_CONFIG)
+    ir = diagram.to_ir()
+    assert not any(
+        block.get("id") == "cross_attention_adapter"
+        for block in ir["layers"][0]["blocks"]
+    )
+    assert any(
+        block.get("id") == "cross_attention_adapter"
+        for block in ir["layers"][3]["blocks"]
+    )
+    assert [
+        i for i, layer in enumerate(ir["layers"])
+        if any(block.get("id") == "cross_attention_adapter" for block in layer["blocks"])
+    ] == [3, 8, 13, 18, 23, 28, 33, 38]
+    html = diagram.to_html(standalone=False)
+    assert "Vision XAttn" in html
+    assert "Cross-attention adapter" in html
+    assert "Vision states stay separate" in html
 def test_expanded_json_supports_qwen_style_unified_grid_stream():
     data = unfold(QWEN2_VL_TINY_CONFIG, return_json=True)
     encoded = json.dumps(data["modalities"])

{model_unfolder-0.2.8 → model_unfolder-0.2.9}/tests/test_smoke.py RENAMED Viewed

@@ -544,6 +544,21 @@ def test_gemma4_multimodal_fusion_render():
     assert 'data-card-id="fusion_mixed_stream"' in html
+def test_gemma4_video_token_does_not_create_grid_video_path():
+    cfg = _gemma4_e2b_vision_config()
+    cfg.update({"video_token_id": 258884, "video_seq_length": 64})
+    d = unfold(cfg)
+    modalities = d.to_ir()["extras"]["modalities"]["inputs"]
+    assert "vision" in modalities
+    assert "audio" in modalities
+    assert "video" not in modalities
+    html = d.to_html(standalone=True)
+    assert "Video -&gt; grid" not in html
+    assert 'data-card-id="video_path"' not in html
 def test_qwen2_audio_sparse_text_config_is_completed():
     d = unfold(QWEN2_AUDIO_SPARSE_CONFIG)
     ir = d.to_ir()