model-unfolder 0.2.8__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/PKG-INFO +1 -1
  2. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/parser.py +42 -0
  3. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/builder.py +3 -3
  4. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/detect.py +0 -3
  5. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/ir.py +1 -0
  6. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/fusion_cross_attention.py +0 -6
  7. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/video.py +2 -13
  8. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/metadata.py +10 -0
  9. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/metadata_modalities.py +5 -1
  10. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/views.py +102 -12
  11. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/views_modalities.py +34 -48
  12. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder.egg-info/PKG-INFO +1 -1
  13. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/pyproject.toml +1 -1
  14. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/tests/test_expanded_json.py +23 -0
  15. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/tests/test_smoke.py +15 -0
  16. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/LICENSE +0 -0
  17. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/README.md +0 -0
  18. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/__init__.py +0 -0
  19. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/__init__.py +0 -0
  20. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/custom/__init__.py +0 -0
  21. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/diffusor/__init__.py +0 -0
  22. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/__init__.py +0 -0
  23. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/assembly.py +0 -0
  24. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/blocks/__init__.py +0 -0
  25. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/blocks/attention.py +0 -0
  26. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/blocks/descriptions.py +0 -0
  27. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/blocks/feed_forward.py +0 -0
  28. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/blocks/layers.py +0 -0
  29. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/blocks/model.py +0 -0
  30. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/common.py +0 -0
  31. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/__init__.py +0 -0
  32. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/__init__.py +0 -0
  33. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/accessors.py +0 -0
  34. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/audio.py +0 -0
  35. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/fusion.py +0 -0
  36. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/schema.py +0 -0
  37. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/modalities/vision.py +0 -0
  38. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py +0 -0
  39. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/diagram.py +0 -0
  40. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/evidence/__init__.py +0 -0
  41. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/evidence/ast_scanner.py +0 -0
  42. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/evidence/inspector.py +0 -0
  43. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/evidence/models.py +0 -0
  44. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/evidence/patterns.py +0 -0
  45. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/evidence/sources.py +0 -0
  46. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/evidence/validate.py +0 -0
  47. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/__init__.py +0 -0
  48. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/attention.py +0 -0
  49. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/block_graph.py +0 -0
  50. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/code_evidence.py +0 -0
  51. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/ffn.py +0 -0
  52. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/grouping.py +0 -0
  53. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/layer_group.py +0 -0
  54. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/modalities.py +0 -0
  55. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/norms.py +0 -0
  56. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/ops.py +0 -0
  57. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/pathways.py +0 -0
  58. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/residual.py +0 -0
  59. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/sections.py +0 -0
  60. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/stack.py +0 -0
  61. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/expanded/utils.py +0 -0
  62. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/html_renderer.py +0 -0
  63. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/labels.py +0 -0
  64. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/params.py +0 -0
  65. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/parser.py +0 -0
  66. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/__init__.py +0 -0
  67. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/__init__.py +0 -0
  68. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/__init__.py +0 -0
  69. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention.py +0 -0
  70. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/__init__.py +0 -0
  71. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/common.py +0 -0
  72. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/grouped_query.py +0 -0
  73. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/latent.py +0 -0
  74. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/linear.py +0 -0
  75. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/multi_head.py +0 -0
  76. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/multi_query.py +0 -0
  77. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/rwkv.py +0 -0
  78. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/sliding_window.py +0 -0
  79. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/attention_types/state_space.py +0 -0
  80. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/feed_forward.py +0 -0
  81. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/mixture_of_experts.py +0 -0
  82. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modalities.py +0 -0
  83. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/__init__.py +0 -0
  84. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/audio.py +0 -0
  85. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/common.py +0 -0
  86. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/fusion_grid.py +0 -0
  87. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/fusion_placeholder.py +0 -0
  88. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/modality_views/vision.py +0 -0
  89. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/per_layer_embedding.py +0 -0
  90. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/block_views/registry.py +0 -0
  91. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/cards.py +0 -0
  92. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/document.py +0 -0
  93. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/evidence.py +0 -0
  94. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/interactions.py +0 -0
  95. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/sections.py +0 -0
  96. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/styles.py +0 -0
  97. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/svg.py +0 -0
  98. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/theme.py +0 -0
  99. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder/renderers/html/utils.py +0 -0
  100. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder.egg-info/SOURCES.txt +0 -0
  101. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder.egg-info/dependency_links.txt +0 -0
  102. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder.egg-info/requires.txt +0 -0
  103. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/model_unfolder.egg-info/top_level.txt +0 -0
  104. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/setup.cfg +0 -0
  105. {model_unfolder-0.2.8 → model_unfolder-0.2.9}/tests/test_code_evidence.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: model-unfolder
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
5
5
  Author: model-unfolder contributors
6
6
  License: Apache-2.0
@@ -26,6 +26,7 @@ Detection is config-driven:
26
26
  * ``num_local_experts`` / ``n_routed_experts`` → MoE FFN
27
27
  * ``n_shared_experts`` / ``num_shared_experts`` → shared experts
28
28
  * ``clip_qkv`` → attention extras
29
+ * ``cross_attention_layers`` → vision side-attention layers
29
30
 
30
31
  Warnings policy: warn only for *specific* config problems (missing
31
32
  critical field, unrecognized layer_type value, …). Never warn just
@@ -43,6 +44,7 @@ from .special_parts.per_layer_embedding import (
43
44
  per_layer_embedding_extras,
44
45
  )
45
46
  from .special_parts.modalities import multimodal_extras
47
+ from .special_parts.modalities.detect import cross_attention_layers as _cross_attention_layers
46
48
 
47
49
 
48
50
  # ---------------------------------------------------------------------------
@@ -252,6 +254,13 @@ def parse(cfg: Any) -> ModelIR:
252
254
  ple_dim = _g(text_cfg, "hidden_size_per_layer_input") or 0
253
255
  ple_vocab = _g(text_cfg, "vocab_size_per_layer_input") or get("vocab_size", 0)
254
256
 
257
+ # ---- Decoder layers that read external modality states through cross-attention ----
258
+ cross_attn_layer_set = set(_cross_attention_layers(cfg, text_cfg) or [])
259
+ has_cross_attention_side_state = bool(
260
+ cross_attn_layer_set
261
+ and (_g(cfg, "vision_config") is not None or _g(cfg, "vision_model_config") is not None)
262
+ )
263
+
255
264
  # ---- Walk the layer stack ----
256
265
  unknown_layer_types: set[str] = set()
257
266
  cross_layer_edges: list[CrossLayerEdge] = []
@@ -325,6 +334,8 @@ def parse(cfg: Any) -> ModelIR:
325
334
  )
326
335
 
327
336
  extra_blocks = list(per_layer_embedding_blocks(hidden_size, ple_dim, activation="gelu")) if ple_dim else []
337
+ if has_cross_attention_side_state and i in cross_attn_layer_set:
338
+ extra_blocks.append(_cross_attention_adapter_block(cross_attn_layer_set))
328
339
 
329
340
  if use_parallel_residual:
330
341
  layers.append(parallel_decoder_layer(i, attn, ffn, hidden_size, norm_kind=norm_kind))
@@ -551,3 +562,34 @@ def _last_matching_layer(layer_types, i: int, first_shared: int) -> int | None:
551
562
  if layer_types[j] == target_type:
552
563
  return j
553
564
  return None
565
+
566
+
567
+ def _cross_attention_adapter_block(layer_indices: set[int]) -> dict:
568
+ """Layer-local side block for decoder layers that read vision states."""
569
+ layers = sorted(layer_indices)
570
+ if layers:
571
+ layer_label = ", ".join(f"L{i}" for i in layers[:6])
572
+ if len(layers) > 6:
573
+ layer_label += ", ..."
574
+ layer_desc = f"active on {layer_label}"
575
+ else:
576
+ layer_desc = "active on selected decoder layers"
577
+ return {
578
+ "id": "cross_attention_adapter",
579
+ "role": "fusion",
580
+ "kind": "fusion",
581
+ "lane": "external_left",
582
+ "feeds": "attn",
583
+ "source_id": "vision_path",
584
+ "source_label": "Vision context",
585
+ "label": "Cross-attention adapter",
586
+ "title": "Cross-attention adapter",
587
+ "description": (
588
+ "Vision states stay separate; this decoder layer reads them with "
589
+ f"cross-attention; {layer_desc}."
590
+ ),
591
+ "detail_view": "multimodal_fusion",
592
+ "w": 270,
593
+ "h": 54,
594
+ "font": 16,
595
+ }
@@ -5,9 +5,10 @@ from typing import Any
5
5
 
6
6
  from .accessors import nested
7
7
  from .audio import audio_path
8
+ from .detect import has_video_input, is_unified_grid_stream
8
9
  from .fusion import fusion_path
9
10
  from .schema import multimodal_payload
10
- from .vision import has_video_input, video_path, vision_path
11
+ from .vision import video_path, vision_path
11
12
 
12
13
 
13
14
  def multimodal_extras(cfg: Any, text_cfg: Any, text_hidden_size: int) -> dict | None:
@@ -18,7 +19,7 @@ def multimodal_extras(cfg: Any, text_cfg: Any, text_hidden_size: int) -> dict |
18
19
  modalities: dict[str, Any] = {}
19
20
  if vision_cfg is not None:
20
21
  modalities["vision"] = vision_path(cfg, text_cfg, vision_cfg, text_hidden_size)
21
- if has_video_input(cfg):
22
+ if has_video_input(cfg) and is_unified_grid_stream(cfg, vision_cfg):
22
23
  modalities["video"] = video_path(cfg, vision_cfg, text_hidden_size)
23
24
  if audio_cfg is not None:
24
25
  modalities["audio"] = audio_path(cfg, audio_cfg, text_hidden_size)
@@ -30,4 +31,3 @@ def multimodal_extras(cfg: Any, text_cfg: Any, text_hidden_size: int) -> dict |
30
31
 
31
32
 
32
33
  __all__ = ["multimodal_extras"]
33
-
@@ -34,8 +34,6 @@ def is_unified_grid_stream(cfg: Any, vision_cfg: Any | None = None) -> bool:
34
34
  "multimodal_rope",
35
35
  }:
36
36
  return True
37
- if first(cfg, "vision_start_token_id", "vision_end_token_id") is not None and has_video_input(cfg):
38
- return True
39
37
  if vision_cfg is not None and first(vision_cfg, "spatial_merge_size", "temporal_patch_size") is not None:
40
38
  return True
41
39
  return model_family_hint(cfg) == "qwen_vl"
@@ -179,4 +177,3 @@ def model_family_hint(cfg: Any) -> str | None:
179
177
  if mt == "pixtral" or "pixtral" in arch:
180
178
  return "pixtral"
181
179
  return None
182
-
@@ -61,6 +61,7 @@ class LayerSpec:
61
61
  a.qk_norm, a.shared, a.no_rope,
62
62
  f.kind, f.gated, f.num_experts,
63
63
  self.norm_kind, self.norm_placement,
64
+ any(block.get("id") == "cross_attention_adapter" for block in self.blocks),
64
65
  )
65
66
 
66
67
 
@@ -53,10 +53,4 @@ def build_cross_attention_fusion_view(ir: dict, info: dict, mount_id: str, fusio
53
53
  "marker-end": f"url(#{arrow_id})", "fill": "none",
54
54
  }))
55
55
 
56
- parts.append(_svg_text(
57
- cx, 482,
58
- "vision states stay separate; decoder layers read them with cross-attention",
59
- {"text-anchor": "middle", "fill": C["muted"], "font-family": FONT_MONO, "font-size": 12},
60
- ))
61
56
  return _svg(w, h, f"{ir.get('name', 'model')} cross-attention adapter", parts)
62
-
@@ -1,15 +1,12 @@
1
1
  """Video pathway detail SVG."""
2
2
  from __future__ import annotations
3
3
 
4
- from ...svg import _defs, _ids, _rect_block, _region_rect, _svg, _svg_tag, _svg_text, _v_line
5
- from ...theme import C, FONT_MONO
6
- from .common import video_input
4
+ from ...svg import _defs, _ids, _rect_block, _region_rect, _svg, _svg_tag, _v_line
5
+ from ...theme import C
7
6
 
8
7
 
9
8
  def build_video_path_view(ir: dict, info: dict, mount_id: str, _block: dict) -> str:
10
9
  """Video frames -> visual encoder -> grid-aware video token stream."""
11
- video = video_input(ir)
12
- grid = ((video.get("tokens") or {}).get("grid") or {})
13
10
  w, h = 720, 560
14
11
  arrow_id, shadow_id = _ids(mount_id, "video-path")
15
12
  parts = [_defs(arrow_id, shadow_id)]
@@ -30,12 +27,4 @@ def build_video_path_view(ir: dict, info: dict, mount_id: str, _block: dict) ->
30
27
  "stroke": C["arrow"], "stroke-width": 1.6, "stroke-linecap": "round",
31
28
  "marker-end": f"url(#{arrow_id})", "fill": "none",
32
29
  }))
33
- if grid.get("runtime_input"):
34
- parts.append(_svg_text(
35
- cx, 490,
36
- f"runtime grid: {grid['runtime_input']}",
37
- {"text-anchor": "middle", "fill": C["muted"], "font-family": FONT_MONO, "font-size": 11},
38
- ))
39
-
40
30
  return _svg(w, h, f"{ir.get('name', 'model')} video pathway", parts)
41
-
@@ -186,6 +186,8 @@ def _group_label(group: dict, info: dict | None = None) -> str:
186
186
  bits.append(mask_short(attn))
187
187
  bits.append(kind_short(attn))
188
188
  bits.append("MoE" if ffn.get("kind") == "moe" else "Dense")
189
+ if _has_cross_attention_adapter(group["spec"]):
190
+ bits.append("Vision XAttn")
189
191
  return f"{' · '.join(bits)} ({_indices_summary(group, info)})"
190
192
 
191
193
 
@@ -233,10 +235,18 @@ def _signature(layer: dict) -> str:
233
235
  ffn.get("num_experts"),
234
236
  layer.get("norm_kind"),
235
237
  layer.get("norm_placement"),
238
+ _has_cross_attention_adapter(layer),
236
239
  )
237
240
  )
238
241
 
239
242
 
243
+ def _has_cross_attention_adapter(layer: dict) -> bool:
244
+ return any(
245
+ block.get("id") == "cross_attention_adapter"
246
+ for block in layer.get("blocks", []) or []
247
+ )
248
+
249
+
240
250
  def _arch_badges(ir: dict, info: dict) -> list[dict[str, str]]:
241
251
  badges: list[dict[str, str]] = []
242
252
  attention = info["dominant"]["spec"]["attention"]
@@ -351,6 +351,7 @@ def _fusion_description(fusion: dict) -> str:
351
351
  bits = ["cross attention", "vision states condition selected decoder layers"]
352
352
  if n_layers:
353
353
  bits.append(f"{_fmt_int(n_layers)} cross-attention layers")
354
+ bits.append("vision states stay separate")
354
355
  if width:
355
356
  bits.append(f"decoder width {_fmt_int(width)}")
356
357
  return "; ".join(bits)
@@ -390,7 +391,10 @@ def _fusion_children(fusion: dict, inputs: dict) -> list[dict]:
390
391
  {
391
392
  "id": "cross_attention_adapter",
392
393
  "title": "Cross-attention adapter layers",
393
- "description": f"Vision context is read by {layers_desc}; it is not inserted as replacement text embeddings.",
394
+ "description": (
395
+ f"Vision states stay separate; {layers_desc} read them with "
396
+ "cross-attention instead of inserting them as replacement text embeddings."
397
+ ),
394
398
  },
395
399
  {
396
400
  "id": "stack_input",
@@ -71,14 +71,18 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
71
71
  fusion_spec = modalities.get("fusion") or {}
72
72
  has_modality_fusion = bool(modality_inputs) and bool(fusion_spec)
73
73
  has_cross_attention_fusion = has_modality_fusion and fusion_spec.get("kind") == "cross_attention"
74
+ has_external_side_stream = any(block.get("source_id") for block in side_blocks)
74
75
 
75
- inner_x, inner_w = (230, 500) if has_cross_attention_fusion else (110, 500)
76
+ modality_count = len(modality_inputs)
77
+ has_wide_modality_scaffold = has_modality_fusion and modality_count >= 3
78
+ needs_wide_arch = has_external_side_stream or has_wide_modality_scaffold
79
+ inner_x, inner_w = (230, 500) if needs_wide_arch else (110, 500)
76
80
 
77
81
  # Default chain center. Auto-shift right when a side_align="tap" block on
78
82
  # the left lane would overlap the widest chain block at the default cx.
79
83
  # This handles parallel-residual architectures (e.g. GPT-NeoX / GPT-J) where
80
84
  # FFN and Attention share the same y-row without any renderer special-casing.
81
- cx = 480 if has_cross_attention_fusion else 360
85
+ cx = 480 if needs_wide_arch else 360
82
86
  _tap_left = [
83
87
  b for b in side_blocks
84
88
  if b.get("side_align") == "tap" and b.get("lane") == "left"
@@ -100,10 +104,12 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
100
104
  inner_h = max(490, stack_h + 2 * inner_padding)
101
105
 
102
106
  inner_y = 200
103
- modality_count = len(modality_inputs)
104
107
  has_audio_fusion = has_modality_fusion and "audio" in modality_inputs
105
- h = inner_y + inner_h + (360 if has_audio_fusion else 292 if has_modality_fusion else 232)
106
- w = 960 if has_cross_attention_fusion or modality_count >= 3 else 720
108
+ if has_modality_fusion and not has_cross_attention_fusion:
109
+ h = inner_y + inner_h + (360 if has_audio_fusion else 292)
110
+ else:
111
+ h = inner_y + inner_h + 232
112
+ w = 960 if needs_wide_arch else 720
107
113
 
108
114
  arrow_id, shadow_id = _ids(mount_id, "arch")
109
115
  parts = [_defs(arrow_id, shadow_id)]
@@ -136,14 +142,16 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
136
142
  y_cursor = inner_y + inner_h - free / 2
137
143
  for block in chain_blocks:
138
144
  layout = _KIND_LAYOUT.get(block["kind"]) or _KIND_LAYOUT["norm"]
139
- block_h = layout["h"]
145
+ block_w = block.get("w") or layout["w"]
146
+ block_h = block.get("h") or layout["h"]
147
+ font_size = block.get("font") or layout.get("font", 16)
140
148
  top = y_cursor - block_h
141
149
  if layout["shape"] == "rect":
142
150
  geom = _rect_block(
143
151
  parts, info, shadow_id, block["id"],
144
- cx - layout["w"] / 2, top, layout["w"], block_h,
152
+ cx - block_w / 2, top, block_w, block_h,
145
153
  _block_label(info, block["id"], block.get("label")),
146
- font_size=layout["font"],
154
+ font_size=font_size,
147
155
  )
148
156
  else:
149
157
  geom = _plus_block(
@@ -211,7 +219,10 @@ def _build_architecture_view(ir: dict, info: dict, mount_id: str) -> str:
211
219
  def _layer_stack_height(layer_blocks: list[dict]) -> int:
212
220
  if not layer_blocks:
213
221
  return 0
214
- total = sum(_KIND_LAYOUT.get(b["kind"], _KIND_LAYOUT["norm"])["h"] for b in layer_blocks)
222
+ total = sum(
223
+ b.get("h") or _KIND_LAYOUT.get(b["kind"], _KIND_LAYOUT["norm"])["h"]
224
+ for b in layer_blocks
225
+ )
215
226
  total += _BLOCK_GAP * (len(layer_blocks) - 1)
216
227
  return total
217
228
 
@@ -235,14 +246,22 @@ def _draw_side_block(
235
246
  horizontal arrow into the ``feeds`` target.
236
247
  """
237
248
  layout = _KIND_LAYOUT.get(block["kind"]) or _KIND_LAYOUT["norm"]
238
- block_w = layout["w"]
239
- block_h = layout["h"]
249
+ block_w = block.get("w") or layout["w"]
250
+ block_h = block.get("h") or layout["h"]
251
+ font_size = block.get("font") or layout.get("font", 16)
240
252
  lane = block.get("lane", "left")
241
253
  feeds_id = block.get("feeds")
242
254
  tap_id = block.get("tap_from")
243
255
 
244
256
  feeds_geom = block_pos.get(feeds_id) if feeds_id else None
245
257
  tap_geom = block_pos.get(tap_id) if tap_id else None
258
+ if feeds_geom and block.get("source_id"):
259
+ _draw_external_side_block(
260
+ parts, info, shadow_id, block, feeds_geom,
261
+ inner_x, inner_w, arrow_id, block_pos,
262
+ block_w, block_h, font_size,
263
+ )
264
+ return
246
265
  if not feeds_geom or not tap_geom:
247
266
  return # mis-declared; nothing to anchor to
248
267
 
@@ -263,7 +282,7 @@ def _draw_side_block(
263
282
  parts, info, shadow_id, block["id"],
264
283
  block_x, top, block_w, block_h,
265
284
  _block_label(info, block["id"], block.get("label")),
266
- font_size=layout["font"],
285
+ font_size=font_size,
267
286
  )
268
287
  block_pos[block["id"]] = geom
269
288
 
@@ -308,6 +327,77 @@ def _draw_side_block(
308
327
  }))
309
328
 
310
329
 
330
+ def _draw_external_side_block(
331
+ parts: list[str],
332
+ info: dict,
333
+ shadow_id: str,
334
+ block: dict,
335
+ feeds_geom: dict,
336
+ inner_x: float,
337
+ _inner_w: float,
338
+ arrow_id: str,
339
+ block_pos: dict,
340
+ block_w: float,
341
+ block_h: float,
342
+ font_size: int,
343
+ ) -> None:
344
+ """Draw a layer-local side stream, e.g. vision states into cross-attention."""
345
+ lane = block.get("lane", "external_left")
346
+ if lane.endswith("left"):
347
+ block_x = max(56, inner_x - block_w - 34)
348
+ target_x = feeds_geom["left"] - GAP
349
+ else:
350
+ block_x = inner_x + _inner_w + 34
351
+ target_x = feeds_geom["right"] + GAP
352
+
353
+ cy = feeds_geom["cy"] + float(block.get("offset_y", 28))
354
+ top = cy - block_h / 2
355
+ geom = _rect_block(
356
+ parts, info, shadow_id, block["id"],
357
+ block_x, top, block_w, block_h,
358
+ _block_label(info, block["id"], block.get("label")),
359
+ font_size=font_size,
360
+ )
361
+ block_pos[block["id"]] = geom
362
+ if lane.endswith("left"):
363
+ route_x = (geom["right"] + target_x) / 2 if geom["right"] < target_x else target_x - 44
364
+ else:
365
+ route_x = (geom["left"] + target_x) / 2 if geom["left"] > target_x else target_x + 44
366
+
367
+ source_id = block.get("source_id")
368
+ source_w = block.get("source_w") or 230
369
+ source_h = block.get("source_h") or 46
370
+ source_gap = block.get("source_gap") or 56
371
+ source_x = geom["cx"] - source_w / 2
372
+ source_top = geom["bottom"] + source_gap
373
+ source = _rect_block(
374
+ parts, info, shadow_id, source_id,
375
+ source_x, source_top, source_w, source_h,
376
+ _block_label(info, source_id, block.get("source_label", source_id)),
377
+ font_size=font_size,
378
+ )
379
+ block_pos[source_id] = source
380
+ parts.append(_v_line(source, geom, arrow_id))
381
+
382
+ # Route out with a visible 90-degree turn so the adapter reads as an
383
+ # external conditioning path, not another central-chain block.
384
+ x_start = geom["right"] if lane.endswith("left") else geom["left"]
385
+ parts.append(_svg_tag("path", {
386
+ "d": (
387
+ f"M {x_start} {geom['cy']} "
388
+ f"L {route_x} {geom['cy']} "
389
+ f"L {route_x} {feeds_geom['cy']} "
390
+ f"L {target_x} {feeds_geom['cy']}"
391
+ ),
392
+ "stroke": C["arrow"],
393
+ "stroke-width": 1.6,
394
+ "stroke-linecap": "round",
395
+ "stroke-linejoin": "round",
396
+ "marker-end": f"url(#{arrow_id})",
397
+ "fill": "none",
398
+ }))
399
+
400
+
311
401
  def _mark_branch_tap(
312
402
  parts: list[str],
313
403
  branch_taps: set[tuple[float, float]],
@@ -2,8 +2,8 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  from .metadata import _block_label
5
- from .svg import _block_top_to_block_bottom, _rect_block, _svg_tag, _v_line
6
- from .theme import C, GAP
5
+ from .svg import _block_top_to_block_bottom, _rect_block, _v_line
6
+ from .theme import GAP
7
7
 
8
8
 
9
9
  def draw_multimodal_input_scaffold(
@@ -40,14 +40,26 @@ def draw_multimodal_input_scaffold(
40
40
  route_specs.append(("audio_path", _block_label(info, "audio_path", "Audio -> tokens")))
41
41
  multi_route = len(route_specs) > 1
42
42
  if multi_route:
43
- text_x = cx if len(route_specs) == 2 else cx - 300
44
43
  text_w = 230
45
- route_w = 170
46
44
  modality_y = embed_y
45
+ route_ys: list[float]
47
46
  if len(route_specs) == 2:
47
+ text_x = cx
48
+ route_w = 170
48
49
  route_centers = [cx - 215, cx + 215]
50
+ route_ys = [modality_y, modality_y]
49
51
  else:
50
- route_centers = [cx - 70, cx + 150, cx + 370]
52
+ text_x = cx
53
+ route_w = 170
54
+ if len(route_specs) == 3:
55
+ route_centers = [cx - 300, cx + 210, cx + 350]
56
+ route_ys = [embed_y, embed_y, tok_y]
57
+ else:
58
+ span_left = cx - 345
59
+ span_right = cx + 350
60
+ step = (span_right - span_left) / max(1, len(route_specs) - 1)
61
+ route_centers = [span_left + i * step for i in range(len(route_specs))]
62
+ route_ys = [modality_y for _ in route_specs]
51
63
  else:
52
64
  text_x = cx - 155
53
65
  modality_x = cx + 155
@@ -55,6 +67,7 @@ def draw_multimodal_input_scaffold(
55
67
  route_w = 210
56
68
  modality_y = embed_y
57
69
  route_centers = [modality_x]
70
+ route_ys = [modality_y]
58
71
 
59
72
  tok_text = _rect_block(
60
73
  parts, info, shadow_id, "tok_text",
@@ -79,15 +92,22 @@ def draw_multimodal_input_scaffold(
79
92
  fusion["bottom"] + GAP,
80
93
  arrow_id,
81
94
  ))
82
- route_targets = (
83
- [fusion["cx"] - 96, fusion["cx"] + 96] if len(route_specs) == 2
84
- else [fusion["cx"] - 112, fusion["cx"], fusion["cx"] + 112] if len(route_specs) >= 3
85
- else [fusion["cx"] + 56]
86
- )
87
- for (node_id, label), x, target_x in zip(route_specs, route_centers, route_targets):
95
+ if len(route_specs) == 2:
96
+ route_targets = [fusion["cx"] - 96, fusion["cx"] + 96]
97
+ elif len(route_specs) == 3:
98
+ route_targets = [fusion["cx"] - 112, fusion["cx"], fusion["cx"] + 112]
99
+ elif len(route_specs) > 3:
100
+ span_left = fusion["cx"] - 132
101
+ span_right = fusion["cx"] + 132
102
+ step = (span_right - span_left) / max(1, len(route_specs) - 1)
103
+ route_targets = [span_left + i * step for i in range(len(route_specs))]
104
+ else:
105
+ route_targets = [fusion["cx"] + 56]
106
+
107
+ for (node_id, label), x, y, target_x in zip(route_specs, route_centers, route_ys, route_targets):
88
108
  route = _rect_block(
89
109
  parts, info, shadow_id, node_id,
90
- x - route_w / 2, modality_y, route_w, 44,
110
+ x - route_w / 2, y, route_w, 44,
91
111
  label, font_size=16,
92
112
  )
93
113
  parts.append(_block_top_to_block_bottom(
@@ -108,15 +128,9 @@ def draw_cross_attention_input_scaffold(
108
128
  inner_y: float,
109
129
  inner_h: float,
110
130
  ) -> tuple[dict, dict, dict]:
111
- """Draw visual context as a side stream into decoder cross-attention."""
112
- embed_y = inner_y + inner_h + 132
131
+ """Draw the text stream; vision side states appear only on cross-attn variants."""
132
+ embed_y = inner_y + inner_h + 64
113
133
  tok_y = embed_y + 66
114
- stack_side_y = inner_y + inner_h - 110
115
- adapter_y = embed_y - 78
116
- vision_y = adapter_y + 96
117
- side_cx = 220
118
- adapter_w = 270
119
- vision_w = 230
120
134
 
121
135
  tok_text = _rect_block(
122
136
  parts, info, shadow_id, "tok_text",
@@ -128,34 +142,6 @@ def draw_cross_attention_input_scaffold(
128
142
  cx - 125, embed_y, 250, 44,
129
143
  _block_label(info, "embed", "Token Embedding"), font_size=16,
130
144
  )
131
- vision = _rect_block(
132
- parts, info, shadow_id, "vision_path",
133
- side_cx - vision_w / 2, vision_y, vision_w, 46,
134
- _block_label(info, "vision_path", "Vision context"), font_size=17,
135
- )
136
- adapter = _rect_block(
137
- parts, info, shadow_id, "fusion",
138
- side_cx - adapter_w / 2, adapter_y, adapter_w, 54,
139
- _block_label(info, "fusion", "Cross-attention adapter"), font_size=17,
140
- )
141
145
 
142
146
  parts.append(_v_line(tok_text, embed, arrow_id))
143
- parts.append(_v_line(vision, adapter, arrow_id))
144
-
145
- target_x = cx - 122
146
- target_y = stack_side_y - 48
147
- parts.append(_svg_tag("path", {
148
- "d": (
149
- f"M {adapter['cx']} {adapter['top']} "
150
- f"L {adapter['cx']} {target_y} "
151
- f"L {target_x} {target_y}"
152
- ),
153
- "stroke": C["arrow"],
154
- "stroke-width": 1.6,
155
- "stroke-linecap": "round",
156
- "stroke-linejoin": "round",
157
- "marker-end": f"url(#{arrow_id})",
158
- "fill": "none",
159
- }))
160
147
  return tok_text, embed, embed
161
-
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: model-unfolder
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
5
5
  Author: model-unfolder contributors
6
6
  License: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "model-unfolder"
7
- version = "0.2.8"
7
+ version = "0.2.9"
8
8
  description = "Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -425,6 +425,29 @@ def test_expanded_json_supports_mllama_cross_attention_vision():
425
425
  }
426
426
 
427
427
 
428
+ def test_mllama_cross_attention_adapter_is_layer_variant_only():
429
+ diagram = unfold(MLLAMA_VISION_TINY_CONFIG)
430
+ ir = diagram.to_ir()
431
+
432
+ assert not any(
433
+ block.get("id") == "cross_attention_adapter"
434
+ for block in ir["layers"][0]["blocks"]
435
+ )
436
+ assert any(
437
+ block.get("id") == "cross_attention_adapter"
438
+ for block in ir["layers"][3]["blocks"]
439
+ )
440
+ assert [
441
+ i for i, layer in enumerate(ir["layers"])
442
+ if any(block.get("id") == "cross_attention_adapter" for block in layer["blocks"])
443
+ ] == [3, 8, 13, 18, 23, 28, 33, 38]
444
+
445
+ html = diagram.to_html(standalone=False)
446
+ assert "Vision XAttn" in html
447
+ assert "Cross-attention adapter" in html
448
+ assert "Vision states stay separate" in html
449
+
450
+
428
451
  def test_expanded_json_supports_qwen_style_unified_grid_stream():
429
452
  data = unfold(QWEN2_VL_TINY_CONFIG, return_json=True)
430
453
  encoded = json.dumps(data["modalities"])
@@ -544,6 +544,21 @@ def test_gemma4_multimodal_fusion_render():
544
544
  assert 'data-card-id="fusion_mixed_stream"' in html
545
545
 
546
546
 
547
+ def test_gemma4_video_token_does_not_create_grid_video_path():
548
+ cfg = _gemma4_e2b_vision_config()
549
+ cfg.update({"video_token_id": 258884, "video_seq_length": 64})
550
+
551
+ d = unfold(cfg)
552
+ modalities = d.to_ir()["extras"]["modalities"]["inputs"]
553
+ assert "vision" in modalities
554
+ assert "audio" in modalities
555
+ assert "video" not in modalities
556
+
557
+ html = d.to_html(standalone=True)
558
+ assert "Video -&gt; grid" not in html
559
+ assert 'data-card-id="video_path"' not in html
560
+
561
+
547
562
  def test_qwen2_audio_sparse_text_config_is_completed():
548
563
  d = unfold(QWEN2_AUDIO_SPARSE_CONFIG)
549
564
  ir = d.to_ir()
File without changes
File without changes
File without changes