model-unfolder 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/PKG-INFO +3 -2
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/README.md +2 -1
- model_unfolder-0.2.4/model_unfolder/adapters/transformer/families/__init__.py +13 -0
- model_unfolder-0.2.4/model_unfolder/adapters/transformer/families/fallback.py +192 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/families/llama.py +12 -13
- model_unfolder-0.2.4/model_unfolder/adapters/transformer/families/mistral.py +119 -0
- model_unfolder-0.2.4/model_unfolder/adapters/transformer/families/qwen.py +144 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/diagram.py +9 -1
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/ir.py +2 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder.egg-info/PKG-INFO +3 -2
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder.egg-info/SOURCES.txt +3 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/pyproject.toml +1 -1
- model_unfolder-0.2.3/model_unfolder/adapters/transformer/families/__init__.py +0 -12
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/LICENSE +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/custom/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/diffusor/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/assembly.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/blocks.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/common.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/families/deepseek.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/families/gemma4.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/special_parts/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/html_renderer.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/labels.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/params.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/parser.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/block_views/__init__.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/block_views/attention.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/block_views/feed_forward.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/block_views/per_layer_embedding.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/cards.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/document.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/interactions.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/metadata.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/sections.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/styles.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/svg.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/theme.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/utils.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/views.py +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder.egg-info/dependency_links.txt +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder.egg-info/requires.txt +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder.egg-info/top_level.txt +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/setup.cfg +0 -0
- {model_unfolder-0.2.3 → model_unfolder-0.2.4}/tests/test_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: model-unfolder
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
|
|
5
5
|
Author: model-unfolder contributors
|
|
6
6
|
License: Apache-2.0
|
|
@@ -110,7 +110,8 @@ Param estimates are close to published numbers — DeepSeek-V3 reports `~675B (~
|
|
|
110
110
|
|---|---|
|
|
111
111
|
| DeepSeek | DeepSeek-V2, DeepSeek-V3, Kimi K2 |
|
|
112
112
|
| Llama | Llama 3 / 3.1 / 3.2 / 3.3 |
|
|
113
|
-
| Mistral |
|
|
113
|
+
| Mistral | Mistral 7B, Mixtral 8x7B / 8x22B, Mistral Medium 3.5 |
|
|
114
|
+
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6 (hybrid linear+full attn) |
|
|
114
115
|
| Gemma | Gemma 3, Gemma 4 (31B, E2B, E4B) |
|
|
115
116
|
|
|
116
117
|
### Diffusors
|
|
@@ -86,7 +86,8 @@ Param estimates are close to published numbers — DeepSeek-V3 reports `~675B (~
|
|
|
86
86
|
|---|---|
|
|
87
87
|
| DeepSeek | DeepSeek-V2, DeepSeek-V3, Kimi K2 |
|
|
88
88
|
| Llama | Llama 3 / 3.1 / 3.2 / 3.3 |
|
|
89
|
-
| Mistral |
|
|
89
|
+
| Mistral | Mistral 7B, Mixtral 8x7B / 8x22B, Mistral Medium 3.5 |
|
|
90
|
+
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6 (hybrid linear+full attn) |
|
|
90
91
|
| Gemma | Gemma 3, Gemma 4 (31B, E2B, E4B) |
|
|
91
92
|
|
|
92
93
|
### Diffusors
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Transformer model-family adapters.
|
|
2
|
+
|
|
3
|
+
These modules translate family-specific HuggingFace config dialects into the
|
|
4
|
+
shared transformer IR pieces in ``model_unfolder.adapters.transformer``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from . import deepseek, fallback, gemma4, llama, mistral, qwen
|
|
8
|
+
|
|
9
|
+
# Order matters: more specific adapters first.
|
|
10
|
+
# ``gemma4`` must run before ``llama`` (which also matches ``gemma`` model_type).
|
|
11
|
+
# ``mistral`` and ``qwen`` must run before ``llama``.
|
|
12
|
+
# ``fallback`` always matches — must be last.
|
|
13
|
+
ADAPTERS = [deepseek, gemma4, mistral, qwen, llama, fallback]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Generic fallback adapter for unknown transformer architectures.
|
|
2
|
+
|
|
3
|
+
Registered last in ADAPTERS so it only fires when no specific family matches.
|
|
4
|
+
It tries a broad set of field-name aliases used across different codebases, so
|
|
5
|
+
most decoder-style configs parse correctly even without a dedicated adapter.
|
|
6
|
+
|
|
7
|
+
Emits a warning in ModelIR.warnings for every gap it detects — unknown
|
|
8
|
+
model_type, unrecognised layer_type strings, missing critical fields, etc.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from ....ir import AttentionSpec, FFNSpec, ModelIR
|
|
15
|
+
from ..assembly import decoder_extras, decoder_layer
|
|
16
|
+
from ..common import architecture_name, get_config_value as _g, model_name
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# Multi-alias field resolver
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
_ALIASES: dict[str, list[str]] = {
|
|
24
|
+
"num_hidden_layers": ["num_hidden_layers", "n_layers", "num_layers", "n_layer",
|
|
25
|
+
"num_blocks", "n_blocks"],
|
|
26
|
+
"num_attention_heads": ["num_attention_heads", "n_heads", "num_heads", "n_head",
|
|
27
|
+
"num_q_heads"],
|
|
28
|
+
"num_key_value_heads": ["num_key_value_heads", "n_kv_heads", "num_kv_heads",
|
|
29
|
+
"num_key_heads"],
|
|
30
|
+
"hidden_size": ["hidden_size", "d_model", "n_embd", "model_dim",
|
|
31
|
+
"embed_dim", "dim"],
|
|
32
|
+
"intermediate_size": ["intermediate_size", "ffn_dim", "mlp_dim", "inner_dim",
|
|
33
|
+
"ffn_hidden_size", "feed_forward_proj_dim"],
|
|
34
|
+
"hidden_act": ["hidden_act", "activation_function", "hidden_activation",
|
|
35
|
+
"act_fn", "activation"],
|
|
36
|
+
"vocab_size": ["vocab_size", "n_vocab", "padded_vocab_size"],
|
|
37
|
+
"max_position_embeddings": ["max_position_embeddings", "max_seq_len", "n_positions",
|
|
38
|
+
"context_length", "max_seq_length", "seq_length"],
|
|
39
|
+
"sliding_window": ["sliding_window", "attention_window", "window_size"],
|
|
40
|
+
"num_experts": ["num_experts", "num_local_experts", "n_experts"],
|
|
41
|
+
"num_experts_per_tok": ["num_experts_per_tok", "top_k_experts", "top_k",
|
|
42
|
+
"num_selected_experts"],
|
|
43
|
+
"num_shared_experts": ["num_shared_experts", "n_shared_experts"],
|
|
44
|
+
"moe_intermediate_size": ["moe_intermediate_size", "expert_intermediate_size",
|
|
45
|
+
"expert_hidden_size", "ffn_dim_multiplier"],
|
|
46
|
+
"head_dim": ["head_dim", "d_head", "head_size", "kv_channels"],
|
|
47
|
+
"tie_word_embeddings": ["tie_word_embeddings", "tie_embeddings",
|
|
48
|
+
"tie_word_embedding_weights"],
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _resolve(cfg: Any, canonical: str, default=None):
|
|
53
|
+
"""Try every known alias for a field, return the first hit."""
|
|
54
|
+
for alias in _ALIASES.get(canonical, [canonical]):
|
|
55
|
+
val = _g(cfg, alias)
|
|
56
|
+
if val is not None:
|
|
57
|
+
return val
|
|
58
|
+
return default
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _unwrap_text(cfg: Any) -> Any:
|
|
62
|
+
"""If a multimodal wrapper hides the LM config under a sub-key, unwrap it."""
|
|
63
|
+
for key in ("text_config", "language_config", "llm_config", "text_model_config"):
|
|
64
|
+
sub = _g(cfg, key)
|
|
65
|
+
if isinstance(sub, dict) and sub.get("num_hidden_layers") or (
|
|
66
|
+
hasattr(sub, "num_hidden_layers") and sub is not None
|
|
67
|
+
):
|
|
68
|
+
return sub
|
|
69
|
+
return cfg
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
# Adapter interface
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def matches(_cfg: Any) -> bool:
|
|
77
|
+
return True # always fires — must be registered last
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse(cfg: Any) -> ModelIR:
|
|
81
|
+
warnings: list[str] = []
|
|
82
|
+
|
|
83
|
+
model_type = (_g(cfg, "model_type") or "unknown").lower()
|
|
84
|
+
arch_name = architecture_name(cfg, "unknown")
|
|
85
|
+
warnings.append(
|
|
86
|
+
f"No dedicated adapter for model_type={model_type!r} / arch={arch_name!r}. "
|
|
87
|
+
"Parsed with generic fallback — some details may be approximate."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
text_cfg = _unwrap_text(cfg)
|
|
91
|
+
if text_cfg is not cfg:
|
|
92
|
+
warnings.append(
|
|
93
|
+
"Config fields read from nested text_config sub-key (multimodal wrapper detected)."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
num_layers = _resolve(text_cfg, "num_hidden_layers", 0)
|
|
97
|
+
num_heads = _resolve(text_cfg, "num_attention_heads", 0)
|
|
98
|
+
num_kv_heads = _resolve(text_cfg, "num_key_value_heads") or num_heads
|
|
99
|
+
hidden_size = _resolve(text_cfg, "hidden_size", 0)
|
|
100
|
+
head_dim = _resolve(text_cfg, "head_dim") or (hidden_size // num_heads if num_heads else None)
|
|
101
|
+
activation = (_resolve(text_cfg, "hidden_act") or "silu").lower()
|
|
102
|
+
sliding_window = _resolve(text_cfg, "sliding_window")
|
|
103
|
+
layer_types = _g(text_cfg, "layer_types") or []
|
|
104
|
+
|
|
105
|
+
if not num_layers:
|
|
106
|
+
warnings.append("Could not determine num_hidden_layers — layer list will be empty.")
|
|
107
|
+
if not hidden_size:
|
|
108
|
+
warnings.append("Could not determine hidden_size.")
|
|
109
|
+
|
|
110
|
+
if num_kv_heads == num_heads:
|
|
111
|
+
attn_kind = "mha"
|
|
112
|
+
elif num_kv_heads == 1:
|
|
113
|
+
attn_kind = "mqa"
|
|
114
|
+
else:
|
|
115
|
+
attn_kind = "gqa"
|
|
116
|
+
|
|
117
|
+
num_experts = _resolve(text_cfg, "num_experts", 0)
|
|
118
|
+
num_experts_per_tok = _resolve(text_cfg, "num_experts_per_tok", 0)
|
|
119
|
+
num_shared_experts = _resolve(text_cfg, "num_shared_experts", 0)
|
|
120
|
+
moe_intermediate_size = _resolve(text_cfg, "moe_intermediate_size", 0)
|
|
121
|
+
intermediate_size = _resolve(text_cfg, "intermediate_size", 0) or moe_intermediate_size
|
|
122
|
+
is_moe = bool(num_experts)
|
|
123
|
+
|
|
124
|
+
unknown_layer_types: set[str] = set()
|
|
125
|
+
|
|
126
|
+
layers = []
|
|
127
|
+
for i in range(num_layers):
|
|
128
|
+
layer_type = layer_types[i] if i < len(layer_types) else "full_attention"
|
|
129
|
+
|
|
130
|
+
if layer_type in ("full_attention", "causal", ""):
|
|
131
|
+
mask, win = "causal", None
|
|
132
|
+
elif "sliding" in layer_type:
|
|
133
|
+
mask, win = "sliding", sliding_window
|
|
134
|
+
elif layer_type == "linear_attention":
|
|
135
|
+
mask, win = "causal", None
|
|
136
|
+
attn_kind = "linear"
|
|
137
|
+
else:
|
|
138
|
+
unknown_layer_types.add(layer_type)
|
|
139
|
+
mask, win = "causal", None
|
|
140
|
+
|
|
141
|
+
attn = AttentionSpec(
|
|
142
|
+
kind=attn_kind,
|
|
143
|
+
num_heads=num_heads,
|
|
144
|
+
num_kv_heads=num_kv_heads,
|
|
145
|
+
head_dim=head_dim,
|
|
146
|
+
mask=mask,
|
|
147
|
+
window_size=win,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if is_moe:
|
|
151
|
+
ffn = FFNSpec(
|
|
152
|
+
kind="moe",
|
|
153
|
+
activation=activation,
|
|
154
|
+
intermediate_size=intermediate_size,
|
|
155
|
+
gated=True,
|
|
156
|
+
num_experts=num_experts,
|
|
157
|
+
num_experts_per_tok=num_experts_per_tok,
|
|
158
|
+
num_shared_experts=num_shared_experts,
|
|
159
|
+
expert_intermediate_size=moe_intermediate_size or intermediate_size,
|
|
160
|
+
)
|
|
161
|
+
else:
|
|
162
|
+
ffn = FFNSpec(
|
|
163
|
+
kind="dense",
|
|
164
|
+
activation=activation,
|
|
165
|
+
intermediate_size=intermediate_size,
|
|
166
|
+
gated=True,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
layers.append(decoder_layer(i, attn, ffn, hidden_size))
|
|
170
|
+
|
|
171
|
+
for lt in sorted(unknown_layer_types):
|
|
172
|
+
warnings.append(
|
|
173
|
+
f"Unrecognised layer_type={lt!r} — treated as standard causal attention. "
|
|
174
|
+
"Add a dedicated adapter to handle this correctly."
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
vocab_size = _resolve(text_cfg, "vocab_size", 0) or _resolve(cfg, "vocab_size", 0)
|
|
178
|
+
tie_word_embeddings = bool(
|
|
179
|
+
_resolve(text_cfg, "tie_word_embeddings", _resolve(cfg, "tie_word_embeddings", False))
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return ModelIR(
|
|
183
|
+
name=model_name(cfg, arch_name),
|
|
184
|
+
architecture=arch_name,
|
|
185
|
+
vocab_size=vocab_size,
|
|
186
|
+
hidden_size=hidden_size,
|
|
187
|
+
max_position_embeddings=_resolve(text_cfg, "max_position_embeddings"),
|
|
188
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
189
|
+
layers=layers,
|
|
190
|
+
extras=decoder_extras(vocab_size, hidden_size, tie_word_embeddings),
|
|
191
|
+
warnings=warnings,
|
|
192
|
+
)
|
{model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/families/llama.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Adapter for Llama,
|
|
1
|
+
"""Adapter for Llama, Phi, and similar GQA/MHA dense models."""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
from typing import Any
|
|
@@ -8,26 +8,30 @@ from ..assembly import decoder_extras, decoder_layer
|
|
|
8
8
|
from ..common import architecture_name, get_config_value as _g, model_name
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
_FAMILIES = {"llama", "
|
|
11
|
+
_FAMILIES = {"llama", "phi3", "gemma"}
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def matches(cfg: Any) -> bool:
|
|
15
|
-
|
|
16
|
-
model_type = _g(cfg, "model_type", "")
|
|
17
|
-
for arch in arches:
|
|
18
|
-
if any(fam in arch.lower() for fam in ("llama", "mistral", "qwen", "phi3")):
|
|
19
|
-
return True
|
|
15
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
20
16
|
if model_type in _FAMILIES:
|
|
21
17
|
return True
|
|
22
|
-
|
|
18
|
+
arches = _g(cfg, "architectures") or []
|
|
19
|
+
return any(
|
|
20
|
+
any(fam in a.lower() for fam in ("llama", "phi3"))
|
|
21
|
+
for a in arches
|
|
22
|
+
)
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
def parse(cfg: Any) -> ModelIR:
|
|
26
|
+
arch_name = architecture_name(cfg, "llama")
|
|
27
|
+
|
|
26
28
|
num_layers = _g(cfg, "num_hidden_layers", 0)
|
|
27
29
|
num_heads = _g(cfg, "num_attention_heads", 0)
|
|
28
30
|
num_kv_heads = _g(cfg, "num_key_value_heads", num_heads)
|
|
29
31
|
hidden_size = _g(cfg, "hidden_size", 0)
|
|
30
32
|
head_dim = _g(cfg, "head_dim") or (hidden_size // num_heads if num_heads else None)
|
|
33
|
+
intermediate_size = _g(cfg, "intermediate_size", 0)
|
|
34
|
+
activation = (_g(cfg, "hidden_act", "silu") or "silu").lower()
|
|
31
35
|
|
|
32
36
|
if num_kv_heads == num_heads:
|
|
33
37
|
attn_kind = "mha"
|
|
@@ -40,11 +44,6 @@ def parse(cfg: Any) -> ModelIR:
|
|
|
40
44
|
sliding_pattern = _g(cfg, "sliding_window_pattern")
|
|
41
45
|
layer_types = _g(cfg, "layer_types")
|
|
42
46
|
|
|
43
|
-
intermediate_size = _g(cfg, "intermediate_size", 0)
|
|
44
|
-
activation = (_g(cfg, "hidden_act", "silu") or "silu").lower()
|
|
45
|
-
|
|
46
|
-
arch_name = architecture_name(cfg, "llama")
|
|
47
|
-
|
|
48
47
|
layers = []
|
|
49
48
|
for i in range(num_layers):
|
|
50
49
|
if layer_types and i < len(layer_types):
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Adapter for the Mistral model family.
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
* Mistral 7B / Mistral Small / Ministral — flat GQA config (model_type: "mistral")
|
|
5
|
+
* Mixtral 8x7B / 8x22B — sparse MoE (model_type: "mixtral")
|
|
6
|
+
* Mistral Medium 3.5 / Pixtral-class — multimodal wrapper with text nested
|
|
7
|
+
under text_config (model_type: "mistral3")
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ....ir import AttentionSpec, FFNSpec, ModelIR
|
|
14
|
+
from ..assembly import decoder_extras, decoder_layer
|
|
15
|
+
from ..common import architecture_name, get_config_value as _g, model_name
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_FLAT_TYPES = {"mistral", "mixtral"}
|
|
19
|
+
_WRAPPED_TYPES = {"mistral3", "ministral3"}
|
|
20
|
+
_ALL_TYPES = _FLAT_TYPES | _WRAPPED_TYPES
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def matches(cfg: Any) -> bool:
|
|
24
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
25
|
+
if model_type in _ALL_TYPES:
|
|
26
|
+
return True
|
|
27
|
+
arches = _g(cfg, "architectures") or []
|
|
28
|
+
return any("mistral" in a.lower() or "mixtral" in a.lower() for a in arches)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse(cfg: Any) -> ModelIR:
|
|
32
|
+
text_cfg = _text_config(cfg)
|
|
33
|
+
arch_name = architecture_name(cfg, "mistral")
|
|
34
|
+
|
|
35
|
+
num_layers = _g(text_cfg, "num_hidden_layers", 0)
|
|
36
|
+
num_heads = _g(text_cfg, "num_attention_heads", 0)
|
|
37
|
+
num_kv_heads = _g(text_cfg, "num_key_value_heads", num_heads)
|
|
38
|
+
hidden_size = _g(text_cfg, "hidden_size", 0)
|
|
39
|
+
head_dim = _g(text_cfg, "head_dim") or (hidden_size // num_heads if num_heads else None)
|
|
40
|
+
activation = (_g(text_cfg, "hidden_act", "silu") or "silu").lower()
|
|
41
|
+
|
|
42
|
+
if num_kv_heads == num_heads:
|
|
43
|
+
attn_kind = "mha"
|
|
44
|
+
elif num_kv_heads == 1:
|
|
45
|
+
attn_kind = "mqa"
|
|
46
|
+
else:
|
|
47
|
+
attn_kind = "gqa"
|
|
48
|
+
|
|
49
|
+
sliding_window = _g(text_cfg, "sliding_window")
|
|
50
|
+
sliding_pattern = _g(text_cfg, "sliding_window_pattern")
|
|
51
|
+
|
|
52
|
+
# MoE fields (Mixtral uses num_local_experts; generic fallback to num_experts)
|
|
53
|
+
num_experts = _g(text_cfg, "num_local_experts") or _g(text_cfg, "num_experts") or 0
|
|
54
|
+
num_experts_per_tok = _g(text_cfg, "num_experts_per_tok") or 0
|
|
55
|
+
intermediate_size = _g(text_cfg, "intermediate_size", 0)
|
|
56
|
+
is_moe = bool(num_experts)
|
|
57
|
+
|
|
58
|
+
layers = []
|
|
59
|
+
for i in range(num_layers):
|
|
60
|
+
if sliding_pattern and sliding_window:
|
|
61
|
+
mask = "sliding" if (i % sliding_pattern) != (sliding_pattern - 1) else "causal"
|
|
62
|
+
win = sliding_window if mask == "sliding" else None
|
|
63
|
+
elif sliding_window:
|
|
64
|
+
mask, win = "sliding", sliding_window
|
|
65
|
+
else:
|
|
66
|
+
mask, win = "causal", None
|
|
67
|
+
|
|
68
|
+
attn = AttentionSpec(
|
|
69
|
+
kind=attn_kind,
|
|
70
|
+
num_heads=num_heads,
|
|
71
|
+
num_kv_heads=num_kv_heads,
|
|
72
|
+
head_dim=head_dim,
|
|
73
|
+
mask=mask,
|
|
74
|
+
window_size=win,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if is_moe:
|
|
78
|
+
ffn = FFNSpec(
|
|
79
|
+
kind="moe",
|
|
80
|
+
activation=activation,
|
|
81
|
+
intermediate_size=intermediate_size,
|
|
82
|
+
gated=True,
|
|
83
|
+
num_experts=num_experts,
|
|
84
|
+
num_experts_per_tok=num_experts_per_tok,
|
|
85
|
+
expert_intermediate_size=intermediate_size,
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
ffn = FFNSpec(
|
|
89
|
+
kind="dense",
|
|
90
|
+
activation=activation,
|
|
91
|
+
intermediate_size=intermediate_size,
|
|
92
|
+
gated=True,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
layers.append(decoder_layer(i, attn, ffn, hidden_size))
|
|
96
|
+
|
|
97
|
+
vocab_size = _g(text_cfg, "vocab_size", 0) or _g(cfg, "vocab_size", 0)
|
|
98
|
+
tie_word_embeddings = bool(
|
|
99
|
+
_g(text_cfg, "tie_word_embeddings", _g(cfg, "tie_word_embeddings", False))
|
|
100
|
+
)
|
|
101
|
+
return ModelIR(
|
|
102
|
+
name=model_name(cfg, arch_name),
|
|
103
|
+
architecture=arch_name,
|
|
104
|
+
vocab_size=vocab_size,
|
|
105
|
+
hidden_size=hidden_size,
|
|
106
|
+
max_position_embeddings=_g(text_cfg, "max_position_embeddings"),
|
|
107
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
108
|
+
layers=layers,
|
|
109
|
+
extras=decoder_extras(vocab_size, hidden_size, tie_word_embeddings),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _text_config(cfg: Any) -> Any:
|
|
114
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
115
|
+
if model_type in _WRAPPED_TYPES:
|
|
116
|
+
sub = _g(cfg, "text_config")
|
|
117
|
+
if sub is not None:
|
|
118
|
+
return sub
|
|
119
|
+
return cfg
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Adapter for the Qwen model family.
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
* Qwen2 / Qwen2.5 (dense) — model_type: "qwen2"
|
|
5
|
+
* Qwen2-MoE — model_type: "qwen2_moe"
|
|
6
|
+
* Qwen3 (dense) — model_type: "qwen3"
|
|
7
|
+
* Qwen3-MoE / Qwen2.5-Max — model_type: "qwen3_moe"
|
|
8
|
+
* Qwen3.5 / Qwen3.6 (hybrid) — model_type: "qwen3_5_moe", text nested under text_config.
|
|
9
|
+
Alternates linear (SSM-style) and full-context attention layers.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from ....ir import AttentionSpec, FFNSpec, ModelIR
|
|
16
|
+
from ..assembly import decoder_extras, decoder_layer
|
|
17
|
+
from ..common import architecture_name, get_config_value as _g, model_name
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_FLAT_TYPES = {"qwen2", "qwen2_moe", "qwen3", "qwen3_moe"}
|
|
21
|
+
_WRAPPED_TYPES = {"qwen3_5_moe", "qwen3_5_moe_text"}
|
|
22
|
+
_MODEL_TYPES = _FLAT_TYPES | _WRAPPED_TYPES
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def matches(cfg: Any) -> bool:
|
|
26
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
27
|
+
if model_type in _MODEL_TYPES:
|
|
28
|
+
return True
|
|
29
|
+
arches = _g(cfg, "architectures") or []
|
|
30
|
+
return any("qwen" in a.lower() for a in arches)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def parse(cfg: Any) -> ModelIR:
|
|
34
|
+
text_cfg = _text_config(cfg)
|
|
35
|
+
arch_name = architecture_name(cfg, "qwen")
|
|
36
|
+
model_type = (_g(text_cfg, "model_type") or "").lower()
|
|
37
|
+
is_moe = "moe" in model_type or bool(_g(text_cfg, "num_experts"))
|
|
38
|
+
|
|
39
|
+
num_layers = _g(text_cfg, "num_hidden_layers", 0)
|
|
40
|
+
num_heads = _g(text_cfg, "num_attention_heads", 0)
|
|
41
|
+
num_kv_heads = _g(text_cfg, "num_key_value_heads", num_heads)
|
|
42
|
+
hidden_size = _g(text_cfg, "hidden_size", 0)
|
|
43
|
+
head_dim = _g(text_cfg, "head_dim") or (hidden_size // num_heads if num_heads else None)
|
|
44
|
+
activation = (_g(text_cfg, "hidden_act") or "silu").lower()
|
|
45
|
+
|
|
46
|
+
# Standard (non-hybrid) attention kind
|
|
47
|
+
if num_kv_heads == num_heads:
|
|
48
|
+
full_attn_kind = "mha"
|
|
49
|
+
elif num_kv_heads == 1:
|
|
50
|
+
full_attn_kind = "mqa"
|
|
51
|
+
else:
|
|
52
|
+
full_attn_kind = "gqa"
|
|
53
|
+
|
|
54
|
+
sliding_window = _g(text_cfg, "sliding_window")
|
|
55
|
+
sliding_pattern = _g(text_cfg, "sliding_window_pattern")
|
|
56
|
+
layer_types = _g(text_cfg, "layer_types") or []
|
|
57
|
+
|
|
58
|
+
# Hybrid linear-attention fields (Qwen3.5/3.6)
|
|
59
|
+
linear_num_kv_heads = _g(text_cfg, "linear_num_key_heads") or 0
|
|
60
|
+
linear_head_dim = _g(text_cfg, "linear_key_head_dim") or head_dim
|
|
61
|
+
|
|
62
|
+
# MoE FFN fields
|
|
63
|
+
num_experts = _g(text_cfg, "num_experts") or 0
|
|
64
|
+
num_experts_per_tok = _g(text_cfg, "num_experts_per_tok") or _g(text_cfg, "top_k") or 0
|
|
65
|
+
num_shared_experts = _g(text_cfg, "num_shared_experts") or 0
|
|
66
|
+
moe_intermediate_size = _g(text_cfg, "moe_intermediate_size") or 0
|
|
67
|
+
dense_intermediate_size = _g(text_cfg, "intermediate_size") or 0
|
|
68
|
+
|
|
69
|
+
layers = []
|
|
70
|
+
for i in range(num_layers):
|
|
71
|
+
layer_type = layer_types[i] if i < len(layer_types) else "full_attention"
|
|
72
|
+
is_linear = layer_type == "linear_attention"
|
|
73
|
+
|
|
74
|
+
if is_linear:
|
|
75
|
+
# SSM / recurrent linear-attention layer — no positional mask, compact heads
|
|
76
|
+
attn = AttentionSpec(
|
|
77
|
+
kind="linear",
|
|
78
|
+
num_heads=num_heads,
|
|
79
|
+
num_kv_heads=linear_num_kv_heads or num_kv_heads,
|
|
80
|
+
head_dim=linear_head_dim,
|
|
81
|
+
mask="causal",
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
if sliding_pattern and sliding_window:
|
|
85
|
+
mask = "sliding" if (i % sliding_pattern) != (sliding_pattern - 1) else "causal"
|
|
86
|
+
win = sliding_window if mask == "sliding" else None
|
|
87
|
+
elif sliding_window:
|
|
88
|
+
mask, win = "sliding", sliding_window
|
|
89
|
+
else:
|
|
90
|
+
mask, win = "causal", None
|
|
91
|
+
|
|
92
|
+
attn = AttentionSpec(
|
|
93
|
+
kind=full_attn_kind,
|
|
94
|
+
num_heads=num_heads,
|
|
95
|
+
num_kv_heads=num_kv_heads,
|
|
96
|
+
head_dim=head_dim,
|
|
97
|
+
mask=mask,
|
|
98
|
+
window_size=win,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if is_moe and num_experts:
|
|
102
|
+
ffn = FFNSpec(
|
|
103
|
+
kind="moe",
|
|
104
|
+
activation=activation,
|
|
105
|
+
intermediate_size=dense_intermediate_size or moe_intermediate_size,
|
|
106
|
+
gated=True,
|
|
107
|
+
num_experts=num_experts,
|
|
108
|
+
num_experts_per_tok=num_experts_per_tok,
|
|
109
|
+
num_shared_experts=num_shared_experts,
|
|
110
|
+
expert_intermediate_size=moe_intermediate_size,
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
ffn = FFNSpec(
|
|
114
|
+
kind="dense",
|
|
115
|
+
activation=activation,
|
|
116
|
+
intermediate_size=dense_intermediate_size,
|
|
117
|
+
gated=True,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
layers.append(decoder_layer(i, attn, ffn, hidden_size))
|
|
121
|
+
|
|
122
|
+
vocab_size = _g(text_cfg, "vocab_size", 0) or _g(cfg, "vocab_size", 0)
|
|
123
|
+
tie_word_embeddings = bool(
|
|
124
|
+
_g(text_cfg, "tie_word_embeddings", _g(cfg, "tie_word_embeddings", False))
|
|
125
|
+
)
|
|
126
|
+
return ModelIR(
|
|
127
|
+
name=model_name(cfg, arch_name),
|
|
128
|
+
architecture=arch_name,
|
|
129
|
+
vocab_size=vocab_size,
|
|
130
|
+
hidden_size=hidden_size,
|
|
131
|
+
max_position_embeddings=_g(text_cfg, "max_position_embeddings"),
|
|
132
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
133
|
+
layers=layers,
|
|
134
|
+
extras=decoder_extras(vocab_size, hidden_size, tie_word_embeddings),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _text_config(cfg: Any) -> Any:
|
|
139
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
140
|
+
if model_type in _WRAPPED_TYPES:
|
|
141
|
+
sub = _g(cfg, "text_config")
|
|
142
|
+
if sub is not None:
|
|
143
|
+
return sub
|
|
144
|
+
return cfg
|
|
@@ -44,6 +44,11 @@ class Diagram:
|
|
|
44
44
|
"""Return parameter-count estimates: total / active / per-layer breakdown."""
|
|
45
45
|
return self._params
|
|
46
46
|
|
|
47
|
+
@property
|
|
48
|
+
def warnings(self) -> list[str]:
|
|
49
|
+
"""Adapter-emitted warnings — unknown model types, unrecognised layer types, etc."""
|
|
50
|
+
return list(self.ir.warnings)
|
|
51
|
+
|
|
47
52
|
def _repr_html_(self) -> str:
|
|
48
53
|
"""Jupyter calls this; returned HTML string is rendered inline."""
|
|
49
54
|
return self._html(standalone=False)
|
|
@@ -87,9 +92,12 @@ class Diagram:
|
|
|
87
92
|
return self._html_cache[standalone]
|
|
88
93
|
|
|
89
94
|
def __repr__(self) -> str:
|
|
90
|
-
|
|
95
|
+
s = (
|
|
91
96
|
f"<Diagram {self.ir.name!r} · {self.ir.num_layers} layers · "
|
|
92
97
|
f"~{humanize(self._params['total'])} params"
|
|
93
98
|
+ (f" ({humanize(self._params['active'])} active)" if self._params['is_sparse'] else "")
|
|
94
99
|
+ ">"
|
|
95
100
|
)
|
|
101
|
+
if self.ir.warnings:
|
|
102
|
+
s += "\n" + "\n".join(f" ⚠ {w}" for w in self.ir.warnings)
|
|
103
|
+
return s
|
|
@@ -81,6 +81,7 @@ class ModelIR:
|
|
|
81
81
|
layers: list # list[LayerSpec]
|
|
82
82
|
cross_layer_edges: list = field(default_factory=list)
|
|
83
83
|
extras: dict = field(default_factory=dict)
|
|
84
|
+
warnings: list = field(default_factory=list) # adapter-emitted gaps / unknowns
|
|
84
85
|
|
|
85
86
|
def to_dict(self) -> dict:
|
|
86
87
|
# Avoid dataclasses.asdict here: it recursively deepcopy()s every
|
|
@@ -97,6 +98,7 @@ class ModelIR:
|
|
|
97
98
|
"layers": [_layer_to_dict(layer) for layer in self.layers],
|
|
98
99
|
"cross_layer_edges": [_cross_edge_to_dict(edge) for edge in self.cross_layer_edges],
|
|
99
100
|
"extras": self.extras,
|
|
101
|
+
"warnings": self.warnings,
|
|
100
102
|
}
|
|
101
103
|
|
|
102
104
|
@property
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: model-unfolder
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
|
|
5
5
|
Author: model-unfolder contributors
|
|
6
6
|
License: Apache-2.0
|
|
@@ -110,7 +110,8 @@ Param estimates are close to published numbers — DeepSeek-V3 reports `~675B (~
|
|
|
110
110
|
|---|---|
|
|
111
111
|
| DeepSeek | DeepSeek-V2, DeepSeek-V3, Kimi K2 |
|
|
112
112
|
| Llama | Llama 3 / 3.1 / 3.2 / 3.3 |
|
|
113
|
-
| Mistral |
|
|
113
|
+
| Mistral | Mistral 7B, Mixtral 8x7B / 8x22B, Mistral Medium 3.5 |
|
|
114
|
+
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6 (hybrid linear+full attn) |
|
|
114
115
|
| Gemma | Gemma 3, Gemma 4 (31B, E2B, E4B) |
|
|
115
116
|
|
|
116
117
|
### Diffusors
|
|
@@ -22,8 +22,11 @@ model_unfolder/adapters/transformer/blocks.py
|
|
|
22
22
|
model_unfolder/adapters/transformer/common.py
|
|
23
23
|
model_unfolder/adapters/transformer/families/__init__.py
|
|
24
24
|
model_unfolder/adapters/transformer/families/deepseek.py
|
|
25
|
+
model_unfolder/adapters/transformer/families/fallback.py
|
|
25
26
|
model_unfolder/adapters/transformer/families/gemma4.py
|
|
26
27
|
model_unfolder/adapters/transformer/families/llama.py
|
|
28
|
+
model_unfolder/adapters/transformer/families/mistral.py
|
|
29
|
+
model_unfolder/adapters/transformer/families/qwen.py
|
|
27
30
|
model_unfolder/adapters/transformer/special_parts/__init__.py
|
|
28
31
|
model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py
|
|
29
32
|
model_unfolder/renderers/__init__.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "model-unfolder"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.4"
|
|
8
8
|
description = "Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
"""Transformer model-family adapters.
|
|
2
|
-
|
|
3
|
-
These modules translate family-specific HuggingFace config dialects into the
|
|
4
|
-
shared transformer IR pieces in ``model_unfolder.adapters.transformer``.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from . import deepseek, gemma4, llama
|
|
8
|
-
|
|
9
|
-
# Order matters: more specific adapters first. ``gemma4`` claims its own
|
|
10
|
-
# top-level ``model_type`` / architecture and must run before the generic
|
|
11
|
-
# llama-family matcher.
|
|
12
|
-
ADAPTERS = [deepseek, gemma4, llama]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/__init__.py
RENAMED
|
File without changes
|
{model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/assembly.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/adapters/transformer/families/gemma4.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/block_views/__init__.py
RENAMED
|
File without changes
|
{model_unfolder-0.2.3 → model_unfolder-0.2.4}/model_unfolder/renderers/html/block_views/attention.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|