model-unfolder 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_unfolder/__init__.py +58 -0
- model_unfolder/adapters/__init__.py +15 -0
- model_unfolder/adapters/custom/__init__.py +8 -0
- model_unfolder/adapters/diffusor/__init__.py +8 -0
- model_unfolder/adapters/transformer/__init__.py +5 -0
- model_unfolder/adapters/transformer/assembly.py +57 -0
- model_unfolder/adapters/transformer/blocks.py +238 -0
- model_unfolder/adapters/transformer/common.py +35 -0
- model_unfolder/adapters/transformer/families/__init__.py +12 -0
- model_unfolder/adapters/transformer/families/deepseek.py +107 -0
- model_unfolder/adapters/transformer/families/gemma4.py +202 -0
- model_unfolder/adapters/transformer/families/llama.py +91 -0
- model_unfolder/adapters/transformer/special_parts/__init__.py +2 -0
- model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py +220 -0
- model_unfolder/diagram.py +95 -0
- model_unfolder/html_renderer.py +5 -0
- model_unfolder/ir.py +163 -0
- model_unfolder/labels.py +166 -0
- model_unfolder/params.py +119 -0
- model_unfolder/parser.py +137 -0
- model_unfolder/renderers/__init__.py +1 -0
- model_unfolder/renderers/html/__init__.py +5 -0
- model_unfolder/renderers/html/block_views/__init__.py +20 -0
- model_unfolder/renderers/html/block_views/attention.py +91 -0
- model_unfolder/renderers/html/block_views/feed_forward.py +213 -0
- model_unfolder/renderers/html/block_views/per_layer_embedding.py +199 -0
- model_unfolder/renderers/html/cards.py +130 -0
- model_unfolder/renderers/html/document.py +157 -0
- model_unfolder/renderers/html/interactions.py +64 -0
- model_unfolder/renderers/html/metadata.py +265 -0
- model_unfolder/renderers/html/sections.py +60 -0
- model_unfolder/renderers/html/styles.py +283 -0
- model_unfolder/renderers/html/svg.py +349 -0
- model_unfolder/renderers/html/theme.py +24 -0
- model_unfolder/renderers/html/utils.py +28 -0
- model_unfolder/renderers/html/views.py +461 -0
- model_unfolder-0.2.0.dist-info/METADATA +122 -0
- model_unfolder-0.2.0.dist-info/RECORD +41 -0
- model_unfolder-0.2.0.dist-info/WHEEL +5 -0
- model_unfolder-0.2.0.dist-info/licenses/LICENSE +201 -0
- model_unfolder-0.2.0.dist-info/top_level.txt +1 -0
model_unfolder/ir.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Intermediate Representation (IR) for transformer architectures.
|
|
3
|
+
|
|
4
|
+
The IR is the contract between parsers (which read HuggingFace configs)
|
|
5
|
+
and the renderer (which produces SVG/HTML). It is layer-aware to support
|
|
6
|
+
heterogeneous architectures (Gemma sliding-window patterns, DeepSeek
|
|
7
|
+
dense+MoE phase changes, YOCO/CLA cross-layer KV sharing, etc.).
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AttentionSpec:
|
|
16
|
+
"""Specification of an attention block within a layer."""
|
|
17
|
+
kind: str # "mha" | "gqa" | "mqa" | "mla"
|
|
18
|
+
num_heads: int
|
|
19
|
+
num_kv_heads: Optional[int] = None
|
|
20
|
+
head_dim: Optional[int] = None
|
|
21
|
+
kv_lora_rank: Optional[int] = None
|
|
22
|
+
q_lora_rank: Optional[int] = None
|
|
23
|
+
rope_dim: Optional[int] = None
|
|
24
|
+
mask: str = "causal" # "causal" | "sliding" | "chunked" | "global"
|
|
25
|
+
window_size: Optional[int] = None
|
|
26
|
+
kv_source_layer: Optional[int] = None # for cross-layer KV sharing
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class FFNSpec:
|
|
31
|
+
"""Specification of the feed-forward block within a layer."""
|
|
32
|
+
kind: str # "dense" | "moe"
|
|
33
|
+
activation: str # "silu" | "gelu" | "relu" | "geglu" | "swiglu"
|
|
34
|
+
intermediate_size: int
|
|
35
|
+
gated: bool = True # SwiGLU/GeGLU style gated MLP
|
|
36
|
+
num_experts: Optional[int] = None
|
|
37
|
+
num_experts_per_tok: Optional[int] = None
|
|
38
|
+
num_shared_experts: int = 0
|
|
39
|
+
expert_intermediate_size: Optional[int] = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class LayerSpec:
|
|
44
|
+
"""One transformer layer. Instances may differ across the stack."""
|
|
45
|
+
index: int
|
|
46
|
+
attention: AttentionSpec
|
|
47
|
+
ffn: FFNSpec
|
|
48
|
+
norm_kind: str = "rmsnorm" # "rmsnorm" | "layernorm"
|
|
49
|
+
norm_placement: str = "pre" # "pre" | "post" | "double"
|
|
50
|
+
blocks: list = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
def signature(self) -> tuple:
|
|
53
|
+
"""Hashable structural fingerprint used for grouping similar layers."""
|
|
54
|
+
a = self.attention
|
|
55
|
+
f = self.ffn
|
|
56
|
+
return (
|
|
57
|
+
a.kind, a.mask, a.window_size, a.kv_source_layer is not None,
|
|
58
|
+
f.kind, f.gated, f.num_experts,
|
|
59
|
+
self.norm_kind, self.norm_placement,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class CrossLayerEdge:
|
|
65
|
+
"""A dependency between two layers (e.g. KV cache sharing)."""
|
|
66
|
+
kind: str # "kv_share"
|
|
67
|
+
from_layer: int
|
|
68
|
+
to_layer: int
|
|
69
|
+
shared: list = field(default_factory=list) # ["K", "V"]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class ModelIR:
|
|
74
|
+
"""Top-level IR for a complete model."""
|
|
75
|
+
name: str
|
|
76
|
+
architecture: str # e.g. "DeepseekV3ForCausalLM"
|
|
77
|
+
vocab_size: int
|
|
78
|
+
hidden_size: int
|
|
79
|
+
max_position_embeddings: Optional[int]
|
|
80
|
+
tie_word_embeddings: bool
|
|
81
|
+
layers: list # list[LayerSpec]
|
|
82
|
+
cross_layer_edges: list = field(default_factory=list)
|
|
83
|
+
extras: dict = field(default_factory=dict)
|
|
84
|
+
|
|
85
|
+
def to_dict(self) -> dict:
|
|
86
|
+
# Avoid dataclasses.asdict here: it recursively deepcopy()s every
|
|
87
|
+
# nested dict/list, including repeated render block metadata for every
|
|
88
|
+
# layer. The IR is treated as immutable after parsing, so a direct
|
|
89
|
+
# structural projection is much cheaper and enough for rendering.
|
|
90
|
+
return {
|
|
91
|
+
"name": self.name,
|
|
92
|
+
"architecture": self.architecture,
|
|
93
|
+
"vocab_size": self.vocab_size,
|
|
94
|
+
"hidden_size": self.hidden_size,
|
|
95
|
+
"max_position_embeddings": self.max_position_embeddings,
|
|
96
|
+
"tie_word_embeddings": self.tie_word_embeddings,
|
|
97
|
+
"layers": [_layer_to_dict(layer) for layer in self.layers],
|
|
98
|
+
"cross_layer_edges": [_cross_edge_to_dict(edge) for edge in self.cross_layer_edges],
|
|
99
|
+
"extras": self.extras,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def num_layers(self) -> int:
|
|
104
|
+
return len(self.layers)
|
|
105
|
+
|
|
106
|
+
def layer_groups(self) -> list:
|
|
107
|
+
"""Run-length encode layers by signature."""
|
|
108
|
+
groups = []
|
|
109
|
+
for layer in self.layers:
|
|
110
|
+
sig = layer.signature()
|
|
111
|
+
if groups and groups[-1][0] == sig:
|
|
112
|
+
groups[-1][1].append(layer.index)
|
|
113
|
+
else:
|
|
114
|
+
groups.append((sig, [layer.index]))
|
|
115
|
+
return groups
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _attention_to_dict(a: AttentionSpec) -> dict:
|
|
119
|
+
return {
|
|
120
|
+
"kind": a.kind,
|
|
121
|
+
"num_heads": a.num_heads,
|
|
122
|
+
"num_kv_heads": a.num_kv_heads,
|
|
123
|
+
"head_dim": a.head_dim,
|
|
124
|
+
"kv_lora_rank": a.kv_lora_rank,
|
|
125
|
+
"q_lora_rank": a.q_lora_rank,
|
|
126
|
+
"rope_dim": a.rope_dim,
|
|
127
|
+
"mask": a.mask,
|
|
128
|
+
"window_size": a.window_size,
|
|
129
|
+
"kv_source_layer": a.kv_source_layer,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _ffn_to_dict(f: FFNSpec) -> dict:
|
|
134
|
+
return {
|
|
135
|
+
"kind": f.kind,
|
|
136
|
+
"activation": f.activation,
|
|
137
|
+
"intermediate_size": f.intermediate_size,
|
|
138
|
+
"gated": f.gated,
|
|
139
|
+
"num_experts": f.num_experts,
|
|
140
|
+
"num_experts_per_tok": f.num_experts_per_tok,
|
|
141
|
+
"num_shared_experts": f.num_shared_experts,
|
|
142
|
+
"expert_intermediate_size": f.expert_intermediate_size,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _layer_to_dict(layer: LayerSpec) -> dict:
|
|
147
|
+
return {
|
|
148
|
+
"index": layer.index,
|
|
149
|
+
"attention": _attention_to_dict(layer.attention),
|
|
150
|
+
"ffn": _ffn_to_dict(layer.ffn),
|
|
151
|
+
"norm_kind": layer.norm_kind,
|
|
152
|
+
"norm_placement": layer.norm_placement,
|
|
153
|
+
"blocks": layer.blocks,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _cross_edge_to_dict(edge: CrossLayerEdge) -> dict:
|
|
158
|
+
return {
|
|
159
|
+
"kind": edge.kind,
|
|
160
|
+
"from_layer": edge.from_layer,
|
|
161
|
+
"to_layer": edge.to_layer,
|
|
162
|
+
"shared": edge.shared,
|
|
163
|
+
}
|
model_unfolder/labels.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Renderer-agnostic vocabulary for talking about transformer specs.
|
|
2
|
+
|
|
3
|
+
Anything that needs to refer to attention masks ("SWA", "full"), attention
|
|
4
|
+
kinds ("GQA"), or build a human description of an attention / FFN block goes
|
|
5
|
+
through this module. Keeping it in one place means changing the wording
|
|
6
|
+
(swap "SWA" for "Sliding", say) only requires editing this file — no scavenger
|
|
7
|
+
hunt across the renderer.
|
|
8
|
+
|
|
9
|
+
The functions are pure and operate on plain ``dict``-shaped specs (i.e. what
|
|
10
|
+
``ModelIR.to_dict()`` produces), so they're equally useful inside the HTML
|
|
11
|
+
renderer, a future text/markdown renderer, or notebook utilities.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
_MASK_SHORT = {
|
|
16
|
+
"sliding": "SWA",
|
|
17
|
+
"global": "full",
|
|
18
|
+
"causal": "causal",
|
|
19
|
+
"chunked": "chunked",
|
|
20
|
+
}
|
|
21
|
+
_MASK_LONG = {
|
|
22
|
+
"sliding": "Sliding-window",
|
|
23
|
+
"global": "Full / global",
|
|
24
|
+
"causal": "Causal",
|
|
25
|
+
"chunked": "Chunked",
|
|
26
|
+
}
|
|
27
|
+
_MASK_TITLE = {
|
|
28
|
+
"sliding": "Sliding-window attention",
|
|
29
|
+
"global": "Full-context attention",
|
|
30
|
+
"causal": "Causal attention",
|
|
31
|
+
"chunked": "Chunked attention",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
_KIND_SHORT = {"mla": "MLA", "gqa": "GQA", "mqa": "MQA", "mha": "MHA"}
|
|
35
|
+
_KIND_LONG = {
|
|
36
|
+
"mla": "Multi-head latent attention",
|
|
37
|
+
"gqa": "Grouped-query attention",
|
|
38
|
+
"mqa": "Multi-query attention",
|
|
39
|
+
"mha": "Multi-head attention",
|
|
40
|
+
}
|
|
41
|
+
_ACTIVATION_LABELS = {
|
|
42
|
+
"gelu": "GELU",
|
|
43
|
+
"gelu_new": "GELU",
|
|
44
|
+
"gelu_fast": "GELU",
|
|
45
|
+
"gelu_pytorch_tanh": "GELU",
|
|
46
|
+
"relu": "ReLU",
|
|
47
|
+
"silu": "SiLU",
|
|
48
|
+
"swish": "SiLU",
|
|
49
|
+
"geglu": "GEGLU",
|
|
50
|
+
"swiglu": "SwiGLU",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def mask_short(attention: dict) -> str:
|
|
55
|
+
"""Compact mask tag — ``"SWA"`` / ``"full"`` / ``"causal"``."""
|
|
56
|
+
return _MASK_SHORT.get(attention.get("mask", "causal"), "causal")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def mask_long(attention: dict) -> str:
|
|
60
|
+
"""Human-readable mask label — ``"Sliding-window"`` / ``"Full / global"``."""
|
|
61
|
+
return _MASK_LONG.get(attention.get("mask", "causal"), "Causal")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def mask_title(attention: dict) -> str:
|
|
65
|
+
"""Tooltip-style mask description."""
|
|
66
|
+
return _MASK_TITLE.get(attention.get("mask", "causal"), "Causal attention")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def mask_chip(attention: dict) -> str:
|
|
70
|
+
"""One-line chip; for sliding includes the window size: ``"SWA 1,024"``."""
|
|
71
|
+
label = mask_short(attention)
|
|
72
|
+
window = attention.get("window_size")
|
|
73
|
+
if window and is_sliding(attention):
|
|
74
|
+
return f"{label} {_fmt_int(window)}"
|
|
75
|
+
return label
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def kind_short(attention: dict) -> str:
|
|
79
|
+
return _KIND_SHORT.get(attention.get("kind", ""), "MHA")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def kind_long(attention: dict) -> str:
|
|
83
|
+
return _KIND_LONG.get(attention.get("kind", ""), "Multi-head attention")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def is_sliding(attention: dict) -> bool:
|
|
87
|
+
return attention.get("mask") == "sliding"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def is_global(attention: dict) -> bool:
|
|
91
|
+
return attention.get("mask") == "global"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def kv_shared(attention: dict) -> bool:
|
|
95
|
+
"""True when this layer reuses K/V from an earlier layer (Gemma 4 small)."""
|
|
96
|
+
return attention.get("kv_source_layer") is not None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def activation_label(name: str | None) -> str:
|
|
100
|
+
"""Display label for activation names stored in configs.
|
|
101
|
+
|
|
102
|
+
Configs often expose backend-specific names such as
|
|
103
|
+
``gelu_pytorch_tanh``. Diagrams should show the mathematical operation,
|
|
104
|
+
not the implementation detail.
|
|
105
|
+
"""
|
|
106
|
+
key = (name or "").strip().lower().replace("-", "_")
|
|
107
|
+
if key in _ACTIVATION_LABELS:
|
|
108
|
+
return _ACTIVATION_LABELS[key]
|
|
109
|
+
if key.startswith("gelu"):
|
|
110
|
+
return "GELU"
|
|
111
|
+
return key.replace("_", " ").title() if key else "Activation"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def describe_attention(attention: dict) -> str:
|
|
115
|
+
"""Multi-clause human description suitable for tooltips and cards.
|
|
116
|
+
|
|
117
|
+
The window-size suffix is appended whenever the layer uses sliding-window
|
|
118
|
+
attention, regardless of the underlying kind (GQA/MHA/MLA).
|
|
119
|
+
"""
|
|
120
|
+
kind = attention.get("kind")
|
|
121
|
+
if kind == "mla":
|
|
122
|
+
text = (
|
|
123
|
+
f"Multi-head latent attention; {attention.get('num_heads')} heads; "
|
|
124
|
+
f"KV LoRA {_fmt_int(attention.get('kv_lora_rank'))}"
|
|
125
|
+
)
|
|
126
|
+
if attention.get("q_lora_rank"):
|
|
127
|
+
text += f"; Q LoRA {_fmt_int(attention.get('q_lora_rank'))}"
|
|
128
|
+
elif kind == "gqa":
|
|
129
|
+
text = (
|
|
130
|
+
f"Grouped-query; {attention.get('num_heads')} Q / "
|
|
131
|
+
f"{attention.get('num_kv_heads')} KV heads; "
|
|
132
|
+
f"head dim {_fmt_int(attention.get('head_dim'))}"
|
|
133
|
+
)
|
|
134
|
+
elif kind == "mqa":
|
|
135
|
+
text = f"Multi-query; {attention.get('num_heads')} Q / 1 KV head"
|
|
136
|
+
else:
|
|
137
|
+
text = (
|
|
138
|
+
f"Multi-head; {attention.get('num_heads')} heads; "
|
|
139
|
+
f"head dim {_fmt_int(attention.get('head_dim'))}"
|
|
140
|
+
)
|
|
141
|
+
if is_sliding(attention) and attention.get("window_size"):
|
|
142
|
+
text += f"; sliding window {_fmt_int(attention.get('window_size'))}"
|
|
143
|
+
return text
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def describe_ffn(ffn: dict) -> str:
|
|
147
|
+
"""Multi-clause human description of an FFN / MoE block."""
|
|
148
|
+
if ffn.get("kind") == "moe":
|
|
149
|
+
text = f"MoE; {_fmt_int(ffn.get('num_experts'))} experts; top-{ffn.get('num_experts_per_tok')}"
|
|
150
|
+
if ffn.get("num_shared_experts"):
|
|
151
|
+
text += f" + {ffn.get('num_shared_experts')} shared"
|
|
152
|
+
if ffn.get("num_experts") and ffn.get("num_experts_per_tok"):
|
|
153
|
+
text += f"; {100 * ffn['num_experts_per_tok'] / ffn['num_experts']:.1f}% active"
|
|
154
|
+
text += f"; expert hidden {_fmt_int(ffn.get('expert_intermediate_size') or ffn.get('intermediate_size'))}"
|
|
155
|
+
return text
|
|
156
|
+
gated = "gated " if ffn.get("gated") else ""
|
|
157
|
+
return f"{gated}FFN; {activation_label(ffn.get('activation'))}; hidden {_fmt_int(ffn.get('intermediate_size'))}"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _fmt_int(value) -> str:
|
|
161
|
+
if value is None:
|
|
162
|
+
return "?"
|
|
163
|
+
try:
|
|
164
|
+
return f"{int(value):,}"
|
|
165
|
+
except (TypeError, ValueError):
|
|
166
|
+
return str(value)
|
model_unfolder/params.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Rough parameter-count estimation from an IR.
|
|
2
|
+
|
|
3
|
+
These are *estimates*. We don't try to model every implementation detail
|
|
4
|
+
(bias terms, MLA's exact projection layout, expert grouping, etc.) — just
|
|
5
|
+
enough to give the right order of magnitude and a useful active/total split
|
|
6
|
+
for MoE models.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
from .ir import ModelIR, AttentionSpec, FFNSpec
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _attn_params(a: AttentionSpec, hidden: int) -> int:
|
|
13
|
+
h = hidden
|
|
14
|
+
head_dim = a.head_dim or (h // max(a.num_heads, 1))
|
|
15
|
+
nq = a.num_heads
|
|
16
|
+
nkv = a.num_kv_heads or nq
|
|
17
|
+
|
|
18
|
+
if a.kind == "mla":
|
|
19
|
+
# Q path: optional LoRA down then up to (nope+rope)*nq
|
|
20
|
+
q_out = nq * head_dim
|
|
21
|
+
if a.q_lora_rank:
|
|
22
|
+
q = h * a.q_lora_rank + a.q_lora_rank * q_out
|
|
23
|
+
else:
|
|
24
|
+
q = h * q_out
|
|
25
|
+
# KV path: hidden -> (kv_lora_rank + rope_dim), then up to nq*(nope+v)
|
|
26
|
+
kv_lora = a.kv_lora_rank or 0
|
|
27
|
+
rope = a.rope_dim or 0
|
|
28
|
+
kv_down = h * (kv_lora + rope)
|
|
29
|
+
kv_up = kv_lora * (nq * head_dim * 2) # K nope + V — rough
|
|
30
|
+
kv = kv_down + kv_up
|
|
31
|
+
out = nq * head_dim * h
|
|
32
|
+
return q + kv + out
|
|
33
|
+
|
|
34
|
+
qkv = h * (nq + 2 * nkv) * head_dim
|
|
35
|
+
out = nq * head_dim * h
|
|
36
|
+
return qkv + out
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _ffn_params(f: FFNSpec, hidden: int) -> tuple:
|
|
40
|
+
"""Returns (total_params, active_params_per_token)."""
|
|
41
|
+
g = 3 if f.gated else 2
|
|
42
|
+
if f.kind == "moe":
|
|
43
|
+
per_expert = g * hidden * (f.expert_intermediate_size or f.intermediate_size)
|
|
44
|
+
n_routed = f.num_experts or 0
|
|
45
|
+
n_shared = f.num_shared_experts or 0
|
|
46
|
+
n_active = f.num_experts_per_tok or 0
|
|
47
|
+
router = hidden * n_routed
|
|
48
|
+
total = per_expert * (n_routed + n_shared) + router
|
|
49
|
+
active = per_expert * (n_active + n_shared) + router
|
|
50
|
+
return total, active
|
|
51
|
+
p = g * hidden * f.intermediate_size
|
|
52
|
+
return p, p
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def estimate_params(ir: ModelIR) -> dict:
|
|
56
|
+
"""Estimate parameter counts for a model.
|
|
57
|
+
|
|
58
|
+
Returns a dict::
|
|
59
|
+
|
|
60
|
+
{
|
|
61
|
+
"total": int, # all parameters
|
|
62
|
+
"active": int, # active per token (== total for non-MoE)
|
|
63
|
+
"embed": int,
|
|
64
|
+
"output": int,
|
|
65
|
+
"per_layer": [{"total": int, "active": int}, ...],
|
|
66
|
+
"is_sparse": bool,
|
|
67
|
+
}
|
|
68
|
+
"""
|
|
69
|
+
h = ir.hidden_size
|
|
70
|
+
v = ir.vocab_size
|
|
71
|
+
|
|
72
|
+
embed = v * h
|
|
73
|
+
output = 0 if ir.tie_word_embeddings else v * h
|
|
74
|
+
final_norm = h
|
|
75
|
+
|
|
76
|
+
per_layer = []
|
|
77
|
+
layers_total = 0
|
|
78
|
+
layers_active = 0
|
|
79
|
+
is_sparse = False
|
|
80
|
+
|
|
81
|
+
for layer in ir.layers:
|
|
82
|
+
a_p = _attn_params(layer.attention, h)
|
|
83
|
+
f_total, f_active = _ffn_params(layer.ffn, h)
|
|
84
|
+
if layer.ffn.kind == "moe":
|
|
85
|
+
is_sparse = True
|
|
86
|
+
norm_p = 2 * h
|
|
87
|
+
t = a_p + f_total + norm_p
|
|
88
|
+
ac = a_p + f_active + norm_p
|
|
89
|
+
per_layer.append({"total": t, "active": ac, "attn": a_p, "ffn": f_total})
|
|
90
|
+
layers_total += t
|
|
91
|
+
layers_active += ac
|
|
92
|
+
|
|
93
|
+
total = embed + output + final_norm + layers_total
|
|
94
|
+
active = embed + output + final_norm + layers_active
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"total": total,
|
|
98
|
+
"active": active,
|
|
99
|
+
"embed": embed,
|
|
100
|
+
"output": output,
|
|
101
|
+
"per_layer": per_layer,
|
|
102
|
+
"is_sparse": is_sparse,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def humanize(n: int) -> str:
|
|
107
|
+
"""Format a parameter count as e.g. '671B', '37.4B', '8.2M'."""
|
|
108
|
+
if n is None:
|
|
109
|
+
return "?"
|
|
110
|
+
n = float(n)
|
|
111
|
+
for unit, scale in (("T", 1e12), ("B", 1e9), ("M", 1e6), ("K", 1e3)):
|
|
112
|
+
if n >= scale:
|
|
113
|
+
v = n / scale
|
|
114
|
+
if v >= 100:
|
|
115
|
+
return f"{v:.0f}{unit}"
|
|
116
|
+
if v >= 10:
|
|
117
|
+
return f"{v:.1f}{unit}"
|
|
118
|
+
return f"{v:.2f}{unit}"
|
|
119
|
+
return f"{int(n)}"
|
model_unfolder/parser.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Parse a HuggingFace config (or model ID) into our IR."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any
|
|
5
|
+
from .ir import ModelIR
|
|
6
|
+
from .adapters import find_adapter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
HF_TOKEN_ENV_VARS = (
|
|
10
|
+
"HF_TOKEN",
|
|
11
|
+
"HUGGING_FACE_HUB_TOKEN",
|
|
12
|
+
"HUGGINGFACE_HUB_TOKEN",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def config_to_ir(cfg_or_id: Any, token: Any = None) -> ModelIR:
|
|
17
|
+
"""Parse anything HF-shaped into an IR.
|
|
18
|
+
|
|
19
|
+
Accepts:
|
|
20
|
+
- A HuggingFace ``PretrainedConfig`` instance
|
|
21
|
+
- A model ID string (e.g. ``"moonshotai/Kimi-K2-Instruct"``) — requires ``transformers``
|
|
22
|
+
- A plain ``dict`` (the contents of ``config.json``)
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
token
|
|
27
|
+
Optional Hugging Face token used only when loading a config by model ID.
|
|
28
|
+
If omitted, ``HF_TOKEN`` and legacy Hugging Face token env vars are used
|
|
29
|
+
when present.
|
|
30
|
+
"""
|
|
31
|
+
cfg = _coerce(cfg_or_id, token=token)
|
|
32
|
+
adapter = find_adapter(cfg)
|
|
33
|
+
if adapter is None:
|
|
34
|
+
arches = (
|
|
35
|
+
cfg.get("architectures") if isinstance(cfg, dict)
|
|
36
|
+
else getattr(cfg, "architectures", None)
|
|
37
|
+
)
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"No adapter found for architecture {arches}. "
|
|
40
|
+
"Pass a dict-like config or contribute an adapter."
|
|
41
|
+
)
|
|
42
|
+
return adapter.parse(cfg)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _coerce(cfg_or_id, token: Any = None):
|
|
46
|
+
if isinstance(cfg_or_id, dict):
|
|
47
|
+
return cfg_or_id
|
|
48
|
+
if isinstance(cfg_or_id, str):
|
|
49
|
+
try:
|
|
50
|
+
from transformers import AutoConfig
|
|
51
|
+
except ImportError as e:
|
|
52
|
+
raise ImportError(
|
|
53
|
+
"Loading a model by ID requires `transformers`. "
|
|
54
|
+
"Install with `pip install transformers`, or pass a config dict."
|
|
55
|
+
) from e
|
|
56
|
+
return _load_config_from_hf(AutoConfig, cfg_or_id, token=token)
|
|
57
|
+
return cfg_or_id
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _load_config_from_hf(auto_config: Any, model_id: str, token: Any = None):
|
|
61
|
+
auth_token = _resolve_hf_token(token)
|
|
62
|
+
try:
|
|
63
|
+
return _from_pretrained(auto_config, model_id, auth_token, trust_remote_code=False)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
if not _should_retry_with_remote_code(e):
|
|
66
|
+
raise
|
|
67
|
+
return _from_pretrained(auto_config, model_id, auth_token, trust_remote_code=True)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _from_pretrained(auto_config: Any, model_id: str, auth_token: Any, *, trust_remote_code: bool):
|
|
71
|
+
kwargs = {}
|
|
72
|
+
if trust_remote_code:
|
|
73
|
+
kwargs["trust_remote_code"] = True
|
|
74
|
+
if auth_token is None:
|
|
75
|
+
return auto_config.from_pretrained(model_id, **kwargs)
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
return auto_config.from_pretrained(model_id, token=auth_token, **kwargs)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
if not _should_retry_with_legacy_auth(e):
|
|
81
|
+
raise
|
|
82
|
+
return auto_config.from_pretrained(
|
|
83
|
+
model_id,
|
|
84
|
+
use_auth_token=auth_token,
|
|
85
|
+
**kwargs,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _resolve_hf_token(token: Any = None):
|
|
90
|
+
if token is not None:
|
|
91
|
+
return _clean_token(token)
|
|
92
|
+
for name in HF_TOKEN_ENV_VARS:
|
|
93
|
+
value = _clean_token(os.environ.get(name))
|
|
94
|
+
if value is not None:
|
|
95
|
+
return value
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _clean_token(token: Any):
|
|
100
|
+
if isinstance(token, str):
|
|
101
|
+
token = token.strip()
|
|
102
|
+
return token or None
|
|
103
|
+
return token
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _should_retry_with_legacy_auth(error: Exception) -> bool:
|
|
107
|
+
msg = str(error).lower()
|
|
108
|
+
return any(
|
|
109
|
+
marker in msg
|
|
110
|
+
for marker in (
|
|
111
|
+
"token",
|
|
112
|
+
"use_auth_token",
|
|
113
|
+
"authentication",
|
|
114
|
+
"authorization",
|
|
115
|
+
"unauthorized",
|
|
116
|
+
"forbidden",
|
|
117
|
+
"401",
|
|
118
|
+
"403",
|
|
119
|
+
"gated",
|
|
120
|
+
"private",
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _should_retry_with_remote_code(error: Exception) -> bool:
|
|
126
|
+
msg = str(error).lower()
|
|
127
|
+
return any(
|
|
128
|
+
marker in msg
|
|
129
|
+
for marker in (
|
|
130
|
+
"trust_remote_code",
|
|
131
|
+
"remote code",
|
|
132
|
+
"custom code",
|
|
133
|
+
"custom configuration",
|
|
134
|
+
"execute the configuration file",
|
|
135
|
+
"execute the repository",
|
|
136
|
+
)
|
|
137
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Rendering backends for model-unfolder diagrams."""
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Reusable rich block detail views for the HTML renderer."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from .attention import attention_card, attention_card_css
|
|
5
|
+
from .feed_forward import build_ffn_view, build_moe_view
|
|
6
|
+
from .per_layer_embedding import build_per_layer_embedding_view
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def block_detail_svg(ir: dict, info: dict, mount_id: str, block: dict) -> str | None:
|
|
10
|
+
"""Return a rich SVG for a clicked architecture block, when one exists."""
|
|
11
|
+
if block.get("kind") == "ffn":
|
|
12
|
+
ffn = info["dominant"]["spec"]["ffn"]
|
|
13
|
+
if ffn.get("kind") == "moe":
|
|
14
|
+
return build_moe_view(ir, info, mount_id)
|
|
15
|
+
return build_ffn_view(ir, info, mount_id)
|
|
16
|
+
|
|
17
|
+
if block.get("detail_view") == "per_layer_embedding":
|
|
18
|
+
return build_per_layer_embedding_view(ir, info, mount_id, block)
|
|
19
|
+
|
|
20
|
+
return None
|