model-unfolder 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_unfolder/__init__.py +58 -0
- model_unfolder/adapters/__init__.py +15 -0
- model_unfolder/adapters/custom/__init__.py +8 -0
- model_unfolder/adapters/diffusor/__init__.py +8 -0
- model_unfolder/adapters/transformer/__init__.py +5 -0
- model_unfolder/adapters/transformer/assembly.py +57 -0
- model_unfolder/adapters/transformer/blocks.py +238 -0
- model_unfolder/adapters/transformer/common.py +35 -0
- model_unfolder/adapters/transformer/families/__init__.py +12 -0
- model_unfolder/adapters/transformer/families/deepseek.py +107 -0
- model_unfolder/adapters/transformer/families/gemma4.py +202 -0
- model_unfolder/adapters/transformer/families/llama.py +91 -0
- model_unfolder/adapters/transformer/special_parts/__init__.py +2 -0
- model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py +220 -0
- model_unfolder/diagram.py +95 -0
- model_unfolder/html_renderer.py +5 -0
- model_unfolder/ir.py +163 -0
- model_unfolder/labels.py +166 -0
- model_unfolder/params.py +119 -0
- model_unfolder/parser.py +137 -0
- model_unfolder/renderers/__init__.py +1 -0
- model_unfolder/renderers/html/__init__.py +5 -0
- model_unfolder/renderers/html/block_views/__init__.py +20 -0
- model_unfolder/renderers/html/block_views/attention.py +91 -0
- model_unfolder/renderers/html/block_views/feed_forward.py +213 -0
- model_unfolder/renderers/html/block_views/per_layer_embedding.py +199 -0
- model_unfolder/renderers/html/cards.py +130 -0
- model_unfolder/renderers/html/document.py +157 -0
- model_unfolder/renderers/html/interactions.py +64 -0
- model_unfolder/renderers/html/metadata.py +265 -0
- model_unfolder/renderers/html/sections.py +60 -0
- model_unfolder/renderers/html/styles.py +283 -0
- model_unfolder/renderers/html/svg.py +349 -0
- model_unfolder/renderers/html/theme.py +24 -0
- model_unfolder/renderers/html/utils.py +28 -0
- model_unfolder/renderers/html/views.py +461 -0
- model_unfolder-0.2.0.dist-info/METADATA +122 -0
- model_unfolder-0.2.0.dist-info/RECORD +41 -0
- model_unfolder-0.2.0.dist-info/WHEEL +5 -0
- model_unfolder-0.2.0.dist-info/licenses/LICENSE +201 -0
- model_unfolder-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""model_unfolder — turn any HuggingFace transformer into a clear architecture diagram.
|
|
2
|
+
|
|
3
|
+
Quick start in a Jupyter notebook::
|
|
4
|
+
|
|
5
|
+
from model_unfolder import unfold
|
|
6
|
+
unfold("moonshotai/Kimi-K2-Instruct")
|
|
7
|
+
|
|
8
|
+
Outside Jupyter::
|
|
9
|
+
|
|
10
|
+
diagram = unfold(cfg)
|
|
11
|
+
diagram.save("kimi_k2.html")
|
|
12
|
+
"""
|
|
13
|
+
from .diagram import Diagram
|
|
14
|
+
from .parser import config_to_ir
|
|
15
|
+
from .ir import ModelIR, LayerSpec, AttentionSpec, FFNSpec, CrossLayerEdge
|
|
16
|
+
from .params import estimate_params
|
|
17
|
+
|
|
18
|
+
__version__ = "0.2.0"
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"unfold",
|
|
22
|
+
"show",
|
|
23
|
+
"Diagram",
|
|
24
|
+
"ModelIR",
|
|
25
|
+
"LayerSpec",
|
|
26
|
+
"AttentionSpec",
|
|
27
|
+
"FFNSpec",
|
|
28
|
+
"CrossLayerEdge",
|
|
29
|
+
"config_to_ir",
|
|
30
|
+
"estimate_params",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def unfold(cfg_or_id, token=None) -> Diagram:
|
|
35
|
+
"""Unfold a transformer into a renderable architecture diagram.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
cfg_or_id
|
|
40
|
+
A HuggingFace ``PretrainedConfig`` instance, a model ID string
|
|
41
|
+
(e.g. ``"moonshotai/Kimi-K2-Instruct"``), or a plain ``dict`` of
|
|
42
|
+
``config.json`` contents.
|
|
43
|
+
token
|
|
44
|
+
Optional Hugging Face token used only when ``cfg_or_id`` is a model ID.
|
|
45
|
+
If omitted, ``HF_TOKEN`` and legacy Hugging Face token env vars are used
|
|
46
|
+
when present.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
Diagram
|
|
51
|
+
Renders inline in Jupyter; otherwise call ``.save()`` or ``.to_html()``.
|
|
52
|
+
"""
|
|
53
|
+
ir = config_to_ir(cfg_or_id, token=token)
|
|
54
|
+
return Diagram(ir)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# friendly alias
|
|
58
|
+
show = unfold
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Adapter registry. Order matters: more specific adapters come first."""
|
|
2
|
+
from . import custom, diffusor, transformer
|
|
3
|
+
|
|
4
|
+
ADAPTERS = [
|
|
5
|
+
*custom.ADAPTERS,
|
|
6
|
+
*diffusor.ADAPTERS,
|
|
7
|
+
*transformer.ADAPTERS,
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_adapter(cfg):
|
|
12
|
+
for a in ADAPTERS:
|
|
13
|
+
if a.matches(cfg):
|
|
14
|
+
return a
|
|
15
|
+
return None
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Assembly helpers for transformer-family adapters."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections.abc import Iterable, Mapping
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ...ir import AttentionSpec, FFNSpec, LayerSpec
|
|
8
|
+
from .blocks import decoder_layer_blocks, decoder_only_render_spec
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def decoder_layer(
|
|
12
|
+
index: int,
|
|
13
|
+
attention: AttentionSpec,
|
|
14
|
+
ffn: FFNSpec,
|
|
15
|
+
hidden_size: int,
|
|
16
|
+
*,
|
|
17
|
+
extra_blocks: Iterable[dict] | None = None,
|
|
18
|
+
) -> LayerSpec:
|
|
19
|
+
"""Build a decoder layer from parsed specs plus optional reusable parts."""
|
|
20
|
+
blocks = decoder_layer_blocks(attention, ffn, hidden_size)
|
|
21
|
+
if extra_blocks:
|
|
22
|
+
blocks.extend(extra_blocks)
|
|
23
|
+
return LayerSpec(
|
|
24
|
+
index=index,
|
|
25
|
+
attention=attention,
|
|
26
|
+
ffn=ffn,
|
|
27
|
+
blocks=blocks,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def decoder_extras(
|
|
32
|
+
vocab_size: int,
|
|
33
|
+
hidden_size: int,
|
|
34
|
+
tie_word_embeddings: bool,
|
|
35
|
+
*extra_maps: Mapping[str, Any] | None,
|
|
36
|
+
) -> dict:
|
|
37
|
+
"""Build top-level extras shared by decoder-only transformer models."""
|
|
38
|
+
extras = {
|
|
39
|
+
"render": decoder_only_render_spec(
|
|
40
|
+
vocab_size,
|
|
41
|
+
hidden_size,
|
|
42
|
+
tie_word_embeddings,
|
|
43
|
+
)
|
|
44
|
+
}
|
|
45
|
+
for extra in extra_maps:
|
|
46
|
+
if not extra:
|
|
47
|
+
continue
|
|
48
|
+
_merge_extras(extras, extra)
|
|
49
|
+
return extras
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _merge_extras(target: dict, extra: Mapping[str, Any]) -> None:
|
|
53
|
+
for key, value in extra.items():
|
|
54
|
+
if key == "external_pathways" and key in target:
|
|
55
|
+
target[key].extend(value)
|
|
56
|
+
else:
|
|
57
|
+
target[key] = value
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Reusable transformer block descriptions for renderers.
|
|
2
|
+
|
|
3
|
+
Adapters attach these block parts to the IR. Renderers can then draw generic
|
|
4
|
+
decoder-only transformer layouts without re-discovering model-specific names
|
|
5
|
+
or labels with another layer of ``if model_type`` logic.
|
|
6
|
+
|
|
7
|
+
Each block carries two orthogonal tags:
|
|
8
|
+
|
|
9
|
+
* ``role`` — semantic ("norm", "attention", "ffn", "residual", "gate") used
|
|
10
|
+
for tooltips, click handlers, and the inspect cards.
|
|
11
|
+
* ``kind`` — rendering shape ("norm", "linear", "activation", "attention",
|
|
12
|
+
"ffn", "residual_add", "gate_mul", "embedding", "output", "source") used
|
|
13
|
+
by the architecture view to pick a glyph and lay out a slot.
|
|
14
|
+
|
|
15
|
+
Edges between blocks travel on the destination side as plain string fields:
|
|
16
|
+
|
|
17
|
+
* ``residual_from: "<other_block_id>"`` — the residual_add block consumes the
|
|
18
|
+
*input* of the named block (the standard pre-attention bypass pattern).
|
|
19
|
+
* ``lane: "left" | "right"`` — the block is rendered off the central chain
|
|
20
|
+
and connected via ``tap_from`` / ``feeds``. Reusable parts such as
|
|
21
|
+
per-layer embeddings use this instead of model-specific renderer logic.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from ...labels import activation_label
|
|
26
|
+
from ...ir import AttentionSpec, FFNSpec
|
|
27
|
+
from .common import format_dim as _fmt
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def decoder_only_render_spec(vocab_size: int, hidden_size: int, tie_word_embeddings: bool) -> dict:
|
|
31
|
+
return {
|
|
32
|
+
"family": "transformer",
|
|
33
|
+
"layout": "decoder_only",
|
|
34
|
+
"model_blocks": decoder_model_blocks(vocab_size, hidden_size, tie_word_embeddings),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def decoder_model_blocks(vocab_size: int, hidden_size: int, tie_word_embeddings: bool) -> list[dict]:
|
|
39
|
+
vocab = _fmt(vocab_size)
|
|
40
|
+
hidden = _fmt(hidden_size)
|
|
41
|
+
tied = " (tied with output)" if tie_word_embeddings else ""
|
|
42
|
+
return [
|
|
43
|
+
{
|
|
44
|
+
"id": "tok_text",
|
|
45
|
+
"role": "input",
|
|
46
|
+
"kind": "source",
|
|
47
|
+
"label": "Tokenized text",
|
|
48
|
+
"title": "Tokenized text",
|
|
49
|
+
"description": "Input token IDs; shape [batch, seq_len]",
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"id": "embed",
|
|
53
|
+
"role": "embedding",
|
|
54
|
+
"kind": "embedding",
|
|
55
|
+
"label": "Token Embedding layer",
|
|
56
|
+
"title": "Token embedding",
|
|
57
|
+
"description": f"{vocab} x {hidden}{tied}",
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"id": "final_rms",
|
|
61
|
+
"role": "norm",
|
|
62
|
+
"kind": "norm",
|
|
63
|
+
"label": "Final RMSNorm",
|
|
64
|
+
"title": "Final norm",
|
|
65
|
+
"description": f"RMSNorm; dim {hidden}",
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"id": "lm_head",
|
|
69
|
+
"role": "output",
|
|
70
|
+
"kind": "output",
|
|
71
|
+
"label": "Linear output layer",
|
|
72
|
+
"title": "LM head",
|
|
73
|
+
"description": f"{hidden} -> {vocab}" + (" (tied)" if tie_word_embeddings else ""),
|
|
74
|
+
},
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def decoder_layer_blocks(attention: AttentionSpec, ffn: FFNSpec, hidden_size: int) -> list[dict]:
|
|
79
|
+
hidden = _fmt(hidden_size)
|
|
80
|
+
return [
|
|
81
|
+
{
|
|
82
|
+
"id": "rms1",
|
|
83
|
+
"role": "norm",
|
|
84
|
+
"kind": "norm",
|
|
85
|
+
"label": "RMSNorm",
|
|
86
|
+
"title": "Pre-attention norm",
|
|
87
|
+
"description": f"RMSNorm; dim {hidden}",
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"id": "attn",
|
|
91
|
+
"role": "attention",
|
|
92
|
+
"kind": "attention",
|
|
93
|
+
"label": attention_label(attention),
|
|
94
|
+
"title": attention_title(attention),
|
|
95
|
+
"description": describe_attention(attention),
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"id": "add1",
|
|
99
|
+
"role": "residual",
|
|
100
|
+
"kind": "residual_add",
|
|
101
|
+
"residual_from": "rms1",
|
|
102
|
+
"label": "+",
|
|
103
|
+
"title": "Residual add",
|
|
104
|
+
"description": "block input + attention output",
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"id": "rms2",
|
|
108
|
+
"role": "norm",
|
|
109
|
+
"kind": "norm",
|
|
110
|
+
"label": "RMSNorm",
|
|
111
|
+
"title": "Pre-FFN norm",
|
|
112
|
+
"description": f"RMSNorm; dim {hidden}",
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"id": "ffn",
|
|
116
|
+
"role": "ffn",
|
|
117
|
+
"kind": "ffn",
|
|
118
|
+
"label": "MoE" if ffn.kind == "moe" else "Feed-Forward",
|
|
119
|
+
"title": "Mixture of experts" if ffn.kind == "moe" else "Feed-forward",
|
|
120
|
+
"description": describe_ffn(ffn),
|
|
121
|
+
"detail_view": "moe" if ffn.kind == "moe" else "gated_ffn",
|
|
122
|
+
"children": ffn_child_blocks(ffn, hidden_size),
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"id": "add2",
|
|
126
|
+
"role": "residual",
|
|
127
|
+
"kind": "residual_add",
|
|
128
|
+
"residual_from": "rms2",
|
|
129
|
+
"label": "+",
|
|
130
|
+
"title": "Residual add",
|
|
131
|
+
"description": "post-attention + FFN output",
|
|
132
|
+
},
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def ffn_child_blocks(ffn: FFNSpec, hidden_size: int) -> list[dict]:
|
|
137
|
+
hidden = _fmt(hidden_size)
|
|
138
|
+
inter = _fmt(ffn.expert_intermediate_size or ffn.intermediate_size)
|
|
139
|
+
activation = activation_label(ffn.activation)
|
|
140
|
+
children = [
|
|
141
|
+
{
|
|
142
|
+
"id": "gate_proj",
|
|
143
|
+
"title": "Gate projection",
|
|
144
|
+
"description": f"Linear; {hidden} -> {inter} (gated path through {activation})",
|
|
145
|
+
},
|
|
146
|
+
{"id": "up_proj", "title": "Up projection", "description": f"Linear; {hidden} -> {inter}"},
|
|
147
|
+
{
|
|
148
|
+
"id": "silu",
|
|
149
|
+
"title": f"{activation} activation",
|
|
150
|
+
"description": "Element-wise non-linearity applied to the gate path",
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"id": "mul",
|
|
154
|
+
"title": "Element-wise multiply",
|
|
155
|
+
"description": f"{activation}(gate) x up; combines the gated and ungated paths",
|
|
156
|
+
},
|
|
157
|
+
{"id": "down_proj", "title": "Down projection", "description": f"Linear; {inter} -> {hidden}"},
|
|
158
|
+
]
|
|
159
|
+
if ffn.kind == "moe":
|
|
160
|
+
n_experts = _fmt(ffn.num_experts) if ffn.num_experts else "N"
|
|
161
|
+
n_active = ffn.num_experts_per_tok or "k"
|
|
162
|
+
n_shared = ffn.num_shared_experts or 0
|
|
163
|
+
expert_desc = (
|
|
164
|
+
f"Dense FFN; {hidden} -> {inter} -> {hidden}; "
|
|
165
|
+
f"only top-{n_active} of {n_experts} active per token"
|
|
166
|
+
+ (f"; plus {n_shared} shared expert(s) always active" if n_shared else "")
|
|
167
|
+
)
|
|
168
|
+
children.extend(
|
|
169
|
+
[
|
|
170
|
+
{
|
|
171
|
+
"id": "router",
|
|
172
|
+
"title": "Router",
|
|
173
|
+
"description": f"Linear; {hidden} -> {n_experts} (selects top-{n_active} experts per token)",
|
|
174
|
+
},
|
|
175
|
+
{"id": "expert_1", "title": "Expert FFN", "description": expert_desc},
|
|
176
|
+
{"id": "expert_k", "title": "Expert FFN", "description": expert_desc},
|
|
177
|
+
{"id": "expert_kp1", "title": "Expert FFN", "description": expert_desc},
|
|
178
|
+
{"id": "expert_n", "title": "Expert FFN", "description": expert_desc},
|
|
179
|
+
{
|
|
180
|
+
"id": "add_moe",
|
|
181
|
+
"title": "Weighted sum",
|
|
182
|
+
"description": f"Combines top-{n_active} expert outputs weighted by router probabilities",
|
|
183
|
+
},
|
|
184
|
+
]
|
|
185
|
+
)
|
|
186
|
+
return children
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def attention_label(attention: AttentionSpec) -> list[str]:
|
|
190
|
+
kind = attention.kind
|
|
191
|
+
if kind == "mla":
|
|
192
|
+
return ["Multi-Head Latent", "Attention"]
|
|
193
|
+
if kind == "gqa":
|
|
194
|
+
return ["Grouped-Query", "Attention"]
|
|
195
|
+
if kind == "mqa":
|
|
196
|
+
return ["Multi-Query", "Attention"]
|
|
197
|
+
return ["Multi-Head", "Attention"]
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def attention_title(attention: AttentionSpec) -> str:
|
|
201
|
+
titles = {
|
|
202
|
+
"mla": "Multi-head latent attention",
|
|
203
|
+
"gqa": "Grouped-query attention",
|
|
204
|
+
"mqa": "Multi-query attention",
|
|
205
|
+
}
|
|
206
|
+
return titles.get(attention.kind, "Attention")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def describe_attention(attention: AttentionSpec) -> str:
|
|
210
|
+
if attention.kind == "mla":
|
|
211
|
+
text = (
|
|
212
|
+
f"Multi-head latent attention; {attention.num_heads} heads; "
|
|
213
|
+
f"KV LoRA {_fmt(attention.kv_lora_rank)}"
|
|
214
|
+
)
|
|
215
|
+
if attention.q_lora_rank:
|
|
216
|
+
text += f"; Q LoRA {_fmt(attention.q_lora_rank)}"
|
|
217
|
+
return text
|
|
218
|
+
if attention.kind == "gqa":
|
|
219
|
+
return (
|
|
220
|
+
f"Grouped-query; {attention.num_heads} Q / {attention.num_kv_heads} KV heads; "
|
|
221
|
+
f"head dim {_fmt(attention.head_dim)}"
|
|
222
|
+
)
|
|
223
|
+
if attention.kind == "mqa":
|
|
224
|
+
return f"Multi-query; {attention.num_heads} Q / 1 KV head"
|
|
225
|
+
return f"Multi-head; {attention.num_heads} heads; head dim {_fmt(attention.head_dim)}"
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def describe_ffn(ffn: FFNSpec) -> str:
|
|
229
|
+
if ffn.kind == "moe":
|
|
230
|
+
text = f"MoE; {_fmt(ffn.num_experts)} experts; top-{ffn.num_experts_per_tok}"
|
|
231
|
+
if ffn.num_shared_experts:
|
|
232
|
+
text += f" + {ffn.num_shared_experts} shared"
|
|
233
|
+
if ffn.num_experts and ffn.num_experts_per_tok:
|
|
234
|
+
text += f"; {100 * ffn.num_experts_per_tok / ffn.num_experts:.1f}% active"
|
|
235
|
+
text += f"; expert hidden {_fmt(ffn.expert_intermediate_size or ffn.intermediate_size)}"
|
|
236
|
+
return text
|
|
237
|
+
gated = "gated " if ffn.gated else ""
|
|
238
|
+
return f"{gated}FFN; {activation_label(ffn.activation)}; hidden {_fmt(ffn.intermediate_size)}"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Shared helpers for transformer-family config adapters."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_config_value(cfg: Any, name: str, default=None):
|
|
8
|
+
"""Get a config value from a dict or a HuggingFace config object."""
|
|
9
|
+
if isinstance(cfg, dict):
|
|
10
|
+
return cfg.get(name, default)
|
|
11
|
+
return getattr(cfg, name, default)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def architecture_name(cfg: Any, fallback: str) -> str:
|
|
15
|
+
architectures = get_config_value(cfg, "architectures") or []
|
|
16
|
+
return architectures[0] if architectures else get_config_value(cfg, "model_type", fallback)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def model_name(cfg: Any, fallback: str) -> str:
|
|
20
|
+
name = (
|
|
21
|
+
get_config_value(cfg, "_name_or_path")
|
|
22
|
+
or get_config_value(cfg, "name_or_path")
|
|
23
|
+
or fallback
|
|
24
|
+
)
|
|
25
|
+
return str(name).split("/")[-1] if name else fallback
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def format_dim(value: Any) -> str:
|
|
29
|
+
"""Human-readable dimension text for adapter-authored metadata."""
|
|
30
|
+
if value is None:
|
|
31
|
+
return "?"
|
|
32
|
+
try:
|
|
33
|
+
return f"{int(value):,}"
|
|
34
|
+
except (TypeError, ValueError):
|
|
35
|
+
return str(value)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Transformer model-family adapters.
|
|
2
|
+
|
|
3
|
+
These modules translate family-specific HuggingFace config dialects into the
|
|
4
|
+
shared transformer IR pieces in ``model_unfolder.adapters.transformer``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from . import deepseek, gemma4, llama
|
|
8
|
+
|
|
9
|
+
# Order matters: more specific adapters first. ``gemma4`` claims its own
|
|
10
|
+
# top-level ``model_type`` / architecture and must run before the generic
|
|
11
|
+
# llama-family matcher.
|
|
12
|
+
ADAPTERS = [deepseek, gemma4, llama]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Adapter for DeepSeek-V3 and Kimi K2 (which shares the architecture)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ....ir import AttentionSpec, FFNSpec, ModelIR
|
|
7
|
+
from ..assembly import decoder_extras, decoder_layer
|
|
8
|
+
from ..common import architecture_name, get_config_value as _g, model_name
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_FAMILIES = {"deepseek_v3", "deepseek_v2", "kimi"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def matches(cfg: Any) -> bool:
|
|
15
|
+
arches = _g(cfg, "architectures") or []
|
|
16
|
+
model_type = _g(cfg, "model_type", "")
|
|
17
|
+
if any("DeepseekV3" in arch or "DeepseekV2" in arch or "Kimi" in arch for arch in arches):
|
|
18
|
+
return True
|
|
19
|
+
if model_type in _FAMILIES:
|
|
20
|
+
return True
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse(cfg: Any) -> ModelIR:
|
|
25
|
+
num_layers = _g(cfg, "num_hidden_layers", 0)
|
|
26
|
+
hidden_size = _g(cfg, "hidden_size", 0)
|
|
27
|
+
num_heads = _g(cfg, "num_attention_heads", 0)
|
|
28
|
+
num_kv_heads = _g(cfg, "num_key_value_heads", num_heads)
|
|
29
|
+
|
|
30
|
+
kv_lora_rank = _g(cfg, "kv_lora_rank")
|
|
31
|
+
q_lora_rank = _g(cfg, "q_lora_rank")
|
|
32
|
+
qk_rope_head_dim = _g(cfg, "qk_rope_head_dim")
|
|
33
|
+
qk_nope_head_dim = _g(cfg, "qk_nope_head_dim")
|
|
34
|
+
v_head_dim = _g(cfg, "v_head_dim")
|
|
35
|
+
|
|
36
|
+
attn_kind = "mla" if kv_lora_rank is not None else "gqa"
|
|
37
|
+
head_dim = (qk_nope_head_dim or 0) + (qk_rope_head_dim or 0) or _g(cfg, "head_dim")
|
|
38
|
+
|
|
39
|
+
intermediate_size = _g(cfg, "intermediate_size", 0)
|
|
40
|
+
moe_intermediate_size = _g(cfg, "moe_intermediate_size", intermediate_size)
|
|
41
|
+
n_routed_experts = _g(cfg, "n_routed_experts") or _g(cfg, "num_experts", 0) or 0
|
|
42
|
+
n_shared_experts = _g(cfg, "n_shared_experts") or _g(cfg, "num_shared_experts", 0) or 0
|
|
43
|
+
num_experts_per_tok = _g(cfg, "num_experts_per_tok", 0)
|
|
44
|
+
first_k_dense_replace = _g(cfg, "first_k_dense_replace", 0)
|
|
45
|
+
moe_layer_freq = _g(cfg, "moe_layer_freq", 1)
|
|
46
|
+
activation = (_g(cfg, "hidden_act", "silu") or "silu").lower()
|
|
47
|
+
|
|
48
|
+
arch_name = architecture_name(cfg, "deepseek_v3")
|
|
49
|
+
|
|
50
|
+
layers = []
|
|
51
|
+
for i in range(num_layers):
|
|
52
|
+
attn = AttentionSpec(
|
|
53
|
+
kind=attn_kind,
|
|
54
|
+
num_heads=num_heads,
|
|
55
|
+
num_kv_heads=num_kv_heads,
|
|
56
|
+
head_dim=head_dim,
|
|
57
|
+
kv_lora_rank=kv_lora_rank,
|
|
58
|
+
q_lora_rank=q_lora_rank,
|
|
59
|
+
rope_dim=qk_rope_head_dim,
|
|
60
|
+
mask="causal",
|
|
61
|
+
)
|
|
62
|
+
is_moe_layer = (
|
|
63
|
+
n_routed_experts > 0
|
|
64
|
+
and i >= first_k_dense_replace
|
|
65
|
+
and (i - first_k_dense_replace) % max(moe_layer_freq, 1) == 0
|
|
66
|
+
)
|
|
67
|
+
if is_moe_layer:
|
|
68
|
+
ffn = FFNSpec(
|
|
69
|
+
kind="moe",
|
|
70
|
+
activation=activation,
|
|
71
|
+
intermediate_size=moe_intermediate_size,
|
|
72
|
+
gated=True,
|
|
73
|
+
num_experts=n_routed_experts,
|
|
74
|
+
num_experts_per_tok=num_experts_per_tok,
|
|
75
|
+
num_shared_experts=n_shared_experts,
|
|
76
|
+
expert_intermediate_size=moe_intermediate_size,
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
ffn = FFNSpec(
|
|
80
|
+
kind="dense",
|
|
81
|
+
activation=activation,
|
|
82
|
+
intermediate_size=intermediate_size,
|
|
83
|
+
gated=True,
|
|
84
|
+
)
|
|
85
|
+
layers.append(decoder_layer(i, attn, ffn, hidden_size))
|
|
86
|
+
|
|
87
|
+
vocab_size = _g(cfg, "vocab_size", 0)
|
|
88
|
+
tie_word_embeddings = bool(_g(cfg, "tie_word_embeddings", False))
|
|
89
|
+
return ModelIR(
|
|
90
|
+
name=model_name(cfg, arch_name),
|
|
91
|
+
architecture=arch_name,
|
|
92
|
+
vocab_size=vocab_size,
|
|
93
|
+
hidden_size=hidden_size,
|
|
94
|
+
max_position_embeddings=_g(cfg, "max_position_embeddings"),
|
|
95
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
96
|
+
layers=layers,
|
|
97
|
+
extras=decoder_extras(
|
|
98
|
+
vocab_size,
|
|
99
|
+
hidden_size,
|
|
100
|
+
tie_word_embeddings,
|
|
101
|
+
{
|
|
102
|
+
"v_head_dim": v_head_dim,
|
|
103
|
+
"first_k_dense_replace": first_k_dense_replace,
|
|
104
|
+
"moe_layer_freq": moe_layer_freq,
|
|
105
|
+
},
|
|
106
|
+
),
|
|
107
|
+
)
|