model-unfolder 0.2.4__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/PKG-INFO +4 -3
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/README.md +3 -2
- model_unfolder-0.2.5/model_unfolder/adapters/transformer/families/__init__.py +14 -0
- model_unfolder-0.2.5/model_unfolder/adapters/transformer/families/gemma/__init__.py +25 -0
- model_unfolder-0.2.5/model_unfolder/adapters/transformer/families/gemma/gemma3.py +137 -0
- {model_unfolder-0.2.4/model_unfolder/adapters/transformer/families → model_unfolder-0.2.5/model_unfolder/adapters/transformer/families/gemma}/gemma4.py +4 -4
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/families/llama.py +1 -1
- model_unfolder-0.2.5/model_unfolder/adapters/transformer/families/minimax.py +99 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/sections.py +10 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/styles.py +6 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder.egg-info/PKG-INFO +4 -3
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder.egg-info/SOURCES.txt +4 -1
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/pyproject.toml +1 -1
- model_unfolder-0.2.4/model_unfolder/adapters/transformer/families/__init__.py +0 -13
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/LICENSE +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/custom/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/diffusor/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/assembly.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/blocks.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/common.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/families/deepseek.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/families/fallback.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/families/mistral.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/families/qwen.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/special_parts/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/diagram.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/html_renderer.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/ir.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/labels.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/params.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/parser.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/block_views/__init__.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/block_views/attention.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/block_views/feed_forward.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/block_views/per_layer_embedding.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/cards.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/document.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/interactions.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/metadata.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/svg.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/theme.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/utils.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/views.py +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder.egg-info/dependency_links.txt +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder.egg-info/requires.txt +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder.egg-info/top_level.txt +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/setup.cfg +0 -0
- {model_unfolder-0.2.4 → model_unfolder-0.2.5}/tests/test_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: model-unfolder
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
|
|
5
5
|
Author: model-unfolder contributors
|
|
6
6
|
License: Apache-2.0
|
|
@@ -111,8 +111,9 @@ Param estimates are close to published numbers — DeepSeek-V3 reports `~675B (~
|
|
|
111
111
|
| DeepSeek | DeepSeek-V2, DeepSeek-V3, Kimi K2 |
|
|
112
112
|
| Llama | Llama 3 / 3.1 / 3.2 / 3.3 |
|
|
113
113
|
| Mistral | Mistral 7B, Mixtral 8x7B / 8x22B, Mistral Medium 3.5 |
|
|
114
|
-
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6
|
|
115
|
-
| Gemma | Gemma 3, Gemma 4 (31B, E2B, E4B) |
|
|
114
|
+
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6 |
|
|
115
|
+
| Gemma | Gemma 3 / 3n, Gemma 4 (31B, E2B, E4B) |
|
|
116
|
+
| MiniMax | MiniMax-Text-01 |
|
|
116
117
|
|
|
117
118
|
### Diffusors
|
|
118
119
|
|
|
@@ -87,8 +87,9 @@ Param estimates are close to published numbers — DeepSeek-V3 reports `~675B (~
|
|
|
87
87
|
| DeepSeek | DeepSeek-V2, DeepSeek-V3, Kimi K2 |
|
|
88
88
|
| Llama | Llama 3 / 3.1 / 3.2 / 3.3 |
|
|
89
89
|
| Mistral | Mistral 7B, Mixtral 8x7B / 8x22B, Mistral Medium 3.5 |
|
|
90
|
-
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6
|
|
91
|
-
| Gemma | Gemma 3, Gemma 4 (31B, E2B, E4B) |
|
|
90
|
+
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6 |
|
|
91
|
+
| Gemma | Gemma 3 / 3n, Gemma 4 (31B, E2B, E4B) |
|
|
92
|
+
| MiniMax | MiniMax-Text-01 |
|
|
92
93
|
|
|
93
94
|
### Diffusors
|
|
94
95
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Transformer model-family adapters.
|
|
2
|
+
|
|
3
|
+
These modules translate family-specific HuggingFace config dialects into the
|
|
4
|
+
shared transformer IR pieces in ``model_unfolder.adapters.transformer``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from . import deepseek, fallback, llama, minimax, mistral, qwen
|
|
8
|
+
from . import gemma # gemma/ package — dispatches to gemma3/gemma4 internally
|
|
9
|
+
|
|
10
|
+
# Order matters: more specific adapters first.
|
|
11
|
+
# ``gemma`` must run before ``llama`` (llama previously caught "gemma" model_type).
|
|
12
|
+
# ``mistral``, ``qwen``, ``minimax`` before ``llama``.
|
|
13
|
+
# ``fallback`` always matches — must be last.
|
|
14
|
+
ADAPTERS = [deepseek, gemma, minimax, mistral, qwen, llama, fallback]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Gemma family adapter — dispatches to version-specific sub-adapters.
|
|
2
|
+
|
|
3
|
+
Sub-adapters (in priority order):
|
|
4
|
+
gemma4 — Gemma 4 (31B, E2B, E4B, 26B-A4B) model_type: gemma4 / gemma4_text
|
|
5
|
+
gemma3 — Gemma 3 / 3n (1B–27B, E2B, E4B) model_type: gemma3 / gemma3n
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from . import gemma3, gemma4
|
|
12
|
+
|
|
13
|
+
_SUB_ADAPTERS = [gemma4, gemma3]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def matches(cfg: Any) -> bool:
|
|
17
|
+
return any(a.matches(cfg) for a in _SUB_ADAPTERS)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse(cfg: Any) -> Any:
|
|
21
|
+
for a in _SUB_ADAPTERS:
|
|
22
|
+
if a.matches(cfg):
|
|
23
|
+
return a.parse(cfg)
|
|
24
|
+
# Should not be reached since matches() already confirmed one hit
|
|
25
|
+
return gemma3.parse(cfg)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Adapter for Google's Gemma 3 and Gemma 3n families.
|
|
2
|
+
|
|
3
|
+
Gemma 3 (1B / 4B / 12B / 27B) is a multimodal wrapper (model_type: "gemma3")
|
|
4
|
+
nesting the language model under text_config, with alternating
|
|
5
|
+
sliding-window and full-context attention layers controlled by
|
|
6
|
+
``sliding_window_pattern`` (every Nth layer is full).
|
|
7
|
+
|
|
8
|
+
Gemma 3n (E2B / E4B) is the nano variant (model_type: "gemma3n") with
|
|
9
|
+
Per-Layer Embeddings — structurally similar but with PLE conditioning.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from .....ir import AttentionSpec, FFNSpec, ModelIR
|
|
16
|
+
from ...assembly import decoder_extras, decoder_layer
|
|
17
|
+
from ...common import architecture_name, get_config_value as _g, model_name
|
|
18
|
+
from ...special_parts.per_layer_embedding import (
|
|
19
|
+
per_layer_embedding_blocks,
|
|
20
|
+
per_layer_embedding_extras,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_TOP_TYPES = {"gemma3", "gemma3n"}
|
|
25
|
+
_TEXT_TYPES = {"gemma3_text", "gemma3n_text"}
|
|
26
|
+
_ALL_TYPES = _TOP_TYPES | _TEXT_TYPES
|
|
27
|
+
_ARCH_HINTS = ("gemma3",)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def matches(cfg: Any) -> bool:
|
|
31
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
32
|
+
if model_type in _ALL_TYPES:
|
|
33
|
+
return True
|
|
34
|
+
arches = _g(cfg, "architectures") or []
|
|
35
|
+
return any(any(h in a.lower() for h in _ARCH_HINTS) for a in arches)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse(cfg: Any) -> ModelIR:
|
|
39
|
+
arch_name = architecture_name(cfg, "gemma3")
|
|
40
|
+
text_cfg = _text_config(cfg)
|
|
41
|
+
model_type = (_g(text_cfg, "model_type") or _g(cfg, "model_type") or "").lower()
|
|
42
|
+
|
|
43
|
+
num_layers = _g(text_cfg, "num_hidden_layers", 0)
|
|
44
|
+
hidden_size = _g(text_cfg, "hidden_size", 0)
|
|
45
|
+
num_heads = _g(text_cfg, "num_attention_heads", 0)
|
|
46
|
+
num_kv_heads = _g(text_cfg, "num_key_value_heads", num_heads)
|
|
47
|
+
head_dim = _g(text_cfg, "head_dim") or (hidden_size // num_heads if num_heads else None)
|
|
48
|
+
intermediate_size = _g(text_cfg, "intermediate_size", 0)
|
|
49
|
+
activation = (_g(text_cfg, "hidden_activation") or _g(text_cfg, "hidden_act") or "gelu").lower()
|
|
50
|
+
|
|
51
|
+
if num_kv_heads == num_heads:
|
|
52
|
+
attn_kind = "mha"
|
|
53
|
+
elif num_kv_heads == 1:
|
|
54
|
+
attn_kind = "mqa"
|
|
55
|
+
else:
|
|
56
|
+
attn_kind = "gqa"
|
|
57
|
+
|
|
58
|
+
sliding_window = _g(text_cfg, "sliding_window")
|
|
59
|
+
# Gemma 3: every sliding_window_pattern-th layer is full, rest are sliding
|
|
60
|
+
sliding_pattern = _g(text_cfg, "sliding_window_pattern")
|
|
61
|
+
layer_types = _g(text_cfg, "layer_types") or []
|
|
62
|
+
|
|
63
|
+
# PLE (Gemma 3n)
|
|
64
|
+
ple_dim = _g(text_cfg, "hidden_size_per_layer_input") or 0
|
|
65
|
+
ple_vocab = _g(text_cfg, "vocab_size_per_layer_input") or _g(text_cfg, "vocab_size", 0)
|
|
66
|
+
|
|
67
|
+
layers = []
|
|
68
|
+
for i in range(num_layers):
|
|
69
|
+
if layer_types and i < len(layer_types):
|
|
70
|
+
lt = layer_types[i]
|
|
71
|
+
if "sliding" in lt:
|
|
72
|
+
mask, win = "sliding", sliding_window
|
|
73
|
+
else:
|
|
74
|
+
mask, win = "global", None
|
|
75
|
+
elif sliding_pattern and sliding_window:
|
|
76
|
+
is_full = (i % sliding_pattern) == (sliding_pattern - 1)
|
|
77
|
+
mask = "global" if is_full else "sliding"
|
|
78
|
+
win = None if is_full else sliding_window
|
|
79
|
+
elif sliding_window:
|
|
80
|
+
mask, win = "sliding", sliding_window
|
|
81
|
+
else:
|
|
82
|
+
mask, win = "causal", None
|
|
83
|
+
|
|
84
|
+
attn = AttentionSpec(
|
|
85
|
+
kind=attn_kind,
|
|
86
|
+
num_heads=num_heads,
|
|
87
|
+
num_kv_heads=num_kv_heads,
|
|
88
|
+
head_dim=head_dim,
|
|
89
|
+
mask=mask,
|
|
90
|
+
window_size=win,
|
|
91
|
+
)
|
|
92
|
+
ffn = FFNSpec(
|
|
93
|
+
kind="dense",
|
|
94
|
+
activation=activation,
|
|
95
|
+
intermediate_size=intermediate_size,
|
|
96
|
+
gated=True,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
extra_blocks = []
|
|
100
|
+
if ple_dim:
|
|
101
|
+
extra_blocks.extend(
|
|
102
|
+
per_layer_embedding_blocks(hidden_size, ple_dim, activation="gelu")
|
|
103
|
+
)
|
|
104
|
+
layers.append(decoder_layer(i, attn, ffn, hidden_size, extra_blocks=extra_blocks))
|
|
105
|
+
|
|
106
|
+
vocab_size = _g(text_cfg, "vocab_size", 0) or _g(cfg, "vocab_size", 0)
|
|
107
|
+
tie_word_embeddings = bool(
|
|
108
|
+
_g(text_cfg, "tie_word_embeddings", _g(cfg, "tie_word_embeddings", False))
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
extras = decoder_extras(
|
|
112
|
+
vocab_size,
|
|
113
|
+
hidden_size,
|
|
114
|
+
tie_word_embeddings,
|
|
115
|
+
per_layer_embedding_extras(hidden_size, ple_dim, ple_vocab, num_layers)
|
|
116
|
+
if ple_dim else None,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return ModelIR(
|
|
120
|
+
name=model_name(cfg, arch_name),
|
|
121
|
+
architecture=arch_name,
|
|
122
|
+
vocab_size=vocab_size,
|
|
123
|
+
hidden_size=hidden_size,
|
|
124
|
+
max_position_embeddings=_g(text_cfg, "max_position_embeddings"),
|
|
125
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
126
|
+
layers=layers,
|
|
127
|
+
extras=extras,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _text_config(cfg: Any) -> Any:
|
|
132
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
133
|
+
if model_type in _TOP_TYPES:
|
|
134
|
+
sub = _g(cfg, "text_config")
|
|
135
|
+
if sub is not None:
|
|
136
|
+
return sub
|
|
137
|
+
return cfg
|
|
@@ -18,10 +18,10 @@ from __future__ import annotations
|
|
|
18
18
|
|
|
19
19
|
from typing import Any
|
|
20
20
|
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
24
|
-
from
|
|
21
|
+
from .....ir import AttentionSpec, CrossLayerEdge, FFNSpec, ModelIR
|
|
22
|
+
from ...assembly import decoder_extras, decoder_layer
|
|
23
|
+
from ...common import architecture_name, get_config_value as _g, model_name
|
|
24
|
+
from ...special_parts.per_layer_embedding import (
|
|
25
25
|
per_layer_embedding_blocks,
|
|
26
26
|
per_layer_embedding_extras,
|
|
27
27
|
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Adapter for MiniMax models (MiniMax-Text-01 and successors).
|
|
2
|
+
|
|
3
|
+
MiniMax-Text-01 is a 456B sparse-MoE hybrid:
|
|
4
|
+
* ``attn_type_list`` — per-layer int array: 0 = lightning (linear) attention,
|
|
5
|
+
1 = full softmax attention. Every 8th layer is full (7 linear + 1 full).
|
|
6
|
+
* MoE FFN: ``num_local_experts`` routed experts, ``num_experts_per_tok`` active.
|
|
7
|
+
* Flat config (no text_config wrapper).
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ....ir import AttentionSpec, FFNSpec, ModelIR
|
|
14
|
+
from ..assembly import decoder_extras, decoder_layer
|
|
15
|
+
from ..common import architecture_name, get_config_value as _g, model_name
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_MODEL_TYPES = {"minimax_text_01", "minimax"}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def matches(cfg: Any) -> bool:
|
|
22
|
+
model_type = (_g(cfg, "model_type") or "").lower()
|
|
23
|
+
if model_type in _MODEL_TYPES:
|
|
24
|
+
return True
|
|
25
|
+
arches = _g(cfg, "architectures") or []
|
|
26
|
+
return any("minimax" in a.lower() for a in arches)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse(cfg: Any) -> ModelIR:
|
|
30
|
+
arch_name = architecture_name(cfg, "minimax")
|
|
31
|
+
|
|
32
|
+
num_layers = _g(cfg, "num_hidden_layers", 0)
|
|
33
|
+
num_heads = _g(cfg, "num_attention_heads", 0)
|
|
34
|
+
num_kv_heads = _g(cfg, "num_key_value_heads", num_heads)
|
|
35
|
+
hidden_size = _g(cfg, "hidden_size", 0)
|
|
36
|
+
head_dim = _g(cfg, "head_dim") or (hidden_size // num_heads if num_heads else None)
|
|
37
|
+
activation = (_g(cfg, "hidden_act") or "silu").lower()
|
|
38
|
+
|
|
39
|
+
if num_kv_heads == num_heads:
|
|
40
|
+
attn_kind = "mha"
|
|
41
|
+
elif num_kv_heads == 1:
|
|
42
|
+
attn_kind = "mqa"
|
|
43
|
+
else:
|
|
44
|
+
attn_kind = "gqa"
|
|
45
|
+
|
|
46
|
+
# 0 = lightning/linear attention, 1 = full softmax attention
|
|
47
|
+
attn_type_list = _g(cfg, "attn_type_list") or []
|
|
48
|
+
|
|
49
|
+
# MoE
|
|
50
|
+
num_experts = _g(cfg, "num_local_experts") or _g(cfg, "num_experts") or 0
|
|
51
|
+
num_experts_per_tok = _g(cfg, "num_experts_per_tok") or 0
|
|
52
|
+
intermediate_size = _g(cfg, "intermediate_size", 0)
|
|
53
|
+
is_moe = bool(num_experts)
|
|
54
|
+
|
|
55
|
+
layers = []
|
|
56
|
+
for i in range(num_layers):
|
|
57
|
+
attn_flag = attn_type_list[i] if i < len(attn_type_list) else 1
|
|
58
|
+
is_linear = attn_flag == 0
|
|
59
|
+
|
|
60
|
+
attn = AttentionSpec(
|
|
61
|
+
kind="linear" if is_linear else attn_kind,
|
|
62
|
+
num_heads=num_heads,
|
|
63
|
+
num_kv_heads=num_kv_heads,
|
|
64
|
+
head_dim=head_dim,
|
|
65
|
+
mask="causal",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if is_moe:
|
|
69
|
+
ffn = FFNSpec(
|
|
70
|
+
kind="moe",
|
|
71
|
+
activation=activation,
|
|
72
|
+
intermediate_size=intermediate_size,
|
|
73
|
+
gated=True,
|
|
74
|
+
num_experts=num_experts,
|
|
75
|
+
num_experts_per_tok=num_experts_per_tok,
|
|
76
|
+
expert_intermediate_size=intermediate_size,
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
ffn = FFNSpec(
|
|
80
|
+
kind="dense",
|
|
81
|
+
activation=activation,
|
|
82
|
+
intermediate_size=intermediate_size,
|
|
83
|
+
gated=True,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
layers.append(decoder_layer(i, attn, ffn, hidden_size))
|
|
87
|
+
|
|
88
|
+
vocab_size = _g(cfg, "vocab_size", 0)
|
|
89
|
+
tie_word_embeddings = bool(_g(cfg, "tie_word_embeddings", False))
|
|
90
|
+
return ModelIR(
|
|
91
|
+
name=model_name(cfg, arch_name),
|
|
92
|
+
architecture=arch_name,
|
|
93
|
+
vocab_size=vocab_size,
|
|
94
|
+
hidden_size=hidden_size,
|
|
95
|
+
max_position_embeddings=_g(cfg, "max_position_embeddings"),
|
|
96
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
97
|
+
layers=layers,
|
|
98
|
+
extras=decoder_extras(vocab_size, hidden_size, tie_word_embeddings),
|
|
99
|
+
)
|
|
@@ -26,6 +26,16 @@ def _header(ir: dict, info: dict) -> str:
|
|
|
26
26
|
badges.append(
|
|
27
27
|
f'<span class="uf-badge" title="{_attr(title)}">{_html(badge["text"])}</span>'
|
|
28
28
|
)
|
|
29
|
+
|
|
30
|
+
warnings = ir.get("warnings") or []
|
|
31
|
+
if warnings:
|
|
32
|
+
tooltip = " · ".join(warnings)
|
|
33
|
+
badges.append(
|
|
34
|
+
f'<span class="uf-badge uf-badge-warn" title="{_attr(tooltip)}">'
|
|
35
|
+
"⚠ partial config"
|
|
36
|
+
"</span>"
|
|
37
|
+
)
|
|
38
|
+
|
|
29
39
|
return f"""
|
|
30
40
|
<div class="uf-header">
|
|
31
41
|
<div class="uf-name">{_html(ir.get("name", "model"))}</div>
|
|
@@ -52,6 +52,12 @@ def _style(mount_id: str) -> str:
|
|
|
52
52
|
font-weight:600;
|
|
53
53
|
letter-spacing:0.02em;
|
|
54
54
|
}}
|
|
55
|
+
#{mount_id} .uf-badge-warn {{
|
|
56
|
+
background:#FEF3C7;
|
|
57
|
+
color:#92400E;
|
|
58
|
+
border:1px solid #FCD34D;
|
|
59
|
+
cursor:default;
|
|
60
|
+
}}
|
|
55
61
|
#{mount_id} .uf-stats {{
|
|
56
62
|
display:grid;
|
|
57
63
|
grid-template-columns:repeat(5,minmax(0,1fr));
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: model-unfolder
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter.
|
|
5
5
|
Author: model-unfolder contributors
|
|
6
6
|
License: Apache-2.0
|
|
@@ -111,8 +111,9 @@ Param estimates are close to published numbers — DeepSeek-V3 reports `~675B (~
|
|
|
111
111
|
| DeepSeek | DeepSeek-V2, DeepSeek-V3, Kimi K2 |
|
|
112
112
|
| Llama | Llama 3 / 3.1 / 3.2 / 3.3 |
|
|
113
113
|
| Mistral | Mistral 7B, Mixtral 8x7B / 8x22B, Mistral Medium 3.5 |
|
|
114
|
-
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6
|
|
115
|
-
| Gemma | Gemma 3, Gemma 4 (31B, E2B, E4B) |
|
|
114
|
+
| Qwen | Qwen2 / 2.5, Qwen2-MoE, Qwen3, Qwen3-MoE, Qwen3.5 / 3.6 |
|
|
115
|
+
| Gemma | Gemma 3 / 3n, Gemma 4 (31B, E2B, E4B) |
|
|
116
|
+
| MiniMax | MiniMax-Text-01 |
|
|
116
117
|
|
|
117
118
|
### Diffusors
|
|
118
119
|
|
|
@@ -23,10 +23,13 @@ model_unfolder/adapters/transformer/common.py
|
|
|
23
23
|
model_unfolder/adapters/transformer/families/__init__.py
|
|
24
24
|
model_unfolder/adapters/transformer/families/deepseek.py
|
|
25
25
|
model_unfolder/adapters/transformer/families/fallback.py
|
|
26
|
-
model_unfolder/adapters/transformer/families/gemma4.py
|
|
27
26
|
model_unfolder/adapters/transformer/families/llama.py
|
|
27
|
+
model_unfolder/adapters/transformer/families/minimax.py
|
|
28
28
|
model_unfolder/adapters/transformer/families/mistral.py
|
|
29
29
|
model_unfolder/adapters/transformer/families/qwen.py
|
|
30
|
+
model_unfolder/adapters/transformer/families/gemma/__init__.py
|
|
31
|
+
model_unfolder/adapters/transformer/families/gemma/gemma3.py
|
|
32
|
+
model_unfolder/adapters/transformer/families/gemma/gemma4.py
|
|
30
33
|
model_unfolder/adapters/transformer/special_parts/__init__.py
|
|
31
34
|
model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py
|
|
32
35
|
model_unfolder/renderers/__init__.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "model-unfolder"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.5"
|
|
8
8
|
description = "Unfold any HuggingFace transformer into an interactive architecture diagram, inline in Jupyter."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
"""Transformer model-family adapters.
|
|
2
|
-
|
|
3
|
-
These modules translate family-specific HuggingFace config dialects into the
|
|
4
|
-
shared transformer IR pieces in ``model_unfolder.adapters.transformer``.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from . import deepseek, fallback, gemma4, llama, mistral, qwen
|
|
8
|
-
|
|
9
|
-
# Order matters: more specific adapters first.
|
|
10
|
-
# ``gemma4`` must run before ``llama`` (which also matches ``gemma`` model_type).
|
|
11
|
-
# ``mistral`` and ``qwen`` must run before ``llama``.
|
|
12
|
-
# ``fallback`` always matches — must be last.
|
|
13
|
-
ADAPTERS = [deepseek, gemma4, mistral, qwen, llama, fallback]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/__init__.py
RENAMED
|
File without changes
|
{model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/assembly.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/adapters/transformer/families/qwen.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/block_views/__init__.py
RENAMED
|
File without changes
|
{model_unfolder-0.2.4 → model_unfolder-0.2.5}/model_unfolder/renderers/html/block_views/attention.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|