model-unfolder 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. model_unfolder/__init__.py +58 -0
  2. model_unfolder/adapters/__init__.py +15 -0
  3. model_unfolder/adapters/custom/__init__.py +8 -0
  4. model_unfolder/adapters/diffusor/__init__.py +8 -0
  5. model_unfolder/adapters/transformer/__init__.py +5 -0
  6. model_unfolder/adapters/transformer/assembly.py +57 -0
  7. model_unfolder/adapters/transformer/blocks.py +238 -0
  8. model_unfolder/adapters/transformer/common.py +35 -0
  9. model_unfolder/adapters/transformer/families/__init__.py +12 -0
  10. model_unfolder/adapters/transformer/families/deepseek.py +107 -0
  11. model_unfolder/adapters/transformer/families/gemma4.py +202 -0
  12. model_unfolder/adapters/transformer/families/llama.py +91 -0
  13. model_unfolder/adapters/transformer/special_parts/__init__.py +2 -0
  14. model_unfolder/adapters/transformer/special_parts/per_layer_embedding.py +220 -0
  15. model_unfolder/diagram.py +95 -0
  16. model_unfolder/html_renderer.py +5 -0
  17. model_unfolder/ir.py +163 -0
  18. model_unfolder/labels.py +166 -0
  19. model_unfolder/params.py +119 -0
  20. model_unfolder/parser.py +137 -0
  21. model_unfolder/renderers/__init__.py +1 -0
  22. model_unfolder/renderers/html/__init__.py +5 -0
  23. model_unfolder/renderers/html/block_views/__init__.py +20 -0
  24. model_unfolder/renderers/html/block_views/attention.py +91 -0
  25. model_unfolder/renderers/html/block_views/feed_forward.py +213 -0
  26. model_unfolder/renderers/html/block_views/per_layer_embedding.py +199 -0
  27. model_unfolder/renderers/html/cards.py +130 -0
  28. model_unfolder/renderers/html/document.py +157 -0
  29. model_unfolder/renderers/html/interactions.py +64 -0
  30. model_unfolder/renderers/html/metadata.py +265 -0
  31. model_unfolder/renderers/html/sections.py +60 -0
  32. model_unfolder/renderers/html/styles.py +283 -0
  33. model_unfolder/renderers/html/svg.py +349 -0
  34. model_unfolder/renderers/html/theme.py +24 -0
  35. model_unfolder/renderers/html/utils.py +28 -0
  36. model_unfolder/renderers/html/views.py +461 -0
  37. model_unfolder-0.2.0.dist-info/METADATA +122 -0
  38. model_unfolder-0.2.0.dist-info/RECORD +41 -0
  39. model_unfolder-0.2.0.dist-info/WHEEL +5 -0
  40. model_unfolder-0.2.0.dist-info/licenses/LICENSE +201 -0
  41. model_unfolder-0.2.0.dist-info/top_level.txt +1 -0
model_unfolder/ir.py ADDED
@@ -0,0 +1,163 @@
1
+ """
2
+ Intermediate Representation (IR) for transformer architectures.
3
+
4
+ The IR is the contract between parsers (which read HuggingFace configs)
5
+ and the renderer (which produces SVG/HTML). It is layer-aware to support
6
+ heterogeneous architectures (Gemma sliding-window patterns, DeepSeek
7
+ dense+MoE phase changes, YOCO/CLA cross-layer KV sharing, etc.).
8
+ """
9
+ from __future__ import annotations
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Optional
12
+
13
+
14
+ @dataclass
15
+ class AttentionSpec:
16
+ """Specification of an attention block within a layer."""
17
+ kind: str # "mha" | "gqa" | "mqa" | "mla"
18
+ num_heads: int
19
+ num_kv_heads: Optional[int] = None
20
+ head_dim: Optional[int] = None
21
+ kv_lora_rank: Optional[int] = None
22
+ q_lora_rank: Optional[int] = None
23
+ rope_dim: Optional[int] = None
24
+ mask: str = "causal" # "causal" | "sliding" | "chunked" | "global"
25
+ window_size: Optional[int] = None
26
+ kv_source_layer: Optional[int] = None # for cross-layer KV sharing
27
+
28
+
29
+ @dataclass
30
+ class FFNSpec:
31
+ """Specification of the feed-forward block within a layer."""
32
+ kind: str # "dense" | "moe"
33
+ activation: str # "silu" | "gelu" | "relu" | "geglu" | "swiglu"
34
+ intermediate_size: int
35
+ gated: bool = True # SwiGLU/GeGLU style gated MLP
36
+ num_experts: Optional[int] = None
37
+ num_experts_per_tok: Optional[int] = None
38
+ num_shared_experts: int = 0
39
+ expert_intermediate_size: Optional[int] = None
40
+
41
+
42
+ @dataclass
43
+ class LayerSpec:
44
+ """One transformer layer. Instances may differ across the stack."""
45
+ index: int
46
+ attention: AttentionSpec
47
+ ffn: FFNSpec
48
+ norm_kind: str = "rmsnorm" # "rmsnorm" | "layernorm"
49
+ norm_placement: str = "pre" # "pre" | "post" | "double"
50
+ blocks: list = field(default_factory=list)
51
+
52
+ def signature(self) -> tuple:
53
+ """Hashable structural fingerprint used for grouping similar layers."""
54
+ a = self.attention
55
+ f = self.ffn
56
+ return (
57
+ a.kind, a.mask, a.window_size, a.kv_source_layer is not None,
58
+ f.kind, f.gated, f.num_experts,
59
+ self.norm_kind, self.norm_placement,
60
+ )
61
+
62
+
63
+ @dataclass
64
+ class CrossLayerEdge:
65
+ """A dependency between two layers (e.g. KV cache sharing)."""
66
+ kind: str # "kv_share"
67
+ from_layer: int
68
+ to_layer: int
69
+ shared: list = field(default_factory=list) # ["K", "V"]
70
+
71
+
72
+ @dataclass
73
+ class ModelIR:
74
+ """Top-level IR for a complete model."""
75
+ name: str
76
+ architecture: str # e.g. "DeepseekV3ForCausalLM"
77
+ vocab_size: int
78
+ hidden_size: int
79
+ max_position_embeddings: Optional[int]
80
+ tie_word_embeddings: bool
81
+ layers: list # list[LayerSpec]
82
+ cross_layer_edges: list = field(default_factory=list)
83
+ extras: dict = field(default_factory=dict)
84
+
85
+ def to_dict(self) -> dict:
86
+ # Avoid dataclasses.asdict here: it recursively deepcopy()s every
87
+ # nested dict/list, including repeated render block metadata for every
88
+ # layer. The IR is treated as immutable after parsing, so a direct
89
+ # structural projection is much cheaper and enough for rendering.
90
+ return {
91
+ "name": self.name,
92
+ "architecture": self.architecture,
93
+ "vocab_size": self.vocab_size,
94
+ "hidden_size": self.hidden_size,
95
+ "max_position_embeddings": self.max_position_embeddings,
96
+ "tie_word_embeddings": self.tie_word_embeddings,
97
+ "layers": [_layer_to_dict(layer) for layer in self.layers],
98
+ "cross_layer_edges": [_cross_edge_to_dict(edge) for edge in self.cross_layer_edges],
99
+ "extras": self.extras,
100
+ }
101
+
102
+ @property
103
+ def num_layers(self) -> int:
104
+ return len(self.layers)
105
+
106
+ def layer_groups(self) -> list:
107
+ """Run-length encode layers by signature."""
108
+ groups = []
109
+ for layer in self.layers:
110
+ sig = layer.signature()
111
+ if groups and groups[-1][0] == sig:
112
+ groups[-1][1].append(layer.index)
113
+ else:
114
+ groups.append((sig, [layer.index]))
115
+ return groups
116
+
117
+
118
+ def _attention_to_dict(a: AttentionSpec) -> dict:
119
+ return {
120
+ "kind": a.kind,
121
+ "num_heads": a.num_heads,
122
+ "num_kv_heads": a.num_kv_heads,
123
+ "head_dim": a.head_dim,
124
+ "kv_lora_rank": a.kv_lora_rank,
125
+ "q_lora_rank": a.q_lora_rank,
126
+ "rope_dim": a.rope_dim,
127
+ "mask": a.mask,
128
+ "window_size": a.window_size,
129
+ "kv_source_layer": a.kv_source_layer,
130
+ }
131
+
132
+
133
+ def _ffn_to_dict(f: FFNSpec) -> dict:
134
+ return {
135
+ "kind": f.kind,
136
+ "activation": f.activation,
137
+ "intermediate_size": f.intermediate_size,
138
+ "gated": f.gated,
139
+ "num_experts": f.num_experts,
140
+ "num_experts_per_tok": f.num_experts_per_tok,
141
+ "num_shared_experts": f.num_shared_experts,
142
+ "expert_intermediate_size": f.expert_intermediate_size,
143
+ }
144
+
145
+
146
+ def _layer_to_dict(layer: LayerSpec) -> dict:
147
+ return {
148
+ "index": layer.index,
149
+ "attention": _attention_to_dict(layer.attention),
150
+ "ffn": _ffn_to_dict(layer.ffn),
151
+ "norm_kind": layer.norm_kind,
152
+ "norm_placement": layer.norm_placement,
153
+ "blocks": layer.blocks,
154
+ }
155
+
156
+
157
+ def _cross_edge_to_dict(edge: CrossLayerEdge) -> dict:
158
+ return {
159
+ "kind": edge.kind,
160
+ "from_layer": edge.from_layer,
161
+ "to_layer": edge.to_layer,
162
+ "shared": edge.shared,
163
+ }
@@ -0,0 +1,166 @@
1
+ """Renderer-agnostic vocabulary for talking about transformer specs.
2
+
3
+ Anything that needs to refer to attention masks ("SWA", "full"), attention
4
+ kinds ("GQA"), or build a human description of an attention / FFN block goes
5
+ through this module. Keeping it in one place means changing the wording
6
+ (swap "SWA" for "Sliding", say) only requires editing this file — no scavenger
7
+ hunt across the renderer.
8
+
9
+ The functions are pure and operate on plain ``dict``-shaped specs (i.e. what
10
+ ``ModelIR.to_dict()`` produces), so they're equally useful inside the HTML
11
+ renderer, a future text/markdown renderer, or notebook utilities.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ _MASK_SHORT = {
16
+ "sliding": "SWA",
17
+ "global": "full",
18
+ "causal": "causal",
19
+ "chunked": "chunked",
20
+ }
21
+ _MASK_LONG = {
22
+ "sliding": "Sliding-window",
23
+ "global": "Full / global",
24
+ "causal": "Causal",
25
+ "chunked": "Chunked",
26
+ }
27
+ _MASK_TITLE = {
28
+ "sliding": "Sliding-window attention",
29
+ "global": "Full-context attention",
30
+ "causal": "Causal attention",
31
+ "chunked": "Chunked attention",
32
+ }
33
+
34
+ _KIND_SHORT = {"mla": "MLA", "gqa": "GQA", "mqa": "MQA", "mha": "MHA"}
35
+ _KIND_LONG = {
36
+ "mla": "Multi-head latent attention",
37
+ "gqa": "Grouped-query attention",
38
+ "mqa": "Multi-query attention",
39
+ "mha": "Multi-head attention",
40
+ }
41
+ _ACTIVATION_LABELS = {
42
+ "gelu": "GELU",
43
+ "gelu_new": "GELU",
44
+ "gelu_fast": "GELU",
45
+ "gelu_pytorch_tanh": "GELU",
46
+ "relu": "ReLU",
47
+ "silu": "SiLU",
48
+ "swish": "SiLU",
49
+ "geglu": "GEGLU",
50
+ "swiglu": "SwiGLU",
51
+ }
52
+
53
+
54
+ def mask_short(attention: dict) -> str:
55
+ """Compact mask tag — ``"SWA"`` / ``"full"`` / ``"causal"``."""
56
+ return _MASK_SHORT.get(attention.get("mask", "causal"), "causal")
57
+
58
+
59
+ def mask_long(attention: dict) -> str:
60
+ """Human-readable mask label — ``"Sliding-window"`` / ``"Full / global"``."""
61
+ return _MASK_LONG.get(attention.get("mask", "causal"), "Causal")
62
+
63
+
64
+ def mask_title(attention: dict) -> str:
65
+ """Tooltip-style mask description."""
66
+ return _MASK_TITLE.get(attention.get("mask", "causal"), "Causal attention")
67
+
68
+
69
+ def mask_chip(attention: dict) -> str:
70
+ """One-line chip; for sliding includes the window size: ``"SWA 1,024"``."""
71
+ label = mask_short(attention)
72
+ window = attention.get("window_size")
73
+ if window and is_sliding(attention):
74
+ return f"{label} {_fmt_int(window)}"
75
+ return label
76
+
77
+
78
+ def kind_short(attention: dict) -> str:
79
+ return _KIND_SHORT.get(attention.get("kind", ""), "MHA")
80
+
81
+
82
+ def kind_long(attention: dict) -> str:
83
+ return _KIND_LONG.get(attention.get("kind", ""), "Multi-head attention")
84
+
85
+
86
+ def is_sliding(attention: dict) -> bool:
87
+ return attention.get("mask") == "sliding"
88
+
89
+
90
+ def is_global(attention: dict) -> bool:
91
+ return attention.get("mask") == "global"
92
+
93
+
94
+ def kv_shared(attention: dict) -> bool:
95
+ """True when this layer reuses K/V from an earlier layer (Gemma 4 small)."""
96
+ return attention.get("kv_source_layer") is not None
97
+
98
+
99
+ def activation_label(name: str | None) -> str:
100
+ """Display label for activation names stored in configs.
101
+
102
+ Configs often expose backend-specific names such as
103
+ ``gelu_pytorch_tanh``. Diagrams should show the mathematical operation,
104
+ not the implementation detail.
105
+ """
106
+ key = (name or "").strip().lower().replace("-", "_")
107
+ if key in _ACTIVATION_LABELS:
108
+ return _ACTIVATION_LABELS[key]
109
+ if key.startswith("gelu"):
110
+ return "GELU"
111
+ return key.replace("_", " ").title() if key else "Activation"
112
+
113
+
114
+ def describe_attention(attention: dict) -> str:
115
+ """Multi-clause human description suitable for tooltips and cards.
116
+
117
+ The window-size suffix is appended whenever the layer uses sliding-window
118
+ attention, regardless of the underlying kind (GQA/MHA/MLA).
119
+ """
120
+ kind = attention.get("kind")
121
+ if kind == "mla":
122
+ text = (
123
+ f"Multi-head latent attention; {attention.get('num_heads')} heads; "
124
+ f"KV LoRA {_fmt_int(attention.get('kv_lora_rank'))}"
125
+ )
126
+ if attention.get("q_lora_rank"):
127
+ text += f"; Q LoRA {_fmt_int(attention.get('q_lora_rank'))}"
128
+ elif kind == "gqa":
129
+ text = (
130
+ f"Grouped-query; {attention.get('num_heads')} Q / "
131
+ f"{attention.get('num_kv_heads')} KV heads; "
132
+ f"head dim {_fmt_int(attention.get('head_dim'))}"
133
+ )
134
+ elif kind == "mqa":
135
+ text = f"Multi-query; {attention.get('num_heads')} Q / 1 KV head"
136
+ else:
137
+ text = (
138
+ f"Multi-head; {attention.get('num_heads')} heads; "
139
+ f"head dim {_fmt_int(attention.get('head_dim'))}"
140
+ )
141
+ if is_sliding(attention) and attention.get("window_size"):
142
+ text += f"; sliding window {_fmt_int(attention.get('window_size'))}"
143
+ return text
144
+
145
+
146
+ def describe_ffn(ffn: dict) -> str:
147
+ """Multi-clause human description of an FFN / MoE block."""
148
+ if ffn.get("kind") == "moe":
149
+ text = f"MoE; {_fmt_int(ffn.get('num_experts'))} experts; top-{ffn.get('num_experts_per_tok')}"
150
+ if ffn.get("num_shared_experts"):
151
+ text += f" + {ffn.get('num_shared_experts')} shared"
152
+ if ffn.get("num_experts") and ffn.get("num_experts_per_tok"):
153
+ text += f"; {100 * ffn['num_experts_per_tok'] / ffn['num_experts']:.1f}% active"
154
+ text += f"; expert hidden {_fmt_int(ffn.get('expert_intermediate_size') or ffn.get('intermediate_size'))}"
155
+ return text
156
+ gated = "gated " if ffn.get("gated") else ""
157
+ return f"{gated}FFN; {activation_label(ffn.get('activation'))}; hidden {_fmt_int(ffn.get('intermediate_size'))}"
158
+
159
+
160
+ def _fmt_int(value) -> str:
161
+ if value is None:
162
+ return "?"
163
+ try:
164
+ return f"{int(value):,}"
165
+ except (TypeError, ValueError):
166
+ return str(value)
@@ -0,0 +1,119 @@
1
+ """Rough parameter-count estimation from an IR.
2
+
3
+ These are *estimates*. We don't try to model every implementation detail
4
+ (bias terms, MLA's exact projection layout, expert grouping, etc.) — just
5
+ enough to give the right order of magnitude and a useful active/total split
6
+ for MoE models.
7
+ """
8
+ from __future__ import annotations
9
+ from .ir import ModelIR, AttentionSpec, FFNSpec
10
+
11
+
12
+ def _attn_params(a: AttentionSpec, hidden: int) -> int:
13
+ h = hidden
14
+ head_dim = a.head_dim or (h // max(a.num_heads, 1))
15
+ nq = a.num_heads
16
+ nkv = a.num_kv_heads or nq
17
+
18
+ if a.kind == "mla":
19
+ # Q path: optional LoRA down then up to (nope+rope)*nq
20
+ q_out = nq * head_dim
21
+ if a.q_lora_rank:
22
+ q = h * a.q_lora_rank + a.q_lora_rank * q_out
23
+ else:
24
+ q = h * q_out
25
+ # KV path: hidden -> (kv_lora_rank + rope_dim), then up to nq*(nope+v)
26
+ kv_lora = a.kv_lora_rank or 0
27
+ rope = a.rope_dim or 0
28
+ kv_down = h * (kv_lora + rope)
29
+ kv_up = kv_lora * (nq * head_dim * 2) # K nope + V — rough
30
+ kv = kv_down + kv_up
31
+ out = nq * head_dim * h
32
+ return q + kv + out
33
+
34
+ qkv = h * (nq + 2 * nkv) * head_dim
35
+ out = nq * head_dim * h
36
+ return qkv + out
37
+
38
+
39
+ def _ffn_params(f: FFNSpec, hidden: int) -> tuple:
40
+ """Returns (total_params, active_params_per_token)."""
41
+ g = 3 if f.gated else 2
42
+ if f.kind == "moe":
43
+ per_expert = g * hidden * (f.expert_intermediate_size or f.intermediate_size)
44
+ n_routed = f.num_experts or 0
45
+ n_shared = f.num_shared_experts or 0
46
+ n_active = f.num_experts_per_tok or 0
47
+ router = hidden * n_routed
48
+ total = per_expert * (n_routed + n_shared) + router
49
+ active = per_expert * (n_active + n_shared) + router
50
+ return total, active
51
+ p = g * hidden * f.intermediate_size
52
+ return p, p
53
+
54
+
55
+ def estimate_params(ir: ModelIR) -> dict:
56
+ """Estimate parameter counts for a model.
57
+
58
+ Returns a dict::
59
+
60
+ {
61
+ "total": int, # all parameters
62
+ "active": int, # active per token (== total for non-MoE)
63
+ "embed": int,
64
+ "output": int,
65
+ "per_layer": [{"total": int, "active": int}, ...],
66
+ "is_sparse": bool,
67
+ }
68
+ """
69
+ h = ir.hidden_size
70
+ v = ir.vocab_size
71
+
72
+ embed = v * h
73
+ output = 0 if ir.tie_word_embeddings else v * h
74
+ final_norm = h
75
+
76
+ per_layer = []
77
+ layers_total = 0
78
+ layers_active = 0
79
+ is_sparse = False
80
+
81
+ for layer in ir.layers:
82
+ a_p = _attn_params(layer.attention, h)
83
+ f_total, f_active = _ffn_params(layer.ffn, h)
84
+ if layer.ffn.kind == "moe":
85
+ is_sparse = True
86
+ norm_p = 2 * h
87
+ t = a_p + f_total + norm_p
88
+ ac = a_p + f_active + norm_p
89
+ per_layer.append({"total": t, "active": ac, "attn": a_p, "ffn": f_total})
90
+ layers_total += t
91
+ layers_active += ac
92
+
93
+ total = embed + output + final_norm + layers_total
94
+ active = embed + output + final_norm + layers_active
95
+
96
+ return {
97
+ "total": total,
98
+ "active": active,
99
+ "embed": embed,
100
+ "output": output,
101
+ "per_layer": per_layer,
102
+ "is_sparse": is_sparse,
103
+ }
104
+
105
+
106
+ def humanize(n: int) -> str:
107
+ """Format a parameter count as e.g. '671B', '37.4B', '8.2M'."""
108
+ if n is None:
109
+ return "?"
110
+ n = float(n)
111
+ for unit, scale in (("T", 1e12), ("B", 1e9), ("M", 1e6), ("K", 1e3)):
112
+ if n >= scale:
113
+ v = n / scale
114
+ if v >= 100:
115
+ return f"{v:.0f}{unit}"
116
+ if v >= 10:
117
+ return f"{v:.1f}{unit}"
118
+ return f"{v:.2f}{unit}"
119
+ return f"{int(n)}"
@@ -0,0 +1,137 @@
1
+ """Parse a HuggingFace config (or model ID) into our IR."""
2
+ from __future__ import annotations
3
+ import os
4
+ from typing import Any
5
+ from .ir import ModelIR
6
+ from .adapters import find_adapter
7
+
8
+
9
+ HF_TOKEN_ENV_VARS = (
10
+ "HF_TOKEN",
11
+ "HUGGING_FACE_HUB_TOKEN",
12
+ "HUGGINGFACE_HUB_TOKEN",
13
+ )
14
+
15
+
16
+ def config_to_ir(cfg_or_id: Any, token: Any = None) -> ModelIR:
17
+ """Parse anything HF-shaped into an IR.
18
+
19
+ Accepts:
20
+ - A HuggingFace ``PretrainedConfig`` instance
21
+ - A model ID string (e.g. ``"moonshotai/Kimi-K2-Instruct"``) — requires ``transformers``
22
+ - A plain ``dict`` (the contents of ``config.json``)
23
+
24
+ Parameters
25
+ ----------
26
+ token
27
+ Optional Hugging Face token used only when loading a config by model ID.
28
+ If omitted, ``HF_TOKEN`` and legacy Hugging Face token env vars are used
29
+ when present.
30
+ """
31
+ cfg = _coerce(cfg_or_id, token=token)
32
+ adapter = find_adapter(cfg)
33
+ if adapter is None:
34
+ arches = (
35
+ cfg.get("architectures") if isinstance(cfg, dict)
36
+ else getattr(cfg, "architectures", None)
37
+ )
38
+ raise ValueError(
39
+ f"No adapter found for architecture {arches}. "
40
+ "Pass a dict-like config or contribute an adapter."
41
+ )
42
+ return adapter.parse(cfg)
43
+
44
+
45
+ def _coerce(cfg_or_id, token: Any = None):
46
+ if isinstance(cfg_or_id, dict):
47
+ return cfg_or_id
48
+ if isinstance(cfg_or_id, str):
49
+ try:
50
+ from transformers import AutoConfig
51
+ except ImportError as e:
52
+ raise ImportError(
53
+ "Loading a model by ID requires `transformers`. "
54
+ "Install with `pip install transformers`, or pass a config dict."
55
+ ) from e
56
+ return _load_config_from_hf(AutoConfig, cfg_or_id, token=token)
57
+ return cfg_or_id
58
+
59
+
60
+ def _load_config_from_hf(auto_config: Any, model_id: str, token: Any = None):
61
+ auth_token = _resolve_hf_token(token)
62
+ try:
63
+ return _from_pretrained(auto_config, model_id, auth_token, trust_remote_code=False)
64
+ except Exception as e:
65
+ if not _should_retry_with_remote_code(e):
66
+ raise
67
+ return _from_pretrained(auto_config, model_id, auth_token, trust_remote_code=True)
68
+
69
+
70
+ def _from_pretrained(auto_config: Any, model_id: str, auth_token: Any, *, trust_remote_code: bool):
71
+ kwargs = {}
72
+ if trust_remote_code:
73
+ kwargs["trust_remote_code"] = True
74
+ if auth_token is None:
75
+ return auto_config.from_pretrained(model_id, **kwargs)
76
+
77
+ try:
78
+ return auto_config.from_pretrained(model_id, token=auth_token, **kwargs)
79
+ except Exception as e:
80
+ if not _should_retry_with_legacy_auth(e):
81
+ raise
82
+ return auto_config.from_pretrained(
83
+ model_id,
84
+ use_auth_token=auth_token,
85
+ **kwargs,
86
+ )
87
+
88
+
89
+ def _resolve_hf_token(token: Any = None):
90
+ if token is not None:
91
+ return _clean_token(token)
92
+ for name in HF_TOKEN_ENV_VARS:
93
+ value = _clean_token(os.environ.get(name))
94
+ if value is not None:
95
+ return value
96
+ return None
97
+
98
+
99
+ def _clean_token(token: Any):
100
+ if isinstance(token, str):
101
+ token = token.strip()
102
+ return token or None
103
+ return token
104
+
105
+
106
+ def _should_retry_with_legacy_auth(error: Exception) -> bool:
107
+ msg = str(error).lower()
108
+ return any(
109
+ marker in msg
110
+ for marker in (
111
+ "token",
112
+ "use_auth_token",
113
+ "authentication",
114
+ "authorization",
115
+ "unauthorized",
116
+ "forbidden",
117
+ "401",
118
+ "403",
119
+ "gated",
120
+ "private",
121
+ )
122
+ )
123
+
124
+
125
+ def _should_retry_with_remote_code(error: Exception) -> bool:
126
+ msg = str(error).lower()
127
+ return any(
128
+ marker in msg
129
+ for marker in (
130
+ "trust_remote_code",
131
+ "remote code",
132
+ "custom code",
133
+ "custom configuration",
134
+ "execute the configuration file",
135
+ "execute the repository",
136
+ )
137
+ )
@@ -0,0 +1 @@
1
+ """Rendering backends for model-unfolder diagrams."""
@@ -0,0 +1,5 @@
1
+ """HTML/SVG rendering backend."""
2
+
3
+ from .document import render_document, render_fragment
4
+
5
+ __all__ = ["render_document", "render_fragment"]
@@ -0,0 +1,20 @@
1
+ """Reusable rich block detail views for the HTML renderer."""
2
+ from __future__ import annotations
3
+
4
+ from .attention import attention_card, attention_card_css
5
+ from .feed_forward import build_ffn_view, build_moe_view
6
+ from .per_layer_embedding import build_per_layer_embedding_view
7
+
8
+
9
+ def block_detail_svg(ir: dict, info: dict, mount_id: str, block: dict) -> str | None:
10
+ """Return a rich SVG for a clicked architecture block, when one exists."""
11
+ if block.get("kind") == "ffn":
12
+ ffn = info["dominant"]["spec"]["ffn"]
13
+ if ffn.get("kind") == "moe":
14
+ return build_moe_view(ir, info, mount_id)
15
+ return build_ffn_view(ir, info, mount_id)
16
+
17
+ if block.get("detail_view") == "per_layer_embedding":
18
+ return build_per_layer_embedding_view(ir, info, mount_id, block)
19
+
20
+ return None