htmltree-view 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
htmltree/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # File: htmltree/__init__.py
4
+ # Author: Hadi Cahyadi <cumulus13@gmail.com>
5
+ # Date: 2026-06-28
6
+ # Description: htmltree-view — Visualize HTML DOM structure as a depth-limited, colorized ASCII tree.
7
+ # License: MIT
8
+
9
+ """
10
+ htmltree-view — Visualize HTML DOM structure as a depth-limited, colorized ASCII tree.
11
+
12
+ Quick start
13
+ -----------
14
+ >>> from htmltree import HtmlTree
15
+ >>> tree = HtmlTree(open("index.html").read(), max_depth=3)
16
+ >>> tree.print()
17
+
18
+ CLI
19
+ ---
20
+ htmltree index.html -d 3
21
+ htmltree https://example.com --no-text
22
+ echo '<div><p>hi</p></div>' | htmltree -
23
+ """
24
+
25
+ from .core import HtmlTree, TreeStats
26
+ from .cli import main
27
+
28
+ __version__ = "0.2.0"
29
+ __all__ = ["HtmlTree", "TreeStats", "main"]
htmltree/cli.py ADDED
@@ -0,0 +1,297 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # File: htmltree/cli.py
4
+ # Author: Hadi Cahyadi <cumulus13@gmail.com>
5
+ # Date: 2026-06-28
6
+ # Description: htmltree CLI — visualize HTML structure as a depth-limited tree.
7
+ # License: MIT
8
+
9
+ """htmltree CLI — visualize HTML structure as a depth-limited tree."""
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import sys
14
+ import urllib.error
15
+ import urllib.request
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from .core import HtmlTree, VALID_PARSERS
20
+
21
+ _VERSION = "0.2.0"
22
+
23
+ _EPILOG = """
24
+ Examples:
25
+ htmltree index.html
26
+ htmltree index.html -d 3
27
+ htmltree index.html -d 2 --no-text
28
+ htmltree index.html -s "body > main"
29
+ htmltree index.html -s "#app" --attrs id class href
30
+ htmltree https://example.com -d 4
31
+ echo '<div><p>hi</p></div>' | htmltree -
32
+ htmltree index.html --no-color | less
33
+ htmltree index.html --show-comments -d 5
34
+
35
+ Selector examples:
36
+ -s body top-level <body>
37
+ -s "main > article" direct article children of main
38
+ -s "#root" element with id="root"
39
+ -s ".container" elements with class="container"
40
+ -s "table:first-of-type"
41
+ """
42
+
43
+
44
+ def _read_html(source: Optional[str]) -> str:
45
+ """Load HTML from a file path, URL, or stdin. Returns raw HTML string."""
46
+ # stdin
47
+ if source is None or source == "-":
48
+ if sys.stdin.isatty() and source is None:
49
+ return "" # caller will print help
50
+ try:
51
+ return sys.stdin.buffer.read().decode("utf-8", errors="replace")
52
+ except KeyboardInterrupt:
53
+ sys.exit(130)
54
+
55
+ # URL
56
+ if source.startswith(("http://", "https://")):
57
+ print(f"Fetching {source} …", file=sys.stderr)
58
+ req = urllib.request.Request(
59
+ source,
60
+ headers={"User-Agent": f"htmltree/{_VERSION} (python)"},
61
+ )
62
+ try:
63
+ with urllib.request.urlopen(req, timeout=20) as resp:
64
+ charset = "utf-8"
65
+ ct = resp.headers.get_content_charset()
66
+ if ct:
67
+ charset = ct
68
+ return resp.read().decode(charset, errors="replace")
69
+ except urllib.error.HTTPError as exc:
70
+ _die(f"HTTP {exc.code} fetching {source}: {exc.reason}")
71
+ except urllib.error.URLError as exc:
72
+ _die(f"Network error fetching {source}: {exc.reason}")
73
+ except Exception as exc: # pragma: no cover
74
+ _die(f"Error fetching {source}: {exc}")
75
+
76
+ # File
77
+ path = Path(source)
78
+ if not path.exists():
79
+ _die(f"File not found: {source}")
80
+ if not path.is_file():
81
+ _die(f"Not a file: {source}")
82
+ try:
83
+ return path.read_bytes().decode("utf-8", errors="replace")
84
+ except OSError as exc:
85
+ _die(f"Cannot read {source}: {exc}")
86
+
87
+
88
+ def _die(msg: str, code: int = 1) -> None:
89
+ print(f"htmltree: error: {msg}", file=sys.stderr)
90
+ sys.exit(code)
91
+
92
+
93
+ def _positive_int(value: str) -> int:
94
+ try:
95
+ n = int(value)
96
+ except ValueError:
97
+ raise argparse.ArgumentTypeError(f"{value!r} is not an integer")
98
+ if n < 0:
99
+ raise argparse.ArgumentTypeError(f"depth must be >= 0, got {n}")
100
+ return n
101
+
102
+
103
+ def _build_parser() -> argparse.ArgumentParser:
104
+ p = argparse.ArgumentParser(
105
+ prog="htmltree",
106
+ description=(
107
+ "Visualize HTML DOM structure as a depth-limited, colorized ASCII tree.\n"
108
+ "Supports files, URLs, and stdin."
109
+ ),
110
+ formatter_class=argparse.RawDescriptionHelpFormatter,
111
+ epilog=_EPILOG,
112
+ )
113
+
114
+ p.add_argument(
115
+ "source",
116
+ nargs="?",
117
+ metavar="SOURCE",
118
+ help=(
119
+ 'HTML source: file path, http/https URL, or "-" for stdin. '
120
+ "Omit to read stdin (when piped)."
121
+ ),
122
+ )
123
+
124
+ # ── display ──────────────────────────────────────────────────────────────
125
+ display = p.add_argument_group("display")
126
+ display.add_argument(
127
+ "-d", "--depth",
128
+ type=_positive_int,
129
+ default=None,
130
+ metavar="N",
131
+ help="Max nesting depth to show (default: unlimited). Truncated levels show a child-count hint.",
132
+ )
133
+ display.add_argument(
134
+ "-s", "--selector",
135
+ default=None,
136
+ metavar="CSS",
137
+ help='CSS selector for the sub-tree root (e.g. "body", "#app", ".container").',
138
+ )
139
+ display.add_argument(
140
+ "--attrs",
141
+ nargs="*",
142
+ default=True,
143
+ metavar="NAME",
144
+ help=(
145
+ "Attributes to display. "
146
+ "Omit flag = show all. "
147
+ "Pass names = show only those (e.g. --attrs id class href). "
148
+ "--attrs with no names = hide all."
149
+ ),
150
+ )
151
+ display.add_argument(
152
+ "--no-text",
153
+ action="store_true",
154
+ help="Hide text nodes.",
155
+ )
156
+ display.add_argument(
157
+ "--show-comments",
158
+ action="store_true",
159
+ help="Show HTML comment nodes.",
160
+ )
161
+ display.add_argument(
162
+ "--text-limit",
163
+ type=_positive_int,
164
+ default=60,
165
+ metavar="N",
166
+ help="Max characters shown per text node before truncation (default: 60).",
167
+ )
168
+ display.add_argument(
169
+ "--attr-limit",
170
+ type=_positive_int,
171
+ default=80,
172
+ metavar="N",
173
+ help="Max characters shown per attribute value before truncation (default: 80).",
174
+ )
175
+
176
+ # ── output ───────────────────────────────────────────────────────────────
177
+ output = p.add_argument_group("output")
178
+ output.add_argument(
179
+ "--no-color",
180
+ action="store_true",
181
+ help="Disable ANSI colors (auto-disabled when output is not a TTY).",
182
+ )
183
+ output.add_argument(
184
+ "--force-color",
185
+ action="store_true",
186
+ help="Force ANSI colors even when piped (e.g. for `less -R`).",
187
+ )
188
+ output.add_argument(
189
+ "--no-summary",
190
+ action="store_true",
191
+ help="Suppress the stats summary footer.",
192
+ )
193
+ output.add_argument(
194
+ "-o", "--output",
195
+ metavar="FILE",
196
+ default=None,
197
+ help="Write output to FILE instead of stdout.",
198
+ )
199
+
200
+ # ── parser ───────────────────────────────────────────────────────────────
201
+ misc = p.add_argument_group("misc")
202
+ misc.add_argument(
203
+ "--parser",
204
+ default="html.parser",
205
+ choices=sorted(VALID_PARSERS),
206
+ metavar="BACKEND",
207
+ help=(
208
+ f"BeautifulSoup parser backend. Choices: {sorted(VALID_PARSERS)}. "
209
+ "Default: html.parser (always available). "
210
+ "lxml is faster; html5lib is most spec-accurate."
211
+ ),
212
+ )
213
+ misc.add_argument(
214
+ "--version",
215
+ action="version",
216
+ version=f"%(prog)s {_VERSION}",
217
+ )
218
+
219
+ return p
220
+
221
+
222
+ def main(argv=None) -> int:
223
+ """
224
+ CLI entry point. Returns an exit code (0 = success).
225
+
226
+ Parameters
227
+ ----------
228
+ argv : list[str] | None
229
+ Argument list; defaults to sys.argv[1:].
230
+ """
231
+ parser = _build_parser()
232
+ args = parser.parse_args(argv)
233
+
234
+ # ── show_attrs resolution ─────────────────────────────────────────────────
235
+ if args.attrs is True:
236
+ show_attrs = True # flag not given → show all
237
+ elif args.attrs == []:
238
+ show_attrs = False # --attrs with no names → hide all
239
+ else:
240
+ show_attrs = list(args.attrs) # --attrs id class → filter list
241
+
242
+ # ── read HTML ─────────────────────────────────────────────────────────────
243
+ html = _read_html(args.source)
244
+
245
+ if not html:
246
+ if args.source is None:
247
+ parser.print_help(sys.stderr)
248
+ else:
249
+ print("htmltree: warning: empty input", file=sys.stderr)
250
+ return 0
251
+
252
+ # ── build tree ────────────────────────────────────────────────────────────
253
+ try:
254
+ tree = HtmlTree(
255
+ html,
256
+ max_depth=args.depth,
257
+ show_text=not args.no_text,
258
+ show_comments=args.show_comments,
259
+ show_attrs=show_attrs,
260
+ parser=args.parser,
261
+ no_color=args.no_color,
262
+ force_color=args.force_color,
263
+ text_limit=args.text_limit,
264
+ attr_limit=args.attr_limit,
265
+ )
266
+ except ValueError as exc:
267
+ _die(str(exc))
268
+
269
+ # ── output ────────────────────────────────────────────────────────────────
270
+ out_file = None
271
+ try:
272
+ if args.output:
273
+ out_path = Path(args.output)
274
+ out_file = out_path.open("w", encoding="utf-8")
275
+
276
+ try:
277
+ tree.print(
278
+ root_selector=args.selector,
279
+ show_summary=not args.no_summary,
280
+ file=out_file,
281
+ )
282
+ except ValueError as exc:
283
+ # Bad CSS selector, invalid parser, etc.
284
+ _die(str(exc))
285
+ except BrokenPipeError:
286
+ # User piped to head/less and closed early — not an error
287
+ pass
288
+
289
+ finally:
290
+ if out_file is not None:
291
+ out_file.close()
292
+
293
+ return 0
294
+
295
+
296
+ if __name__ == "__main__":
297
+ sys.exit(main())
htmltree/core.py ADDED
@@ -0,0 +1,546 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # File: htmltree/core.py
4
+ # Author: Hadi Cahyadi <cumulus13@gmail.com>
5
+ # Date: 2026-06-28
6
+ # Description: Core HTML tree visualization engine.
7
+ # License: MIT
8
+
9
+ """Core HTML tree visualization engine."""
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import sys
14
+ from dataclasses import dataclass, field
15
+ from typing import Iterator, List, Optional, Sequence, Union
16
+
17
+ from bs4 import BeautifulSoup, Comment, NavigableString, Tag
18
+
19
+ # ─── ANSI palette ────────────────────────────────────────────────────────────
20
+
21
+ RESET = "\033[0m"
22
+ BOLD = "\033[1m"
23
+ DIM = "\033[2m"
24
+
25
+ TAG_COLORS: dict[str, str] = {
26
+ # document root
27
+ "html": "\033[38;5;75m",
28
+ # head-section
29
+ "head": "\033[38;5;111m", "title": "\033[38;5;250m",
30
+ "meta": "\033[38;5;240m", "link": "\033[38;5;240m",
31
+ "script": "\033[38;5;240m", "style": "\033[38;5;240m",
32
+ "noscript": "\033[38;5;240m", "base": "\033[38;5;240m",
33
+ # layout / landmark
34
+ "body": "\033[38;5;111m", "header": "\033[38;5;117m",
35
+ "footer": "\033[38;5;117m", "main": "\033[38;5;117m",
36
+ "nav": "\033[38;5;117m", "section": "\033[38;5;81m",
37
+ "article": "\033[38;5;81m", "aside": "\033[38;5;81m",
38
+ "address": "\033[38;5;81m", "dialog": "\033[38;5;81m",
39
+ # headings
40
+ "h1": "\033[38;5;214m", "h2": "\033[38;5;220m",
41
+ "h3": "\033[38;5;226m", "h4": "\033[38;5;228m",
42
+ "h5": "\033[38;5;229m", "h6": "\033[38;5;230m",
43
+ # block text
44
+ "p": "\033[38;5;156m", "div": "\033[38;5;147m",
45
+ "blockquote": "\033[38;5;159m", "pre": "\033[38;5;159m",
46
+ "figure": "\033[38;5;159m", "figcaption": "\033[38;5;159m",
47
+ "details": "\033[38;5;159m", "summary": "\033[38;5;159m",
48
+ # inline text
49
+ "span": "\033[38;5;189m", "code": "\033[38;5;121m",
50
+ "em": "\033[38;5;189m", "strong": "\033[38;5;189m",
51
+ "small": "\033[38;5;189m", "mark": "\033[38;5;189m",
52
+ "abbr": "\033[38;5;189m", "cite": "\033[38;5;189m",
53
+ "time": "\033[38;5;189m", "kbd": "\033[38;5;121m",
54
+ "samp": "\033[38;5;121m", "var": "\033[38;5;121m",
55
+ "sub": "\033[38;5;189m", "sup": "\033[38;5;189m",
56
+ "del": "\033[38;5;189m", "ins": "\033[38;5;189m",
57
+ # links & media
58
+ "a": "\033[38;5;51m",
59
+ "img": "\033[38;5;208m", "video": "\033[38;5;208m",
60
+ "audio": "\033[38;5;208m", "picture": "\033[38;5;208m",
61
+ "source": "\033[38;5;208m", "track": "\033[38;5;208m",
62
+ "canvas": "\033[38;5;208m", "svg": "\033[38;5;208m",
63
+ "iframe": "\033[38;5;208m", "embed": "\033[38;5;208m",
64
+ "object": "\033[38;5;208m",
65
+ # forms
66
+ "form": "\033[38;5;204m", "input": "\033[38;5;210m",
67
+ "button": "\033[38;5;210m", "select": "\033[38;5;210m",
68
+ "textarea": "\033[38;5;210m", "label": "\033[38;5;216m",
69
+ "fieldset": "\033[38;5;216m", "legend": "\033[38;5;216m",
70
+ "datalist": "\033[38;5;210m", "output": "\033[38;5;210m",
71
+ "progress": "\033[38;5;210m", "meter": "\033[38;5;210m",
72
+ "option": "\033[38;5;210m", "optgroup": "\033[38;5;210m",
73
+ # lists
74
+ "ul": "\033[38;5;183m", "ol": "\033[38;5;183m",
75
+ "li": "\033[38;5;189m", "dl": "\033[38;5;183m",
76
+ "dt": "\033[38;5;189m", "dd": "\033[38;5;189m",
77
+ # tables
78
+ "table": "\033[38;5;178m", "caption": "\033[38;5;184m",
79
+ "colgroup": "\033[38;5;184m", "col": "\033[38;5;184m",
80
+ "thead": "\033[38;5;184m", "tbody": "\033[38;5;190m",
81
+ "tfoot": "\033[38;5;184m", "tr": "\033[38;5;192m",
82
+ "th": "\033[38;5;196m", "td": "\033[38;5;202m",
83
+ # semantic / misc
84
+ "template": "\033[38;5;240m",
85
+ "slot": "\033[38;5;240m",
86
+ }
87
+
88
+ DEFAULT_TAG_COLOR = "\033[38;5;153m"
89
+ COMMENT_COLOR = "\033[38;5;238m"
90
+ TEXT_COLOR = "\033[38;5;242m"
91
+ ATTR_KEY_COLOR = "\033[38;5;180m"
92
+ ATTR_VAL_COLOR = "\033[38;5;222m"
93
+ ERROR_COLOR = "\033[38;5;196m"
94
+ WARN_COLOR = "\033[38;5;214m"
95
+
96
+ LEVEL_COLORS = [
97
+ "\033[38;5;240m",
98
+ "\033[38;5;244m",
99
+ "\033[38;5;248m",
100
+ "\033[38;5;252m",
101
+ ]
102
+
103
+ TREE_BRANCH = "├── "
104
+ TREE_LAST = "└── "
105
+ TREE_PIPE = "│ "
106
+ TREE_SPACE = " "
107
+
108
+ VALID_PARSERS = frozenset({"html.parser", "lxml", "html5lib", "lxml-xml"})
109
+
110
+
111
+ # ─── Stats ───────────────────────────────────────────────────────────────────
112
+
113
+ @dataclass
114
+ class TreeStats:
115
+ """Accumulated metrics from the last render() call."""
116
+ total_tags: int = 0
117
+ total_text_nodes: int = 0
118
+ total_comments: int = 0
119
+ max_depth_seen: int = 0
120
+ tag_counts: dict = field(default_factory=dict)
121
+
122
+ def reset(self) -> None:
123
+ self.total_tags = 0
124
+ self.total_text_nodes = 0
125
+ self.total_comments = 0
126
+ self.max_depth_seen = 0
127
+ self.tag_counts.clear()
128
+
129
+ def record_tag(self, name: str, depth: int) -> None:
130
+ self.total_tags += 1
131
+ self.tag_counts[name] = self.tag_counts.get(name, 0) + 1
132
+ if depth > self.max_depth_seen:
133
+ self.max_depth_seen = depth
134
+
135
+ def record_text(self) -> None:
136
+ self.total_text_nodes += 1
137
+
138
+ def record_comment(self) -> None:
139
+ self.total_comments += 1
140
+
141
+
142
+ # ─── Main class ──────────────────────────────────────────────────────────────
143
+
144
+ class HtmlTree:
145
+ """
146
+ Render an HTML document (or fragment) as a colorized ASCII tree.
147
+
148
+ Parameters
149
+ ----------
150
+ html : str
151
+ Raw HTML string to parse and display.
152
+ max_depth : int | None
153
+ Maximum nesting depth to display (0 = root only; None = unlimited).
154
+ Negative values are treated as 0.
155
+ show_text : bool
156
+ Show text nodes in the tree (default True).
157
+ show_comments : bool
158
+ Show HTML comment nodes in the tree (default False).
159
+ show_attrs : list[str] | bool
160
+ * ``True`` – show all attributes (default)
161
+ * ``False`` – hide all attributes
162
+ * list/set/tuple of str – show only the named attributes
163
+ parser : str
164
+ BeautifulSoup parser backend. One of: ``"html.parser"`` (stdlib, always
165
+ available), ``"lxml"`` (fast, install separately), ``"html5lib"``
166
+ (most spec-accurate, install separately).
167
+ no_color : bool
168
+ Disable ANSI color codes. Automatically set to ``True`` when stdout is
169
+ not a TTY (e.g. piped to a file) *unless* ``force_color=True``.
170
+ force_color : bool
171
+ Override the TTY check and always emit colors even when piped.
172
+ text_limit : int
173
+ Maximum characters shown per text node before truncation (min 1).
174
+ attr_limit : int
175
+ Maximum characters shown per attribute *value* before truncation (min 1).
176
+
177
+ Examples
178
+ --------
179
+ >>> from htmltree import HtmlTree
180
+ >>> tree = HtmlTree(open("index.html").read(), max_depth=3)
181
+ >>> tree.print()
182
+
183
+ >>> output = tree.render(root_selector="#main")
184
+
185
+ >>> for line in tree.iter_lines():
186
+ ... print(line)
187
+ """
188
+
189
+ def __init__(
190
+ self,
191
+ html: str,
192
+ *,
193
+ max_depth: Optional[int] = None,
194
+ show_text: bool = True,
195
+ show_comments: bool = False,
196
+ show_attrs: Union[bool, Sequence[str]] = True,
197
+ parser: str = "html.parser",
198
+ no_color: bool = False,
199
+ force_color: bool = False,
200
+ text_limit: int = 60,
201
+ attr_limit: int = 80,
202
+ ) -> None:
203
+ # ── validate / normalize inputs ──────────────────────────────────────
204
+ if not isinstance(html, str):
205
+ raise TypeError(f"html must be str, got {type(html).__name__}")
206
+
207
+ if max_depth is not None:
208
+ max_depth = max(0, int(max_depth)) # clamp negatives to 0
209
+
210
+ if parser not in VALID_PARSERS:
211
+ raise ValueError(
212
+ f"Unknown parser {parser!r}. Valid choices: {sorted(VALID_PARSERS)}"
213
+ )
214
+
215
+ text_limit = max(1, int(text_limit))
216
+ attr_limit = max(1, int(attr_limit))
217
+
218
+ # Normalize show_attrs to bool | frozenset[str]
219
+ if isinstance(show_attrs, bool):
220
+ _show_attrs: Union[bool, frozenset] = show_attrs
221
+ elif isinstance(show_attrs, (list, tuple, set, frozenset)):
222
+ _show_attrs = frozenset(str(a) for a in show_attrs)
223
+ else:
224
+ raise TypeError(
225
+ f"show_attrs must be bool or a sequence of str, "
226
+ f"got {type(show_attrs).__name__}"
227
+ )
228
+
229
+ # Auto-detect TTY for color
230
+ _no_color = no_color
231
+ if not force_color and not no_color:
232
+ if not sys.stdout.isatty():
233
+ _no_color = True
234
+ # Also respect NO_COLOR env-var (https://no-color.org/)
235
+ if os.environ.get("NO_COLOR", ""):
236
+ _no_color = True
237
+ # Respect FORCE_COLOR env-var
238
+ if os.environ.get("FORCE_COLOR", ""):
239
+ _no_color = False
240
+
241
+ self.max_depth = max_depth
242
+ self.show_text = bool(show_text)
243
+ self.show_comments = bool(show_comments)
244
+ self._show_attrs = _show_attrs
245
+ self.no_color = _no_color
246
+ self.text_limit = text_limit
247
+ self.attr_limit = attr_limit
248
+ self.stats = TreeStats()
249
+
250
+ try:
251
+ self.soup = BeautifulSoup(html, parser)
252
+ except Exception as exc:
253
+ raise ValueError(f"Failed to parse HTML with parser {parser!r}: {exc}") from exc
254
+
255
+ # ── Internal color helpers ────────────────────────────────────────────────
256
+
257
+ def _c(self, code: str, text: str) -> str:
258
+ if self.no_color:
259
+ return text
260
+ return f"{code}{text}{RESET}"
261
+
262
+ def _tag_color(self, name: str) -> str:
263
+ return TAG_COLORS.get(name.lower(), DEFAULT_TAG_COLOR)
264
+
265
+ def _pipe_color(self, depth: int) -> str:
266
+ return LEVEL_COLORS[depth % len(LEVEL_COLORS)]
267
+
268
+ # ── Attribute formatting ──────────────────────────────────────────────────
269
+
270
+ def _fmt_attrs(self, tag: Tag) -> str:
271
+ if not tag.attrs or self._show_attrs is False:
272
+ return ""
273
+ items = list(tag.attrs.items())
274
+ if isinstance(self._show_attrs, frozenset):
275
+ items = [(k, v) for k, v in items if k in self._show_attrs]
276
+ if not items:
277
+ return ""
278
+ parts: List[str] = []
279
+ for k, v in items:
280
+ if isinstance(v, (list, tuple)):
281
+ v = " ".join(str(x) for x in v)
282
+ else:
283
+ v = str(v)
284
+ # Truncate huge attribute values (e.g. inline base64 images)
285
+ if len(v) > self.attr_limit:
286
+ v = v[: self.attr_limit] + "…"
287
+ key = self._c(ATTR_KEY_COLOR, str(k))
288
+ val = self._c(ATTR_VAL_COLOR, f'"{v}"')
289
+ parts.append(f"{key}={val}")
290
+ return " " + " ".join(parts)
291
+
292
+ # ── Prefix / guide-line building ─────────────────────────────────────────
293
+
294
+ def _build_prefix(self, indent_guide: List[bool], is_last: bool) -> str:
295
+ """
296
+ Build the tree-art prefix string for a node.
297
+
298
+ indent_guide : list[bool]
299
+ One entry per ancestor level; True = draw a vertical pipe,
300
+ False = draw whitespace.
301
+ is_last : bool
302
+ Whether this node is the last sibling at its level.
303
+ """
304
+ parts: List[str] = []
305
+ for depth_idx, has_pipe in enumerate(indent_guide):
306
+ ch = TREE_PIPE if has_pipe else TREE_SPACE
307
+ parts.append(self._c(self._pipe_color(depth_idx), ch))
308
+ branch = TREE_LAST if is_last else TREE_BRANCH
309
+ parts.append(self._c(self._pipe_color(len(parts)), branch))
310
+ return "".join(parts)
311
+
312
+ # ── Node line renderers ───────────────────────────────────────────────────
313
+
314
+ def _render_tag_line(
315
+ self, tag: Tag, prefix: str, depth: int, child_count: int
316
+ ) -> str:
317
+ name = self._c(BOLD + self._tag_color(tag.name), f"<{tag.name}>")
318
+ attrs = self._fmt_attrs(tag)
319
+ badge_parts = [self._c(DIM, "["), self._c("\033[38;5;245m", f"L{depth}")]
320
+ if child_count > 0:
321
+ badge_parts.append(self._c("\033[38;5;67m", f"{child_count}ch"))
322
+ else:
323
+ badge_parts.append(self._c("\033[38;5;238m", "empty"))
324
+ badge_parts.append(self._c(DIM, "]"))
325
+ badge = " ".join(badge_parts)
326
+ return f"{prefix}{name}{attrs} {badge}"
327
+
328
+ def _render_text_line(self, raw: str, prefix: str) -> str:
329
+ text = raw.strip()
330
+ if not text:
331
+ return ""
332
+ # Collapse internal whitespace
333
+ text = " ".join(text.split())
334
+ if len(text) > self.text_limit:
335
+ text = text[: self.text_limit] + "…"
336
+ return prefix + self._c(TEXT_COLOR, f'"{text}"')
337
+
338
+ def _render_comment_line(self, raw: str, prefix: str) -> str:
339
+ text = raw.strip()
340
+ if not text:
341
+ return ""
342
+ text = " ".join(text.split())
343
+ if len(text) > self.text_limit:
344
+ text = text[: self.text_limit] + "…"
345
+ return prefix + self._c(COMMENT_COLOR, f"<!-- {text} -->")
346
+
347
+ # ── Visible-child collection ──────────────────────────────────────────────
348
+
349
+ def _visible_children(self, tag: Tag) -> List:
350
+ """
351
+ Return the list of child nodes that will actually be rendered,
352
+ respecting show_text and show_comments settings.
353
+ """
354
+ result = []
355
+ for child in tag.children:
356
+ if isinstance(child, Comment):
357
+ if self.show_comments:
358
+ result.append(child)
359
+ elif isinstance(child, NavigableString):
360
+ if self.show_text and child.strip():
361
+ result.append(child)
362
+ elif isinstance(child, Tag):
363
+ result.append(child)
364
+ return result
365
+
366
+ # ── Iterative tree walk (no recursion → no stack overflow) ───────────────
367
+
368
+ def _iter_lines(self, roots: List[Tag]) -> Iterator[str]:
369
+ """
370
+ Iterative DFS that yields rendered lines one by one.
371
+
372
+ Uses an explicit stack instead of recursion so that arbitrarily deep
373
+ HTML (e.g. 10,000-level nesting from a malformed document) never causes
374
+ a RecursionError.
375
+
376
+ Stack items: (node, depth, indent_guide, is_last)
377
+ """
378
+ # Push roots in reverse order so the first root is processed first
379
+ stack: List[tuple] = []
380
+ for i, root in enumerate(reversed(roots)):
381
+ is_last_root = (i == 0) # reversed, so first-iterated = last root
382
+ stack.append((root, 0, [], is_last_root))
383
+
384
+ while stack:
385
+ node, depth, indent_guide, is_last = stack.pop()
386
+
387
+ # ── depth guard ──────────────────────────────────────────────────
388
+ if self.max_depth is not None and depth > self.max_depth:
389
+ continue
390
+
391
+ prefix = self._build_prefix(indent_guide, is_last) if depth > 0 else ""
392
+
393
+ # ── Comment node ─────────────────────────────────────────────────
394
+ if isinstance(node, Comment):
395
+ if self.show_comments:
396
+ line = self._render_comment_line(str(node), prefix)
397
+ if line:
398
+ self.stats.record_comment()
399
+ yield line
400
+ continue
401
+
402
+ # ── Text node ────────────────────────────────────────────────────
403
+ if isinstance(node, NavigableString):
404
+ if self.show_text:
405
+ line = self._render_text_line(str(node), prefix)
406
+ if line:
407
+ self.stats.record_text()
408
+ yield line
409
+ continue
410
+
411
+ # ── Skip unknown node types ──────────────────────────────────────
412
+ if not isinstance(node, Tag):
413
+ continue
414
+
415
+ # ── Tag node ─────────────────────────────────────────────────────
416
+ children = self._visible_children(node)
417
+ tag_children_count = sum(1 for c in children if isinstance(c, Tag))
418
+
419
+ self.stats.record_tag(node.name, depth)
420
+ yield self._render_tag_line(node, prefix, depth, tag_children_count)
421
+
422
+ # ── At depth limit: show ellipsis for hidden children ─────────────
423
+ if self.max_depth is not None and depth == self.max_depth:
424
+ if children:
425
+ ellipsis_guide = indent_guide + ([not is_last] if depth > 0 else [])
426
+ ellipsis_prefix = self._build_prefix(ellipsis_guide, True)
427
+ n = len(children)
428
+ label = f"… ({n} {'child' if n == 1 else 'children'} hidden)"
429
+ yield ellipsis_prefix + self._c(COMMENT_COLOR, label)
430
+ continue
431
+
432
+ # ── Push children in reverse so first child is processed first ───
433
+ next_guide = indent_guide + ([not is_last] if depth > 0 else [])
434
+ for idx, child in enumerate(reversed(children)):
435
+ child_is_last = (idx == 0) # reversed
436
+ stack.append((child, depth + 1, next_guide, child_is_last))
437
+
438
+ # ── Public API ────────────────────────────────────────────────────────────
439
+
440
+ def iter_lines(self, root_selector: Optional[str] = None) -> Iterator[str]:
441
+ """
442
+ Yield rendered tree lines one at a time (memory-efficient for large docs).
443
+
444
+ Parameters
445
+ ----------
446
+ root_selector : str | None
447
+ CSS selector used to pick the sub-tree root(s). Supports any
448
+ selector that BeautifulSoup's ``select()`` understands.
449
+
450
+ Yields
451
+ ------
452
+ str
453
+ One rendered line per call (may include ANSI codes).
454
+
455
+ Raises
456
+ ------
457
+ ValueError
458
+ If ``root_selector`` is syntactically invalid.
459
+ """
460
+ self.stats.reset()
461
+
462
+ if root_selector:
463
+ try:
464
+ roots = self.soup.select(root_selector)
465
+ except Exception as exc:
466
+ raise ValueError(
467
+ f"Invalid CSS selector {root_selector!r}: {exc}"
468
+ ) from exc
469
+ if not roots:
470
+ yield self._c(WARN_COLOR, f"⚠ No elements matched selector: {root_selector!r}")
471
+ return
472
+ yield from self._iter_lines(roots)
473
+ else:
474
+ root = self.soup.find("html") or self.soup
475
+ yield from self._iter_lines([root])
476
+
477
+ def render(self, root_selector: Optional[str] = None) -> str:
478
+ """
479
+ Return the complete tree as a single string.
480
+
481
+ Parameters
482
+ ----------
483
+ root_selector : str | None
484
+ CSS selector for a sub-tree root.
485
+
486
+ Returns
487
+ -------
488
+ str
489
+ Rendered tree (may contain ANSI codes unless ``no_color=True``).
490
+ """
491
+ return "\n".join(self.iter_lines(root_selector))
492
+
493
+ def summary(self) -> str:
494
+ """
495
+ Return a one-line stats summary (to be called *after* render/iter_lines).
496
+
497
+ Returns
498
+ -------
499
+ str
500
+ Formatted summary string.
501
+ """
502
+ s = self.stats
503
+ top5 = sorted(s.tag_counts.items(), key=lambda x: -x[1])[:5]
504
+ top5_str = ", ".join(f"{t}×{c}" for t, c in top5) or "(none)"
505
+ depth_info = (
506
+ f" (capped at {self.max_depth})" if self.max_depth is not None else ""
507
+ )
508
+ comment_info = f" Comments: {s.total_comments}" if s.total_comments else ""
509
+ lines = [
510
+ "",
511
+ self._c(DIM, "─" * 52),
512
+ self._c(
513
+ "\033[38;5;245m",
514
+ f" Tags: {s.total_tags} "
515
+ f"Text nodes: {s.total_text_nodes}{comment_info} "
516
+ f"Max depth: {s.max_depth_seen}{depth_info}",
517
+ ),
518
+ self._c("\033[38;5;240m", f" Top tags: {top5_str}"),
519
+ ]
520
+ return "\n".join(lines)
521
+
522
+ def print(
523
+ self,
524
+ root_selector: Optional[str] = None,
525
+ *,
526
+ show_summary: bool = True,
527
+ file=None,
528
+ ) -> None:
529
+ """
530
+ Print the tree to *file* (default: ``sys.stdout``).
531
+
532
+ Parameters
533
+ ----------
534
+ root_selector : str | None
535
+ CSS selector for sub-tree root.
536
+ show_summary : bool
537
+ Whether to print the stats summary after the tree.
538
+ file : file-like | None
539
+ Output destination; defaults to ``sys.stdout``.
540
+ """
541
+ out = file or sys.stdout
542
+ # Stream line by line — avoids building the whole string in memory
543
+ for line in self.iter_lines(root_selector):
544
+ print(line, file=out)
545
+ if show_summary:
546
+ print(self.summary(), file=out)
@@ -0,0 +1,237 @@
1
+ Metadata-Version: 2.4
2
+ Name: htmltree-view
3
+ Version: 0.2.1
4
+ Summary: Visualize HTML DOM structure as a depth-limited, colorized ASCII tree
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/cumulus13/htmltree
7
+ Project-URL: Repository, https://github.com/cumulus13/htmltree
8
+ Project-URL: Issues, https://github.com/cumulus13/htmltree/issues
9
+ Keywords: html,dom,tree,visualizer,beautifulsoup,cli,debug,structure,ascii
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: Markup :: HTML
22
+ Classifier: Topic :: Utilities
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: beautifulsoup4>=4.12
27
+ Provides-Extra: lxml
28
+ Requires-Dist: lxml; extra == "lxml"
29
+ Provides-Extra: html5lib
30
+ Requires-Dist: html5lib; extra == "html5lib"
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7; extra == "dev"
33
+ Requires-Dist: pytest-cov; extra == "dev"
34
+ Requires-Dist: lxml; extra == "dev"
35
+ Requires-Dist: html5lib; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # htmltree-view
39
+
40
+ > Visualize HTML DOM structure as a **depth-limited, colorized ASCII tree** — like the `tree` command, but for HTML files.
41
+
42
+ ```
43
+ <html> lang="en" [ L0 2ch ]
44
+ ├── <head> [ L0 4ch ]
45
+ │ ├── <meta> charset="utf-8" [ L1 empty ]
46
+ │ ├── <meta> name="viewport" content="width=device-width" [ L1 empty ]
47
+ │ ├── <title> [ L1 empty ]
48
+ │ │ └── "My Page"
49
+ │ └── <link> rel="stylesheet" href="style.css" [ L1 empty ]
50
+ └── <body> [ L1 3ch ]
51
+ ├── <header> [ L2 2ch ]
52
+ │ └── … (2 children hidden)
53
+ ├── <main> id="main-content" [ L2 2ch ]
54
+ │ └── … (2 children hidden)
55
+ └── <footer> [ L2 2ch ]
56
+ └── … (2 children hidden)
57
+
58
+ ────────────────────────────────────────────────────
59
+ Tags: 8 Text nodes: 1 Max depth: 2 (capped at 2)
60
+ Top tags: meta×2, html×1, head×1, title×1, link×1
61
+ ```
62
+
63
+ ## Features
64
+
65
+ - **Depth limiting** — `-d N` stops at level N; truncated sub-trees show a `… (X children hidden)` hint
66
+ - **CSS selector zoom** — `-s "#app"` or `-s "body > main"` focuses any sub-tree
67
+ - **Semantic tag colors** — headings in amber, structural in blue, forms in pink, links in cyan, etc.
68
+ - **Depth-cycling pipe colors** — guide lines change shade per nesting level
69
+ - **`[L3 5ch]` badges** — depth level + direct child-tag count on every node
70
+ - **Text nodes** — quoted inline, with `--text-limit` truncation and whitespace collapsing
71
+ - **Attribute filtering** — `--attrs id class href` shows only what you care about; `--attrs` hides all
72
+ - **Attribute value truncation** — `--attr-limit 80` prevents base64/data-URI blowout
73
+ - **HTML comments** — hidden by default, shown with `--show-comments`
74
+ - **URL fetching** — `htmltree https://example.com -d 3`
75
+ - **stdin pipe** — `curl ... | htmltree -` or `echo '<div/>' | htmltree -`
76
+ - **Output to file** — `-o tree.txt` (auto-disables color)
77
+ - **Auto color detection** — ANSI disabled when stdout is not a TTY; respects `NO_COLOR` / `FORCE_COLOR` env vars
78
+ - **Streaming output** — `iter_lines()` yields one line at a time; never builds the full string unless you ask
79
+ - **No recursion** — iterative DFS walk; handles arbitrarily deep HTML without `RecursionError`
80
+ - **Stats summary** — total tags, text nodes, comments, max depth seen, top-5 tag frequencies
81
+
82
+ ## Install
83
+
84
+ ```bash
85
+ pip install htmltree-view
86
+
87
+ # With faster lxml parser:
88
+ pip install "htmltree-view[lxml]"
89
+
90
+ # With html5lib (most spec-accurate):
91
+ pip install "htmltree-view[html5lib]"
92
+ ```
93
+
94
+ ## CLI
95
+
96
+ ```bash
97
+ # Full tree
98
+ htmltree index.html
99
+
100
+ # Limit depth to 3 levels
101
+ htmltree index.html -d 3
102
+
103
+ # Focus on a CSS-selected sub-tree
104
+ htmltree index.html -s "body > main"
105
+ htmltree index.html -s "#app"
106
+ htmltree index.html -s ".container"
107
+
108
+ # Fetch from URL
109
+ htmltree https://example.com -d 4
110
+
111
+ # Read from stdin
112
+ curl https://example.com | htmltree -
113
+ echo '<div><p>hi</p></div>' | htmltree -
114
+
115
+ # Show only id and class attributes
116
+ htmltree index.html --attrs id class
117
+
118
+ # Hide all attributes
119
+ htmltree index.html --attrs
120
+
121
+ # Hide text nodes (structure only)
122
+ htmltree index.html --no-text
123
+
124
+ # Show HTML comments
125
+ htmltree index.html --show-comments
126
+
127
+ # Truncate text/attr at 40 chars
128
+ htmltree index.html --text-limit 40 --attr-limit 40
129
+
130
+ # Save to file (color auto-disabled)
131
+ htmltree index.html -o structure.txt
132
+
133
+ # Pipe to less with color preserved
134
+ htmltree index.html --force-color | less -R
135
+
136
+ # Use lxml backend (faster)
137
+ htmltree index.html --parser lxml
138
+
139
+ # Plain output (no ANSI)
140
+ htmltree index.html --no-color
141
+ ```
142
+
143
+ ## Python API
144
+
145
+ ```python
146
+ from htmltree import HtmlTree
147
+
148
+ html = open("index.html").read()
149
+
150
+ # Basic usage
151
+ tree = HtmlTree(html)
152
+ tree.print()
153
+
154
+ # Limit depth, filter attributes
155
+ tree = HtmlTree(html, max_depth=3, show_attrs=["id", "class"])
156
+ tree.print()
157
+
158
+ # Zoom into a sub-tree
159
+ tree = HtmlTree(html, max_depth=5, show_text=False)
160
+ tree.print(root_selector="body > main")
161
+
162
+ # Render to string
163
+ tree = HtmlTree(html, max_depth=2, force_color=False)
164
+ output = tree.render(root_selector="body")
165
+ print(output)
166
+
167
+ # Stream line by line (memory-efficient for large pages)
168
+ tree = HtmlTree(html, max_depth=4)
169
+ for line in tree.iter_lines(root_selector="#content"):
170
+ print(line)
171
+
172
+ # Access stats after render
173
+ tree.render()
174
+ print(tree.stats.total_tags)
175
+ print(tree.stats.tag_counts) # dict: tag name → count
176
+ print(tree.stats.max_depth_seen)
177
+ print(tree.stats.total_text_nodes)
178
+ print(tree.stats.total_comments)
179
+ ```
180
+
181
+ ## CLI reference
182
+
183
+ | Flag | Default | Description |
184
+ |------|---------|-------------|
185
+ | `SOURCE` | — | HTML file path, http/https URL, or `-` for stdin |
186
+ | `-d N` / `--depth N` | unlimited | Max depth; negatives clamped to 0 |
187
+ | `-s CSS` / `--selector CSS` | `<html>` | CSS selector for tree root |
188
+ | `--attrs [NAME …]` | all | Attributes to show; no names = hide all |
189
+ | `--no-text` | off | Hide text nodes |
190
+ | `--show-comments` | off | Show HTML comment nodes |
191
+ | `--text-limit N` | 60 | Max chars per text node |
192
+ | `--attr-limit N` | 80 | Max chars per attribute value |
193
+ | `--no-color` | off | Disable ANSI colors |
194
+ | `--force-color` | off | Force colors even when piped |
195
+ | `--no-summary` | off | Suppress stats footer |
196
+ | `-o FILE` / `--output FILE` | stdout | Write to file |
197
+ | `--parser BACKEND` | `html.parser` | `html.parser`, `lxml`, `html5lib` |
198
+ | `--version` | — | Print version and exit |
199
+
200
+ ## Tree legend
201
+
202
+ | Symbol | Meaning |
203
+ |--------|---------|
204
+ | `[L3]` | Node is at depth 3 |
205
+ | `[5ch]` | 5 direct tag children |
206
+ | `[empty]` | No children |
207
+ | `"text"` | Text node content (may be truncated) |
208
+ | `<!-- … -->` | HTML comment (with `--show-comments`) |
209
+ | `… (N children hidden)` | Sub-tree cut at depth limit |
210
+
211
+ ## Environment variables
212
+
213
+ | Variable | Effect |
214
+ |----------|--------|
215
+ | `NO_COLOR` | Any non-empty value disables ANSI colors (https://no-color.org/) |
216
+ | `FORCE_COLOR` | Any non-empty value forces ANSI colors even when piped |
217
+
218
+ ## Requirements
219
+
220
+ - Python ≥ 3.8
221
+ - `beautifulsoup4 ≥ 4.12`
222
+ - Optional: `lxml`, `html5lib`
223
+
224
+ ## License
225
+
226
+ [MIT](LICENSE)
227
+
228
+ ## 👤 Author
229
+
230
+ [Hadi Cahyadi](mailto:cumulus13@gmail.com)
231
+
232
+
233
+ [![Buy Me a Coffee](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://www.buymeacoffee.com/cumulus13)
234
+
235
+ [![Donate via Ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/cumulus13)
236
+
237
+ [Support me on Patreon](https://www.patreon.com/cumulus13)
@@ -0,0 +1,9 @@
1
+ htmltree/__init__.py,sha256=UEi0In0BwaPX41D97-F70VoArRwqp8EdUDC7lD9mwPk,724
2
+ htmltree/cli.py,sha256=sJxlN604QPyUF0Fdw43l4vQ3p-1VORvgoD3lPLQqE88,9820
3
+ htmltree/core.py,sha256=qu0pukgXY44dIt7tXEdSGsV9zDfyWuDf4sVNyjknuq4,21881
4
+ htmltree_view-0.2.1.dist-info/licenses/LICENSE,sha256=YUZAmTZXLbSJqc2UabCY7nJH-s_jCegXkhfTpHrOSRU,1068
5
+ htmltree_view-0.2.1.dist-info/METADATA,sha256=_OpIV_-E4NPmk8WpabVESOf_god_9n1laEA6q7HQW8k,8252
6
+ htmltree_view-0.2.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ htmltree_view-0.2.1.dist-info/entry_points.txt,sha256=xaR5gaSXseq1n7CN8hJC-hnCa5COQKh7fPMsILecnE8,47
8
+ htmltree_view-0.2.1.dist-info/top_level.txt,sha256=Fr8AeWjZj8vA2mCqzwVhp02VqBZIywXyKv2wZSnG-ro,9
9
+ htmltree_view-0.2.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ htmltree = htmltree.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Hadi Cahyadi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ htmltree