outliner-cli 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/PKG-INFO +35 -5
  2. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/README.md +34 -4
  3. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/pyproject.toml +1 -1
  4. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/cli.py +45 -11
  5. outliner_cli-0.3.0/src/outliner/parsers/__init__.py +65 -0
  6. outliner_cli-0.3.0/src/outliner/parsers/html.py +357 -0
  7. outliner_cli-0.3.0/src/outliner/parsers/json.py +329 -0
  8. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/markdown.py +13 -14
  9. outliner_cli-0.3.0/src/outliner/parsers/xml.py +271 -0
  10. outliner_cli-0.3.0/src/outliner/types.py +30 -0
  11. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/PKG-INFO +35 -5
  12. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/SOURCES.txt +4 -0
  13. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_cli.py +66 -0
  14. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_html.py +171 -38
  15. outliner_cli-0.3.0/tests/test_json.py +405 -0
  16. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_markdown.py +4 -0
  17. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_parsers.py +41 -0
  18. outliner_cli-0.3.0/tests/test_xml.py +125 -0
  19. outliner_cli-0.2.0/src/outliner/parsers/__init__.py +0 -43
  20. outliner_cli-0.2.0/src/outliner/parsers/html.py +0 -217
  21. outliner_cli-0.2.0/src/outliner/types.py +0 -19
  22. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/LICENSE +0 -0
  23. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/setup.cfg +0 -0
  24. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/__init__.py +0 -0
  25. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/asciidoc.py +0 -0
  26. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/c.py +0 -0
  27. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/clojure.py +0 -0
  28. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/csharp.py +0 -0
  29. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/go.py +0 -0
  30. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/java.py +0 -0
  31. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/javascript.py +0 -0
  32. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/orgmode.py +0 -0
  33. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/perl.py +0 -0
  34. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/php.py +0 -0
  35. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/python.py +0 -0
  36. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/rst.py +0 -0
  37. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/ruby.py +0 -0
  38. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/rust.py +0 -0
  39. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/scala.py +0 -0
  40. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/shell.py +0 -0
  41. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/swift.py +0 -0
  42. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/util.py +0 -0
  43. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/zig.py +0 -0
  44. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/dependency_links.txt +0 -0
  45. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/entry_points.txt +0 -0
  46. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/top_level.txt +0 -0
  47. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_asciidoc.py +0 -0
  48. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_c.py +0 -0
  49. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_clojure.py +0 -0
  50. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_csharp.py +0 -0
  51. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_go.py +0 -0
  52. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_java.py +0 -0
  53. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_javascript.py +0 -0
  54. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_orgmode.py +0 -0
  55. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_perl.py +0 -0
  56. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_php.py +0 -0
  57. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_python.py +0 -0
  58. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_rst.py +0 -0
  59. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_ruby.py +0 -0
  60. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_rust.py +0 -0
  61. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_scala.py +0 -0
  62. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_shell.py +0 -0
  63. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_swift.py +0 -0
  64. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_util.py +0 -0
  65. {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_zig.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: outliner-cli
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Print the structural outline of source files for LLM navigation
5
5
  Author: Per Cederberg
6
6
  License-Expression: MIT
@@ -27,6 +27,7 @@ outliner-cli [OPTIONS] [FILE...]
27
27
  | ------------------- | ----------------------------------------------------------------------------- |
28
28
  | `-g, --grep EXPR` | Only show items whose signature matches EXPR (case-insensitive) |
29
29
  | `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous |
30
+ | `-t, --type LANG` | Only include files of this language (repeatable, accepts name or extension) |
30
31
  | `-w, --width COLS` | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
31
32
 
32
33
  Pass a file, a directory (walked recursively), or omit arguments to read stdin.
@@ -57,7 +58,7 @@ methods) and native-format indentation in the signature (indented for code,
57
58
  ## Installation
58
59
 
59
60
  ```sh
60
- pip install outliner
61
+ pip install outliner-cli
61
62
  ```
62
63
 
63
64
  ## Running
@@ -79,9 +80,9 @@ uv run pytest
79
80
 
80
81
  ## Supported Languages
81
82
 
82
- AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript, Markdown,
83
- Org-mode, Perl, PHP, Python, reStructuredText, Ruby, Rust, Scala, Shell, Swift,
84
- and Zig.
83
+ AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
84
+ JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
85
+ Rust, Scala, Shell, Swift, XML, and Zig.
85
86
 
86
87
  ## Example Use Cases
87
88
 
@@ -132,3 +133,32 @@ $ uvx outliner-cli --grep PaymentMethod src/
132
133
  14,12 def charge(method: PaymentMethod, amount: Decimal) -> Receipt
133
134
  88,4 def validate(m: PaymentMethod) -> bool
134
135
  ```
136
+
137
+ **Inspect a dataset without opening it** — JSON/NDJSON files show a schema
138
+ overview with file size, record count, data types, optionality, and truncated
139
+ sample values:
140
+
141
+ ```
142
+ $ uvx outliner-cli titanic.json
143
+ $ 163.9 KB · json · array[891]
144
+ .Age float|int? -- 22
145
+ .Cabin str? -- "C85"
146
+ .Embarked str? -- "S"
147
+ .Fare float|int -- 7.25
148
+ .Name str -- "Braund, Mr. Owen Harris"
149
+ .Survived int -- 0
150
+ ```
151
+
152
+ XML files show an indented structural outline with XML-native node kinds:
153
+
154
+ ```
155
+ $ uvx outliner-cli pubmed26n0001.xml
156
+ / 195.5 MB · xml · sampled 204K elems
157
+ <PubmedArticleSet> elem
158
+ <PubmedArticle> elem+
159
+ <MedlineCitation> elem
160
+ @Status attr -- "MEDLINE"
161
+ <Article> elem
162
+ <ArticleTitle> text -- "Formate assay in body fluids: applica..."
163
+ <Abstract> elem?
164
+ ```
@@ -14,6 +14,7 @@ outliner-cli [OPTIONS] [FILE...]
14
14
  | ------------------- | ----------------------------------------------------------------------------- |
15
15
  | `-g, --grep EXPR` | Only show items whose signature matches EXPR (case-insensitive) |
16
16
  | `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous |
17
+ | `-t, --type LANG` | Only include files of this language (repeatable, accepts name or extension) |
17
18
  | `-w, --width COLS` | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
18
19
 
19
20
  Pass a file, a directory (walked recursively), or omit arguments to read stdin.
@@ -44,7 +45,7 @@ methods) and native-format indentation in the signature (indented for code,
44
45
  ## Installation
45
46
 
46
47
  ```sh
47
- pip install outliner
48
+ pip install outliner-cli
48
49
  ```
49
50
 
50
51
  ## Running
@@ -66,9 +67,9 @@ uv run pytest
66
67
 
67
68
  ## Supported Languages
68
69
 
69
- AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript, Markdown,
70
- Org-mode, Perl, PHP, Python, reStructuredText, Ruby, Rust, Scala, Shell, Swift,
71
- and Zig.
70
+ AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
71
+ JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
72
+ Rust, Scala, Shell, Swift, XML, and Zig.
72
73
 
73
74
  ## Example Use Cases
74
75
 
@@ -119,3 +120,32 @@ $ uvx outliner-cli --grep PaymentMethod src/
119
120
  14,12 def charge(method: PaymentMethod, amount: Decimal) -> Receipt
120
121
  88,4 def validate(m: PaymentMethod) -> bool
121
122
  ```
123
+
124
+ **Inspect a dataset without opening it** — JSON/NDJSON files show a schema
125
+ overview with file size, record count, data types, optionality, and truncated
126
+ sample values:
127
+
128
+ ```
129
+ $ uvx outliner-cli titanic.json
130
+ $ 163.9 KB · json · array[891]
131
+ .Age float|int? -- 22
132
+ .Cabin str? -- "C85"
133
+ .Embarked str? -- "S"
134
+ .Fare float|int -- 7.25
135
+ .Name str -- "Braund, Mr. Owen Harris"
136
+ .Survived int -- 0
137
+ ```
138
+
139
+ XML files show an indented structural outline with XML-native node kinds:
140
+
141
+ ```
142
+ $ uvx outliner-cli pubmed26n0001.xml
143
+ / 195.5 MB · xml · sampled 204K elems
144
+ <PubmedArticleSet> elem
145
+ <PubmedArticle> elem+
146
+ <MedlineCitation> elem
147
+ @Status attr -- "MEDLINE"
148
+ <Article> elem
149
+ <ArticleTitle> text -- "Formate assay in body fluids: applica..."
150
+ <Abstract> elem?
151
+ ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "outliner-cli"
7
- version = "0.2.0"
7
+ version = "0.3.0"
8
8
  description = "Print the structural outline of source files for LLM navigation"
9
9
  authors = [{name = "Per Cederberg"}]
10
10
  license = "MIT"
@@ -10,6 +10,9 @@ import sys
10
10
  from outliner.parsers import NAMES, EXTENSIONS, detect, outline, syntax
11
11
  from outliner.types import OutlineItem
12
12
 
13
+ _TEXT_CONTROLS = "\n\r\t\f\b"
14
+ _BINARY_THRESHOLD = 0.05
15
+
13
16
 
14
17
  def die(msg: str, code: int = 2) -> None:
15
18
  print(f"outliner: {msg}", file=sys.stderr)
@@ -78,9 +81,47 @@ def _format_items(items: list[OutlineItem], grep: re.Pattern | None, line_width:
78
81
  items = [it for it in items if grep.search(it.signature)]
79
82
  if not items:
80
83
  return []
81
- num_width = max(it.num_width for it in items)
82
- num_width = max(num_width, 3)
83
- return [it.format(num_width, line_width) for it in items]
84
+ fmt_width = max(it.fmt_width for it in items)
85
+ fmt_width = max(fmt_width, 3)
86
+ return [it.format(fmt_width, line_width) for it in items]
87
+
88
+
89
+ def _looks_binary(head: str) -> bool:
90
+ if "\0" in head:
91
+ return True
92
+ if head:
93
+ controls = sum(1 for ch in head if ord(ch) < 32 and ch not in _TEXT_CONTROLS)
94
+ replaced = head.count("\ufffd")
95
+ return (controls + replaced) / len(head) > _BINARY_THRESHOLD
96
+ return False
97
+
98
+
99
+ def _format_size(size_bytes: int) -> str:
100
+ if size_bytes >= 1_000_000_000:
101
+ return f"{size_bytes / 1_000_000_000:.1f} GB"
102
+ if size_bytes >= 1_000_000:
103
+ return f"{size_bytes / 1_000_000:.1f} MB"
104
+ if size_bytes >= 1_000:
105
+ return f"{size_bytes / 1_000:.1f} KB"
106
+ return f"{size_bytes} B"
107
+
108
+
109
+ def _outline_source(src: str, selected: str | None) -> tuple[list[OutlineItem] | None, str | None]:
110
+ if src == "-":
111
+ if selected:
112
+ return outline(selected, sys.stdin), selected
113
+ text = sys.stdin.read()
114
+ match = selected or detect(text)
115
+ return (outline(match, text) if match else None), match
116
+
117
+ with open(src, encoding="utf-8", errors="replace") as fh:
118
+ head = fh.read(4096)
119
+ if _looks_binary(head):
120
+ size = _format_size(os.path.getsize(src))
121
+ return [OutlineItem(locator="binary file", signature=size)], "binary"
122
+ match = selected or guess_syntax(src) or detect(head)
123
+ fh.seek(0)
124
+ return (outline(match, fh) if match else None), match
84
125
 
85
126
 
86
127
  def main(argv: list[str] | None = None) -> int:
@@ -143,25 +184,18 @@ def main(argv: list[str] | None = None) -> int:
143
184
  exit_code = 0
144
185
  for src in sources:
145
186
  try:
146
- if src == "-":
147
- text = sys.stdin.read()
148
- else:
149
- with open(src, encoding="utf-8", errors="replace") as fh:
150
- text = fh.read()
187
+ items, match = _outline_source(src, args.syntax)
151
188
  except OSError as exc:
152
189
  print(f"outliner: {exc}", file=sys.stderr)
153
190
  exit_code = 1
154
191
  continue
155
192
 
156
- match = args.syntax or guess_syntax(src) or detect(text)
157
-
158
193
  if match is None:
159
194
  print(f"outliner: cannot auto-detect syntax for '{src}'; use --syntax",
160
195
  file=sys.stderr)
161
196
  exit_code = 2
162
197
  continue
163
198
 
164
- items = outline(match, text)
165
199
  if items is None:
166
200
  available = ", ".join(NAMES)
167
201
  print(f"outliner: unsupported syntax '{match}'; available: {available}",
@@ -0,0 +1,65 @@
1
+ import io
2
+ import re
3
+ import types
4
+ from typing import TextIO
5
+
6
+ from ..types import OutlineItem
7
+ from . import (
8
+ python, scala, go, java, rust, swift, c, ruby, php, shell, javascript,
9
+ csharp, perl, zig, clojure, html, asciidoc, orgmode, rst, json, xml,
10
+ markdown,
11
+ )
12
+
13
+ _MODULES = {
14
+ mod.SYNTAX: mod
15
+ for mod in globals().values() if (
16
+ isinstance(mod, types.ModuleType)
17
+ and mod.__name__.startswith(f"{__name__}.")
18
+ and hasattr(mod, "SYNTAX")
19
+ and hasattr(mod, "EXTENSIONS")
20
+ )
21
+ }
22
+ NAMES = sorted(_MODULES)
23
+ EXTENSIONS = {ext: syntax for syntax, mod in _MODULES.items() for ext in mod.EXTENSIONS}
24
+ _FRONTMATTER_RE = re.compile(r'\A(?:---\n(?:.*\n){0,98}?---\n|\+\+\+\n(?:.*\n){0,98}?\+\+\+\n)')
25
+
26
+
27
+ def _strip_frontmatter(content: str) -> str:
28
+ m = _FRONTMATTER_RE.match(content)
29
+ return content[m.end():] if m else content
30
+
31
+
32
+ def syntax(name: str) -> str | None:
33
+ if name in _MODULES:
34
+ return name
35
+ ext = name if name.startswith(".") else "." + name
36
+ return EXTENSIONS.get(ext)
37
+
38
+
39
+ def detect(content: str) -> str | None:
40
+ lines = _strip_frontmatter(content).splitlines()[:100]
41
+ for mod in _MODULES.values():
42
+ if mod.detect(lines):
43
+ return mod.SYNTAX
44
+ return None
45
+
46
+
47
+ def outline(syntax: str, content: str | TextIO) -> list[OutlineItem] | None:
48
+ mod = _MODULES.get(syntax)
49
+ if not mod:
50
+ return None
51
+ elif hasattr(mod, "read"):
52
+ fh = io.StringIO(content) if isinstance(content, str) else content
53
+ return list(mod.read(fh))
54
+ else:
55
+ text = content.read() if hasattr(content, "read") else content
56
+ return _outline_text(mod, text)
57
+
58
+
59
+ def _outline_text(mod, content: str) -> list[OutlineItem]:
60
+ m = _FRONTMATTER_RE.match(content)
61
+ if not m:
62
+ return list(mod.parse(content))
63
+ offset = m.group(0).count('\n')
64
+ return [OutlineItem(start=it.start + offset, count=it.count, signature=it.signature)
65
+ for it in mod.parse(content[m.end():])]
@@ -0,0 +1,357 @@
1
+ """HTML outline parser using Python's tokenizing HTMLParser."""
2
+
3
+ import html
4
+ import re
5
+ from collections.abc import Iterator
6
+ from dataclasses import dataclass, field
7
+ from html.parser import HTMLParser
8
+
9
+ from outliner.types import OutlineItem
10
+
11
+ SYNTAX = "html"
12
+ EXTENSIONS = (".html", ".htm", ".xhtml")
13
+
14
+ _DOCTYPE_RE = re.compile(r'^\s*<!DOCTYPE\s+html', re.IGNORECASE)
15
+ _HTML_TAG_RE = re.compile(r'^\s*<html[\s>]', re.IGNORECASE)
16
+ _WS_RE = re.compile(r'\s+')
17
+ _SENTENCE_RE = re.compile(r'(?<=[.!?])\s+')
18
+ _STRUCTURAL = {"head", "body"}
19
+ _LANDMARKS = {"nav", "main", "article", "section", "header"}
20
+ _OUTLINE_TAGS = _STRUCTURAL | _LANDMARKS
21
+ _CONTENT_TAGS = {"body"} | _LANDMARKS
22
+ _TEXT_SKIP_TAGS = {"script", "style", "svg", "noscript", "template"}
23
+ _EXCERPT_LIMIT = 80
24
+ _BORING_EXCERPTS = {
25
+ "advertisement", "close", "menu", "navigation", "open menu",
26
+ "search", "skip advertisement", "skip to content",
27
+ }
28
+ _VOID_TAGS = {
29
+ "area", "base", "br", "col", "embed", "hr", "img", "input",
30
+ "link", "meta", "param", "source", "track", "wbr",
31
+ }
32
+
33
+
34
+ @dataclass
35
+ class _Node:
36
+ tag: str
37
+ start: int
38
+ start_col: int
39
+ attrs: dict[str, str]
40
+ depth: int
41
+ text_parts: list[str] = field(default_factory=list)
42
+ heading_text: str = ""
43
+ end: int | None = None
44
+
45
+ @property
46
+ def signature(self) -> str:
47
+ return _block_sig(
48
+ self.tag,
49
+ self.attrs,
50
+ self.heading_text or "".join(self.text_parts),
51
+ self.depth,
52
+ )
53
+
54
+ @property
55
+ def has_identity(self) -> bool:
56
+ return (
57
+ self.tag in _STRUCTURAL
58
+ or self.tag == "main"
59
+ or bool(self.attrs.get("id"))
60
+ or bool(_clean(self.attrs.get("aria-label", "")))
61
+ or bool(_excerpt(self.heading_text or "".join(self.text_parts)))
62
+ )
63
+
64
+
65
+ @dataclass
66
+ class _Heading:
67
+ tag: str
68
+ level: int
69
+ start: int
70
+ start_col: int
71
+ base_depth: int
72
+ context_key: int
73
+ text_parts: list[str] = field(default_factory=list)
74
+
75
+
76
+ class _Parser(HTMLParser):
77
+ def __init__(self, line_count: int):
78
+ super().__init__(convert_charrefs=False)
79
+ self.line_count = line_count
80
+ self.nodes: list[_Node] = []
81
+ self.headings: list[tuple[int, int, int, str]] = []
82
+ self.titles: list[tuple[int, int, OutlineItem]] = []
83
+ self._stack: list[_Node] = []
84
+ self._heading: _Heading | None = None
85
+ self._title: _Heading | None = None
86
+ self._text_skip: list[str] = []
87
+ self._heading_stacks: dict[int, list[tuple[int, int]]] = {}
88
+
89
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
90
+ tag = tag.lower()
91
+ if self._text_skip:
92
+ if tag in _TEXT_SKIP_TAGS:
93
+ self._text_skip.append(tag)
94
+ return
95
+ if tag in _TEXT_SKIP_TAGS:
96
+ self._text_skip.append(tag)
97
+ return
98
+
99
+ line, column = self.getpos()
100
+ attrs_by_name = {name.lower(): value or "" for name, value in attrs}
101
+ if tag in _OUTLINE_TAGS:
102
+ if tag in _STRUCTURAL and self._inside_content():
103
+ return
104
+ node = _Node(
105
+ tag=tag,
106
+ start=line,
107
+ start_col=column,
108
+ attrs=attrs_by_name,
109
+ depth=self._tag_depth(tag),
110
+ )
111
+ self.nodes.append(node)
112
+ if tag not in _VOID_TAGS:
113
+ self._stack.append(node)
114
+ elif tag in _VOID_TAGS:
115
+ return
116
+
117
+ if _is_heading(tag):
118
+ self._heading = _Heading(
119
+ tag=tag,
120
+ level=int(tag[1]),
121
+ start=line,
122
+ start_col=column,
123
+ base_depth=self._heading_base_depth(),
124
+ context_key=self._heading_context_key(),
125
+ )
126
+ elif tag == "title" and self._inside_document_head():
127
+ self._title = _Heading(
128
+ tag=tag,
129
+ level=0,
130
+ start=line,
131
+ start_col=column,
132
+ base_depth=self._tag_depth(tag),
133
+ context_key=0,
134
+ )
135
+
136
+ def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
137
+ tag = tag.lower()
138
+ if self._text_skip:
139
+ return
140
+ if tag in _OUTLINE_TAGS:
141
+ if tag in _STRUCTURAL and self._inside_content():
142
+ return
143
+ attrs_by_name = {name.lower(): value or "" for name, value in attrs}
144
+ line, column = self.getpos()
145
+ self.nodes.append(_Node(
146
+ tag=tag,
147
+ start=line,
148
+ start_col=column,
149
+ end=line,
150
+ attrs=attrs_by_name,
151
+ depth=self._tag_depth(tag),
152
+ ))
153
+
154
+ def handle_endtag(self, tag: str) -> None:
155
+ tag = tag.lower()
156
+ if self._text_skip:
157
+ if tag == self._text_skip[-1]:
158
+ self._text_skip.pop()
159
+ return
160
+
161
+ line = self.getpos()[0]
162
+ if self._heading and tag == self._heading.tag:
163
+ heading = self._heading
164
+ text = _clean("".join(heading.text_parts))
165
+ depth = self._heading_depth(heading)
166
+ self.headings.append((
167
+ heading.start,
168
+ heading.start_col,
169
+ heading.level,
170
+ f"{' ' * depth}<{heading.tag}>{text}</{heading.tag}>",
171
+ ))
172
+ for node in reversed(self._stack):
173
+ if node.tag in _LANDMARKS and not node.heading_text:
174
+ node.heading_text = text
175
+ break
176
+ self._heading = None
177
+ elif self._title and tag == "title":
178
+ title = self._title
179
+ text = _clean("".join(title.text_parts))
180
+ if text:
181
+ self.titles.append((title.start, title.start_col, OutlineItem(
182
+ start=title.start,
183
+ count=max(1, line - title.start + 1),
184
+ signature=f"{' ' * title.base_depth}<title>{text}</title>",
185
+ )))
186
+ self._title = None
187
+
188
+ if tag in _OUTLINE_TAGS:
189
+ for idx in range(len(self._stack) - 1, -1, -1):
190
+ node = self._stack[idx]
191
+ if node.tag == tag:
192
+ node.end = line
193
+ del self._stack[idx:]
194
+ break
195
+
196
+ def handle_data(self, data: str) -> None:
197
+ if self._heading:
198
+ self._heading.text_parts.append(data)
199
+ elif self._title:
200
+ self._title.text_parts.append(data)
201
+ elif not self._text_skip:
202
+ self._add_text(data)
203
+
204
+ def handle_entityref(self, name: str) -> None:
205
+ if self._heading:
206
+ self._heading.text_parts.append(f"&{name};")
207
+ elif self._title:
208
+ self._title.text_parts.append(f"&{name};")
209
+ elif not self._text_skip:
210
+ self._add_text(f"&{name};", glue=True)
211
+
212
+ def handle_charref(self, name: str) -> None:
213
+ if self._heading:
214
+ self._heading.text_parts.append(f"&#{name};")
215
+ elif self._title:
216
+ self._title.text_parts.append(f"&#{name};")
217
+ elif not self._text_skip:
218
+ self._add_text(f"&#{name};", glue=True)
219
+
220
+ def close(self) -> None:
221
+ super().close()
222
+ if self._heading:
223
+ heading = self._heading
224
+ text = _clean("".join(heading.text_parts))
225
+ depth = self._heading_depth(heading)
226
+ self.headings.append((
227
+ heading.start,
228
+ heading.start_col,
229
+ heading.level,
230
+ f"{' ' * depth}<{heading.tag}>{text}</{heading.tag}>",
231
+ ))
232
+ self._heading = None
233
+ if self._title:
234
+ title = self._title
235
+ text = _clean("".join(title.text_parts))
236
+ if text:
237
+ self.titles.append((title.start, title.start_col, OutlineItem(
238
+ start=title.start,
239
+ count=max(1, self.line_count - title.start + 1),
240
+ signature=f"{' ' * title.base_depth}<title>{text}</title>",
241
+ )))
242
+ self._title = None
243
+ for node in self._stack:
244
+ node.end = self.line_count
245
+
246
+ def _inside_content(self) -> bool:
247
+ return any(node.tag in _CONTENT_TAGS for node in self._stack)
248
+
249
+ def _inside_document_head(self) -> bool:
250
+ return any(node.tag == "head" for node in self._stack) and not self._inside_content()
251
+
252
+ def _outline_depth(self) -> int:
253
+ return len([
254
+ node for node in self._stack
255
+ if node.has_identity and node.tag not in _STRUCTURAL
256
+ ])
257
+
258
+ def _tag_depth(self, tag: str) -> int:
259
+ return 0 if tag in _STRUCTURAL else self._outline_depth() + 1
260
+
261
+ def _heading_context_node(self) -> _Node | None:
262
+ for node in reversed(self._stack):
263
+ if node.tag not in _STRUCTURAL and node.tag in _OUTLINE_TAGS:
264
+ return node
265
+ return None
266
+
267
+ def _heading_base_depth(self) -> int:
268
+ node = self._heading_context_node()
269
+ return node.depth + 1 if node else 1
270
+
271
+ def _heading_context_key(self) -> int:
272
+ node = self._heading_context_node()
273
+ return id(node) if node else 0
274
+
275
+ def _heading_depth(self, heading: _Heading) -> int:
276
+ stack = self._heading_stacks.setdefault(heading.context_key, [])
277
+ while stack and stack[-1][0] >= heading.level:
278
+ stack.pop()
279
+ depth = stack[-1][1] + 1 if stack else heading.base_depth
280
+ stack.append((heading.level, depth))
281
+ return depth
282
+
283
+ def _add_text(self, text: str, glue: bool = False) -> None:
284
+ if not text.strip():
285
+ return
286
+ for node in reversed(self._stack):
287
+ if node.tag in _LANDMARKS:
288
+ if node.text_parts and not (glue or node.text_parts[-1].endswith(";")):
289
+ node.text_parts.append(" ")
290
+ node.text_parts.append(text)
291
+ break
292
+
293
+
294
+ def _is_heading(tag: str) -> bool:
295
+ return len(tag) == 2 and tag[0] == "h" and tag[1] in "123456"
296
+
297
+
298
+ def _clean(text: str) -> str:
299
+ return _WS_RE.sub(" ", html.unescape(text)).strip()
300
+
301
+
302
+ def _block_sig(tag: str, attrs: dict[str, str], fallback_text: str = "", depth: int = 0) -> str:
303
+ ident = f"#{attrs['id']}" if attrs.get("id") else ""
304
+ label = _clean(attrs.get("aria-label", ""))
305
+ label_attr = f' aria-label="{label}"' if label else ""
306
+ excerpt = _excerpt(fallback_text) if tag in _LANDMARKS else ""
307
+ text = excerpt if excerpt and not label else ""
308
+ return f"{' ' * depth}<{tag}{ident}{label_attr}>{text}"
309
+
310
+
311
+ def _excerpt(text: str) -> str:
312
+ text = _clean(text)
313
+ if not text:
314
+ return ""
315
+ first = _SENTENCE_RE.split(text, maxsplit=1)[0]
316
+ if first.lower() in _BORING_EXCERPTS:
317
+ return ""
318
+ return first if len(first) <= _EXCERPT_LIMIT else first[:_EXCERPT_LIMIT - 3].rstrip() + "..."
319
+
320
+
321
+ def _items_from_headings(
322
+ headings: list[tuple[int, int, int, str]],
323
+ line_count: int,
324
+ ) -> Iterator[tuple[int, int, OutlineItem]]:
325
+ for idx, (line, column, level, signature) in enumerate(headings):
326
+ end = line_count + 1
327
+ for future_line, _, future_level, _ in headings[idx + 1:]:
328
+ if future_level <= level:
329
+ end = future_line
330
+ break
331
+ yield (line, column, OutlineItem(start=line, count=max(1, end - line), signature=signature))
332
+
333
+
334
+ def detect(lines: list[str]) -> bool:
335
+ for line in lines[:30]:
336
+ if _DOCTYPE_RE.match(line) or _HTML_TAG_RE.match(line):
337
+ return True
338
+ return False
339
+
340
+
341
+ def parse(text: str) -> Iterator[OutlineItem]:
342
+ line_count = len(text.splitlines())
343
+ parser = _Parser(line_count)
344
+ parser.feed(text)
345
+ parser.close()
346
+
347
+ block_items = (
348
+ (node.start, node.start_col, OutlineItem(
349
+ start=node.start,
350
+ count=max(1, (node.end or line_count) - node.start + 1),
351
+ signature=node.signature,
352
+ ))
353
+ for node in parser.nodes if node.has_identity
354
+ )
355
+ events = [*parser.titles, *_items_from_headings(parser.headings, line_count), *block_items]
356
+ for _, _, item in sorted(events, key=lambda event: (event[0], event[1])):
357
+ yield item