outliner-cli 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/PKG-INFO +20 -17
  2. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/README.md +19 -16
  3. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/pyproject.toml +1 -1
  4. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/cli.py +42 -39
  5. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/__init__.py +1 -1
  6. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/html.py +50 -62
  7. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/javascript.py +69 -17
  8. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/json.py +82 -93
  9. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/markdown.py +23 -3
  10. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/util.py +17 -0
  11. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/xml.py +29 -33
  12. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner_cli.egg-info/PKG-INFO +20 -17
  13. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_cli.py +121 -4
  14. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_html.py +35 -0
  15. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_javascript.py +148 -0
  16. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_json.py +90 -3
  17. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_markdown.py +55 -1
  18. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_util.py +18 -0
  19. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_xml.py +56 -2
  20. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/LICENSE +0 -0
  21. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/setup.cfg +0 -0
  22. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/__init__.py +0 -0
  23. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/asciidoc.py +0 -0
  24. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/c.py +0 -0
  25. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/clojure.py +0 -0
  26. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/csharp.py +0 -0
  27. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/go.py +0 -0
  28. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/java.py +0 -0
  29. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/orgmode.py +0 -0
  30. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/perl.py +0 -0
  31. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/php.py +0 -0
  32. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/python.py +0 -0
  33. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/rst.py +0 -0
  34. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/ruby.py +0 -0
  35. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/rust.py +0 -0
  36. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/scala.py +0 -0
  37. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/shell.py +0 -0
  38. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/swift.py +0 -0
  39. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/parsers/zig.py +0 -0
  40. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner/types.py +0 -0
  41. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner_cli.egg-info/SOURCES.txt +0 -0
  42. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner_cli.egg-info/dependency_links.txt +0 -0
  43. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner_cli.egg-info/entry_points.txt +0 -0
  44. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/src/outliner_cli.egg-info/top_level.txt +0 -0
  45. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_asciidoc.py +0 -0
  46. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_c.py +0 -0
  47. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_clojure.py +0 -0
  48. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_csharp.py +0 -0
  49. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_go.py +0 -0
  50. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_java.py +0 -0
  51. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_orgmode.py +0 -0
  52. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_parsers.py +0 -0
  53. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_perl.py +0 -0
  54. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_php.py +0 -0
  55. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_python.py +0 -0
  56. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_rst.py +0 -0
  57. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_ruby.py +0 -0
  58. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_rust.py +0 -0
  59. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_scala.py +0 -0
  60. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_shell.py +0 -0
  61. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_swift.py +0 -0
  62. {outliner_cli-0.3.0 → outliner_cli-0.4.0}/tests/test_zig.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: outliner-cli
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Print the structural outline of source files for LLM navigation
5
5
  Author: Per Cederberg
6
6
  License-Expression: MIT
@@ -13,7 +13,7 @@ Dynamic: license-file
13
13
 
14
14
  # outliner
15
15
 
16
- Print the structural outline of source files — useful declarations and callable
16
+ Print the structural outline of source files — declarations and callable
17
17
  landmarks with line ranges — so an LLM agent (or human) can navigate a file
18
18
  without reading it whole.
19
19
 
@@ -23,17 +23,20 @@ without reading it whole.
23
23
  outliner-cli [OPTIONS] [FILE...]
24
24
  ```
25
25
 
26
- | Option | Description |
27
- | ------------------- | ----------------------------------------------------------------------------- |
28
- | `-g, --grep EXPR` | Only show items whose signature matches EXPR (case-insensitive) |
29
- | `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous |
30
- | `-t, --type LANG` | Only include files of this language (repeatable, accepts name or extension) |
31
- | `-w, --width COLS` | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
26
+ | Option | Description |
27
+ | -------------------- | ---------------------------------------------------- |
28
+ | `-g, --grep EXPR` | Only show items whose signature matches EXPR |
29
+ | `-s, --syntax LANG` | Override syntax auto-detection when ambiguous |
30
+ | `-t, --type LANG` | Only include files of this language (repeatable) |
31
+ | `-w, --width COLS` | Truncate lines (`0`=off, `auto`=fit, default `120`) |
32
+ | `-x, --exclude GLOB` | Exclude files from directory walks (gitignore-style) |
32
33
 
33
34
  Pass a file, a directory (walked recursively), or omit arguments to read stdin.
34
- Use `-` to read stdin explicitly. `--syntax` is only needed when content
35
- auto-detection cannot identify the language (e.g. an ambiguous extensionless
36
- script piped on stdin).
35
+ Use `-` to read stdin explicitly. Directory walks honor `.gitignore` and skip
36
+ hidden directories; all other files are listed, with binary and unrecognized
37
+ files shown as one-line `binary file` / `unsupported file` summaries. `--syntax`
38
+ is only needed when content auto-detection cannot identify the language (e.g. an
39
+ ambiguous extensionless script piped on stdin).
37
40
 
38
41
  ## Output
39
42
 
@@ -47,7 +50,7 @@ Each line: `<start>,<count> <signature>`
47
50
 
48
51
  - `start` — 1-based line number, right-aligned
49
52
  - `count` — number of lines covered by the item (including doc-comments above)
50
- - `signature` — first non-comment line of the declaration; multi-line signatures
53
+ - `signature` — first non-comment line of a declaration; multi-line signatures
51
54
  are merged into one line; lines longer than the output width are truncated
52
55
  with `...`
53
56
 
@@ -80,13 +83,13 @@ uv run pytest
80
83
 
81
84
  ## Supported Languages
82
85
 
83
- AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
84
- JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
85
- Rust, Scala, Shell, Swift, XML, and Zig.
86
+ AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript (incl.
87
+ Svelte, Vue, and Astro components), JSON/NDJSON, Markdown, Org-mode, Perl, PHP,
88
+ Python, reStructuredText, Ruby, Rust, Scala, Shell, Swift, XML, and Zig.
86
89
 
87
90
  ## Example Use Cases
88
91
 
89
- **Structural overview** — Run on a directory to see all declarations across many
92
+ **Structural overview** — Run on a directory to see declarations across all
90
93
  files before reading anything:
91
94
 
92
95
  ```
@@ -159,6 +162,6 @@ $ uvx outliner-cli pubmed26n0001.xml
159
162
  <MedlineCitation> elem
160
163
  @Status attr -- "MEDLINE"
161
164
  <Article> elem
162
- <ArticleTitle> text -- "Formate assay in body fluids: applica..."
165
+ <ArticleTitle> text -- "Formate assay in body fluids..."
163
166
  <Abstract> elem?
164
167
  ```
@@ -1,6 +1,6 @@
1
1
  # outliner
2
2
 
3
- Print the structural outline of source files — useful declarations and callable
3
+ Print the structural outline of source files — declarations and callable
4
4
  landmarks with line ranges — so an LLM agent (or human) can navigate a file
5
5
  without reading it whole.
6
6
 
@@ -10,17 +10,20 @@ without reading it whole.
10
10
  outliner-cli [OPTIONS] [FILE...]
11
11
  ```
12
12
 
13
- | Option | Description |
14
- | ------------------- | ----------------------------------------------------------------------------- |
15
- | `-g, --grep EXPR` | Only show items whose signature matches EXPR (case-insensitive) |
16
- | `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous |
17
- | `-t, --type LANG` | Only include files of this language (repeatable, accepts name or extension) |
18
- | `-w, --width COLS` | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
13
+ | Option | Description |
14
+ | -------------------- | ---------------------------------------------------- |
15
+ | `-g, --grep EXPR` | Only show items whose signature matches EXPR |
16
+ | `-s, --syntax LANG` | Override syntax auto-detection when ambiguous |
17
+ | `-t, --type LANG` | Only include files of this language (repeatable) |
18
+ | `-w, --width COLS` | Truncate lines (`0`=off, `auto`=fit, default `120`) |
19
+ | `-x, --exclude GLOB` | Exclude files from directory walks (gitignore-style) |
19
20
 
20
21
  Pass a file, a directory (walked recursively), or omit arguments to read stdin.
21
- Use `-` to read stdin explicitly. `--syntax` is only needed when content
22
- auto-detection cannot identify the language (e.g. an ambiguous extensionless
23
- script piped on stdin).
22
+ Use `-` to read stdin explicitly. Directory walks honor `.gitignore` and skip
23
+ hidden directories; all other files are listed, with binary and unrecognized
24
+ files shown as one-line `binary file` / `unsupported file` summaries. `--syntax`
25
+ is only needed when content auto-detection cannot identify the language (e.g. an
26
+ ambiguous extensionless script piped on stdin).
24
27
 
25
28
  ## Output
26
29
 
@@ -34,7 +37,7 @@ Each line: `<start>,<count> <signature>`
34
37
 
35
38
  - `start` — 1-based line number, right-aligned
36
39
  - `count` — number of lines covered by the item (including doc-comments above)
37
- - `signature` — first non-comment line of the declaration; multi-line signatures
40
+ - `signature` — first non-comment line of a declaration; multi-line signatures
38
41
  are merged into one line; lines longer than the output width are truncated
39
42
  with `...`
40
43
 
@@ -67,13 +70,13 @@ uv run pytest
67
70
 
68
71
  ## Supported Languages
69
72
 
70
- AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
71
- JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
72
- Rust, Scala, Shell, Swift, XML, and Zig.
73
+ AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript (incl.
74
+ Svelte, Vue, and Astro components), JSON/NDJSON, Markdown, Org-mode, Perl, PHP,
75
+ Python, reStructuredText, Ruby, Rust, Scala, Shell, Swift, XML, and Zig.
73
76
 
74
77
  ## Example Use Cases
75
78
 
76
- **Structural overview** — Run on a directory to see all declarations across many
79
+ **Structural overview** — Run on a directory to see declarations across all
77
80
  files before reading anything:
78
81
 
79
82
  ```
@@ -146,6 +149,6 @@ $ uvx outliner-cli pubmed26n0001.xml
146
149
  <MedlineCitation> elem
147
150
  @Status attr -- "MEDLINE"
148
151
  <Article> elem
149
- <ArticleTitle> text -- "Formate assay in body fluids: applica..."
152
+ <ArticleTitle> text -- "Formate assay in body fluids..."
150
153
  <Abstract> elem?
151
154
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "outliner-cli"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "Print the structural outline of source files for LLM navigation"
9
9
  authors = [{name = "Per Cederberg"}]
10
10
  license = "MIT"
@@ -8,6 +8,7 @@ import shutil
8
8
  import sys
9
9
 
10
10
  from outliner.parsers import NAMES, EXTENSIONS, detect, outline, syntax
11
+ from outliner.parsers.util import format_count, format_size
11
12
  from outliner.types import OutlineItem
12
13
 
13
14
  _TEXT_CONTROLS = "\n\r\t\f\b"
@@ -56,22 +57,27 @@ def _is_ignored(name: str, root: str, gi: dict[str, list[str]], is_dir: bool) ->
56
57
  return False
57
58
 
58
59
 
59
- def _expand_sources(sources: list[str], types: set[str] | None = None) -> list[str]:
60
+ def _expand_sources(
61
+ sources: list[str],
62
+ types: set[str] | None = None,
63
+ excludes: list[str] | None = None,
64
+ ) -> list[str]:
60
65
  result = []
61
66
  for src in sources:
62
67
  if src == "-" or not os.path.isdir(src):
63
68
  result.append(src)
64
69
  continue
65
- gi: dict[str, list[str]] = {}
70
+ # CLI excludes behave like a .gitignore in the walk root
71
+ gi: dict[str, list[str]] = {os.path.normpath(src): list(excludes)} if excludes else {}
66
72
  for root, dirs, files in os.walk(src):
67
73
  pats = _load_gitignore(root)
68
74
  if pats:
69
- gi[root] = pats
70
- dirs[:] = sorted(d for d in dirs if not _is_ignored(d, root, gi, True))
75
+ gi[root] = gi.get(root, []) + pats
76
+ dirs[:] = sorted(d for d in dirs
77
+ if not d.startswith(".") and not _is_ignored(d, root, gi, True))
71
78
  for name in sorted(files):
72
- match = guess_syntax(name)
73
- supported = match and (not types or match in types)
74
- if supported and not _is_ignored(name, root, gi, False):
79
+ wanted = not types or guess_syntax(name) in types
80
+ if wanted and not _is_ignored(name, root, gi, False):
75
81
  result.append(os.path.join(root, name))
76
82
  return result
77
83
 
@@ -96,32 +102,40 @@ def _looks_binary(head: str) -> bool:
96
102
  return False
97
103
 
98
104
 
99
- def _format_size(size_bytes: int) -> str:
100
- if size_bytes >= 1_000_000_000:
101
- return f"{size_bytes / 1_000_000_000:.1f} GB"
102
- if size_bytes >= 1_000_000:
103
- return f"{size_bytes / 1_000_000:.1f} MB"
104
- if size_bytes >= 1_000:
105
- return f"{size_bytes / 1_000:.1f} KB"
106
- return f"{size_bytes} B"
105
+ def _unsupported_items(size: int, line_count: int) -> list[OutlineItem]:
106
+ plural = "s" if line_count != 1 else ""
107
+ sig = f"{format_size(size)} \u00b7 {format_count(line_count)} line{plural}"
108
+ return [OutlineItem(locator="unsupported file", signature=sig)]
107
109
 
108
110
 
109
- def _outline_source(src: str, selected: str | None) -> tuple[list[OutlineItem] | None, str | None]:
111
+ def _outline_source(src: str, selected: str | None) -> tuple[list[OutlineItem] | None, str]:
110
112
  if src == "-":
111
113
  if selected:
112
114
  return outline(selected, sys.stdin), selected
113
- text = sys.stdin.read()
114
- match = selected or detect(text)
115
- return (outline(match, text) if match else None), match
116
-
117
- with open(src, encoding="utf-8", errors="replace") as fh:
115
+ text = sys.stdin.read().removeprefix("\ufeff")
116
+ match = detect(text)
117
+ if match:
118
+ return outline(match, text), match
119
+ if not text.strip():
120
+ return [], "unsupported"
121
+ return _unsupported_items(len(text), len(text.splitlines())), "unsupported"
122
+
123
+ with open(src, encoding="utf-8-sig", errors="replace") as fh:
118
124
  head = fh.read(4096)
119
125
  if _looks_binary(head):
120
- size = _format_size(os.path.getsize(src))
126
+ size = format_size(os.path.getsize(src))
121
127
  return [OutlineItem(locator="binary file", signature=size)], "binary"
122
128
  match = selected or guess_syntax(src) or detect(head)
123
- fh.seek(0)
124
- return (outline(match, fh) if match else None), match
129
+ if match:
130
+ fh.seek(0)
131
+ return outline(match, fh), match
132
+ line_count, tail = head.count("\n"), head
133
+ while chunk := fh.read(1 << 20):
134
+ line_count += chunk.count("\n")
135
+ tail = chunk
136
+ if tail and not tail.endswith("\n"):
137
+ line_count += 1
138
+ return _unsupported_items(os.path.getsize(src), line_count), "unsupported"
125
139
 
126
140
 
127
141
  def main(argv: list[str] | None = None) -> int:
@@ -139,6 +153,8 @@ def main(argv: list[str] | None = None) -> int:
139
153
  help="Only include files of this language or extension (repeatable)")
140
154
  ap.add_argument("-w", "--width", metavar="COLS", default="120",
141
155
  help="Truncate output lines to COLS (0=unlimited, auto=terminal width, default=120)")
156
+ ap.add_argument("-x", "--exclude", action="append", metavar="PATTERN",
157
+ help="Exclude matching files from directory walks, like .gitignore (repeatable)")
142
158
  args = ap.parse_args(argv)
143
159
 
144
160
  grep_re: re.Pattern | None = None
@@ -178,31 +194,18 @@ def main(argv: list[str] | None = None) -> int:
178
194
  if sources == ["-"] and sys.stdin.isatty():
179
195
  ap.print_help()
180
196
  return 0
181
- sources = _expand_sources(sources, types)
197
+ sources = _expand_sources(sources, types, args.exclude)
182
198
  multi = len(sources) > 1
183
199
 
184
200
  exit_code = 0
185
201
  for src in sources:
186
202
  try:
187
- items, match = _outline_source(src, args.syntax)
203
+ items, _ = _outline_source(src, args.syntax)
188
204
  except OSError as exc:
189
205
  print(f"outliner: {exc}", file=sys.stderr)
190
206
  exit_code = 1
191
207
  continue
192
208
 
193
- if match is None:
194
- print(f"outliner: cannot auto-detect syntax for '{src}'; use --syntax",
195
- file=sys.stderr)
196
- exit_code = 2
197
- continue
198
-
199
- if items is None:
200
- available = ", ".join(NAMES)
201
- print(f"outliner: unsupported syntax '{match}'; available: {available}",
202
- file=sys.stderr)
203
- exit_code = 2
204
- continue
205
-
206
209
  output_lines = _format_items(items, grep_re, line_width)
207
210
 
208
211
  if output_lines:
@@ -57,7 +57,7 @@ def outline(syntax: str, content: str | TextIO) -> list[OutlineItem] | None:
57
57
 
58
58
 
59
59
  def _outline_text(mod, content: str) -> list[OutlineItem]:
60
- m = _FRONTMATTER_RE.match(content)
60
+ m = _FRONTMATTER_RE.match(content) if getattr(mod, "STRIP_FRONTMATTER", True) else None
61
61
  if not m:
62
62
  return list(mod.parse(content))
63
63
  offset = m.group(0).count('\n')
@@ -25,10 +25,6 @@ _BORING_EXCERPTS = {
25
25
  "advertisement", "close", "menu", "navigation", "open menu",
26
26
  "search", "skip advertisement", "skip to content",
27
27
  }
28
- _VOID_TAGS = {
29
- "area", "base", "br", "col", "embed", "hr", "img", "input",
30
- "link", "meta", "param", "source", "track", "wbr",
31
- }
32
28
 
33
29
 
34
30
  @dataclass
@@ -109,12 +105,10 @@ class _Parser(HTMLParser):
109
105
  depth=self._tag_depth(tag),
110
106
  )
111
107
  self.nodes.append(node)
112
- if tag not in _VOID_TAGS:
113
- self._stack.append(node)
114
- elif tag in _VOID_TAGS:
115
- return
108
+ self._stack.append(node)
116
109
 
117
110
  if _is_heading(tag):
111
+ self._flush_heading()
118
112
  self._heading = _Heading(
119
113
  tag=tag,
120
114
  level=int(tag[1]),
@@ -123,7 +117,7 @@ class _Parser(HTMLParser):
123
117
  base_depth=self._heading_base_depth(),
124
118
  context_key=self._heading_context_key(),
125
119
  )
126
- elif tag == "title" and self._inside_document_head():
120
+ elif tag == "title" and not self._inside_content():
127
121
  self._title = _Heading(
128
122
  tag=tag,
129
123
  level=0,
@@ -160,95 +154,89 @@ class _Parser(HTMLParser):
160
154
 
161
155
  line = self.getpos()[0]
162
156
  if self._heading and tag == self._heading.tag:
163
- heading = self._heading
164
- text = _clean("".join(heading.text_parts))
165
- depth = self._heading_depth(heading)
166
- self.headings.append((
167
- heading.start,
168
- heading.start_col,
169
- heading.level,
170
- f"{' ' * depth}<{heading.tag}>{text}</{heading.tag}>",
171
- ))
172
- for node in reversed(self._stack):
173
- if node.tag in _LANDMARKS and not node.heading_text:
174
- node.heading_text = text
175
- break
176
- self._heading = None
157
+ self._flush_heading()
177
158
  elif self._title and tag == "title":
178
- title = self._title
179
- text = _clean("".join(title.text_parts))
180
- if text:
181
- self.titles.append((title.start, title.start_col, OutlineItem(
182
- start=title.start,
183
- count=max(1, line - title.start + 1),
184
- signature=f"{' ' * title.base_depth}<title>{text}</title>",
185
- )))
186
- self._title = None
159
+ self._flush_title(line)
187
160
 
188
161
  if tag in _OUTLINE_TAGS:
189
162
  for idx in range(len(self._stack) - 1, -1, -1):
190
- node = self._stack[idx]
191
- if node.tag == tag:
192
- node.end = line
163
+ if self._stack[idx].tag == tag:
164
+ for node in self._stack[idx:]:
165
+ node.end = line
193
166
  del self._stack[idx:]
194
167
  break
195
168
 
196
169
  def handle_data(self, data: str) -> None:
170
+ if self._text_skip:
171
+ return
197
172
  if self._heading:
198
173
  self._heading.text_parts.append(data)
199
174
  elif self._title:
200
175
  self._title.text_parts.append(data)
201
- elif not self._text_skip:
176
+ else:
202
177
  self._add_text(data)
203
178
 
204
179
  def handle_entityref(self, name: str) -> None:
180
+ if self._text_skip:
181
+ return
205
182
  if self._heading:
206
183
  self._heading.text_parts.append(f"&{name};")
207
184
  elif self._title:
208
185
  self._title.text_parts.append(f"&{name};")
209
- elif not self._text_skip:
186
+ else:
210
187
  self._add_text(f"&{name};", glue=True)
211
188
 
212
189
  def handle_charref(self, name: str) -> None:
190
+ if self._text_skip:
191
+ return
213
192
  if self._heading:
214
193
  self._heading.text_parts.append(f"&#{name};")
215
194
  elif self._title:
216
195
  self._title.text_parts.append(f"&#{name};")
217
- elif not self._text_skip:
196
+ else:
218
197
  self._add_text(f"&#{name};", glue=True)
219
198
 
220
199
  def close(self) -> None:
221
200
  super().close()
222
- if self._heading:
223
- heading = self._heading
224
- text = _clean("".join(heading.text_parts))
225
- depth = self._heading_depth(heading)
226
- self.headings.append((
227
- heading.start,
228
- heading.start_col,
229
- heading.level,
230
- f"{' ' * depth}<{heading.tag}>{text}</{heading.tag}>",
231
- ))
232
- self._heading = None
233
- if self._title:
234
- title = self._title
235
- text = _clean("".join(title.text_parts))
236
- if text:
237
- self.titles.append((title.start, title.start_col, OutlineItem(
238
- start=title.start,
239
- count=max(1, self.line_count - title.start + 1),
240
- signature=f"{' ' * title.base_depth}<title>{text}</title>",
241
- )))
242
- self._title = None
201
+ self._flush_heading()
202
+ self._flush_title(self.line_count)
243
203
  for node in self._stack:
244
204
  node.end = self.line_count
245
205
 
206
+ def _flush_heading(self) -> None:
207
+ heading = self._heading
208
+ if not heading:
209
+ return
210
+ text = _clean("".join(heading.text_parts))
211
+ depth = self._heading_depth(heading)
212
+ self.headings.append((
213
+ heading.start,
214
+ heading.start_col,
215
+ heading.level,
216
+ f"{' ' * depth}<{heading.tag}>{text}</{heading.tag}>",
217
+ ))
218
+ for node in reversed(self._stack):
219
+ if node.tag in _LANDMARKS and not node.heading_text:
220
+ node.heading_text = text
221
+ break
222
+ self._heading = None
223
+
224
+ def _flush_title(self, end_line: int) -> None:
225
+ title = self._title
226
+ if not title:
227
+ return
228
+ text = _clean("".join(title.text_parts))
229
+ if text:
230
+ self.titles.append((title.start, title.start_col, OutlineItem(
231
+ start=title.start,
232
+ count=max(1, end_line - title.start + 1),
233
+ signature=f"{' ' * title.base_depth}<title>{text}</title>",
234
+ )))
235
+ self._title = None
236
+
246
237
  def _inside_content(self) -> bool:
247
238
  return any(node.tag in _CONTENT_TAGS for node in self._stack)
248
239
 
249
- def _inside_document_head(self) -> bool:
250
- return any(node.tag == "head" for node in self._stack) and not self._inside_content()
251
-
252
240
  def _outline_depth(self) -> int:
253
241
  return len([
254
242
  node for node in self._stack