outliner-cli 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/PKG-INFO +35 -5
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/README.md +34 -4
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/pyproject.toml +1 -1
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/cli.py +45 -11
- outliner_cli-0.3.0/src/outliner/parsers/__init__.py +65 -0
- outliner_cli-0.3.0/src/outliner/parsers/html.py +357 -0
- outliner_cli-0.3.0/src/outliner/parsers/json.py +329 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/markdown.py +13 -14
- outliner_cli-0.3.0/src/outliner/parsers/xml.py +271 -0
- outliner_cli-0.3.0/src/outliner/types.py +30 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/PKG-INFO +35 -5
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/SOURCES.txt +4 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_cli.py +66 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_html.py +171 -38
- outliner_cli-0.3.0/tests/test_json.py +405 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_markdown.py +4 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_parsers.py +41 -0
- outliner_cli-0.3.0/tests/test_xml.py +125 -0
- outliner_cli-0.2.0/src/outliner/parsers/__init__.py +0 -43
- outliner_cli-0.2.0/src/outliner/parsers/html.py +0 -217
- outliner_cli-0.2.0/src/outliner/types.py +0 -19
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/LICENSE +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/setup.cfg +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/__init__.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/asciidoc.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/c.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/clojure.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/csharp.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/go.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/java.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/javascript.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/orgmode.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/perl.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/php.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/python.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/rst.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/ruby.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/rust.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/scala.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/shell.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/swift.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/util.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/parsers/zig.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/dependency_links.txt +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/entry_points.txt +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner_cli.egg-info/top_level.txt +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_asciidoc.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_c.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_clojure.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_csharp.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_go.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_java.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_javascript.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_orgmode.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_perl.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_php.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_python.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_rst.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_ruby.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_rust.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_scala.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_shell.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_swift.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_util.py +0 -0
- {outliner_cli-0.2.0 → outliner_cli-0.3.0}/tests/test_zig.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: outliner-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Print the structural outline of source files for LLM navigation
|
|
5
5
|
Author: Per Cederberg
|
|
6
6
|
License-Expression: MIT
|
|
@@ -27,6 +27,7 @@ outliner-cli [OPTIONS] [FILE...]
|
|
|
27
27
|
| ------------------- | ----------------------------------------------------------------------------- |
|
|
28
28
|
| `-g, --grep EXPR` | Only show items whose signature matches EXPR (case-insensitive) |
|
|
29
29
|
| `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous |
|
|
30
|
+
| `-t, --type LANG` | Only include files of this language (repeatable, accepts name or extension) |
|
|
30
31
|
| `-w, --width COLS` | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
|
|
31
32
|
|
|
32
33
|
Pass a file, a directory (walked recursively), or omit arguments to read stdin.
|
|
@@ -57,7 +58,7 @@ methods) and native-format indentation in the signature (indented for code,
|
|
|
57
58
|
## Installation
|
|
58
59
|
|
|
59
60
|
```sh
|
|
60
|
-
pip install outliner
|
|
61
|
+
pip install outliner-cli
|
|
61
62
|
```
|
|
62
63
|
|
|
63
64
|
## Running
|
|
@@ -79,9 +80,9 @@ uv run pytest
|
|
|
79
80
|
|
|
80
81
|
## Supported Languages
|
|
81
82
|
|
|
82
|
-
AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
|
|
83
|
-
Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
|
|
84
|
-
and Zig.
|
|
83
|
+
AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
|
|
84
|
+
JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
|
|
85
|
+
Rust, Scala, Shell, Swift, XML, and Zig.
|
|
85
86
|
|
|
86
87
|
## Example Use Cases
|
|
87
88
|
|
|
@@ -132,3 +133,32 @@ $ uvx outliner-cli --grep PaymentMethod src/
|
|
|
132
133
|
14,12 def charge(method: PaymentMethod, amount: Decimal) -> Receipt
|
|
133
134
|
88,4 def validate(m: PaymentMethod) -> bool
|
|
134
135
|
```
|
|
136
|
+
|
|
137
|
+
**Inspect a dataset without opening it** — JSON/NDJSON files show a schema
|
|
138
|
+
overview with file size, record count, data types, optionality, and truncated
|
|
139
|
+
sample values:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
$ uvx outliner-cli titanic.json
|
|
143
|
+
$ 163.9 KB · json · array[891]
|
|
144
|
+
.Age float|int? -- 22
|
|
145
|
+
.Cabin str? -- "C85"
|
|
146
|
+
.Embarked str? -- "S"
|
|
147
|
+
.Fare float|int -- 7.25
|
|
148
|
+
.Name str -- "Braund, Mr. Owen Harris"
|
|
149
|
+
.Survived int -- 0
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
XML files show an indented structural outline with XML-native node kinds:
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
$ uvx outliner-cli pubmed26n0001.xml
|
|
156
|
+
/ 195.5 MB · xml · sampled 204K elems
|
|
157
|
+
<PubmedArticleSet> elem
|
|
158
|
+
<PubmedArticle> elem+
|
|
159
|
+
<MedlineCitation> elem
|
|
160
|
+
@Status attr -- "MEDLINE"
|
|
161
|
+
<Article> elem
|
|
162
|
+
<ArticleTitle> text -- "Formate assay in body fluids: applica..."
|
|
163
|
+
<Abstract> elem?
|
|
164
|
+
```
|
|
@@ -14,6 +14,7 @@ outliner-cli [OPTIONS] [FILE...]
|
|
|
14
14
|
| ------------------- | ----------------------------------------------------------------------------- |
|
|
15
15
|
| `-g, --grep EXPR` | Only show items whose signature matches EXPR (case-insensitive) |
|
|
16
16
|
| `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous |
|
|
17
|
+
| `-t, --type LANG` | Only include files of this language (repeatable, accepts name or extension) |
|
|
17
18
|
| `-w, --width COLS` | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
|
|
18
19
|
|
|
19
20
|
Pass a file, a directory (walked recursively), or omit arguments to read stdin.
|
|
@@ -44,7 +45,7 @@ methods) and native-format indentation in the signature (indented for code,
|
|
|
44
45
|
## Installation
|
|
45
46
|
|
|
46
47
|
```sh
|
|
47
|
-
pip install outliner
|
|
48
|
+
pip install outliner-cli
|
|
48
49
|
```
|
|
49
50
|
|
|
50
51
|
## Running
|
|
@@ -66,9 +67,9 @@ uv run pytest
|
|
|
66
67
|
|
|
67
68
|
## Supported Languages
|
|
68
69
|
|
|
69
|
-
AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
|
|
70
|
-
Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
|
|
71
|
-
and Zig.
|
|
70
|
+
AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
|
|
71
|
+
JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
|
|
72
|
+
Rust, Scala, Shell, Swift, XML, and Zig.
|
|
72
73
|
|
|
73
74
|
## Example Use Cases
|
|
74
75
|
|
|
@@ -119,3 +120,32 @@ $ uvx outliner-cli --grep PaymentMethod src/
|
|
|
119
120
|
14,12 def charge(method: PaymentMethod, amount: Decimal) -> Receipt
|
|
120
121
|
88,4 def validate(m: PaymentMethod) -> bool
|
|
121
122
|
```
|
|
123
|
+
|
|
124
|
+
**Inspect a dataset without opening it** — JSON/NDJSON files show a schema
|
|
125
|
+
overview with file size, record count, data types, optionality, and truncated
|
|
126
|
+
sample values:
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
$ uvx outliner-cli titanic.json
|
|
130
|
+
$ 163.9 KB · json · array[891]
|
|
131
|
+
.Age float|int? -- 22
|
|
132
|
+
.Cabin str? -- "C85"
|
|
133
|
+
.Embarked str? -- "S"
|
|
134
|
+
.Fare float|int -- 7.25
|
|
135
|
+
.Name str -- "Braund, Mr. Owen Harris"
|
|
136
|
+
.Survived int -- 0
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
XML files show an indented structural outline with XML-native node kinds:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
$ uvx outliner-cli pubmed26n0001.xml
|
|
143
|
+
/ 195.5 MB · xml · sampled 204K elems
|
|
144
|
+
<PubmedArticleSet> elem
|
|
145
|
+
<PubmedArticle> elem+
|
|
146
|
+
<MedlineCitation> elem
|
|
147
|
+
@Status attr -- "MEDLINE"
|
|
148
|
+
<Article> elem
|
|
149
|
+
<ArticleTitle> text -- "Formate assay in body fluids: applica..."
|
|
150
|
+
<Abstract> elem?
|
|
151
|
+
```
|
|
@@ -10,6 +10,9 @@ import sys
|
|
|
10
10
|
from outliner.parsers import NAMES, EXTENSIONS, detect, outline, syntax
|
|
11
11
|
from outliner.types import OutlineItem
|
|
12
12
|
|
|
13
|
+
_TEXT_CONTROLS = "\n\r\t\f\b"
|
|
14
|
+
_BINARY_THRESHOLD = 0.05
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
def die(msg: str, code: int = 2) -> None:
|
|
15
18
|
print(f"outliner: {msg}", file=sys.stderr)
|
|
@@ -78,9 +81,47 @@ def _format_items(items: list[OutlineItem], grep: re.Pattern | None, line_width:
|
|
|
78
81
|
items = [it for it in items if grep.search(it.signature)]
|
|
79
82
|
if not items:
|
|
80
83
|
return []
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
return [it.format(
|
|
84
|
+
fmt_width = max(it.fmt_width for it in items)
|
|
85
|
+
fmt_width = max(fmt_width, 3)
|
|
86
|
+
return [it.format(fmt_width, line_width) for it in items]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _looks_binary(head: str) -> bool:
|
|
90
|
+
if "\0" in head:
|
|
91
|
+
return True
|
|
92
|
+
if head:
|
|
93
|
+
controls = sum(1 for ch in head if ord(ch) < 32 and ch not in _TEXT_CONTROLS)
|
|
94
|
+
replaced = head.count("\ufffd")
|
|
95
|
+
return (controls + replaced) / len(head) > _BINARY_THRESHOLD
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _format_size(size_bytes: int) -> str:
|
|
100
|
+
if size_bytes >= 1_000_000_000:
|
|
101
|
+
return f"{size_bytes / 1_000_000_000:.1f} GB"
|
|
102
|
+
if size_bytes >= 1_000_000:
|
|
103
|
+
return f"{size_bytes / 1_000_000:.1f} MB"
|
|
104
|
+
if size_bytes >= 1_000:
|
|
105
|
+
return f"{size_bytes / 1_000:.1f} KB"
|
|
106
|
+
return f"{size_bytes} B"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _outline_source(src: str, selected: str | None) -> tuple[list[OutlineItem] | None, str | None]:
|
|
110
|
+
if src == "-":
|
|
111
|
+
if selected:
|
|
112
|
+
return outline(selected, sys.stdin), selected
|
|
113
|
+
text = sys.stdin.read()
|
|
114
|
+
match = selected or detect(text)
|
|
115
|
+
return (outline(match, text) if match else None), match
|
|
116
|
+
|
|
117
|
+
with open(src, encoding="utf-8", errors="replace") as fh:
|
|
118
|
+
head = fh.read(4096)
|
|
119
|
+
if _looks_binary(head):
|
|
120
|
+
size = _format_size(os.path.getsize(src))
|
|
121
|
+
return [OutlineItem(locator="binary file", signature=size)], "binary"
|
|
122
|
+
match = selected or guess_syntax(src) or detect(head)
|
|
123
|
+
fh.seek(0)
|
|
124
|
+
return (outline(match, fh) if match else None), match
|
|
84
125
|
|
|
85
126
|
|
|
86
127
|
def main(argv: list[str] | None = None) -> int:
|
|
@@ -143,25 +184,18 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
143
184
|
exit_code = 0
|
|
144
185
|
for src in sources:
|
|
145
186
|
try:
|
|
146
|
-
|
|
147
|
-
text = sys.stdin.read()
|
|
148
|
-
else:
|
|
149
|
-
with open(src, encoding="utf-8", errors="replace") as fh:
|
|
150
|
-
text = fh.read()
|
|
187
|
+
items, match = _outline_source(src, args.syntax)
|
|
151
188
|
except OSError as exc:
|
|
152
189
|
print(f"outliner: {exc}", file=sys.stderr)
|
|
153
190
|
exit_code = 1
|
|
154
191
|
continue
|
|
155
192
|
|
|
156
|
-
match = args.syntax or guess_syntax(src) or detect(text)
|
|
157
|
-
|
|
158
193
|
if match is None:
|
|
159
194
|
print(f"outliner: cannot auto-detect syntax for '{src}'; use --syntax",
|
|
160
195
|
file=sys.stderr)
|
|
161
196
|
exit_code = 2
|
|
162
197
|
continue
|
|
163
198
|
|
|
164
|
-
items = outline(match, text)
|
|
165
199
|
if items is None:
|
|
166
200
|
available = ", ".join(NAMES)
|
|
167
201
|
print(f"outliner: unsupported syntax '{match}'; available: {available}",
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import re
|
|
3
|
+
import types
|
|
4
|
+
from typing import TextIO
|
|
5
|
+
|
|
6
|
+
from ..types import OutlineItem
|
|
7
|
+
from . import (
|
|
8
|
+
python, scala, go, java, rust, swift, c, ruby, php, shell, javascript,
|
|
9
|
+
csharp, perl, zig, clojure, html, asciidoc, orgmode, rst, json, xml,
|
|
10
|
+
markdown,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
_MODULES = {
|
|
14
|
+
mod.SYNTAX: mod
|
|
15
|
+
for mod in globals().values() if (
|
|
16
|
+
isinstance(mod, types.ModuleType)
|
|
17
|
+
and mod.__name__.startswith(f"{__name__}.")
|
|
18
|
+
and hasattr(mod, "SYNTAX")
|
|
19
|
+
and hasattr(mod, "EXTENSIONS")
|
|
20
|
+
)
|
|
21
|
+
}
|
|
22
|
+
NAMES = sorted(_MODULES)
|
|
23
|
+
EXTENSIONS = {ext: syntax for syntax, mod in _MODULES.items() for ext in mod.EXTENSIONS}
|
|
24
|
+
_FRONTMATTER_RE = re.compile(r'\A(?:---\n(?:.*\n){0,98}?---\n|\+\+\+\n(?:.*\n){0,98}?\+\+\+\n)')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _strip_frontmatter(content: str) -> str:
|
|
28
|
+
m = _FRONTMATTER_RE.match(content)
|
|
29
|
+
return content[m.end():] if m else content
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def syntax(name: str) -> str | None:
|
|
33
|
+
if name in _MODULES:
|
|
34
|
+
return name
|
|
35
|
+
ext = name if name.startswith(".") else "." + name
|
|
36
|
+
return EXTENSIONS.get(ext)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def detect(content: str) -> str | None:
|
|
40
|
+
lines = _strip_frontmatter(content).splitlines()[:100]
|
|
41
|
+
for mod in _MODULES.values():
|
|
42
|
+
if mod.detect(lines):
|
|
43
|
+
return mod.SYNTAX
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def outline(syntax: str, content: str | TextIO) -> list[OutlineItem] | None:
|
|
48
|
+
mod = _MODULES.get(syntax)
|
|
49
|
+
if not mod:
|
|
50
|
+
return None
|
|
51
|
+
elif hasattr(mod, "read"):
|
|
52
|
+
fh = io.StringIO(content) if isinstance(content, str) else content
|
|
53
|
+
return list(mod.read(fh))
|
|
54
|
+
else:
|
|
55
|
+
text = content.read() if hasattr(content, "read") else content
|
|
56
|
+
return _outline_text(mod, text)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _outline_text(mod, content: str) -> list[OutlineItem]:
|
|
60
|
+
m = _FRONTMATTER_RE.match(content)
|
|
61
|
+
if not m:
|
|
62
|
+
return list(mod.parse(content))
|
|
63
|
+
offset = m.group(0).count('\n')
|
|
64
|
+
return [OutlineItem(start=it.start + offset, count=it.count, signature=it.signature)
|
|
65
|
+
for it in mod.parse(content[m.end():])]
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""HTML outline parser using Python's tokenizing HTMLParser."""
|
|
2
|
+
|
|
3
|
+
import html
|
|
4
|
+
import re
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from html.parser import HTMLParser
|
|
8
|
+
|
|
9
|
+
from outliner.types import OutlineItem
|
|
10
|
+
|
|
11
|
+
SYNTAX = "html"
|
|
12
|
+
EXTENSIONS = (".html", ".htm", ".xhtml")
|
|
13
|
+
|
|
14
|
+
_DOCTYPE_RE = re.compile(r'^\s*<!DOCTYPE\s+html', re.IGNORECASE)
|
|
15
|
+
_HTML_TAG_RE = re.compile(r'^\s*<html[\s>]', re.IGNORECASE)
|
|
16
|
+
_WS_RE = re.compile(r'\s+')
|
|
17
|
+
_SENTENCE_RE = re.compile(r'(?<=[.!?])\s+')
|
|
18
|
+
_STRUCTURAL = {"head", "body"}
|
|
19
|
+
_LANDMARKS = {"nav", "main", "article", "section", "header"}
|
|
20
|
+
_OUTLINE_TAGS = _STRUCTURAL | _LANDMARKS
|
|
21
|
+
_CONTENT_TAGS = {"body"} | _LANDMARKS
|
|
22
|
+
_TEXT_SKIP_TAGS = {"script", "style", "svg", "noscript", "template"}
|
|
23
|
+
_EXCERPT_LIMIT = 80
|
|
24
|
+
_BORING_EXCERPTS = {
|
|
25
|
+
"advertisement", "close", "menu", "navigation", "open menu",
|
|
26
|
+
"search", "skip advertisement", "skip to content",
|
|
27
|
+
}
|
|
28
|
+
_VOID_TAGS = {
|
|
29
|
+
"area", "base", "br", "col", "embed", "hr", "img", "input",
|
|
30
|
+
"link", "meta", "param", "source", "track", "wbr",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class _Node:
|
|
36
|
+
tag: str
|
|
37
|
+
start: int
|
|
38
|
+
start_col: int
|
|
39
|
+
attrs: dict[str, str]
|
|
40
|
+
depth: int
|
|
41
|
+
text_parts: list[str] = field(default_factory=list)
|
|
42
|
+
heading_text: str = ""
|
|
43
|
+
end: int | None = None
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def signature(self) -> str:
|
|
47
|
+
return _block_sig(
|
|
48
|
+
self.tag,
|
|
49
|
+
self.attrs,
|
|
50
|
+
self.heading_text or "".join(self.text_parts),
|
|
51
|
+
self.depth,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def has_identity(self) -> bool:
|
|
56
|
+
return (
|
|
57
|
+
self.tag in _STRUCTURAL
|
|
58
|
+
or self.tag == "main"
|
|
59
|
+
or bool(self.attrs.get("id"))
|
|
60
|
+
or bool(_clean(self.attrs.get("aria-label", "")))
|
|
61
|
+
or bool(_excerpt(self.heading_text or "".join(self.text_parts)))
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class _Heading:
|
|
67
|
+
tag: str
|
|
68
|
+
level: int
|
|
69
|
+
start: int
|
|
70
|
+
start_col: int
|
|
71
|
+
base_depth: int
|
|
72
|
+
context_key: int
|
|
73
|
+
text_parts: list[str] = field(default_factory=list)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class _Parser(HTMLParser):
|
|
77
|
+
def __init__(self, line_count: int):
|
|
78
|
+
super().__init__(convert_charrefs=False)
|
|
79
|
+
self.line_count = line_count
|
|
80
|
+
self.nodes: list[_Node] = []
|
|
81
|
+
self.headings: list[tuple[int, int, int, str]] = []
|
|
82
|
+
self.titles: list[tuple[int, int, OutlineItem]] = []
|
|
83
|
+
self._stack: list[_Node] = []
|
|
84
|
+
self._heading: _Heading | None = None
|
|
85
|
+
self._title: _Heading | None = None
|
|
86
|
+
self._text_skip: list[str] = []
|
|
87
|
+
self._heading_stacks: dict[int, list[tuple[int, int]]] = {}
|
|
88
|
+
|
|
89
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
90
|
+
tag = tag.lower()
|
|
91
|
+
if self._text_skip:
|
|
92
|
+
if tag in _TEXT_SKIP_TAGS:
|
|
93
|
+
self._text_skip.append(tag)
|
|
94
|
+
return
|
|
95
|
+
if tag in _TEXT_SKIP_TAGS:
|
|
96
|
+
self._text_skip.append(tag)
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
line, column = self.getpos()
|
|
100
|
+
attrs_by_name = {name.lower(): value or "" for name, value in attrs}
|
|
101
|
+
if tag in _OUTLINE_TAGS:
|
|
102
|
+
if tag in _STRUCTURAL and self._inside_content():
|
|
103
|
+
return
|
|
104
|
+
node = _Node(
|
|
105
|
+
tag=tag,
|
|
106
|
+
start=line,
|
|
107
|
+
start_col=column,
|
|
108
|
+
attrs=attrs_by_name,
|
|
109
|
+
depth=self._tag_depth(tag),
|
|
110
|
+
)
|
|
111
|
+
self.nodes.append(node)
|
|
112
|
+
if tag not in _VOID_TAGS:
|
|
113
|
+
self._stack.append(node)
|
|
114
|
+
elif tag in _VOID_TAGS:
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
if _is_heading(tag):
|
|
118
|
+
self._heading = _Heading(
|
|
119
|
+
tag=tag,
|
|
120
|
+
level=int(tag[1]),
|
|
121
|
+
start=line,
|
|
122
|
+
start_col=column,
|
|
123
|
+
base_depth=self._heading_base_depth(),
|
|
124
|
+
context_key=self._heading_context_key(),
|
|
125
|
+
)
|
|
126
|
+
elif tag == "title" and self._inside_document_head():
|
|
127
|
+
self._title = _Heading(
|
|
128
|
+
tag=tag,
|
|
129
|
+
level=0,
|
|
130
|
+
start=line,
|
|
131
|
+
start_col=column,
|
|
132
|
+
base_depth=self._tag_depth(tag),
|
|
133
|
+
context_key=0,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
137
|
+
tag = tag.lower()
|
|
138
|
+
if self._text_skip:
|
|
139
|
+
return
|
|
140
|
+
if tag in _OUTLINE_TAGS:
|
|
141
|
+
if tag in _STRUCTURAL and self._inside_content():
|
|
142
|
+
return
|
|
143
|
+
attrs_by_name = {name.lower(): value or "" for name, value in attrs}
|
|
144
|
+
line, column = self.getpos()
|
|
145
|
+
self.nodes.append(_Node(
|
|
146
|
+
tag=tag,
|
|
147
|
+
start=line,
|
|
148
|
+
start_col=column,
|
|
149
|
+
end=line,
|
|
150
|
+
attrs=attrs_by_name,
|
|
151
|
+
depth=self._tag_depth(tag),
|
|
152
|
+
))
|
|
153
|
+
|
|
154
|
+
def handle_endtag(self, tag: str) -> None:
|
|
155
|
+
tag = tag.lower()
|
|
156
|
+
if self._text_skip:
|
|
157
|
+
if tag == self._text_skip[-1]:
|
|
158
|
+
self._text_skip.pop()
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
line = self.getpos()[0]
|
|
162
|
+
if self._heading and tag == self._heading.tag:
|
|
163
|
+
heading = self._heading
|
|
164
|
+
text = _clean("".join(heading.text_parts))
|
|
165
|
+
depth = self._heading_depth(heading)
|
|
166
|
+
self.headings.append((
|
|
167
|
+
heading.start,
|
|
168
|
+
heading.start_col,
|
|
169
|
+
heading.level,
|
|
170
|
+
f"{' ' * depth}<{heading.tag}>{text}</{heading.tag}>",
|
|
171
|
+
))
|
|
172
|
+
for node in reversed(self._stack):
|
|
173
|
+
if node.tag in _LANDMARKS and not node.heading_text:
|
|
174
|
+
node.heading_text = text
|
|
175
|
+
break
|
|
176
|
+
self._heading = None
|
|
177
|
+
elif self._title and tag == "title":
|
|
178
|
+
title = self._title
|
|
179
|
+
text = _clean("".join(title.text_parts))
|
|
180
|
+
if text:
|
|
181
|
+
self.titles.append((title.start, title.start_col, OutlineItem(
|
|
182
|
+
start=title.start,
|
|
183
|
+
count=max(1, line - title.start + 1),
|
|
184
|
+
signature=f"{' ' * title.base_depth}<title>{text}</title>",
|
|
185
|
+
)))
|
|
186
|
+
self._title = None
|
|
187
|
+
|
|
188
|
+
if tag in _OUTLINE_TAGS:
|
|
189
|
+
for idx in range(len(self._stack) - 1, -1, -1):
|
|
190
|
+
node = self._stack[idx]
|
|
191
|
+
if node.tag == tag:
|
|
192
|
+
node.end = line
|
|
193
|
+
del self._stack[idx:]
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
def handle_data(self, data: str) -> None:
|
|
197
|
+
if self._heading:
|
|
198
|
+
self._heading.text_parts.append(data)
|
|
199
|
+
elif self._title:
|
|
200
|
+
self._title.text_parts.append(data)
|
|
201
|
+
elif not self._text_skip:
|
|
202
|
+
self._add_text(data)
|
|
203
|
+
|
|
204
|
+
def handle_entityref(self, name: str) -> None:
|
|
205
|
+
if self._heading:
|
|
206
|
+
self._heading.text_parts.append(f"&{name};")
|
|
207
|
+
elif self._title:
|
|
208
|
+
self._title.text_parts.append(f"&{name};")
|
|
209
|
+
elif not self._text_skip:
|
|
210
|
+
self._add_text(f"&{name};", glue=True)
|
|
211
|
+
|
|
212
|
+
def handle_charref(self, name: str) -> None:
|
|
213
|
+
if self._heading:
|
|
214
|
+
self._heading.text_parts.append(f"&#{name};")
|
|
215
|
+
elif self._title:
|
|
216
|
+
self._title.text_parts.append(f"&#{name};")
|
|
217
|
+
elif not self._text_skip:
|
|
218
|
+
self._add_text(f"&#{name};", glue=True)
|
|
219
|
+
|
|
220
|
+
def close(self) -> None:
|
|
221
|
+
super().close()
|
|
222
|
+
if self._heading:
|
|
223
|
+
heading = self._heading
|
|
224
|
+
text = _clean("".join(heading.text_parts))
|
|
225
|
+
depth = self._heading_depth(heading)
|
|
226
|
+
self.headings.append((
|
|
227
|
+
heading.start,
|
|
228
|
+
heading.start_col,
|
|
229
|
+
heading.level,
|
|
230
|
+
f"{' ' * depth}<{heading.tag}>{text}</{heading.tag}>",
|
|
231
|
+
))
|
|
232
|
+
self._heading = None
|
|
233
|
+
if self._title:
|
|
234
|
+
title = self._title
|
|
235
|
+
text = _clean("".join(title.text_parts))
|
|
236
|
+
if text:
|
|
237
|
+
self.titles.append((title.start, title.start_col, OutlineItem(
|
|
238
|
+
start=title.start,
|
|
239
|
+
count=max(1, self.line_count - title.start + 1),
|
|
240
|
+
signature=f"{' ' * title.base_depth}<title>{text}</title>",
|
|
241
|
+
)))
|
|
242
|
+
self._title = None
|
|
243
|
+
for node in self._stack:
|
|
244
|
+
node.end = self.line_count
|
|
245
|
+
|
|
246
|
+
def _inside_content(self) -> bool:
|
|
247
|
+
return any(node.tag in _CONTENT_TAGS for node in self._stack)
|
|
248
|
+
|
|
249
|
+
def _inside_document_head(self) -> bool:
|
|
250
|
+
return any(node.tag == "head" for node in self._stack) and not self._inside_content()
|
|
251
|
+
|
|
252
|
+
def _outline_depth(self) -> int:
|
|
253
|
+
return len([
|
|
254
|
+
node for node in self._stack
|
|
255
|
+
if node.has_identity and node.tag not in _STRUCTURAL
|
|
256
|
+
])
|
|
257
|
+
|
|
258
|
+
def _tag_depth(self, tag: str) -> int:
|
|
259
|
+
return 0 if tag in _STRUCTURAL else self._outline_depth() + 1
|
|
260
|
+
|
|
261
|
+
def _heading_context_node(self) -> _Node | None:
|
|
262
|
+
for node in reversed(self._stack):
|
|
263
|
+
if node.tag not in _STRUCTURAL and node.tag in _OUTLINE_TAGS:
|
|
264
|
+
return node
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
def _heading_base_depth(self) -> int:
|
|
268
|
+
node = self._heading_context_node()
|
|
269
|
+
return node.depth + 1 if node else 1
|
|
270
|
+
|
|
271
|
+
def _heading_context_key(self) -> int:
|
|
272
|
+
node = self._heading_context_node()
|
|
273
|
+
return id(node) if node else 0
|
|
274
|
+
|
|
275
|
+
def _heading_depth(self, heading: _Heading) -> int:
|
|
276
|
+
stack = self._heading_stacks.setdefault(heading.context_key, [])
|
|
277
|
+
while stack and stack[-1][0] >= heading.level:
|
|
278
|
+
stack.pop()
|
|
279
|
+
depth = stack[-1][1] + 1 if stack else heading.base_depth
|
|
280
|
+
stack.append((heading.level, depth))
|
|
281
|
+
return depth
|
|
282
|
+
|
|
283
|
+
def _add_text(self, text: str, glue: bool = False) -> None:
|
|
284
|
+
if not text.strip():
|
|
285
|
+
return
|
|
286
|
+
for node in reversed(self._stack):
|
|
287
|
+
if node.tag in _LANDMARKS:
|
|
288
|
+
if node.text_parts and not (glue or node.text_parts[-1].endswith(";")):
|
|
289
|
+
node.text_parts.append(" ")
|
|
290
|
+
node.text_parts.append(text)
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _is_heading(tag: str) -> bool:
|
|
295
|
+
return len(tag) == 2 and tag[0] == "h" and tag[1] in "123456"
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _clean(text: str) -> str:
|
|
299
|
+
return _WS_RE.sub(" ", html.unescape(text)).strip()
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _block_sig(tag: str, attrs: dict[str, str], fallback_text: str = "", depth: int = 0) -> str:
|
|
303
|
+
ident = f"#{attrs['id']}" if attrs.get("id") else ""
|
|
304
|
+
label = _clean(attrs.get("aria-label", ""))
|
|
305
|
+
label_attr = f' aria-label="{label}"' if label else ""
|
|
306
|
+
excerpt = _excerpt(fallback_text) if tag in _LANDMARKS else ""
|
|
307
|
+
text = excerpt if excerpt and not label else ""
|
|
308
|
+
return f"{' ' * depth}<{tag}{ident}{label_attr}>{text}"
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _excerpt(text: str) -> str:
|
|
312
|
+
text = _clean(text)
|
|
313
|
+
if not text:
|
|
314
|
+
return ""
|
|
315
|
+
first = _SENTENCE_RE.split(text, maxsplit=1)[0]
|
|
316
|
+
if first.lower() in _BORING_EXCERPTS:
|
|
317
|
+
return ""
|
|
318
|
+
return first if len(first) <= _EXCERPT_LIMIT else first[:_EXCERPT_LIMIT - 3].rstrip() + "..."
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _items_from_headings(
|
|
322
|
+
headings: list[tuple[int, int, int, str]],
|
|
323
|
+
line_count: int,
|
|
324
|
+
) -> Iterator[tuple[int, int, OutlineItem]]:
|
|
325
|
+
for idx, (line, column, level, signature) in enumerate(headings):
|
|
326
|
+
end = line_count + 1
|
|
327
|
+
for future_line, _, future_level, _ in headings[idx + 1:]:
|
|
328
|
+
if future_level <= level:
|
|
329
|
+
end = future_line
|
|
330
|
+
break
|
|
331
|
+
yield (line, column, OutlineItem(start=line, count=max(1, end - line), signature=signature))
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def detect(lines: list[str]) -> bool:
|
|
335
|
+
for line in lines[:30]:
|
|
336
|
+
if _DOCTYPE_RE.match(line) or _HTML_TAG_RE.match(line):
|
|
337
|
+
return True
|
|
338
|
+
return False
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def parse(text: str) -> Iterator[OutlineItem]:
|
|
342
|
+
line_count = len(text.splitlines())
|
|
343
|
+
parser = _Parser(line_count)
|
|
344
|
+
parser.feed(text)
|
|
345
|
+
parser.close()
|
|
346
|
+
|
|
347
|
+
block_items = (
|
|
348
|
+
(node.start, node.start_col, OutlineItem(
|
|
349
|
+
start=node.start,
|
|
350
|
+
count=max(1, (node.end or line_count) - node.start + 1),
|
|
351
|
+
signature=node.signature,
|
|
352
|
+
))
|
|
353
|
+
for node in parser.nodes if node.has_identity
|
|
354
|
+
)
|
|
355
|
+
events = [*parser.titles, *_items_from_headings(parser.headings, line_count), *block_items]
|
|
356
|
+
for _, _, item in sorted(events, key=lambda event: (event[0], event[1])):
|
|
357
|
+
yield item
|