decant-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- decant/__init__.py +0 -0
- decant/cli/__init__.py +0 -0
- decant/cli/main.py +158 -0
- decant/core/__init__.py +0 -0
- decant/core/constants.py +65 -0
- decant/core/content_selector.py +77 -0
- decant/core/degradation.py +147 -0
- decant/core/model.py +139 -0
- decant/core/parser.py +1073 -0
- decant/core/renderer.py +578 -0
- decant/core/sanitizer.py +58 -0
- decant/io/__init__.py +0 -0
- decant/io/reader.py +31 -0
- decant/io/writer.py +26 -0
- decant_cli-0.1.0.dist-info/METADATA +63 -0
- decant_cli-0.1.0.dist-info/RECORD +20 -0
- decant_cli-0.1.0.dist-info/WHEEL +5 -0
- decant_cli-0.1.0.dist-info/entry_points.txt +2 -0
- decant_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- decant_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main content selection from sanitized DOM.
|
|
3
|
+
|
|
4
|
+
Implements deterministic content selection: main -> article -> body.
|
|
5
|
+
Also provides mode detection for routing between transform and extract pipelines.
|
|
6
|
+
See decisions.md section 4 for selection rules.
|
|
7
|
+
"""
|
|
8
|
+
from bs4 import BeautifulSoup, Tag
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def detect_mode(html: str) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Detect whether input HTML needs extraction (boilerplate removal) or
|
|
14
|
+
can go straight to transform (fidelity-first parsing).
|
|
15
|
+
|
|
16
|
+
Routing rules (in order):
|
|
17
|
+
1. Force extract if scripts >= 10
|
|
18
|
+
2. Force extract if nav/aside/footer/header elements >= 5
|
|
19
|
+
3. Default to transform
|
|
20
|
+
|
|
21
|
+
These thresholds were derived from analysis of the fixture corpus.
|
|
22
|
+
simple_article.html (clean developer HTML) scores: scripts=0, nav=0
|
|
23
|
+
Real-world pages (Wikipedia, NHS, BBC etc) score well above thresholds.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
html: Raw HTML string
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
"transform" or "extract"
|
|
30
|
+
"""
|
|
31
|
+
soup = BeautifulSoup(html, "lxml")
|
|
32
|
+
|
|
33
|
+
scripts = len(soup.find_all(["script", "iframe"]))
|
|
34
|
+
nav_elements = len(soup.find_all(["nav", "aside", "footer", "header"]))
|
|
35
|
+
|
|
36
|
+
if scripts >= 10:
|
|
37
|
+
return "extract"
|
|
38
|
+
|
|
39
|
+
if nav_elements >= 5:
|
|
40
|
+
return "extract"
|
|
41
|
+
|
|
42
|
+
return "transform"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def select_main_content(soup: BeautifulSoup) -> Tag:
|
|
46
|
+
"""
|
|
47
|
+
Select main content area from DOM tree.
|
|
48
|
+
|
|
49
|
+
Selection order (deterministic):
|
|
50
|
+
1. First <main> element
|
|
51
|
+
2. First <article> element
|
|
52
|
+
3. <body> element
|
|
53
|
+
|
|
54
|
+
Navigation, headers, footers, and sidebars are excluded by selection.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
soup: BeautifulSoup parsed DOM tree
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Tag object representing the main content subtree
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
ValueError: If no body element exists (malformed HTML)
|
|
64
|
+
"""
|
|
65
|
+
main = soup.find("main")
|
|
66
|
+
if main:
|
|
67
|
+
return main
|
|
68
|
+
|
|
69
|
+
article = soup.find("article")
|
|
70
|
+
if article:
|
|
71
|
+
return article
|
|
72
|
+
|
|
73
|
+
body = soup.find("body")
|
|
74
|
+
if body:
|
|
75
|
+
return body
|
|
76
|
+
|
|
77
|
+
raise ValueError("No body element found in HTML")
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Degradation rules for unsupported HTML elements.
|
|
3
|
+
|
|
4
|
+
Converts tables, images, forms, and other unsupported elements into
|
|
5
|
+
placeholder model objects. See decisions.md section 7 for rules.
|
|
6
|
+
"""
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from bs4 import Tag
|
|
10
|
+
from decant.core.model import Image, Paragraph, Text, Table, TableRow, TableCell
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _is_simple_table(element: Tag) -> bool:
|
|
14
|
+
"""
|
|
15
|
+
Check if a table meets the simple-table boundary.
|
|
16
|
+
|
|
17
|
+
Simple: <= 10 rows, no colspan, no rowspan, no nested tables,
|
|
18
|
+
more than 1 cell total.
|
|
19
|
+
"""
|
|
20
|
+
# No nested tables
|
|
21
|
+
if element.find("table"):
|
|
22
|
+
return False
|
|
23
|
+
|
|
24
|
+
rows = element.find_all("tr")
|
|
25
|
+
if len(rows) == 0 or len(rows) > 10:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
total_cells = 0
|
|
29
|
+
for row in rows:
|
|
30
|
+
cells = row.find_all(["td", "th"])
|
|
31
|
+
for cell in cells:
|
|
32
|
+
if cell.get("colspan") or cell.get("rowspan"):
|
|
33
|
+
return False
|
|
34
|
+
total_cells += 1
|
|
35
|
+
|
|
36
|
+
return total_cells > 1
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def degrade_table(element: Tag) -> Paragraph | Table:
|
|
40
|
+
"""
|
|
41
|
+
Convert table to Table model if simple, otherwise placeholder.
|
|
42
|
+
|
|
43
|
+
Simple tables (<= 10 rows, no colspan/rowspan/nesting, > 1 cell)
|
|
44
|
+
are rendered as styled HTML tables. Complex tables degrade to
|
|
45
|
+
placeholder text with dimensions.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
element: BeautifulSoup Tag for <table>
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Table for simple tables, Paragraph placeholder for complex
|
|
52
|
+
"""
|
|
53
|
+
if _is_simple_table(element):
|
|
54
|
+
return _parse_simple_table(element)
|
|
55
|
+
|
|
56
|
+
rows = element.find_all("tr")
|
|
57
|
+
row_count = len(rows)
|
|
58
|
+
|
|
59
|
+
col_count = 0
|
|
60
|
+
for row in rows:
|
|
61
|
+
cells = row.find_all(["td", "th"])
|
|
62
|
+
col_count = max(col_count, len(cells))
|
|
63
|
+
|
|
64
|
+
text = f"[Table omitted - {row_count} rows, {col_count} columns]"
|
|
65
|
+
return Paragraph(inlines=[Text(text=text)])
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _parse_simple_table(element: Tag) -> Table:
|
|
69
|
+
"""
|
|
70
|
+
Parse a simple table element into a Table model.
|
|
71
|
+
|
|
72
|
+
Walks rows and cells, parsing inline content from each cell.
|
|
73
|
+
Cells are marked as headers if they are <th> elements.
|
|
74
|
+
"""
|
|
75
|
+
from decant.core.parser import parse_inlines
|
|
76
|
+
|
|
77
|
+
model_rows: list[TableRow] = []
|
|
78
|
+
for tr in element.find_all("tr"):
|
|
79
|
+
cells: list[TableCell] = []
|
|
80
|
+
for cell in tr.find_all(["td", "th"]):
|
|
81
|
+
inlines = parse_inlines(cell)
|
|
82
|
+
cells.append(TableCell(
|
|
83
|
+
inlines=inlines,
|
|
84
|
+
is_header=(cell.name == "th"),
|
|
85
|
+
))
|
|
86
|
+
if cells:
|
|
87
|
+
model_rows.append(TableRow(cells=cells))
|
|
88
|
+
|
|
89
|
+
return Table(rows=model_rows)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def degrade_image(element: Tag) -> Image | Text:
|
|
93
|
+
"""
|
|
94
|
+
Preserve image when src is http/https, otherwise placeholder.
|
|
95
|
+
|
|
96
|
+
Per decisions.md section 7: images with external URLs are rendered
|
|
97
|
+
as <img> tags. Images without valid src degrade to WARN placeholder.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
element: BeautifulSoup Tag for <img>
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Image when src is http/https, Text placeholder otherwise
|
|
104
|
+
"""
|
|
105
|
+
src = element.get("src", "").strip()
|
|
106
|
+
alt = element.get("alt", "").strip()
|
|
107
|
+
|
|
108
|
+
if src:
|
|
109
|
+
scheme = urlparse(src).scheme.lower()
|
|
110
|
+
if scheme in ("http", "https"):
|
|
111
|
+
return Image(src=src, alt=alt)
|
|
112
|
+
|
|
113
|
+
# Fallback: WARN placeholder
|
|
114
|
+
if alt:
|
|
115
|
+
text = f"[Image: {alt}]"
|
|
116
|
+
else:
|
|
117
|
+
text = "[Image not included]"
|
|
118
|
+
|
|
119
|
+
return Text(text=text)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def degrade_form(element: Tag) -> Paragraph:
|
|
123
|
+
"""
|
|
124
|
+
Convert form to placeholder.
|
|
125
|
+
|
|
126
|
+
Forms are interactive and incompatible with static readable output.
|
|
127
|
+
Will not be supported in future versions.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
element: BeautifulSoup Tag for form element
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Paragraph with static placeholder
|
|
134
|
+
"""
|
|
135
|
+
return Paragraph(inlines=[Text(text="[Form omitted]")])
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def degrade_hr() -> Paragraph:
|
|
139
|
+
"""
|
|
140
|
+
Convert horizontal rule to visual separator.
|
|
141
|
+
|
|
142
|
+
v2 consideration: Could render as actual CSS border for visual clarity.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Paragraph with separator token
|
|
146
|
+
"""
|
|
147
|
+
return Paragraph(inlines=[Text(text="[-]")])
|
decant/core/model.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Internal document model classes.
|
|
3
|
+
|
|
4
|
+
Represents parsed HTML as a tree of Python objects.
|
|
5
|
+
Parser builds these from DOM. Renderer consumes them to generate HTML.
|
|
6
|
+
No HTML strings or DOM references allowed in the model.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# === Inline elements ===
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Text:
|
|
16
|
+
"""Plain text content."""
|
|
17
|
+
text: str
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Emphasis:
|
|
21
|
+
"""Emphasized text (italic). Can contain nested inlines."""
|
|
22
|
+
children: list[Inline]
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Strong:
|
|
26
|
+
"""Strong emphasis (bold). Can contain nested inlines."""
|
|
27
|
+
children: list[Inline]
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class Code:
|
|
31
|
+
"""Inline code. Plain text only (no nesting)."""
|
|
32
|
+
text: str
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Link:
|
|
36
|
+
"""Hyperlink with href and inline children."""
|
|
37
|
+
href: str
|
|
38
|
+
children: list[Inline]
|
|
39
|
+
|
|
40
|
+
class LineBreak:
|
|
41
|
+
"""Line break (br tag). No fields - marker type only."""
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Type alias for any inline element
|
|
46
|
+
Inline = Text | Emphasis | Strong | Code | Link | LineBreak
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# === Block elements ===
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Paragraph:
|
|
53
|
+
"""Paragraph containing inline elements."""
|
|
54
|
+
inlines: list[Inline]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class ListItem:
|
|
59
|
+
"""
|
|
60
|
+
Single item in a list.
|
|
61
|
+
|
|
62
|
+
Can contain nested lists via children field.
|
|
63
|
+
"""
|
|
64
|
+
inlines: list[Inline]
|
|
65
|
+
children: list[ListBlock] # Nested lists (0..n)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class ListBlock:
|
|
70
|
+
"""Ordered or unordered list."""
|
|
71
|
+
ordered: bool
|
|
72
|
+
items: list[ListItem]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Quote:
|
|
77
|
+
"""Blockquote containing block elements (recursive)."""
|
|
78
|
+
blocks: list[Block]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class Preformatted:
|
|
83
|
+
"""Preformatted text from <pre>. Verbatim content."""
|
|
84
|
+
text: str
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class Image:
|
|
89
|
+
"""Preserved image with external URL. No raw HTML."""
|
|
90
|
+
src: str
|
|
91
|
+
alt: str
|
|
92
|
+
caption: str = ""
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class TableCell:
|
|
97
|
+
"""Single cell in a table. May be header (th) or data (td)."""
|
|
98
|
+
inlines: list[Inline]
|
|
99
|
+
is_header: bool = False
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class TableRow:
|
|
104
|
+
"""Single row in a table."""
|
|
105
|
+
cells: list[TableCell]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class Table:
|
|
110
|
+
"""Simple data table. Rows contain cells with inline content."""
|
|
111
|
+
rows: list[TableRow]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Type alias for any block element
|
|
115
|
+
Block = Paragraph | ListBlock | Quote | Preformatted | Image | Table
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# === Document structure ===
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class Heading:
|
|
122
|
+
"""Section heading with level and inline content."""
|
|
123
|
+
level: int # 1..6
|
|
124
|
+
inlines: list[Inline]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class Section:
|
|
129
|
+
"""Document section with heading and blocks."""
|
|
130
|
+
heading: Heading
|
|
131
|
+
blocks: list[Block]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class Document:
|
|
136
|
+
"""Top-level document structure."""
|
|
137
|
+
title: str
|
|
138
|
+
sections: list[Section]
|
|
139
|
+
source_url: str = ""
|