decant-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ """
2
+ Main content selection from sanitized DOM.
3
+
4
+ Implements deterministic content selection: main -> article -> body.
5
+ Also provides mode detection for routing between transform and extract pipelines.
6
+ See decisions.md section 4 for selection rules.
7
+ """
8
+ from bs4 import BeautifulSoup, Tag
9
+
10
+
11
+ def detect_mode(html: str) -> str:
12
+ """
13
+ Detect whether input HTML needs extraction (boilerplate removal) or
14
+ can go straight to transform (fidelity-first parsing).
15
+
16
+ Routing rules (in order):
17
+ 1. Force extract if scripts >= 10
18
+ 2. Force extract if nav/aside/footer/header elements >= 5
19
+ 3. Default to transform
20
+
21
+ These thresholds were derived from analysis of the fixture corpus.
22
+ simple_article.html (clean developer HTML) scores: scripts=0, nav=0
23
+ Real-world pages (Wikipedia, NHS, BBC etc) score well above thresholds.
24
+
25
+ Args:
26
+ html: Raw HTML string
27
+
28
+ Returns:
29
+ "transform" or "extract"
30
+ """
31
+ soup = BeautifulSoup(html, "lxml")
32
+
33
+ scripts = len(soup.find_all(["script", "iframe"]))
34
+ nav_elements = len(soup.find_all(["nav", "aside", "footer", "header"]))
35
+
36
+ if scripts >= 10:
37
+ return "extract"
38
+
39
+ if nav_elements >= 5:
40
+ return "extract"
41
+
42
+ return "transform"
43
+
44
+
45
+ def select_main_content(soup: BeautifulSoup) -> Tag:
46
+ """
47
+ Select main content area from DOM tree.
48
+
49
+ Selection order (deterministic):
50
+ 1. First <main> element
51
+ 2. First <article> element
52
+ 3. <body> element
53
+
54
+ Navigation, headers, footers, and sidebars are excluded by selection.
55
+
56
+ Args:
57
+ soup: BeautifulSoup parsed DOM tree
58
+
59
+ Returns:
60
+ Tag object representing the main content subtree
61
+
62
+ Raises:
63
+ ValueError: If no body element exists (malformed HTML)
64
+ """
65
+ main = soup.find("main")
66
+ if main:
67
+ return main
68
+
69
+ article = soup.find("article")
70
+ if article:
71
+ return article
72
+
73
+ body = soup.find("body")
74
+ if body:
75
+ return body
76
+
77
+ raise ValueError("No body element found in HTML")
@@ -0,0 +1,147 @@
1
+ """
2
+ Degradation rules for unsupported HTML elements.
3
+
4
+ Converts tables, images, forms, and other unsupported elements into
5
+ placeholder model objects. See decisions.md section 7 for rules.
6
+ """
7
+ from urllib.parse import urlparse
8
+
9
+ from bs4 import Tag
10
+ from decant.core.model import Image, Paragraph, Text, Table, TableRow, TableCell
11
+
12
+
13
+ def _is_simple_table(element: Tag) -> bool:
14
+ """
15
+ Check if a table meets the simple-table boundary.
16
+
17
+ Simple: <= 10 rows, no colspan, no rowspan, no nested tables,
18
+ more than 1 cell total.
19
+ """
20
+ # No nested tables
21
+ if element.find("table"):
22
+ return False
23
+
24
+ rows = element.find_all("tr")
25
+ if len(rows) == 0 or len(rows) > 10:
26
+ return False
27
+
28
+ total_cells = 0
29
+ for row in rows:
30
+ cells = row.find_all(["td", "th"])
31
+ for cell in cells:
32
+ if cell.get("colspan") or cell.get("rowspan"):
33
+ return False
34
+ total_cells += 1
35
+
36
+ return total_cells > 1
37
+
38
+
39
+ def degrade_table(element: Tag) -> Paragraph | Table:
40
+ """
41
+ Convert table to Table model if simple, otherwise placeholder.
42
+
43
+ Simple tables (<= 10 rows, no colspan/rowspan/nesting, > 1 cell)
44
+ are rendered as styled HTML tables. Complex tables degrade to
45
+ placeholder text with dimensions.
46
+
47
+ Args:
48
+ element: BeautifulSoup Tag for <table>
49
+
50
+ Returns:
51
+ Table for simple tables, Paragraph placeholder for complex
52
+ """
53
+ if _is_simple_table(element):
54
+ return _parse_simple_table(element)
55
+
56
+ rows = element.find_all("tr")
57
+ row_count = len(rows)
58
+
59
+ col_count = 0
60
+ for row in rows:
61
+ cells = row.find_all(["td", "th"])
62
+ col_count = max(col_count, len(cells))
63
+
64
+ text = f"[Table omitted - {row_count} rows, {col_count} columns]"
65
+ return Paragraph(inlines=[Text(text=text)])
66
+
67
+
68
+ def _parse_simple_table(element: Tag) -> Table:
69
+ """
70
+ Parse a simple table element into a Table model.
71
+
72
+ Walks rows and cells, parsing inline content from each cell.
73
+ Cells are marked as headers if they are <th> elements.
74
+ """
75
+ from decant.core.parser import parse_inlines
76
+
77
+ model_rows: list[TableRow] = []
78
+ for tr in element.find_all("tr"):
79
+ cells: list[TableCell] = []
80
+ for cell in tr.find_all(["td", "th"]):
81
+ inlines = parse_inlines(cell)
82
+ cells.append(TableCell(
83
+ inlines=inlines,
84
+ is_header=(cell.name == "th"),
85
+ ))
86
+ if cells:
87
+ model_rows.append(TableRow(cells=cells))
88
+
89
+ return Table(rows=model_rows)
90
+
91
+
92
+ def degrade_image(element: Tag) -> Image | Text:
93
+ """
94
+ Preserve image when src is http/https, otherwise placeholder.
95
+
96
+ Per decisions.md section 7: images with external URLs are rendered
97
+ as <img> tags. Images without valid src degrade to WARN placeholder.
98
+
99
+ Args:
100
+ element: BeautifulSoup Tag for <img>
101
+
102
+ Returns:
103
+ Image when src is http/https, Text placeholder otherwise
104
+ """
105
+ src = element.get("src", "").strip()
106
+ alt = element.get("alt", "").strip()
107
+
108
+ if src:
109
+ scheme = urlparse(src).scheme.lower()
110
+ if scheme in ("http", "https"):
111
+ return Image(src=src, alt=alt)
112
+
113
+ # Fallback: WARN placeholder
114
+ if alt:
115
+ text = f"[Image: {alt}]"
116
+ else:
117
+ text = "[Image not included]"
118
+
119
+ return Text(text=text)
120
+
121
+
122
+ def degrade_form(element: Tag) -> Paragraph:
123
+ """
124
+ Convert form to placeholder.
125
+
126
+ Forms are interactive and incompatible with static readable output.
127
+ Will not be supported in future versions.
128
+
129
+ Args:
130
+ element: BeautifulSoup Tag for form element
131
+
132
+ Returns:
133
+ Paragraph with static placeholder
134
+ """
135
+ return Paragraph(inlines=[Text(text="[Form omitted]")])
136
+
137
+
138
+ def degrade_hr() -> Paragraph:
139
+ """
140
+ Convert horizontal rule to visual separator.
141
+
142
+ v2 consideration: Could render as actual CSS border for visual clarity.
143
+
144
+ Returns:
145
+ Paragraph with separator token
146
+ """
147
+ return Paragraph(inlines=[Text(text="[-]")])
decant/core/model.py ADDED
@@ -0,0 +1,139 @@
1
+ """
2
+ Internal document model classes.
3
+
4
+ Represents parsed HTML as a tree of Python objects.
5
+ Parser builds these from DOM. Renderer consumes them to generate HTML.
6
+ No HTML strings or DOM references allowed in the model.
7
+ """
8
+ from __future__ import annotations
9
+ from dataclasses import dataclass
10
+
11
+
12
+ # === Inline elements ===
13
+
14
+ @dataclass
15
+ class Text:
16
+ """Plain text content."""
17
+ text: str
18
+
19
+ @dataclass
20
+ class Emphasis:
21
+ """Emphasized text (italic). Can contain nested inlines."""
22
+ children: list[Inline]
23
+
24
+ @dataclass
25
+ class Strong:
26
+ """Strong emphasis (bold). Can contain nested inlines."""
27
+ children: list[Inline]
28
+
29
+ @dataclass
30
+ class Code:
31
+ """Inline code. Plain text only (no nesting)."""
32
+ text: str
33
+
34
+ @dataclass
35
+ class Link:
36
+ """Hyperlink with href and inline children."""
37
+ href: str
38
+ children: list[Inline]
39
+
40
+ class LineBreak:
41
+ """Line break (br tag). No fields - marker type only."""
42
+ pass
43
+
44
+
45
+ # Type alias for any inline element
46
+ Inline = Text | Emphasis | Strong | Code | Link | LineBreak
47
+
48
+
49
+ # === Block elements ===
50
+
51
+ @dataclass
52
+ class Paragraph:
53
+ """Paragraph containing inline elements."""
54
+ inlines: list[Inline]
55
+
56
+
57
+ @dataclass
58
+ class ListItem:
59
+ """
60
+ Single item in a list.
61
+
62
+ Can contain nested lists via children field.
63
+ """
64
+ inlines: list[Inline]
65
+ children: list[ListBlock] # Nested lists (0..n)
66
+
67
+
68
+ @dataclass
69
+ class ListBlock:
70
+ """Ordered or unordered list."""
71
+ ordered: bool
72
+ items: list[ListItem]
73
+
74
+
75
+ @dataclass
76
+ class Quote:
77
+ """Blockquote containing block elements (recursive)."""
78
+ blocks: list[Block]
79
+
80
+
81
+ @dataclass
82
+ class Preformatted:
83
+ """Preformatted text from <pre>. Verbatim content."""
84
+ text: str
85
+
86
+
87
+ @dataclass
88
+ class Image:
89
+ """Preserved image with external URL. No raw HTML."""
90
+ src: str
91
+ alt: str
92
+ caption: str = ""
93
+
94
+
95
+ @dataclass
96
+ class TableCell:
97
+ """Single cell in a table. May be header (th) or data (td)."""
98
+ inlines: list[Inline]
99
+ is_header: bool = False
100
+
101
+
102
+ @dataclass
103
+ class TableRow:
104
+ """Single row in a table."""
105
+ cells: list[TableCell]
106
+
107
+
108
+ @dataclass
109
+ class Table:
110
+ """Simple data table. Rows contain cells with inline content."""
111
+ rows: list[TableRow]
112
+
113
+
114
+ # Type alias for any block element
115
+ Block = Paragraph | ListBlock | Quote | Preformatted | Image | Table
116
+
117
+
118
+ # === Document structure ===
119
+
120
+ @dataclass
121
+ class Heading:
122
+ """Section heading with level and inline content."""
123
+ level: int # 1..6
124
+ inlines: list[Inline]
125
+
126
+
127
+ @dataclass
128
+ class Section:
129
+ """Document section with heading and blocks."""
130
+ heading: Heading
131
+ blocks: list[Block]
132
+
133
+
134
+ @dataclass
135
+ class Document:
136
+ """Top-level document structure."""
137
+ title: str
138
+ sections: list[Section]
139
+ source_url: str = ""