decant-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- decant/__init__.py +0 -0
- decant/cli/__init__.py +0 -0
- decant/cli/main.py +158 -0
- decant/core/__init__.py +0 -0
- decant/core/constants.py +65 -0
- decant/core/content_selector.py +77 -0
- decant/core/degradation.py +147 -0
- decant/core/model.py +139 -0
- decant/core/parser.py +1073 -0
- decant/core/renderer.py +578 -0
- decant/core/sanitizer.py +58 -0
- decant/io/__init__.py +0 -0
- decant/io/reader.py +31 -0
- decant/io/writer.py +26 -0
- decant_cli-0.1.0.dist-info/METADATA +63 -0
- decant_cli-0.1.0.dist-info/RECORD +20 -0
- decant_cli-0.1.0.dist-info/WHEEL +5 -0
- decant_cli-0.1.0.dist-info/entry_points.txt +2 -0
- decant_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- decant_cli-0.1.0.dist-info/top_level.txt +1 -0
decant/core/parser.py
ADDED
|
@@ -0,0 +1,1073 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML to Document model parser.
|
|
3
|
+
|
|
4
|
+
Converts sanitized DOM tree to internal model representation.
|
|
5
|
+
Pipeline: Step 4 (after sanitization and content selection)
|
|
6
|
+
See decisions.md sections 5-8 for parsing rules.
|
|
7
|
+
"""
|
|
8
|
+
import re
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
import trafilatura
|
|
12
|
+
|
|
13
|
+
# ExtractionMode controls which Trafilatura parameter set is used.
|
|
14
|
+
# "baseline" is and must remain the current production behavior.
|
|
15
|
+
ExtractionMode = Literal["baseline", "precision", "recall"]
|
|
16
|
+
|
|
17
|
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
|
18
|
+
|
|
19
|
+
from decant.core.model import (
|
|
20
|
+
Document, Section, Heading, Block, Inline,
|
|
21
|
+
Paragraph, ListBlock, ListItem, Quote, Preformatted, Image, Table,
|
|
22
|
+
Text, Emphasis, Strong, Code, Link, LineBreak
|
|
23
|
+
)
|
|
24
|
+
from decant.core.sanitizer import sanitize
|
|
25
|
+
from decant.core.content_selector import select_main_content
|
|
26
|
+
from decant.core.degradation import (
|
|
27
|
+
degrade_table, degrade_image, degrade_form, degrade_hr
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ValidationError(Exception):
|
|
32
|
+
"""Raised when input HTML lacks required semantic structure."""
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def harvest_captions(html: str) -> dict[str, str]:
|
|
37
|
+
"""
|
|
38
|
+
Scan raw HTML before Trafilatura runs, build a map of
|
|
39
|
+
img src URL -> figcaption plain text.
|
|
40
|
+
|
|
41
|
+
Rules:
|
|
42
|
+
- Only considers <figure> elements containing exactly one <img>
|
|
43
|
+
with an http/https src and a non-empty <figcaption>.
|
|
44
|
+
- Ambiguous figures (multiple images, no image, data URIs) are skipped.
|
|
45
|
+
- Safe to call on any HTML. No exceptions, no side effects.
|
|
46
|
+
"""
|
|
47
|
+
from urllib.parse import urlparse
|
|
48
|
+
soup = BeautifulSoup(html, "lxml")
|
|
49
|
+
result: dict[str, str] = {}
|
|
50
|
+
for figure in soup.find_all("figure"):
|
|
51
|
+
figcaption = figure.find("figcaption")
|
|
52
|
+
if not figcaption:
|
|
53
|
+
continue
|
|
54
|
+
caption_text = figcaption.get_text(separator=" ", strip=True)
|
|
55
|
+
if not caption_text:
|
|
56
|
+
continue
|
|
57
|
+
imgs = figure.find_all("img")
|
|
58
|
+
http_imgs = [
|
|
59
|
+
img for img in imgs
|
|
60
|
+
if img.get("src", "").strip()
|
|
61
|
+
and urlparse(img["src"].strip()).scheme in ("http", "https")
|
|
62
|
+
]
|
|
63
|
+
if len(http_imgs) != 1:
|
|
64
|
+
continue
|
|
65
|
+
result[http_imgs[0]["src"].strip()] = caption_text
|
|
66
|
+
return result
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Site-specific anchor strings. These are an escape hatch for CMS
|
|
71
|
+
# footer phrases that don't match any structural pattern in
|
|
72
|
+
# trim_trailing_noise(). The generic mechanisms (noise patterns,
|
|
73
|
+
# heading detection) do the heavy lifting. Add strings here only
|
|
74
|
+
# when a specific site's boilerplate cannot be caught structurally.
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
_TAIL_BOILERPLATE_ANCHORS = frozenset([
|
|
78
|
+
"Follow Cleveland Clinic",
|
|
79
|
+
"Learn more about the Health Library",
|
|
80
|
+
"Got a story we should hear",
|
|
81
|
+
"Back to top",
|
|
82
|
+
"Educate your inbox",
|
|
83
|
+
])
|
|
84
|
+
|
|
85
|
+
_BOILERPLATE_SECTION_HEADINGS = frozenset([
|
|
86
|
+
"educate your inbox",
|
|
87
|
+
"newsletter",
|
|
88
|
+
"subscribe",
|
|
89
|
+
"comments",
|
|
90
|
+
"related stories",
|
|
91
|
+
"related articles",
|
|
92
|
+
"more stories",
|
|
93
|
+
"recommended",
|
|
94
|
+
"advertisement",
|
|
95
|
+
"sponsored content",
|
|
96
|
+
])
|
|
97
|
+
|
|
98
|
+
_TAIL_SCAN_LIMIT = 10 # never scan further than this from the end
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _inline_to_text(inline: "Inline") -> str:
|
|
102
|
+
"""Recursively extract plain text from a single Inline for pattern matching."""
|
|
103
|
+
if isinstance(inline, Text):
|
|
104
|
+
return inline.text
|
|
105
|
+
if isinstance(inline, (Emphasis, Strong)):
|
|
106
|
+
return "".join(_inline_to_text(c) for c in inline.children)
|
|
107
|
+
if isinstance(inline, Code):
|
|
108
|
+
return inline.text
|
|
109
|
+
if isinstance(inline, Link):
|
|
110
|
+
return "".join(_inline_to_text(c) for c in inline.children)
|
|
111
|
+
return "" # LineBreak and unknowns contribute no text
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _paragraph_plain_text(para: Paragraph) -> str:
|
|
115
|
+
"""Return the plain-text content of a Paragraph for boilerplate matching."""
|
|
116
|
+
return "".join(_inline_to_text(il) for il in para.inlines).strip()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def trim_trailing_boilerplate(sections: list[Section]) -> list[Section]:
|
|
120
|
+
"""
|
|
121
|
+
Remove trailing CMS footer paragraphs from the last section.
|
|
122
|
+
|
|
123
|
+
Scans up to _TAIL_SCAN_LIMIT blocks from the end of the final section.
|
|
124
|
+
When an anchor pattern is found, that block and all blocks after it are
|
|
125
|
+
removed. Blocks before the anchor are preserved intact.
|
|
126
|
+
|
|
127
|
+
End-anchored: never removes content from the middle of a document.
|
|
128
|
+
Called after build_sections(), before render.
|
|
129
|
+
"""
|
|
130
|
+
if not sections:
|
|
131
|
+
return sections
|
|
132
|
+
|
|
133
|
+
last = sections[-1]
|
|
134
|
+
blocks = last.blocks
|
|
135
|
+
if not blocks:
|
|
136
|
+
return sections
|
|
137
|
+
|
|
138
|
+
tail_start = max(0, len(blocks) - _TAIL_SCAN_LIMIT)
|
|
139
|
+
cut_at = None
|
|
140
|
+
|
|
141
|
+
for i in range(len(blocks) - 1, tail_start - 1, -1):
|
|
142
|
+
block = blocks[i]
|
|
143
|
+
if isinstance(block, Paragraph):
|
|
144
|
+
text = _paragraph_plain_text(block)
|
|
145
|
+
for pattern in _TAIL_BOILERPLATE_ANCHORS:
|
|
146
|
+
if pattern in text:
|
|
147
|
+
cut_at = i
|
|
148
|
+
break
|
|
149
|
+
|
|
150
|
+
if cut_at is not None:
|
|
151
|
+
sections = list(sections) # do not mutate the input list
|
|
152
|
+
sections[-1] = Section(heading=last.heading, blocks=list(blocks[:cut_at]))
|
|
153
|
+
|
|
154
|
+
return sections
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
_TRAILING_NOISE_DATE_RE = re.compile(
|
|
158
|
+
r'^(?:\d{4}[-/]\d{2}[-/]\d{2}'
|
|
159
|
+
r'|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.]*\s+\d{1,2},?\s+\d{4}'
|
|
160
|
+
r'|\d{1,2}/\d{1,2}/\d{4})$',
|
|
161
|
+
re.IGNORECASE,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _is_trailing_noise(text: str) -> bool:
|
|
166
|
+
"""Return True if a paragraph's plain text matches a trailing noise pattern."""
|
|
167
|
+
t = text.strip()
|
|
168
|
+
if not t:
|
|
169
|
+
return False
|
|
170
|
+
tl = t.lower()
|
|
171
|
+
|
|
172
|
+
# Photo/image/visual/credit prefix
|
|
173
|
+
if tl.startswith(("image:", "photo:", "visual:", "credit:")):
|
|
174
|
+
return True
|
|
175
|
+
|
|
176
|
+
# Copyright symbol or (c)
|
|
177
|
+
if t.startswith("\u00a9") or tl.startswith("(c)"):
|
|
178
|
+
return True
|
|
179
|
+
|
|
180
|
+
# License boilerplate
|
|
181
|
+
if "cc by" in tl or "creative commons" in tl:
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
# Trivial noise (3 chars or fewer)
|
|
185
|
+
if len(t) <= 3:
|
|
186
|
+
return True
|
|
187
|
+
|
|
188
|
+
# Bare date stamp
|
|
189
|
+
if _TRAILING_NOISE_DATE_RE.match(t):
|
|
190
|
+
return True
|
|
191
|
+
|
|
192
|
+
return False
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def trim_trailing_noise(sections: list[Section]) -> list[Section]:
|
|
196
|
+
"""
|
|
197
|
+
Remove trailing noise paragraphs from the last section.
|
|
198
|
+
|
|
199
|
+
Scans backwards from the end. Removes paragraphs matching structural
|
|
200
|
+
noise patterns (photo credits, license text, date stamps, trivial noise).
|
|
201
|
+
Stops at the first non-matching block (end-anchored).
|
|
202
|
+
"""
|
|
203
|
+
if not sections:
|
|
204
|
+
return sections
|
|
205
|
+
|
|
206
|
+
last = sections[-1]
|
|
207
|
+
blocks = last.blocks
|
|
208
|
+
if not blocks:
|
|
209
|
+
return sections
|
|
210
|
+
|
|
211
|
+
# Scan backwards, find how many trailing blocks to remove
|
|
212
|
+
trim_count = 0
|
|
213
|
+
for i in range(len(blocks) - 1, -1, -1):
|
|
214
|
+
block = blocks[i]
|
|
215
|
+
if not isinstance(block, Paragraph):
|
|
216
|
+
break
|
|
217
|
+
text = _paragraph_plain_text(block)
|
|
218
|
+
if _is_trailing_noise(text):
|
|
219
|
+
trim_count += 1
|
|
220
|
+
else:
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
if trim_count > 0:
|
|
224
|
+
sections = list(sections)
|
|
225
|
+
new_blocks = list(blocks[:len(blocks) - trim_count])
|
|
226
|
+
sections[-1] = Section(heading=last.heading, blocks=new_blocks)
|
|
227
|
+
|
|
228
|
+
return sections
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def drop_trailing_orphan_section(sections: list[Section]) -> list[Section]:
|
|
232
|
+
"""
|
|
233
|
+
Drop the final section if it meets either of two end-anchored conditions:
|
|
234
|
+
|
|
235
|
+
1. Zero content blocks (bare heading stub — existing behaviour).
|
|
236
|
+
2. All blocks are placeholder tokens: every block is a Paragraph whose
|
|
237
|
+
sole inline is a bracketed placeholder text such as '[Form omitted]',
|
|
238
|
+
'[Image omitted]', or '[Image: ...]'. These sections contain no
|
|
239
|
+
article prose and are unambiguously CMS template artefacts.
|
|
240
|
+
|
|
241
|
+
End-anchored: only ever inspects the last section, so legitimate content
|
|
242
|
+
earlier in the document is never affected. Structural check only — no
|
|
243
|
+
site-specific strings.
|
|
244
|
+
|
|
245
|
+
Called after trim_trailing_boilerplate() (which may itself empty the last
|
|
246
|
+
section's block list), before render.
|
|
247
|
+
"""
|
|
248
|
+
if not sections:
|
|
249
|
+
return sections
|
|
250
|
+
last = sections[-1]
|
|
251
|
+
if len(last.blocks) == 0:
|
|
252
|
+
return sections[:-1]
|
|
253
|
+
if last.blocks and all(_is_placeholder_paragraph(b) for b in last.blocks):
|
|
254
|
+
return sections[:-1]
|
|
255
|
+
if _normalize_heading_text(last.heading) in _BOILERPLATE_SECTION_HEADINGS:
|
|
256
|
+
return sections[:-1]
|
|
257
|
+
return sections
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _normalize_str(s: str) -> str:
|
|
261
|
+
"""Collapse whitespace, strip, and lowercase a plain text string."""
|
|
262
|
+
return re.sub(r'\s+', ' ', s).strip().lower()
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _normalize_heading_text(heading) -> str:
|
|
266
|
+
"""
|
|
267
|
+
Return normalized heading text: strip, collapse internal whitespace,
|
|
268
|
+
lowercase. Used for consecutive duplicate detection.
|
|
269
|
+
"""
|
|
270
|
+
raw = "".join(_inline_to_text(il) for il in heading.inlines)
|
|
271
|
+
return _normalize_str(raw)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def drop_duplicate_consecutive_sections(sections: list[Section]) -> list[Section]:
|
|
275
|
+
"""
|
|
276
|
+
Drop any section whose normalized heading text is identical to the
|
|
277
|
+
immediately preceding section's normalized heading text.
|
|
278
|
+
|
|
279
|
+
Only adjacent (consecutive) duplicates are removed. Non-consecutive
|
|
280
|
+
duplicate headings are left untouched. Structural only — no
|
|
281
|
+
site-specific strings.
|
|
282
|
+
|
|
283
|
+
Called after collapse_consecutive_placeholder_blocks(), before render.
|
|
284
|
+
"""
|
|
285
|
+
if not sections:
|
|
286
|
+
return sections
|
|
287
|
+
result = [sections[0]]
|
|
288
|
+
for section in sections[1:]:
|
|
289
|
+
if _normalize_heading_text(section.heading) != _normalize_heading_text(result[-1].heading):
|
|
290
|
+
result.append(section)
|
|
291
|
+
return result
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def drop_empty_sections(sections: list[Section]) -> list[Section]:
|
|
295
|
+
"""
|
|
296
|
+
Remove any section with zero content blocks.
|
|
297
|
+
|
|
298
|
+
A heading with no paragraphs, lists, images, or other
|
|
299
|
+
blocks beneath it is structural noise — either a
|
|
300
|
+
title-duplicate artifact (section 0 matching <title>)
|
|
301
|
+
or a source heading whose content was lost in extraction.
|
|
302
|
+
|
|
303
|
+
Applied globally, not just end-anchored. Safe because
|
|
304
|
+
empty sections by definition contain no prose or media.
|
|
305
|
+
"""
|
|
306
|
+
return [s for s in sections if len(s.blocks) > 0]
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _is_placeholder_paragraph(block) -> bool:
|
|
310
|
+
"""
|
|
311
|
+
Return True if block is a single-inline Paragraph whose text is a known
|
|
312
|
+
placeholder token: [Form omitted], [Image not included], or [Image: <alt>].
|
|
313
|
+
|
|
314
|
+
These are generated by degradation.py and are structurally inert when
|
|
315
|
+
they appear in consecutive runs.
|
|
316
|
+
"""
|
|
317
|
+
from decant.core.model import Paragraph, Text as TextInline
|
|
318
|
+
if not isinstance(block, Paragraph):
|
|
319
|
+
return False
|
|
320
|
+
if len(block.inlines) != 1:
|
|
321
|
+
return False
|
|
322
|
+
inline = block.inlines[0]
|
|
323
|
+
if not isinstance(inline, TextInline):
|
|
324
|
+
return False
|
|
325
|
+
t = inline.text
|
|
326
|
+
return t.startswith("[") and t.endswith("]")
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def collapse_consecutive_placeholder_blocks(sections: list[Section]) -> list[Section]:
|
|
330
|
+
"""
|
|
331
|
+
Collapse runs of N>=2 consecutive identical placeholder Paragraphs within
|
|
332
|
+
each section's block list to a single instance.
|
|
333
|
+
|
|
334
|
+
'Identical' means the full placeholder text matches (e.g. all six
|
|
335
|
+
consecutive '[Form omitted]' blocks become one). Non-placeholder blocks
|
|
336
|
+
and runs of length 1 are left untouched.
|
|
337
|
+
|
|
338
|
+
Called after drop_trailing_orphan_section(), before render.
|
|
339
|
+
"""
|
|
340
|
+
result = []
|
|
341
|
+
for section in sections:
|
|
342
|
+
collapsed: list = []
|
|
343
|
+
for block in section.blocks:
|
|
344
|
+
if (
|
|
345
|
+
_is_placeholder_paragraph(block)
|
|
346
|
+
and collapsed
|
|
347
|
+
and _is_placeholder_paragraph(collapsed[-1])
|
|
348
|
+
and collapsed[-1].inlines[0].text == block.inlines[0].text
|
|
349
|
+
):
|
|
350
|
+
# Same placeholder as previous — skip (deduplicate)
|
|
351
|
+
continue
|
|
352
|
+
collapsed.append(block)
|
|
353
|
+
result.append(Section(heading=section.heading, blocks=collapsed))
|
|
354
|
+
return result
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
_ARTICLE_BODY_MIN_WORDS = 20
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _has_article_body(sections: list[Section]) -> bool:
|
|
361
|
+
"""
|
|
362
|
+
Return True if the document contains at least one non-placeholder
|
|
363
|
+
Paragraph with >= _ARTICLE_BODY_MIN_WORDS words of prose text.
|
|
364
|
+
|
|
365
|
+
Placeholder paragraphs (bracketed tokens like '[Form omitted]',
|
|
366
|
+
'[Image omitted]', etc.) are excluded from the count. Headings and
|
|
367
|
+
list blocks are not counted.
|
|
368
|
+
|
|
369
|
+
Used to detect extraction failure where Trafilatura captured navigation
|
|
370
|
+
or boilerplate instead of article content.
|
|
371
|
+
"""
|
|
372
|
+
from decant.core.model import Paragraph as ParagraphModel
|
|
373
|
+
for section in sections:
|
|
374
|
+
for block in section.blocks:
|
|
375
|
+
if not isinstance(block, ParagraphModel):
|
|
376
|
+
continue
|
|
377
|
+
if _is_placeholder_paragraph(block):
|
|
378
|
+
continue
|
|
379
|
+
text = "".join(_inline_to_text(il) for il in block.inlines)
|
|
380
|
+
if len(text.split()) >= _ARTICLE_BODY_MIN_WORDS:
|
|
381
|
+
return True
|
|
382
|
+
return False
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
# Minimum prose characters (after stripping tags) to accept a headingless
|
|
386
|
+
# Trafilatura extraction instead of falling back to the full original HTML.
|
|
387
|
+
# Both guardian (32 K) and theringer (18 K) clear this by a wide margin;
|
|
388
|
+
# a genuine empty/navigation-only extraction is well below it.
|
|
389
|
+
_MIN_PROSE_CHARS = 2000
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _extract_title_string(html: str) -> str:
|
|
393
|
+
"""
|
|
394
|
+
Extract and normalise the text content of the <title> tag from a raw
|
|
395
|
+
HTML string. Returns an empty string if no <title> is found.
|
|
396
|
+
|
|
397
|
+
Used to supply a synthetic heading when Trafilatura produces headingless
|
|
398
|
+
prose output (see extract_with_trafilatura).
|
|
399
|
+
"""
|
|
400
|
+
m = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE | re.DOTALL)
|
|
401
|
+
if not m:
|
|
402
|
+
return ""
|
|
403
|
+
return re.sub(r'\s+', ' ', m.group(1)).strip()
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _strip_title_branding(title: str | None) -> str | None:
|
|
407
|
+
"""
|
|
408
|
+
Strip site-branding suffix from a page title string.
|
|
409
|
+
|
|
410
|
+
Tries delimiters in priority order: ' | ', ' - ', ' -- '.
|
|
411
|
+
For the first delimiter found, splits on its last (rightmost) occurrence
|
|
412
|
+
and returns the left part, whitespace-stripped.
|
|
413
|
+
Returns None for None/empty input; returns the original stripped string
|
|
414
|
+
if no delimiter matches.
|
|
415
|
+
"""
|
|
416
|
+
if not title:
|
|
417
|
+
return None
|
|
418
|
+
for delim in (" | ", " - ", " -- "):
|
|
419
|
+
if delim in title:
|
|
420
|
+
return title.rsplit(delim, 1)[0].strip()
|
|
421
|
+
return title.strip()
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _first_heading_text(html_string: str) -> str | None:
|
|
425
|
+
"""
|
|
426
|
+
Return the plain text of the first h1-h6 element in an HTML string, or
|
|
427
|
+
None if no heading is found or the text is empty/whitespace-only.
|
|
428
|
+
|
|
429
|
+
Used by the H1-injection guard to compare the candidate synthetic title
|
|
430
|
+
against the first real heading already present in Trafilatura output.
|
|
431
|
+
"""
|
|
432
|
+
m = re.search(r'<h[1-6][^>]*>(.*?)</h[1-6]>', html_string, re.IGNORECASE | re.DOTALL)
|
|
433
|
+
if not m:
|
|
434
|
+
return None
|
|
435
|
+
text = re.sub(r'<[^>]+>', '', m.group(1)).strip()
|
|
436
|
+
return text if text else None
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def extract_with_trafilatura(
|
|
440
|
+
html: str,
|
|
441
|
+
extraction_mode: ExtractionMode = "baseline",
|
|
442
|
+
original_title: str | None = None,
|
|
443
|
+
) -> str:
|
|
444
|
+
"""
|
|
445
|
+
Extract main content from HTML using Trafilatura.
|
|
446
|
+
|
|
447
|
+
Used in extract mode (real-world pages with boilerplate).
|
|
448
|
+
Returns extracted HTML if Trafilatura succeeds. Falls back to original
|
|
449
|
+
HTML only when extraction genuinely fails (None or near-empty output).
|
|
450
|
+
|
|
451
|
+
Acceptance rules (in order):
|
|
452
|
+
1. Extracted content contains headings → use as-is (existing behaviour).
|
|
453
|
+
2. Extracted content is prose-sufficient (>= _MIN_PROSE_CHARS chars after
|
|
454
|
+
stripping tags) but headingless → inject a synthetic <h1> from the page
|
|
455
|
+
title so that validate_structure() and build_sections() can process the
|
|
456
|
+
content normally. Catches prose articles whose headings are CSS- or
|
|
457
|
+
JS-rendered and therefore absent from Trafilatura's HTML output
|
|
458
|
+
(e.g. Guardian long-reads, The Ringer feature articles).
|
|
459
|
+
3. Otherwise → fall back to original HTML (genuine extraction failure).
|
|
460
|
+
|
|
461
|
+
The synthetic heading is sourced from:
|
|
462
|
+
- ``original_title`` argument if supplied by the caller, else
|
|
463
|
+
- the raw ``<title>`` element parsed from ``html``, else
|
|
464
|
+
- the literal string "Article".
|
|
465
|
+
|
|
466
|
+
Known limitations in extract mode:
|
|
467
|
+
- <pre> code blocks are converted to <blockquote>
|
|
468
|
+
- Some inline spacing may be lost around inline elements
|
|
469
|
+
These are documented limitations, not bugs.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
html: Raw HTML string
|
|
473
|
+
extraction_mode: Controls Trafilatura parameter set (default "baseline").
|
|
474
|
+
"baseline" - current production behavior: favor_precision=True, with fallback.
|
|
475
|
+
"precision" - stricter: favor_precision=True, no_fallback=True.
|
|
476
|
+
Skips fallback extraction algorithms; more inputs fall
|
|
477
|
+
through to original HTML. Fewer false positives.
|
|
478
|
+
"recall" - inclusive: favor_recall=True. Less filtering, more
|
|
479
|
+
content retained. More boilerplate may leak through.
|
|
480
|
+
original_title: Optional page title string; used as the synthetic <h1>
|
|
481
|
+
text when the extraction is headingless. Callers may
|
|
482
|
+
omit this; the function will extract it from ``html``.
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
Extracted HTML string (possibly with injected <h1>), or original HTML
|
|
486
|
+
if extraction failed.
|
|
487
|
+
"""
|
|
488
|
+
# Trafilatura kwargs per extraction mode.
|
|
489
|
+
# "baseline" is the unchanged production behavior; existing callers are unaffected.
|
|
490
|
+
if extraction_mode == "precision":
|
|
491
|
+
# no_fallback=True: skip secondary extraction methods on failure.
|
|
492
|
+
# Trafilatura>=1.8.0 supports this parameter.
|
|
493
|
+
traf_kwargs: dict = dict(favor_precision=True, no_fallback=True)
|
|
494
|
+
elif extraction_mode == "recall":
|
|
495
|
+
# favor_recall=True: more inclusive extraction, keeps more content at cost of
|
|
496
|
+
# more boilerplate leakage. Named for what it does, not for speed.
|
|
497
|
+
traf_kwargs = dict(favor_precision=False, favor_recall=True, no_fallback=False)
|
|
498
|
+
else: # "baseline" — must stay byte-identical to the previous hardcoded call
|
|
499
|
+
traf_kwargs = dict(favor_precision=True, no_fallback=False)
|
|
500
|
+
|
|
501
|
+
extracted = trafilatura.extract(
|
|
502
|
+
html,
|
|
503
|
+
output_format="html",
|
|
504
|
+
include_formatting=True,
|
|
505
|
+
include_links=True,
|
|
506
|
+
include_images=True,
|
|
507
|
+
include_comments=False,
|
|
508
|
+
include_tables=False,
|
|
509
|
+
**traf_kwargs,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
has_h1 = extracted and "<h1" in extracted
|
|
513
|
+
has_any_heading = extracted and any(f"<h{i}" in extracted for i in range(1, 7))
|
|
514
|
+
|
|
515
|
+
if has_any_heading:
|
|
516
|
+
cleaned = re.sub(r'<p>\s*Advertisement\s*</p>', '', extracted)
|
|
517
|
+
if not has_h1:
|
|
518
|
+
title = _strip_title_branding(original_title or _extract_title_string(html))
|
|
519
|
+
h1 = f"<h1>{title}</h1>\n" if title else "<h1>Article</h1>\n"
|
|
520
|
+
# Guard: skip injection if the title duplicates the first real heading.
|
|
521
|
+
# Uses the same normalization as drop_duplicate_consecutive_sections().
|
|
522
|
+
first_h = _first_heading_text(cleaned)
|
|
523
|
+
if first_h is not None and title:
|
|
524
|
+
if _normalize_str(title) == _normalize_str(first_h):
|
|
525
|
+
return cleaned
|
|
526
|
+
return h1 + cleaned
|
|
527
|
+
return cleaned
|
|
528
|
+
|
|
529
|
+
# Headingless extraction: accept if prose-sufficient, inject synthetic heading.
|
|
530
|
+
if extracted:
|
|
531
|
+
prose_chars = len(re.sub(r'<[^>]+>', '', extracted).strip())
|
|
532
|
+
if prose_chars >= _MIN_PROSE_CHARS:
|
|
533
|
+
cleaned = re.sub(r'<p>\s*Advertisement\s*</p>', '', extracted)
|
|
534
|
+
title = _strip_title_branding(original_title or _extract_title_string(html))
|
|
535
|
+
h1 = f"<h1>{title}</h1>\n" if title else "<h1>Article</h1>\n"
|
|
536
|
+
return h1 + cleaned
|
|
537
|
+
|
|
538
|
+
return html
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def parse(html: str, original_title=None, require_article_body: bool = False,
|
|
542
|
+
caption_map: dict[str, str] | None = None,
|
|
543
|
+
source_url: str = "") -> Document:
|
|
544
|
+
"""
|
|
545
|
+
Parse HTML string to Document model.
|
|
546
|
+
|
|
547
|
+
Full pipeline: sanitize -> parse DOM -> select content -> build model.
|
|
548
|
+
|
|
549
|
+
In transform mode, call parse() directly with raw HTML.
|
|
550
|
+
In extract mode, call extract_with_trafilatura() first, then parse()
|
|
551
|
+
with the original_title captured before extraction.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
html: Raw HTML string (pre-extracted if in extract mode)
|
|
555
|
+
original_title: Optional BeautifulSoup Tag for <title>, captured
|
|
556
|
+
before Trafilatura strips <head>. Passed by main.py
|
|
557
|
+
in extract mode to preserve document title.
|
|
558
|
+
require_article_body: If True, raise ValidationError when no paragraph
|
|
559
|
+
with >= 20 words of non-placeholder prose is found.
|
|
560
|
+
Set to True in extract mode to detect extraction failure.
|
|
561
|
+
Default False to preserve backward compatibility.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
Document model with title and sections
|
|
565
|
+
"""
|
|
566
|
+
# Step 1: Sanitize
|
|
567
|
+
clean_html = sanitize(html)
|
|
568
|
+
|
|
569
|
+
# Step 1.5: Convert trafilatura <graphic> to <img> before lxml parsing.
|
|
570
|
+
# lxml doesn't recognise <graphic> as void, so it nests subsequent
|
|
571
|
+
# content inside it. <img> is a known void element.
|
|
572
|
+
clean_html = re.sub(
|
|
573
|
+
r'<graphic\b([^>]*)(?:/>|>\s*</graphic>|>)',
|
|
574
|
+
r'<img\1>',
|
|
575
|
+
clean_html,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Step 2: Parse DOM
|
|
579
|
+
soup = BeautifulSoup(clean_html, "lxml")
|
|
580
|
+
|
|
581
|
+
# Step 3: Select main content
|
|
582
|
+
content = select_main_content(soup)
|
|
583
|
+
|
|
584
|
+
# Step 4: Validate structure
|
|
585
|
+
validate_structure(content)
|
|
586
|
+
|
|
587
|
+
# Step 4.5: Preflight scope check — reject table/form/reference-dominant pages
|
|
588
|
+
preflight_scope_check(content)
|
|
589
|
+
|
|
590
|
+
# Step 5: Build sections
|
|
591
|
+
sections = build_sections(content, caption_map=caption_map)
|
|
592
|
+
|
|
593
|
+
# Step 5.5: Trim trailing CMS boilerplate paragraphs (end-anchored)
|
|
594
|
+
sections = trim_trailing_boilerplate(sections)
|
|
595
|
+
|
|
596
|
+
# Step 5.55: Trim trailing noise paragraphs (end-anchored)
|
|
597
|
+
sections = trim_trailing_noise(sections)
|
|
598
|
+
|
|
599
|
+
# Step 5.6: Drop final orphan section (heading with zero content blocks)
|
|
600
|
+
sections = drop_trailing_orphan_section(sections)
|
|
601
|
+
|
|
602
|
+
# Step 5.7: Collapse consecutive identical placeholder blocks
|
|
603
|
+
sections = collapse_consecutive_placeholder_blocks(sections)
|
|
604
|
+
|
|
605
|
+
# Step 5.8: Drop consecutive duplicate-heading sections
|
|
606
|
+
sections = drop_duplicate_consecutive_sections(sections)
|
|
607
|
+
|
|
608
|
+
# Step 5.85: Drop all empty sections (heading-only, no content blocks)
|
|
609
|
+
sections = drop_empty_sections(sections)
|
|
610
|
+
|
|
611
|
+
# Step 5.9: Guard against extraction failure — no article body (extract mode only)
|
|
612
|
+
if require_article_body and not _has_article_body(sections):
|
|
613
|
+
raise ValidationError(
|
|
614
|
+
"No article body detected: document contains no paragraph with "
|
|
615
|
+
"20 or more words of prose text. Extraction may have captured "
|
|
616
|
+
"navigation or boilerplate instead of article content."
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# Step 6: Extract title
|
|
620
|
+
title = extract_title(soup, content, original_title=original_title)
|
|
621
|
+
|
|
622
|
+
return Document(title=title, sections=sections, source_url=source_url)
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def extract_title(soup: BeautifulSoup, content: Tag, original_title=None) -> str:
|
|
626
|
+
"""
|
|
627
|
+
Extract document title from <title> tag or first <h1>.
|
|
628
|
+
|
|
629
|
+
Per decisions.md section 5:
|
|
630
|
+
- If <title> exists, use its text
|
|
631
|
+
- Else if first <h1> in content exists, use its text
|
|
632
|
+
- Else empty string
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
soup: Full DOM tree (for <title> access)
|
|
636
|
+
content: Selected content subtree
|
|
637
|
+
original_title: Optional BeautifulSoup Tag captured before
|
|
638
|
+
Trafilatura stripped <head>
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
Title string (may be empty)
|
|
642
|
+
"""
|
|
643
|
+
# Use original title if provided (extract mode - Trafilatura strips <head>)
|
|
644
|
+
title_tag = original_title or soup.find("title")
|
|
645
|
+
if title_tag:
|
|
646
|
+
return re.sub(r'\s+', ' ', title_tag.get_text()).strip()
|
|
647
|
+
|
|
648
|
+
# Fall back to first <h1> in content
|
|
649
|
+
h1 = content.find("h1")
|
|
650
|
+
if h1:
|
|
651
|
+
return re.sub(r'\s+', ' ', h1.get_text()).strip()
|
|
652
|
+
|
|
653
|
+
return ""
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def validate_structure(content: Tag) -> None:
|
|
657
|
+
"""
|
|
658
|
+
Validate that content has minimum semantic structure.
|
|
659
|
+
|
|
660
|
+
Per decisions.md section 3:
|
|
661
|
+
- Must have at least one h1, h2, or h3
|
|
662
|
+
- Must have at least one p, ul, or ol
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
content: Selected content subtree
|
|
666
|
+
|
|
667
|
+
Raises:
|
|
668
|
+
ValidationError: If structure requirements are not met
|
|
669
|
+
"""
|
|
670
|
+
has_heading = bool(content.find(["h1", "h2", "h3"]))
|
|
671
|
+
has_body_content = bool(content.find(["p", "ul", "ol"]))
|
|
672
|
+
|
|
673
|
+
if not has_heading or not has_body_content:
|
|
674
|
+
raise ValidationError(
|
|
675
|
+
"Input HTML lacks semantic structure "
|
|
676
|
+
"(requires at least one h1-h3 and body content in p/ul/ol)."
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def preflight_scope_check(content: Tag) -> None:
|
|
681
|
+
"""
|
|
682
|
+
Preflight: reject table/form/reference-dominant pages before model build.
|
|
683
|
+
|
|
684
|
+
Checks three structural signatures that indicate out-of-scope content:
|
|
685
|
+
1. Form/tool pages: form elements present + sparse prose (<250 words).
|
|
686
|
+
2. Table/reference pages: multiple tables + sparse prose (<250 words).
|
|
687
|
+
3. Navigation/reference pages: link-word ratio > 0.5 with sparse prose
|
|
688
|
+
(<500 words). Catches pages whose paragraph text is mostly navigation
|
|
689
|
+
links rather than article prose (e.g. GDP reference lists).
|
|
690
|
+
|
|
691
|
+
Thresholds are calibrated against the eval20 corpus:
|
|
692
|
+
- Lowest in-scope ACCEPT p_text_words: 241 (cdc)
|
|
693
|
+
- Highest in-scope ACCEPT link/p ratio: 0.18 (theconversation)
|
|
694
|
+
Both values leave a safe margin below these rejection thresholds.
|
|
695
|
+
|
|
696
|
+
Called unconditionally after validate_structure() so that it applies in
|
|
697
|
+
both transform and extract mode.
|
|
698
|
+
"""
|
|
699
|
+
table_count = len(content.find_all("table"))
|
|
700
|
+
form_count = len(content.find_all("form"))
|
|
701
|
+
|
|
702
|
+
p_text_words = sum(
|
|
703
|
+
len(p.get_text().split())
|
|
704
|
+
for p in content.find_all("p")
|
|
705
|
+
)
|
|
706
|
+
link_words = sum(
|
|
707
|
+
len(a.get_text().split())
|
|
708
|
+
for a in content.find_all("a")
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
if form_count >= 1 and p_text_words < 250:
|
|
712
|
+
raise ValidationError("Out of scope: tool/form page.")
|
|
713
|
+
|
|
714
|
+
if table_count >= 2 and p_text_words < 250:
|
|
715
|
+
raise ValidationError("Out of scope: table/reference page.")
|
|
716
|
+
|
|
717
|
+
if p_text_words > 0 and p_text_words < 500 and (link_words / p_text_words) > 0.5:
|
|
718
|
+
raise ValidationError("Out of scope: navigation/reference page.")
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def build_sections(content: Tag, caption_map: dict[str, str] | None = None) -> list[Section]:
|
|
722
|
+
"""
|
|
723
|
+
Build sections from headings in content.
|
|
724
|
+
|
|
725
|
+
Per decisions.md section 5:
|
|
726
|
+
- Drop all content before first heading
|
|
727
|
+
- Each heading starts a new section
|
|
728
|
+
- Section continues until next heading (any level)
|
|
729
|
+
|
|
730
|
+
Uses a flatten-then-assign approach: all headings and block-level
|
|
731
|
+
elements are collected in document order, then each block is assigned
|
|
732
|
+
to the most recent heading. This avoids content duplication when
|
|
733
|
+
headings are nested at different DOM depths (e.g. h2 → div → h3).
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
content: Selected content subtree
|
|
737
|
+
|
|
738
|
+
Returns:
|
|
739
|
+
List of Section objects
|
|
740
|
+
"""
|
|
741
|
+
# Find all headings (h1-h6)
|
|
742
|
+
heading_tags = set(["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
743
|
+
headings = content.find_all(heading_tags)
|
|
744
|
+
|
|
745
|
+
if not headings:
|
|
746
|
+
# No headings found - will fail validation later
|
|
747
|
+
return []
|
|
748
|
+
|
|
749
|
+
heading_set = set(id(h) for h in headings)
|
|
750
|
+
|
|
751
|
+
# Flatten: walk content descendants in document order, collecting
|
|
752
|
+
# headings and parseable block elements. Skip any element whose
|
|
753
|
+
# ancestor is already a collected block (prevents duplication).
|
|
754
|
+
ordered: list[tuple[str, Tag]] = [] # ("heading"|"block", element)
|
|
755
|
+
collected_ids: set[int] = set()
|
|
756
|
+
|
|
757
|
+
for elem in content.descendants:
|
|
758
|
+
if not isinstance(elem, Tag):
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
# Skip elements inside an already-collected block
|
|
762
|
+
if any(id(p) in collected_ids for p in elem.parents):
|
|
763
|
+
continue
|
|
764
|
+
|
|
765
|
+
if id(elem) in heading_set:
|
|
766
|
+
ordered.append(("heading", elem))
|
|
767
|
+
collected_ids.add(id(elem))
|
|
768
|
+
elif elem.name not in heading_tags:
|
|
769
|
+
block = parse_block(elem, caption_map=caption_map)
|
|
770
|
+
if block:
|
|
771
|
+
ordered.append(("block", elem))
|
|
772
|
+
collected_ids.add(id(elem))
|
|
773
|
+
|
|
774
|
+
# Assign: each block goes to the most recent heading.
|
|
775
|
+
sections: list[Section] = []
|
|
776
|
+
current_heading: Heading | None = None
|
|
777
|
+
current_blocks: list[Block] = []
|
|
778
|
+
|
|
779
|
+
for kind, elem in ordered:
|
|
780
|
+
if kind == "heading":
|
|
781
|
+
# Flush previous section
|
|
782
|
+
if current_heading is not None and _heading_has_text(current_heading):
|
|
783
|
+
sections.append(Section(heading=current_heading, blocks=current_blocks))
|
|
784
|
+
current_heading = parse_heading(elem)
|
|
785
|
+
current_blocks = []
|
|
786
|
+
else:
|
|
787
|
+
if current_heading is not None:
|
|
788
|
+
block = parse_block(elem, caption_map=caption_map)
|
|
789
|
+
if block:
|
|
790
|
+
current_blocks.append(block)
|
|
791
|
+
|
|
792
|
+
# Flush last section
|
|
793
|
+
if current_heading is not None and _heading_has_text(current_heading):
|
|
794
|
+
sections.append(Section(heading=current_heading, blocks=current_blocks))
|
|
795
|
+
|
|
796
|
+
return sections
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def _heading_has_text(heading: "Heading") -> bool:
|
|
800
|
+
"""Return True if the heading contains at least one non-whitespace character."""
|
|
801
|
+
return bool("".join(_inline_to_text(il) for il in heading.inlines).strip())
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def parse_heading(element: Tag) -> Heading:
|
|
805
|
+
"""
|
|
806
|
+
Parse heading element to Heading model.
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
element: BeautifulSoup Tag for h1-h6
|
|
810
|
+
|
|
811
|
+
Returns:
|
|
812
|
+
Heading with level and inline content
|
|
813
|
+
"""
|
|
814
|
+
level = int(element.name[1])
|
|
815
|
+
inlines = parse_inlines(element)
|
|
816
|
+
return Heading(level=level, inlines=inlines)
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def _listblock_has_content(block: "ListBlock") -> bool:
|
|
820
|
+
"""Return True if the list has at least one item with non-empty content.
|
|
821
|
+
|
|
822
|
+
Filters two structural artifact cases:
|
|
823
|
+
- Zero items: <ol></ol> or <ul></ul> (no <li> elements at all)
|
|
824
|
+
- All-empty items: every <li> has no inline text and no nested lists
|
|
825
|
+
(e.g. <ul><li></li><li></li></ul>)
|
|
826
|
+
Both are unambiguously extraction artifacts; legitimate lists always have
|
|
827
|
+
at least one item with visible content.
|
|
828
|
+
"""
|
|
829
|
+
return bool(block.items) and any(
|
|
830
|
+
item.inlines or item.children for item in block.items
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def parse_figure(element: Tag, caption_map: dict[str, str] | None = None) -> Block | None:
|
|
835
|
+
"""
|
|
836
|
+
Parse <figure> element to Block model.
|
|
837
|
+
|
|
838
|
+
If the figure contains an image, preserves it with figcaption as caption.
|
|
839
|
+
If no image but figcaption text exists, returns as Paragraph.
|
|
840
|
+
"""
|
|
841
|
+
imgs = element.find_all(["img", "graphic"])
|
|
842
|
+
figcaption = element.find("figcaption")
|
|
843
|
+
caption_text = ""
|
|
844
|
+
if figcaption:
|
|
845
|
+
caption_text = figcaption.get_text(separator=" ", strip=True)
|
|
846
|
+
|
|
847
|
+
if not imgs:
|
|
848
|
+
if caption_text:
|
|
849
|
+
return Paragraph(inlines=[Text(text=caption_text)])
|
|
850
|
+
return None
|
|
851
|
+
|
|
852
|
+
result = degrade_image(imgs[0])
|
|
853
|
+
if isinstance(result, Image):
|
|
854
|
+
if caption_text:
|
|
855
|
+
result.caption = caption_text
|
|
856
|
+
return result
|
|
857
|
+
# Placeholder Text — wrap in Paragraph
|
|
858
|
+
return Paragraph(inlines=[result])
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def parse_block(element: Tag, caption_map: dict[str, str] | None = None) -> Block | None:
|
|
862
|
+
"""
|
|
863
|
+
Parse block-level element to Block model.
|
|
864
|
+
|
|
865
|
+
Returns None for unrecognized elements (silently skipped in v1).
|
|
866
|
+
v2: Add warning logging for skipped elements.
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
element: BeautifulSoup Tag for block element
|
|
870
|
+
caption_map: Optional map of img src -> caption text (extract mode)
|
|
871
|
+
|
|
872
|
+
Returns:
|
|
873
|
+
Block model object or None if unsupported
|
|
874
|
+
"""
|
|
875
|
+
tag_name = element.name
|
|
876
|
+
|
|
877
|
+
if tag_name == "p":
|
|
878
|
+
block = parse_paragraph(element)
|
|
879
|
+
return block if block.inlines else None
|
|
880
|
+
elif tag_name in ("ul", "ol"):
|
|
881
|
+
block = parse_list(element)
|
|
882
|
+
return block if _listblock_has_content(block) else None
|
|
883
|
+
elif tag_name == "blockquote":
|
|
884
|
+
return parse_quote(element)
|
|
885
|
+
elif tag_name == "pre":
|
|
886
|
+
return parse_preformatted(element)
|
|
887
|
+
elif tag_name == "table":
|
|
888
|
+
return degrade_table(element)
|
|
889
|
+
elif tag_name == "figure":
|
|
890
|
+
return parse_figure(element, caption_map=caption_map)
|
|
891
|
+
elif tag_name in ("img", "graphic"):
|
|
892
|
+
result = degrade_image(element)
|
|
893
|
+
if isinstance(result, Image):
|
|
894
|
+
if caption_map and result.src in caption_map:
|
|
895
|
+
result.caption = caption_map[result.src]
|
|
896
|
+
return result
|
|
897
|
+
# Placeholder Text — wrap in Paragraph
|
|
898
|
+
return Paragraph(inlines=[result])
|
|
899
|
+
elif tag_name in ("form", "input", "textarea", "select", "button"):
|
|
900
|
+
return degrade_form(element)
|
|
901
|
+
elif tag_name == "hr":
|
|
902
|
+
return degrade_hr()
|
|
903
|
+
else:
|
|
904
|
+
# Unknown block - skip silently in v1
|
|
905
|
+
return None
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def parse_paragraph(element: Tag) -> Paragraph:
|
|
909
|
+
"""
|
|
910
|
+
Parse paragraph to Paragraph model.
|
|
911
|
+
|
|
912
|
+
Args:
|
|
913
|
+
element: BeautifulSoup Tag for <p>
|
|
914
|
+
|
|
915
|
+
Returns:
|
|
916
|
+
Paragraph with inline content
|
|
917
|
+
"""
|
|
918
|
+
inlines = parse_inlines(element)
|
|
919
|
+
return Paragraph(inlines=inlines)
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
def parse_list(element: Tag) -> ListBlock:
|
|
923
|
+
"""
|
|
924
|
+
Parse list to ListBlock model.
|
|
925
|
+
|
|
926
|
+
Handles nested lists via ListItem.children.
|
|
927
|
+
|
|
928
|
+
Args:
|
|
929
|
+
element: BeautifulSoup Tag for <ul> or <ol>
|
|
930
|
+
|
|
931
|
+
Returns:
|
|
932
|
+
ListBlock with items
|
|
933
|
+
"""
|
|
934
|
+
ordered = element.name == "ol"
|
|
935
|
+
items = []
|
|
936
|
+
|
|
937
|
+
for li in element.find_all("li", recursive=False):
|
|
938
|
+
# Parse inlines, but exclude nested lists from inline parsing
|
|
939
|
+
# (nested lists are handled separately via children)
|
|
940
|
+
li_copy = li.__copy__()
|
|
941
|
+
for nested in li_copy.find_all(["ul", "ol"], recursive=False):
|
|
942
|
+
nested.decompose()
|
|
943
|
+
inlines = parse_inlines(li_copy)
|
|
944
|
+
|
|
945
|
+
# Find nested lists
|
|
946
|
+
nested_lists = li.find_all(["ul", "ol"], recursive=False)
|
|
947
|
+
children = [parse_list(nested) for nested in nested_lists]
|
|
948
|
+
|
|
949
|
+
items.append(ListItem(inlines=inlines, children=children))
|
|
950
|
+
|
|
951
|
+
return ListBlock(ordered=ordered, items=items)
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def parse_quote(element: Tag) -> Quote:
|
|
955
|
+
"""
|
|
956
|
+
Parse blockquote to Quote model (recursive).
|
|
957
|
+
|
|
958
|
+
Args:
|
|
959
|
+
element: BeautifulSoup Tag for <blockquote>
|
|
960
|
+
|
|
961
|
+
Returns:
|
|
962
|
+
Quote containing blocks
|
|
963
|
+
"""
|
|
964
|
+
blocks = []
|
|
965
|
+
for child in element.children:
|
|
966
|
+
if isinstance(child, Tag):
|
|
967
|
+
block = parse_block(child)
|
|
968
|
+
if block:
|
|
969
|
+
blocks.append(block)
|
|
970
|
+
|
|
971
|
+
return Quote(blocks=blocks)
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def parse_preformatted(element: Tag) -> Preformatted:
|
|
975
|
+
"""
|
|
976
|
+
Parse <pre> to Preformatted model.
|
|
977
|
+
|
|
978
|
+
Preserves whitespace exactly.
|
|
979
|
+
|
|
980
|
+
Args:
|
|
981
|
+
element: BeautifulSoup Tag for <pre>
|
|
982
|
+
|
|
983
|
+
Returns:
|
|
984
|
+
Preformatted with verbatim text
|
|
985
|
+
"""
|
|
986
|
+
return Preformatted(text=element.get_text())
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def collapse_whitespace(text: str) -> str:
|
|
990
|
+
"""
|
|
991
|
+
Collapse runs of whitespace into a single space.
|
|
992
|
+
|
|
993
|
+
Preserves leading/trailing space if the original had whitespace
|
|
994
|
+
at those boundaries. This matches HTML whitespace collapsing rules
|
|
995
|
+
(decisions.md section 8).
|
|
996
|
+
"""
|
|
997
|
+
if not text:
|
|
998
|
+
return ""
|
|
999
|
+
leading = " " if text[0].isspace() else ""
|
|
1000
|
+
trailing = " " if text[-1].isspace() else ""
|
|
1001
|
+
collapsed = re.sub(r"\s+", " ", text.strip())
|
|
1002
|
+
if not collapsed:
|
|
1003
|
+
return ""
|
|
1004
|
+
return leading + collapsed + trailing
|
|
1005
|
+
|
|
1006
|
+
|
|
1007
|
+
def parse_inlines(element: Tag) -> list[Inline]:
|
|
1008
|
+
"""
|
|
1009
|
+
Parse inline content recursively.
|
|
1010
|
+
|
|
1011
|
+
Handles nested inline elements and text nodes.
|
|
1012
|
+
Applies whitespace normalization.
|
|
1013
|
+
|
|
1014
|
+
Args:
|
|
1015
|
+
element: BeautifulSoup Tag containing inline content
|
|
1016
|
+
|
|
1017
|
+
Returns:
|
|
1018
|
+
List of Inline model objects
|
|
1019
|
+
"""
|
|
1020
|
+
result = []
|
|
1021
|
+
|
|
1022
|
+
for child in element.children:
|
|
1023
|
+
if isinstance(child, NavigableString):
|
|
1024
|
+
text = collapse_whitespace(str(child))
|
|
1025
|
+
if text:
|
|
1026
|
+
result.append(Text(text=text))
|
|
1027
|
+
elif isinstance(child, Tag):
|
|
1028
|
+
inline = parse_inline_element(child)
|
|
1029
|
+
if isinstance(inline, list):
|
|
1030
|
+
result.extend(inline)
|
|
1031
|
+
elif inline:
|
|
1032
|
+
result.append(inline)
|
|
1033
|
+
|
|
1034
|
+
return result
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
def parse_inline_element(element: Tag) -> Inline | list[Inline] | None:
|
|
1038
|
+
"""
|
|
1039
|
+
Parse single inline element.
|
|
1040
|
+
|
|
1041
|
+
Returns Inline object, list of Inlines, or None for unknown elements.
|
|
1042
|
+
|
|
1043
|
+
Args:
|
|
1044
|
+
element: BeautifulSoup Tag for inline element
|
|
1045
|
+
|
|
1046
|
+
Returns:
|
|
1047
|
+
Inline model object(s) or None
|
|
1048
|
+
"""
|
|
1049
|
+
tag_name = element.name
|
|
1050
|
+
|
|
1051
|
+
if tag_name == "br":
|
|
1052
|
+
return LineBreak()
|
|
1053
|
+
elif tag_name in ("em", "i"):
|
|
1054
|
+
return Emphasis(children=parse_inlines(element))
|
|
1055
|
+
elif tag_name in ("strong", "b"):
|
|
1056
|
+
return Strong(children=parse_inlines(element))
|
|
1057
|
+
elif tag_name == "code":
|
|
1058
|
+
return Code(text=element.get_text())
|
|
1059
|
+
elif tag_name == "a":
|
|
1060
|
+
href = element.get("href", "")
|
|
1061
|
+
return Link(href=href, children=parse_inlines(element))
|
|
1062
|
+
elif tag_name in ("img", "graphic"):
|
|
1063
|
+
result = degrade_image(element)
|
|
1064
|
+
if isinstance(result, Image):
|
|
1065
|
+
# Image is block-level; in inline context, use alt text
|
|
1066
|
+
return Text(text=result.alt) if result.alt else None
|
|
1067
|
+
return result
|
|
1068
|
+
else:
|
|
1069
|
+
# Unknown inline element - extract text
|
|
1070
|
+
text = element.get_text().strip()
|
|
1071
|
+
if text:
|
|
1072
|
+
return Text(text=text)
|
|
1073
|
+
return None
|