decant-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
decant/core/parser.py ADDED
@@ -0,0 +1,1073 @@
1
+ """
2
+ HTML to Document model parser.
3
+
4
+ Converts sanitized DOM tree to internal model representation.
5
+ Pipeline: Step 4 (after sanitization and content selection)
6
+ See decisions.md sections 5-8 for parsing rules.
7
+ """
8
+ import re
9
+ from typing import Literal
10
+
11
+ import trafilatura
12
+
13
+ # ExtractionMode controls which Trafilatura parameter set is used.
14
+ # "baseline" is and must remain the current production behavior.
15
+ ExtractionMode = Literal["baseline", "precision", "recall"]
16
+
17
+ from bs4 import BeautifulSoup, Tag, NavigableString
18
+
19
+ from decant.core.model import (
20
+ Document, Section, Heading, Block, Inline,
21
+ Paragraph, ListBlock, ListItem, Quote, Preformatted, Image, Table,
22
+ Text, Emphasis, Strong, Code, Link, LineBreak
23
+ )
24
+ from decant.core.sanitizer import sanitize
25
+ from decant.core.content_selector import select_main_content
26
+ from decant.core.degradation import (
27
+ degrade_table, degrade_image, degrade_form, degrade_hr
28
+ )
29
+
30
+
31
+ class ValidationError(Exception):
32
+ """Raised when input HTML lacks required semantic structure."""
33
+ pass
34
+
35
+
36
+ def harvest_captions(html: str) -> dict[str, str]:
37
+ """
38
+ Scan raw HTML before Trafilatura runs, build a map of
39
+ img src URL -> figcaption plain text.
40
+
41
+ Rules:
42
+ - Only considers <figure> elements containing exactly one <img>
43
+ with an http/https src and a non-empty <figcaption>.
44
+ - Ambiguous figures (multiple images, no image, data URIs) are skipped.
45
+ - Safe to call on any HTML. No exceptions, no side effects.
46
+ """
47
+ from urllib.parse import urlparse
48
+ soup = BeautifulSoup(html, "lxml")
49
+ result: dict[str, str] = {}
50
+ for figure in soup.find_all("figure"):
51
+ figcaption = figure.find("figcaption")
52
+ if not figcaption:
53
+ continue
54
+ caption_text = figcaption.get_text(separator=" ", strip=True)
55
+ if not caption_text:
56
+ continue
57
+ imgs = figure.find_all("img")
58
+ http_imgs = [
59
+ img for img in imgs
60
+ if img.get("src", "").strip()
61
+ and urlparse(img["src"].strip()).scheme in ("http", "https")
62
+ ]
63
+ if len(http_imgs) != 1:
64
+ continue
65
+ result[http_imgs[0]["src"].strip()] = caption_text
66
+ return result
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Site-specific anchor strings. These are an escape hatch for CMS
71
+ # footer phrases that don't match any structural pattern in
72
+ # trim_trailing_noise(). The generic mechanisms (noise patterns,
73
+ # heading detection) do the heavy lifting. Add strings here only
74
+ # when a specific site's boilerplate cannot be caught structurally.
75
+ # ---------------------------------------------------------------------------
76
+
77
+ _TAIL_BOILERPLATE_ANCHORS = frozenset([
78
+ "Follow Cleveland Clinic",
79
+ "Learn more about the Health Library",
80
+ "Got a story we should hear",
81
+ "Back to top",
82
+ "Educate your inbox",
83
+ ])
84
+
85
+ _BOILERPLATE_SECTION_HEADINGS = frozenset([
86
+ "educate your inbox",
87
+ "newsletter",
88
+ "subscribe",
89
+ "comments",
90
+ "related stories",
91
+ "related articles",
92
+ "more stories",
93
+ "recommended",
94
+ "advertisement",
95
+ "sponsored content",
96
+ ])
97
+
98
+ _TAIL_SCAN_LIMIT = 10 # never scan further than this from the end
99
+
100
+
101
+ def _inline_to_text(inline: "Inline") -> str:
102
+ """Recursively extract plain text from a single Inline for pattern matching."""
103
+ if isinstance(inline, Text):
104
+ return inline.text
105
+ if isinstance(inline, (Emphasis, Strong)):
106
+ return "".join(_inline_to_text(c) for c in inline.children)
107
+ if isinstance(inline, Code):
108
+ return inline.text
109
+ if isinstance(inline, Link):
110
+ return "".join(_inline_to_text(c) for c in inline.children)
111
+ return "" # LineBreak and unknowns contribute no text
112
+
113
+
114
+ def _paragraph_plain_text(para: Paragraph) -> str:
115
+ """Return the plain-text content of a Paragraph for boilerplate matching."""
116
+ return "".join(_inline_to_text(il) for il in para.inlines).strip()
117
+
118
+
119
+ def trim_trailing_boilerplate(sections: list[Section]) -> list[Section]:
120
+ """
121
+ Remove trailing CMS footer paragraphs from the last section.
122
+
123
+ Scans up to _TAIL_SCAN_LIMIT blocks from the end of the final section.
124
+ When an anchor pattern is found, that block and all blocks after it are
125
+ removed. Blocks before the anchor are preserved intact.
126
+
127
+ End-anchored: never removes content from the middle of a document.
128
+ Called after build_sections(), before render.
129
+ """
130
+ if not sections:
131
+ return sections
132
+
133
+ last = sections[-1]
134
+ blocks = last.blocks
135
+ if not blocks:
136
+ return sections
137
+
138
+ tail_start = max(0, len(blocks) - _TAIL_SCAN_LIMIT)
139
+ cut_at = None
140
+
141
+ for i in range(len(blocks) - 1, tail_start - 1, -1):
142
+ block = blocks[i]
143
+ if isinstance(block, Paragraph):
144
+ text = _paragraph_plain_text(block)
145
+ for pattern in _TAIL_BOILERPLATE_ANCHORS:
146
+ if pattern in text:
147
+ cut_at = i
148
+ break
149
+
150
+ if cut_at is not None:
151
+ sections = list(sections) # do not mutate the input list
152
+ sections[-1] = Section(heading=last.heading, blocks=list(blocks[:cut_at]))
153
+
154
+ return sections
155
+
156
+
157
+ _TRAILING_NOISE_DATE_RE = re.compile(
158
+ r'^(?:\d{4}[-/]\d{2}[-/]\d{2}'
159
+ r'|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.]*\s+\d{1,2},?\s+\d{4}'
160
+ r'|\d{1,2}/\d{1,2}/\d{4})$',
161
+ re.IGNORECASE,
162
+ )
163
+
164
+
165
+ def _is_trailing_noise(text: str) -> bool:
166
+ """Return True if a paragraph's plain text matches a trailing noise pattern."""
167
+ t = text.strip()
168
+ if not t:
169
+ return False
170
+ tl = t.lower()
171
+
172
+ # Photo/image/visual/credit prefix
173
+ if tl.startswith(("image:", "photo:", "visual:", "credit:")):
174
+ return True
175
+
176
+ # Copyright symbol or (c)
177
+ if t.startswith("\u00a9") or tl.startswith("(c)"):
178
+ return True
179
+
180
+ # License boilerplate
181
+ if "cc by" in tl or "creative commons" in tl:
182
+ return True
183
+
184
+ # Trivial noise (3 chars or fewer)
185
+ if len(t) <= 3:
186
+ return True
187
+
188
+ # Bare date stamp
189
+ if _TRAILING_NOISE_DATE_RE.match(t):
190
+ return True
191
+
192
+ return False
193
+
194
+
195
+ def trim_trailing_noise(sections: list[Section]) -> list[Section]:
196
+ """
197
+ Remove trailing noise paragraphs from the last section.
198
+
199
+ Scans backwards from the end. Removes paragraphs matching structural
200
+ noise patterns (photo credits, license text, date stamps, trivial noise).
201
+ Stops at the first non-matching block (end-anchored).
202
+ """
203
+ if not sections:
204
+ return sections
205
+
206
+ last = sections[-1]
207
+ blocks = last.blocks
208
+ if not blocks:
209
+ return sections
210
+
211
+ # Scan backwards, find how many trailing blocks to remove
212
+ trim_count = 0
213
+ for i in range(len(blocks) - 1, -1, -1):
214
+ block = blocks[i]
215
+ if not isinstance(block, Paragraph):
216
+ break
217
+ text = _paragraph_plain_text(block)
218
+ if _is_trailing_noise(text):
219
+ trim_count += 1
220
+ else:
221
+ break
222
+
223
+ if trim_count > 0:
224
+ sections = list(sections)
225
+ new_blocks = list(blocks[:len(blocks) - trim_count])
226
+ sections[-1] = Section(heading=last.heading, blocks=new_blocks)
227
+
228
+ return sections
229
+
230
+
231
+ def drop_trailing_orphan_section(sections: list[Section]) -> list[Section]:
232
+ """
233
+ Drop the final section if it meets either of two end-anchored conditions:
234
+
235
+ 1. Zero content blocks (bare heading stub — existing behaviour).
236
+ 2. All blocks are placeholder tokens: every block is a Paragraph whose
237
+ sole inline is a bracketed placeholder text such as '[Form omitted]',
238
+ '[Image omitted]', or '[Image: ...]'. These sections contain no
239
+ article prose and are unambiguously CMS template artefacts.
240
+
241
+ End-anchored: only ever inspects the last section, so legitimate content
242
+ earlier in the document is never affected. Structural check only — no
243
+ site-specific strings.
244
+
245
+ Called after trim_trailing_boilerplate() (which may itself empty the last
246
+ section's block list), before render.
247
+ """
248
+ if not sections:
249
+ return sections
250
+ last = sections[-1]
251
+ if len(last.blocks) == 0:
252
+ return sections[:-1]
253
+ if last.blocks and all(_is_placeholder_paragraph(b) for b in last.blocks):
254
+ return sections[:-1]
255
+ if _normalize_heading_text(last.heading) in _BOILERPLATE_SECTION_HEADINGS:
256
+ return sections[:-1]
257
+ return sections
258
+
259
+
260
+ def _normalize_str(s: str) -> str:
261
+ """Collapse whitespace, strip, and lowercase a plain text string."""
262
+ return re.sub(r'\s+', ' ', s).strip().lower()
263
+
264
+
265
+ def _normalize_heading_text(heading) -> str:
266
+ """
267
+ Return normalized heading text: strip, collapse internal whitespace,
268
+ lowercase. Used for consecutive duplicate detection.
269
+ """
270
+ raw = "".join(_inline_to_text(il) for il in heading.inlines)
271
+ return _normalize_str(raw)
272
+
273
+
274
+ def drop_duplicate_consecutive_sections(sections: list[Section]) -> list[Section]:
275
+ """
276
+ Drop any section whose normalized heading text is identical to the
277
+ immediately preceding section's normalized heading text.
278
+
279
+ Only adjacent (consecutive) duplicates are removed. Non-consecutive
280
+ duplicate headings are left untouched. Structural only — no
281
+ site-specific strings.
282
+
283
+ Called after collapse_consecutive_placeholder_blocks(), before render.
284
+ """
285
+ if not sections:
286
+ return sections
287
+ result = [sections[0]]
288
+ for section in sections[1:]:
289
+ if _normalize_heading_text(section.heading) != _normalize_heading_text(result[-1].heading):
290
+ result.append(section)
291
+ return result
292
+
293
+
294
+ def drop_empty_sections(sections: list[Section]) -> list[Section]:
295
+ """
296
+ Remove any section with zero content blocks.
297
+
298
+ A heading with no paragraphs, lists, images, or other
299
+ blocks beneath it is structural noise — either a
300
+ title-duplicate artifact (section 0 matching <title>)
301
+ or a source heading whose content was lost in extraction.
302
+
303
+ Applied globally, not just end-anchored. Safe because
304
+ empty sections by definition contain no prose or media.
305
+ """
306
+ return [s for s in sections if len(s.blocks) > 0]
307
+
308
+
309
+ def _is_placeholder_paragraph(block) -> bool:
310
+ """
311
+ Return True if block is a single-inline Paragraph whose text is a known
312
+ placeholder token: [Form omitted], [Image not included], or [Image: <alt>].
313
+
314
+ These are generated by degradation.py and are structurally inert when
315
+ they appear in consecutive runs.
316
+ """
317
+ from decant.core.model import Paragraph, Text as TextInline
318
+ if not isinstance(block, Paragraph):
319
+ return False
320
+ if len(block.inlines) != 1:
321
+ return False
322
+ inline = block.inlines[0]
323
+ if not isinstance(inline, TextInline):
324
+ return False
325
+ t = inline.text
326
+ return t.startswith("[") and t.endswith("]")
327
+
328
+
329
+ def collapse_consecutive_placeholder_blocks(sections: list[Section]) -> list[Section]:
330
+ """
331
+ Collapse runs of N>=2 consecutive identical placeholder Paragraphs within
332
+ each section's block list to a single instance.
333
+
334
+ 'Identical' means the full placeholder text matches (e.g. all six
335
+ consecutive '[Form omitted]' blocks become one). Non-placeholder blocks
336
+ and runs of length 1 are left untouched.
337
+
338
+ Called after drop_trailing_orphan_section(), before render.
339
+ """
340
+ result = []
341
+ for section in sections:
342
+ collapsed: list = []
343
+ for block in section.blocks:
344
+ if (
345
+ _is_placeholder_paragraph(block)
346
+ and collapsed
347
+ and _is_placeholder_paragraph(collapsed[-1])
348
+ and collapsed[-1].inlines[0].text == block.inlines[0].text
349
+ ):
350
+ # Same placeholder as previous — skip (deduplicate)
351
+ continue
352
+ collapsed.append(block)
353
+ result.append(Section(heading=section.heading, blocks=collapsed))
354
+ return result
355
+
356
+
357
+ _ARTICLE_BODY_MIN_WORDS = 20
358
+
359
+
360
+ def _has_article_body(sections: list[Section]) -> bool:
361
+ """
362
+ Return True if the document contains at least one non-placeholder
363
+ Paragraph with >= _ARTICLE_BODY_MIN_WORDS words of prose text.
364
+
365
+ Placeholder paragraphs (bracketed tokens like '[Form omitted]',
366
+ '[Image omitted]', etc.) are excluded from the count. Headings and
367
+ list blocks are not counted.
368
+
369
+ Used to detect extraction failure where Trafilatura captured navigation
370
+ or boilerplate instead of article content.
371
+ """
372
+ from decant.core.model import Paragraph as ParagraphModel
373
+ for section in sections:
374
+ for block in section.blocks:
375
+ if not isinstance(block, ParagraphModel):
376
+ continue
377
+ if _is_placeholder_paragraph(block):
378
+ continue
379
+ text = "".join(_inline_to_text(il) for il in block.inlines)
380
+ if len(text.split()) >= _ARTICLE_BODY_MIN_WORDS:
381
+ return True
382
+ return False
383
+
384
+
385
+ # Minimum prose characters (after stripping tags) to accept a headingless
386
+ # Trafilatura extraction instead of falling back to the full original HTML.
387
+ # Both guardian (32 K) and theringer (18 K) clear this by a wide margin;
388
+ # a genuine empty/navigation-only extraction is well below it.
389
+ _MIN_PROSE_CHARS = 2000
390
+
391
+
392
+ def _extract_title_string(html: str) -> str:
393
+ """
394
+ Extract and normalise the text content of the <title> tag from a raw
395
+ HTML string. Returns an empty string if no <title> is found.
396
+
397
+ Used to supply a synthetic heading when Trafilatura produces headingless
398
+ prose output (see extract_with_trafilatura).
399
+ """
400
+ m = re.search(r'<title[^>]*>(.*?)</title>', html, re.IGNORECASE | re.DOTALL)
401
+ if not m:
402
+ return ""
403
+ return re.sub(r'\s+', ' ', m.group(1)).strip()
404
+
405
+
406
+ def _strip_title_branding(title: str | None) -> str | None:
407
+ """
408
+ Strip site-branding suffix from a page title string.
409
+
410
+ Tries delimiters in priority order: ' | ', ' - ', ' -- '.
411
+ For the first delimiter found, splits on its last (rightmost) occurrence
412
+ and returns the left part, whitespace-stripped.
413
+ Returns None for None/empty input; returns the original stripped string
414
+ if no delimiter matches.
415
+ """
416
+ if not title:
417
+ return None
418
+ for delim in (" | ", " - ", " -- "):
419
+ if delim in title:
420
+ return title.rsplit(delim, 1)[0].strip()
421
+ return title.strip()
422
+
423
+
424
+ def _first_heading_text(html_string: str) -> str | None:
425
+ """
426
+ Return the plain text of the first h1-h6 element in an HTML string, or
427
+ None if no heading is found or the text is empty/whitespace-only.
428
+
429
+ Used by the H1-injection guard to compare the candidate synthetic title
430
+ against the first real heading already present in Trafilatura output.
431
+ """
432
+ m = re.search(r'<h[1-6][^>]*>(.*?)</h[1-6]>', html_string, re.IGNORECASE | re.DOTALL)
433
+ if not m:
434
+ return None
435
+ text = re.sub(r'<[^>]+>', '', m.group(1)).strip()
436
+ return text if text else None
437
+
438
+
439
+ def extract_with_trafilatura(
440
+ html: str,
441
+ extraction_mode: ExtractionMode = "baseline",
442
+ original_title: str | None = None,
443
+ ) -> str:
444
+ """
445
+ Extract main content from HTML using Trafilatura.
446
+
447
+ Used in extract mode (real-world pages with boilerplate).
448
+ Returns extracted HTML if Trafilatura succeeds. Falls back to original
449
+ HTML only when extraction genuinely fails (None or near-empty output).
450
+
451
+ Acceptance rules (in order):
452
+ 1. Extracted content contains headings → use as-is (existing behaviour).
453
+ 2. Extracted content is prose-sufficient (>= _MIN_PROSE_CHARS chars after
454
+ stripping tags) but headingless → inject a synthetic <h1> from the page
455
+ title so that validate_structure() and build_sections() can process the
456
+ content normally. Catches prose articles whose headings are CSS- or
457
+ JS-rendered and therefore absent from Trafilatura's HTML output
458
+ (e.g. Guardian long-reads, The Ringer feature articles).
459
+ 3. Otherwise → fall back to original HTML (genuine extraction failure).
460
+
461
+ The synthetic heading is sourced from:
462
+ - ``original_title`` argument if supplied by the caller, else
463
+ - the raw ``<title>`` element parsed from ``html``, else
464
+ - the literal string "Article".
465
+
466
+ Known limitations in extract mode:
467
+ - <pre> code blocks are converted to <blockquote>
468
+ - Some inline spacing may be lost around inline elements
469
+ These are documented limitations, not bugs.
470
+
471
+ Args:
472
+ html: Raw HTML string
473
+ extraction_mode: Controls Trafilatura parameter set (default "baseline").
474
+ "baseline" - current production behavior: favor_precision=True, with fallback.
475
+ "precision" - stricter: favor_precision=True, no_fallback=True.
476
+ Skips fallback extraction algorithms; more inputs fall
477
+ through to original HTML. Fewer false positives.
478
+ "recall" - inclusive: favor_recall=True. Less filtering, more
479
+ content retained. More boilerplate may leak through.
480
+ original_title: Optional page title string; used as the synthetic <h1>
481
+ text when the extraction is headingless. Callers may
482
+ omit this; the function will extract it from ``html``.
483
+
484
+ Returns:
485
+ Extracted HTML string (possibly with injected <h1>), or original HTML
486
+ if extraction failed.
487
+ """
488
+ # Trafilatura kwargs per extraction mode.
489
+ # "baseline" is the unchanged production behavior; existing callers are unaffected.
490
+ if extraction_mode == "precision":
491
+ # no_fallback=True: skip secondary extraction methods on failure.
492
+ # Trafilatura>=1.8.0 supports this parameter.
493
+ traf_kwargs: dict = dict(favor_precision=True, no_fallback=True)
494
+ elif extraction_mode == "recall":
495
+ # favor_recall=True: more inclusive extraction, keeps more content at cost of
496
+ # more boilerplate leakage. Named for what it does, not for speed.
497
+ traf_kwargs = dict(favor_precision=False, favor_recall=True, no_fallback=False)
498
+ else: # "baseline" — must stay byte-identical to the previous hardcoded call
499
+ traf_kwargs = dict(favor_precision=True, no_fallback=False)
500
+
501
+ extracted = trafilatura.extract(
502
+ html,
503
+ output_format="html",
504
+ include_formatting=True,
505
+ include_links=True,
506
+ include_images=True,
507
+ include_comments=False,
508
+ include_tables=False,
509
+ **traf_kwargs,
510
+ )
511
+
512
+ has_h1 = extracted and "<h1" in extracted
513
+ has_any_heading = extracted and any(f"<h{i}" in extracted for i in range(1, 7))
514
+
515
+ if has_any_heading:
516
+ cleaned = re.sub(r'<p>\s*Advertisement\s*</p>', '', extracted)
517
+ if not has_h1:
518
+ title = _strip_title_branding(original_title or _extract_title_string(html))
519
+ h1 = f"<h1>{title}</h1>\n" if title else "<h1>Article</h1>\n"
520
+ # Guard: skip injection if the title duplicates the first real heading.
521
+ # Uses the same normalization as drop_duplicate_consecutive_sections().
522
+ first_h = _first_heading_text(cleaned)
523
+ if first_h is not None and title:
524
+ if _normalize_str(title) == _normalize_str(first_h):
525
+ return cleaned
526
+ return h1 + cleaned
527
+ return cleaned
528
+
529
+ # Headingless extraction: accept if prose-sufficient, inject synthetic heading.
530
+ if extracted:
531
+ prose_chars = len(re.sub(r'<[^>]+>', '', extracted).strip())
532
+ if prose_chars >= _MIN_PROSE_CHARS:
533
+ cleaned = re.sub(r'<p>\s*Advertisement\s*</p>', '', extracted)
534
+ title = _strip_title_branding(original_title or _extract_title_string(html))
535
+ h1 = f"<h1>{title}</h1>\n" if title else "<h1>Article</h1>\n"
536
+ return h1 + cleaned
537
+
538
+ return html
539
+
540
+
541
+ def parse(html: str, original_title=None, require_article_body: bool = False,
542
+ caption_map: dict[str, str] | None = None,
543
+ source_url: str = "") -> Document:
544
+ """
545
+ Parse HTML string to Document model.
546
+
547
+ Full pipeline: sanitize -> parse DOM -> select content -> build model.
548
+
549
+ In transform mode, call parse() directly with raw HTML.
550
+ In extract mode, call extract_with_trafilatura() first, then parse()
551
+ with the original_title captured before extraction.
552
+
553
+ Args:
554
+ html: Raw HTML string (pre-extracted if in extract mode)
555
+ original_title: Optional BeautifulSoup Tag for <title>, captured
556
+ before Trafilatura strips <head>. Passed by main.py
557
+ in extract mode to preserve document title.
558
+ require_article_body: If True, raise ValidationError when no paragraph
559
+ with >= 20 words of non-placeholder prose is found.
560
+ Set to True in extract mode to detect extraction failure.
561
+ Default False to preserve backward compatibility.
562
+
563
+ Returns:
564
+ Document model with title and sections
565
+ """
566
+ # Step 1: Sanitize
567
+ clean_html = sanitize(html)
568
+
569
+ # Step 1.5: Convert trafilatura <graphic> to <img> before lxml parsing.
570
+ # lxml doesn't recognise <graphic> as void, so it nests subsequent
571
+ # content inside it. <img> is a known void element.
572
+ clean_html = re.sub(
573
+ r'<graphic\b([^>]*)(?:/>|>\s*</graphic>|>)',
574
+ r'<img\1>',
575
+ clean_html,
576
+ )
577
+
578
+ # Step 2: Parse DOM
579
+ soup = BeautifulSoup(clean_html, "lxml")
580
+
581
+ # Step 3: Select main content
582
+ content = select_main_content(soup)
583
+
584
+ # Step 4: Validate structure
585
+ validate_structure(content)
586
+
587
+ # Step 4.5: Preflight scope check — reject table/form/reference-dominant pages
588
+ preflight_scope_check(content)
589
+
590
+ # Step 5: Build sections
591
+ sections = build_sections(content, caption_map=caption_map)
592
+
593
+ # Step 5.5: Trim trailing CMS boilerplate paragraphs (end-anchored)
594
+ sections = trim_trailing_boilerplate(sections)
595
+
596
+ # Step 5.55: Trim trailing noise paragraphs (end-anchored)
597
+ sections = trim_trailing_noise(sections)
598
+
599
+ # Step 5.6: Drop final orphan section (heading with zero content blocks)
600
+ sections = drop_trailing_orphan_section(sections)
601
+
602
+ # Step 5.7: Collapse consecutive identical placeholder blocks
603
+ sections = collapse_consecutive_placeholder_blocks(sections)
604
+
605
+ # Step 5.8: Drop consecutive duplicate-heading sections
606
+ sections = drop_duplicate_consecutive_sections(sections)
607
+
608
+ # Step 5.85: Drop all empty sections (heading-only, no content blocks)
609
+ sections = drop_empty_sections(sections)
610
+
611
+ # Step 5.9: Guard against extraction failure — no article body (extract mode only)
612
+ if require_article_body and not _has_article_body(sections):
613
+ raise ValidationError(
614
+ "No article body detected: document contains no paragraph with "
615
+ "20 or more words of prose text. Extraction may have captured "
616
+ "navigation or boilerplate instead of article content."
617
+ )
618
+
619
+ # Step 6: Extract title
620
+ title = extract_title(soup, content, original_title=original_title)
621
+
622
+ return Document(title=title, sections=sections, source_url=source_url)
623
+
624
+
625
+ def extract_title(soup: BeautifulSoup, content: Tag, original_title=None) -> str:
626
+ """
627
+ Extract document title from <title> tag or first <h1>.
628
+
629
+ Per decisions.md section 5:
630
+ - If <title> exists, use its text
631
+ - Else if first <h1> in content exists, use its text
632
+ - Else empty string
633
+
634
+ Args:
635
+ soup: Full DOM tree (for <title> access)
636
+ content: Selected content subtree
637
+ original_title: Optional BeautifulSoup Tag captured before
638
+ Trafilatura stripped <head>
639
+
640
+ Returns:
641
+ Title string (may be empty)
642
+ """
643
+ # Use original title if provided (extract mode - Trafilatura strips <head>)
644
+ title_tag = original_title or soup.find("title")
645
+ if title_tag:
646
+ return re.sub(r'\s+', ' ', title_tag.get_text()).strip()
647
+
648
+ # Fall back to first <h1> in content
649
+ h1 = content.find("h1")
650
+ if h1:
651
+ return re.sub(r'\s+', ' ', h1.get_text()).strip()
652
+
653
+ return ""
654
+
655
+
656
+ def validate_structure(content: Tag) -> None:
657
+ """
658
+ Validate that content has minimum semantic structure.
659
+
660
+ Per decisions.md section 3:
661
+ - Must have at least one h1, h2, or h3
662
+ - Must have at least one p, ul, or ol
663
+
664
+ Args:
665
+ content: Selected content subtree
666
+
667
+ Raises:
668
+ ValidationError: If structure requirements are not met
669
+ """
670
+ has_heading = bool(content.find(["h1", "h2", "h3"]))
671
+ has_body_content = bool(content.find(["p", "ul", "ol"]))
672
+
673
+ if not has_heading or not has_body_content:
674
+ raise ValidationError(
675
+ "Input HTML lacks semantic structure "
676
+ "(requires at least one h1-h3 and body content in p/ul/ol)."
677
+ )
678
+
679
+
680
+ def preflight_scope_check(content: Tag) -> None:
681
+ """
682
+ Preflight: reject table/form/reference-dominant pages before model build.
683
+
684
+ Checks three structural signatures that indicate out-of-scope content:
685
+ 1. Form/tool pages: form elements present + sparse prose (<250 words).
686
+ 2. Table/reference pages: multiple tables + sparse prose (<250 words).
687
+ 3. Navigation/reference pages: link-word ratio > 0.5 with sparse prose
688
+ (<500 words). Catches pages whose paragraph text is mostly navigation
689
+ links rather than article prose (e.g. GDP reference lists).
690
+
691
+ Thresholds are calibrated against the eval20 corpus:
692
+ - Lowest in-scope ACCEPT p_text_words: 241 (cdc)
693
+ - Highest in-scope ACCEPT link/p ratio: 0.18 (theconversation)
694
+ Both values leave a safe margin below these rejection thresholds.
695
+
696
+ Called unconditionally after validate_structure() so that it applies in
697
+ both transform and extract mode.
698
+ """
699
+ table_count = len(content.find_all("table"))
700
+ form_count = len(content.find_all("form"))
701
+
702
+ p_text_words = sum(
703
+ len(p.get_text().split())
704
+ for p in content.find_all("p")
705
+ )
706
+ link_words = sum(
707
+ len(a.get_text().split())
708
+ for a in content.find_all("a")
709
+ )
710
+
711
+ if form_count >= 1 and p_text_words < 250:
712
+ raise ValidationError("Out of scope: tool/form page.")
713
+
714
+ if table_count >= 2 and p_text_words < 250:
715
+ raise ValidationError("Out of scope: table/reference page.")
716
+
717
+ if p_text_words > 0 and p_text_words < 500 and (link_words / p_text_words) > 0.5:
718
+ raise ValidationError("Out of scope: navigation/reference page.")
719
+
720
+
721
+ def build_sections(content: Tag, caption_map: dict[str, str] | None = None) -> list[Section]:
722
+ """
723
+ Build sections from headings in content.
724
+
725
+ Per decisions.md section 5:
726
+ - Drop all content before first heading
727
+ - Each heading starts a new section
728
+ - Section continues until next heading (any level)
729
+
730
+ Uses a flatten-then-assign approach: all headings and block-level
731
+ elements are collected in document order, then each block is assigned
732
+ to the most recent heading. This avoids content duplication when
733
+ headings are nested at different DOM depths (e.g. h2 → div → h3).
734
+
735
+ Args:
736
+ content: Selected content subtree
737
+
738
+ Returns:
739
+ List of Section objects
740
+ """
741
+ # Find all headings (h1-h6)
742
+ heading_tags = set(["h1", "h2", "h3", "h4", "h5", "h6"])
743
+ headings = content.find_all(heading_tags)
744
+
745
+ if not headings:
746
+ # No headings found - will fail validation later
747
+ return []
748
+
749
+ heading_set = set(id(h) for h in headings)
750
+
751
+ # Flatten: walk content descendants in document order, collecting
752
+ # headings and parseable block elements. Skip any element whose
753
+ # ancestor is already a collected block (prevents duplication).
754
+ ordered: list[tuple[str, Tag]] = [] # ("heading"|"block", element)
755
+ collected_ids: set[int] = set()
756
+
757
+ for elem in content.descendants:
758
+ if not isinstance(elem, Tag):
759
+ continue
760
+
761
+ # Skip elements inside an already-collected block
762
+ if any(id(p) in collected_ids for p in elem.parents):
763
+ continue
764
+
765
+ if id(elem) in heading_set:
766
+ ordered.append(("heading", elem))
767
+ collected_ids.add(id(elem))
768
+ elif elem.name not in heading_tags:
769
+ block = parse_block(elem, caption_map=caption_map)
770
+ if block:
771
+ ordered.append(("block", elem))
772
+ collected_ids.add(id(elem))
773
+
774
+ # Assign: each block goes to the most recent heading.
775
+ sections: list[Section] = []
776
+ current_heading: Heading | None = None
777
+ current_blocks: list[Block] = []
778
+
779
+ for kind, elem in ordered:
780
+ if kind == "heading":
781
+ # Flush previous section
782
+ if current_heading is not None and _heading_has_text(current_heading):
783
+ sections.append(Section(heading=current_heading, blocks=current_blocks))
784
+ current_heading = parse_heading(elem)
785
+ current_blocks = []
786
+ else:
787
+ if current_heading is not None:
788
+ block = parse_block(elem, caption_map=caption_map)
789
+ if block:
790
+ current_blocks.append(block)
791
+
792
+ # Flush last section
793
+ if current_heading is not None and _heading_has_text(current_heading):
794
+ sections.append(Section(heading=current_heading, blocks=current_blocks))
795
+
796
+ return sections
797
+
798
+
799
+ def _heading_has_text(heading: "Heading") -> bool:
800
+ """Return True if the heading contains at least one non-whitespace character."""
801
+ return bool("".join(_inline_to_text(il) for il in heading.inlines).strip())
802
+
803
+
804
+ def parse_heading(element: Tag) -> Heading:
805
+ """
806
+ Parse heading element to Heading model.
807
+
808
+ Args:
809
+ element: BeautifulSoup Tag for h1-h6
810
+
811
+ Returns:
812
+ Heading with level and inline content
813
+ """
814
+ level = int(element.name[1])
815
+ inlines = parse_inlines(element)
816
+ return Heading(level=level, inlines=inlines)
817
+
818
+
819
+ def _listblock_has_content(block: "ListBlock") -> bool:
820
+ """Return True if the list has at least one item with non-empty content.
821
+
822
+ Filters two structural artifact cases:
823
+ - Zero items: <ol></ol> or <ul></ul> (no <li> elements at all)
824
+ - All-empty items: every <li> has no inline text and no nested lists
825
+ (e.g. <ul><li></li><li></li></ul>)
826
+ Both are unambiguously extraction artifacts; legitimate lists always have
827
+ at least one item with visible content.
828
+ """
829
+ return bool(block.items) and any(
830
+ item.inlines or item.children for item in block.items
831
+ )
832
+
833
+
834
+ def parse_figure(element: Tag, caption_map: dict[str, str] | None = None) -> Block | None:
835
+ """
836
+ Parse <figure> element to Block model.
837
+
838
+ If the figure contains an image, preserves it with figcaption as caption.
839
+ If no image but figcaption text exists, returns as Paragraph.
840
+ """
841
+ imgs = element.find_all(["img", "graphic"])
842
+ figcaption = element.find("figcaption")
843
+ caption_text = ""
844
+ if figcaption:
845
+ caption_text = figcaption.get_text(separator=" ", strip=True)
846
+
847
+ if not imgs:
848
+ if caption_text:
849
+ return Paragraph(inlines=[Text(text=caption_text)])
850
+ return None
851
+
852
+ result = degrade_image(imgs[0])
853
+ if isinstance(result, Image):
854
+ if caption_text:
855
+ result.caption = caption_text
856
+ return result
857
+ # Placeholder Text — wrap in Paragraph
858
+ return Paragraph(inlines=[result])
859
+
860
+
861
+ def parse_block(element: Tag, caption_map: dict[str, str] | None = None) -> Block | None:
862
+ """
863
+ Parse block-level element to Block model.
864
+
865
+ Returns None for unrecognized elements (silently skipped in v1).
866
+ v2: Add warning logging for skipped elements.
867
+
868
+ Args:
869
+ element: BeautifulSoup Tag for block element
870
+ caption_map: Optional map of img src -> caption text (extract mode)
871
+
872
+ Returns:
873
+ Block model object or None if unsupported
874
+ """
875
+ tag_name = element.name
876
+
877
+ if tag_name == "p":
878
+ block = parse_paragraph(element)
879
+ return block if block.inlines else None
880
+ elif tag_name in ("ul", "ol"):
881
+ block = parse_list(element)
882
+ return block if _listblock_has_content(block) else None
883
+ elif tag_name == "blockquote":
884
+ return parse_quote(element)
885
+ elif tag_name == "pre":
886
+ return parse_preformatted(element)
887
+ elif tag_name == "table":
888
+ return degrade_table(element)
889
+ elif tag_name == "figure":
890
+ return parse_figure(element, caption_map=caption_map)
891
+ elif tag_name in ("img", "graphic"):
892
+ result = degrade_image(element)
893
+ if isinstance(result, Image):
894
+ if caption_map and result.src in caption_map:
895
+ result.caption = caption_map[result.src]
896
+ return result
897
+ # Placeholder Text — wrap in Paragraph
898
+ return Paragraph(inlines=[result])
899
+ elif tag_name in ("form", "input", "textarea", "select", "button"):
900
+ return degrade_form(element)
901
+ elif tag_name == "hr":
902
+ return degrade_hr()
903
+ else:
904
+ # Unknown block - skip silently in v1
905
+ return None
906
+
907
+
908
+ def parse_paragraph(element: Tag) -> Paragraph:
909
+ """
910
+ Parse paragraph to Paragraph model.
911
+
912
+ Args:
913
+ element: BeautifulSoup Tag for <p>
914
+
915
+ Returns:
916
+ Paragraph with inline content
917
+ """
918
+ inlines = parse_inlines(element)
919
+ return Paragraph(inlines=inlines)
920
+
921
+
922
+ def parse_list(element: Tag) -> ListBlock:
923
+ """
924
+ Parse list to ListBlock model.
925
+
926
+ Handles nested lists via ListItem.children.
927
+
928
+ Args:
929
+ element: BeautifulSoup Tag for <ul> or <ol>
930
+
931
+ Returns:
932
+ ListBlock with items
933
+ """
934
+ ordered = element.name == "ol"
935
+ items = []
936
+
937
+ for li in element.find_all("li", recursive=False):
938
+ # Parse inlines, but exclude nested lists from inline parsing
939
+ # (nested lists are handled separately via children)
940
+ li_copy = li.__copy__()
941
+ for nested in li_copy.find_all(["ul", "ol"], recursive=False):
942
+ nested.decompose()
943
+ inlines = parse_inlines(li_copy)
944
+
945
+ # Find nested lists
946
+ nested_lists = li.find_all(["ul", "ol"], recursive=False)
947
+ children = [parse_list(nested) for nested in nested_lists]
948
+
949
+ items.append(ListItem(inlines=inlines, children=children))
950
+
951
+ return ListBlock(ordered=ordered, items=items)
952
+
953
+
954
+ def parse_quote(element: Tag) -> Quote:
955
+ """
956
+ Parse blockquote to Quote model (recursive).
957
+
958
+ Args:
959
+ element: BeautifulSoup Tag for <blockquote>
960
+
961
+ Returns:
962
+ Quote containing blocks
963
+ """
964
+ blocks = []
965
+ for child in element.children:
966
+ if isinstance(child, Tag):
967
+ block = parse_block(child)
968
+ if block:
969
+ blocks.append(block)
970
+
971
+ return Quote(blocks=blocks)
972
+
973
+
974
+ def parse_preformatted(element: Tag) -> Preformatted:
975
+ """
976
+ Parse <pre> to Preformatted model.
977
+
978
+ Preserves whitespace exactly.
979
+
980
+ Args:
981
+ element: BeautifulSoup Tag for <pre>
982
+
983
+ Returns:
984
+ Preformatted with verbatim text
985
+ """
986
+ return Preformatted(text=element.get_text())
987
+
988
+
989
+ def collapse_whitespace(text: str) -> str:
990
+ """
991
+ Collapse runs of whitespace into a single space.
992
+
993
+ Preserves leading/trailing space if the original had whitespace
994
+ at those boundaries. This matches HTML whitespace collapsing rules
995
+ (decisions.md section 8).
996
+ """
997
+ if not text:
998
+ return ""
999
+ leading = " " if text[0].isspace() else ""
1000
+ trailing = " " if text[-1].isspace() else ""
1001
+ collapsed = re.sub(r"\s+", " ", text.strip())
1002
+ if not collapsed:
1003
+ return ""
1004
+ return leading + collapsed + trailing
1005
+
1006
+
1007
+ def parse_inlines(element: Tag) -> list[Inline]:
1008
+ """
1009
+ Parse inline content recursively.
1010
+
1011
+ Handles nested inline elements and text nodes.
1012
+ Applies whitespace normalization.
1013
+
1014
+ Args:
1015
+ element: BeautifulSoup Tag containing inline content
1016
+
1017
+ Returns:
1018
+ List of Inline model objects
1019
+ """
1020
+ result = []
1021
+
1022
+ for child in element.children:
1023
+ if isinstance(child, NavigableString):
1024
+ text = collapse_whitespace(str(child))
1025
+ if text:
1026
+ result.append(Text(text=text))
1027
+ elif isinstance(child, Tag):
1028
+ inline = parse_inline_element(child)
1029
+ if isinstance(inline, list):
1030
+ result.extend(inline)
1031
+ elif inline:
1032
+ result.append(inline)
1033
+
1034
+ return result
1035
+
1036
+
1037
+ def parse_inline_element(element: Tag) -> Inline | list[Inline] | None:
1038
+ """
1039
+ Parse single inline element.
1040
+
1041
+ Returns Inline object, list of Inlines, or None for unknown elements.
1042
+
1043
+ Args:
1044
+ element: BeautifulSoup Tag for inline element
1045
+
1046
+ Returns:
1047
+ Inline model object(s) or None
1048
+ """
1049
+ tag_name = element.name
1050
+
1051
+ if tag_name == "br":
1052
+ return LineBreak()
1053
+ elif tag_name in ("em", "i"):
1054
+ return Emphasis(children=parse_inlines(element))
1055
+ elif tag_name in ("strong", "b"):
1056
+ return Strong(children=parse_inlines(element))
1057
+ elif tag_name == "code":
1058
+ return Code(text=element.get_text())
1059
+ elif tag_name == "a":
1060
+ href = element.get("href", "")
1061
+ return Link(href=href, children=parse_inlines(element))
1062
+ elif tag_name in ("img", "graphic"):
1063
+ result = degrade_image(element)
1064
+ if isinstance(result, Image):
1065
+ # Image is block-level; in inline context, use alt text
1066
+ return Text(text=result.alt) if result.alt else None
1067
+ return result
1068
+ else:
1069
+ # Unknown inline element - extract text
1070
+ text = element.get_text().strip()
1071
+ if text:
1072
+ return Text(text=text)
1073
+ return None