web2textpy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
web2text.py ADDED
@@ -0,0 +1,931 @@
1
+ """web2text.py — Python port of Web2Text alignment-based labeling pipeline.
2
+
3
+ Given paired (raw_html, clean_text), aligns clean text back onto DOM nodes
4
+ and labels each as content or boilerplate.
5
+
6
+ Based on: Vogels et al., "Web2Text: Deep Structured Boilerplate Removal" (ECIR 2018)
7
+ Original Scala: https://github.com/dalab/web2text
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ import unicodedata
14
+ from collections import Counter, defaultdict
15
+ from typing import NamedTuple
16
+
17
+ from lxml import etree
18
+ from lxml.html import document_fromstring
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Constants
22
+ # ---------------------------------------------------------------------------
23
+
24
+ GAPCHAR = "\u25a1" # □
25
+
26
+ SKIP_TAGS = frozenset({
27
+ "script", "style", "head", "noscript", "iframe", "img", "input",
28
+ "br", "hr", "meta", "title", "video", "select", "textarea",
29
+ "link", "object", "embed", "applet", "param", "svg",
30
+ })
31
+
32
+ BLOCK_TAGS = frozenset({
33
+ "address", "article", "aside", "blockquote", "body", "center",
34
+ "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure",
35
+ "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header",
36
+ "li", "main", "nav", "ol", "p", "pre", "section", "table",
37
+ "tbody", "td", "tfoot", "th", "thead", "tr", "ul",
38
+ })
39
+
40
+ # Characters treated as equivalent in alignment (apostrophes, quotes, ?)
41
+ _QUOTE_CHARS = frozenset("?'\u2018\u2019\u201a\u201b`\u0027")
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Text normalization
45
+ # ---------------------------------------------------------------------------
46
+
47
+ _WS_RE = re.compile(r"\s+")
48
+ _CTRL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]")
49
+ # Word-internal ? acting as apostrophe: letter?letter (e.g., it?s, don?t)
50
+ _QUESTION_APOS_RE = re.compile(r"(?<=\w)\?(?=\w)")
51
+
52
+
53
+ def _fix_mojibake(s: str) -> str:
54
+ """Fix CP1252 mojibake: text that was UTF-8 but decoded as CP1252.
55
+
56
+ Tries to re-encode as CP1252 and decode as UTF-8. If the result is
57
+ shorter (mojibake expands chars), it's a genuine fix.
58
+ """
59
+ try:
60
+ fixed = s.encode("cp1252").decode("utf-8")
61
+ if len(fixed) < len(s): # mojibake always inflates length
62
+ return fixed
63
+ except (UnicodeDecodeError, UnicodeEncodeError):
64
+ pass
65
+ return s
66
+
67
+
68
+ def normalize_text(s: str) -> str:
69
+ """NFC-normalize, fix mojibake, collapse whitespace, replace NBSP, strip."""
70
+ s = _fix_mojibake(s)
71
+ s = unicodedata.normalize("NFC", s)
72
+ s = s.replace("\u00a0", " ")
73
+ return _WS_RE.sub(" ", s).strip()
74
+
75
+
76
+ def _normalize_for_eval(s: str) -> str:
77
+ """Extra normalization applied only during evaluation.
78
+
79
+ Normalizes smart quotes and ?-as-apostrophe so that ground truth
80
+ quirks don't penalize correct extraction.
81
+ """
82
+ s = s.replace("\u2018", "'").replace("\u2019", "'")
83
+ s = s.replace("\u201c", '"').replace("\u201d", '"')
84
+ s = _QUESTION_APOS_RE.sub("'", s)
85
+ return s
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # CDOM construction
90
+ # ---------------------------------------------------------------------------
91
+
92
+ def _remove_preserving_tail(el: etree._Element) -> None:
93
+ """Remove *el* from its parent without losing its tail text."""
94
+ parent = el.getparent()
95
+ if parent is None:
96
+ return
97
+ tail = el.tail
98
+ if tail:
99
+ prev = el.getprevious()
100
+ if prev is not None:
101
+ prev.tail = (prev.tail or "") + tail
102
+ else:
103
+ parent.text = (parent.text or "") + tail
104
+ parent.remove(el)
105
+
106
+
107
+ _XML_DECL_RE = re.compile(r"<\?xml[^\n]*(?:\?>|\n|$)", re.IGNORECASE)
108
+
109
+
110
+ def build_cdom(html_str: str) -> etree._Element:
111
+ """Parse HTML and build a Collapsed DOM tree.
112
+
113
+ Removes non-content elements, empty nodes, and collapses single-child chains.
114
+ """
115
+ # Strip XML declaration that lxml.html rejects
116
+ html_str = _XML_DECL_RE.sub("", html_str, count=1)
117
+
118
+ doc = document_fromstring(html_str)
119
+ try:
120
+ body = doc.body
121
+ except IndexError:
122
+ body = None
123
+ if body is None:
124
+ body = doc
125
+
126
+ # --- Strip XML-invalid control characters from all text/tail/attributes ---
127
+ for el in doc.iter():
128
+ if el.text:
129
+ el.text = _CTRL_RE.sub("", el.text)
130
+ if el.tail:
131
+ el.tail = _CTRL_RE.sub("", el.tail)
132
+ if isinstance(el.tag, str):
133
+ for attr, val in el.attrib.items():
134
+ cleaned = _CTRL_RE.sub("", val)
135
+ if cleaned != val:
136
+ el.attrib[attr] = cleaned
137
+
138
+ # --- Remove comments ---
139
+ for c in body.iter():
140
+ if callable(c.tag): # Comments, PIs
141
+ _remove_preserving_tail(c)
142
+
143
+ # --- Remove skip-tag elements ---
144
+ to_remove = [el for el in body.iter() if isinstance(el.tag, str) and el.tag in SKIP_TAGS]
145
+ for el in to_remove:
146
+ _remove_preserving_tail(el)
147
+
148
+ # --- Remove empty text (set .text / .tail to None if whitespace-only) ---
149
+ for el in body.iter():
150
+ if not isinstance(el.tag, str):
151
+ continue
152
+ if el.text and not el.text.strip():
153
+ el.text = None
154
+ if el.tail and not el.tail.strip():
155
+ el.tail = None
156
+
157
+ # --- Remove empty leaf elements (bottom-up) ---
158
+ changed = True
159
+ while changed:
160
+ changed = False
161
+ for el in list(body.iter()):
162
+ if not isinstance(el.tag, str):
163
+ continue
164
+ if el is body:
165
+ continue
166
+ if len(el) == 0 and not (el.text and el.text.strip()):
167
+ _remove_preserving_tail(el)
168
+ changed = True
169
+
170
+ # --- Collapse single-child chains (bottom-up via post-order) ---
171
+ def _collapse(el: etree._Element) -> None:
172
+ for child in list(el):
173
+ if isinstance(child.tag, str):
174
+ _collapse(child)
175
+ if len(el) == 1 and not (el.text and el.text.strip()):
176
+ child = el[0]
177
+ if not isinstance(child.tag, str):
178
+ return
179
+ # Merge child into parent: adopt child's children and text
180
+ el.text = child.text
181
+ # Move grandchildren up
182
+ grandchildren = list(child)
183
+ for gc in grandchildren:
184
+ child.remove(gc)
185
+ el.append(gc)
186
+ # Preserve child.tail: append it to the last grandchild's tail,
187
+ # or to el.text if there are no grandchildren
188
+ if child.tail and child.tail.strip():
189
+ if grandchildren:
190
+ last_gc = grandchildren[-1]
191
+ last_gc.tail = (last_gc.tail or "") + child.tail
192
+ else:
193
+ el.text = (el.text or "") + child.tail
194
+ el.remove(child)
195
+
196
+ _collapse(body)
197
+
198
+ return body
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Leaf extraction
203
+ # ---------------------------------------------------------------------------
204
+
205
+ def extract_leaves(tree: etree._Element) -> list[tuple[etree._Element, str]]:
206
+ """Extract ordered text blocks from the CDOM tree.
207
+
208
+ In lxml, text lives in two places: el.text (before first child) and
209
+ child.tail (after a child element). Both must be captured, mirroring
210
+ how Jsoup represents text as separate TextNode children.
211
+
212
+ For leaf elements (no children), we capture el.text_content() as usual.
213
+ For internal elements, we capture el.text as a separate text block,
214
+ and each child's .tail as a separate text block.
215
+ """
216
+ leaves: list[tuple[etree._Element, str]] = []
217
+
218
+ def _walk(el: etree._Element) -> None:
219
+ if not isinstance(el.tag, str):
220
+ return
221
+ if el.get("data-synthetic"):
222
+ return # already processed
223
+ if len(el) == 0:
224
+ # Leaf element — capture all its text
225
+ text = normalize_text(el.text_content())
226
+ if text:
227
+ el.set("data-leaf-id", str(len(leaves)))
228
+ leaves.append((el, text))
229
+ else:
230
+ # Snapshot original children BEFORE any tree mutations
231
+ original_children = list(el)
232
+
233
+ # Internal element — capture .text (text before first child)
234
+ if el.text and el.text.strip():
235
+ text = normalize_text(el.text)
236
+ if text:
237
+ span = etree.SubElement(el, "span")
238
+ span.set("data-synthetic", "1")
239
+ span.text = el.text
240
+ el.text = None
241
+ el.insert(0, span)
242
+ span.set("data-leaf-id", str(len(leaves)))
243
+ leaves.append((span, text))
244
+
245
+ # Recurse into original children only
246
+ for child in original_children:
247
+ _walk(child)
248
+ # Capture child.tail (text after this child, before next sibling)
249
+ if child.tail and child.tail.strip():
250
+ tail_text = normalize_text(child.tail)
251
+ if tail_text:
252
+ span = etree.SubElement(el, "span")
253
+ span.set("data-synthetic", "1")
254
+ span.text = child.tail
255
+ child.tail = None
256
+ idx = list(el).index(child)
257
+ el.insert(idx + 1, span)
258
+ span.set("data-leaf-id", str(len(leaves)))
259
+ leaves.append((span, tail_text))
260
+
261
+ _walk(tree)
262
+ return leaves
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # Anchor-based alignment (port of Scala find1to1mathches)
267
+ # ---------------------------------------------------------------------------
268
+
269
+ class Segment(NamedTuple):
270
+ src_start: int
271
+ src_end: int
272
+ cln_start: int
273
+ cln_end: int
274
+ matched: bool
275
+
276
+
277
+ def _find_anchors(source: str, cleaned: str, k: int = 10) -> list[Segment]:
278
+ """Find unique k-char anchors shared between *source* and *cleaned*.
279
+
280
+ Returns a list of Segments covering the full source/cleaned range.
281
+ Matched segments are definite alignments; open segments need DP.
282
+ """
283
+ n, m = len(source), len(cleaned)
284
+
285
+ if n < k or m < k:
286
+ return [Segment(0, n, 0, m, False)]
287
+
288
+ # Build substring → [positions] map for source (skip substrings with GAPCHAR)
289
+ source_map: dict[str, list[int]] = defaultdict(list)
290
+ for i in range(n + 1 - k):
291
+ sub = source[i : i + k]
292
+ if GAPCHAR not in sub:
293
+ source_map[sub].append(i)
294
+
295
+ # Trimmed source maps for safety check (strip whitespace + GAPCHAR)
296
+ def _trim_filter(c: str) -> bool:
297
+ return not c.isspace() and c != GAPCHAR
298
+
299
+ trimmed_source = "".join(c for c in source if _trim_filter(c))
300
+ # Pre-compute occurrence counts for trimmed substrings of lengths 1..k
301
+ trimmed_maps: list[dict[str, int]] = []
302
+ for kk in range(1, k + 1):
303
+ counts: dict[str, int] = defaultdict(int)
304
+ for i in range(len(trimmed_source) + 1 - kk):
305
+ counts[trimmed_source[i : i + kk]] += 1
306
+ trimmed_maps.append(counts)
307
+
308
+ def _equal_enough(c1: str, c2: str) -> bool:
309
+ if c1.isspace() and c2.isspace():
310
+ return True
311
+ if c1.upper() == c2.upper():
312
+ return True
313
+ # Treat ? as equivalent to apostrophe variants (l3s-gn1 ground truth quirk)
314
+ if c1 in _QUOTE_CHARS and c2 in _QUOTE_CHARS:
315
+ return True
316
+ return False
317
+
318
+ segments: list[Segment] = []
319
+ # Track last matched/open segment end positions
320
+ last_src_start, last_src_end = 0, 0
321
+ last_cln_start, last_cln_end = 0, 0
322
+ last_is_init = True # first sentinel
323
+
324
+ i = 0
325
+ while i < m + 1 - k:
326
+ subs = cleaned[i : i + k]
327
+
328
+ # Trimmed substring for safety check
329
+ trimmed_subs = "".join(c for c in subs if _trim_filter(c))
330
+
331
+ match_locs = source_map.get(subs, [])
332
+ trimmed_count = (
333
+ trimmed_maps[len(trimmed_subs) - 1].get(trimmed_subs, 0)
334
+ if trimmed_subs
335
+ else 0
336
+ )
337
+
338
+ if len(match_locs) == 1 and trimmed_count == 1:
339
+ src_pos = match_locs[0]
340
+
341
+ # Extend right
342
+ extra_right = 0
343
+ while (
344
+ i + k + extra_right < m
345
+ and src_pos + k + extra_right < n
346
+ and _equal_enough(cleaned[i + k + extra_right], source[src_pos + k + extra_right])
347
+ ):
348
+ extra_right += 1
349
+
350
+ # Extend left
351
+ extra_left = 0
352
+ while (
353
+ i - extra_left > 0
354
+ and src_pos - extra_left > 0
355
+ and src_pos - extra_left >= last_src_end + 1
356
+ and _equal_enough(cleaned[i - 1 - extra_left], source[src_pos - 1 - extra_left])
357
+ ):
358
+ extra_left += 1
359
+
360
+ if src_pos <= last_src_start and not last_is_init:
361
+ # Collision: new match is before previous — discard last 2 segments
362
+ if len(segments) >= 2:
363
+ segments.pop()
364
+ segments.pop()
365
+ if segments:
366
+ prev = segments[-1]
367
+ last_src_start, last_src_end = prev.src_start, prev.src_end
368
+ last_cln_start, last_cln_end = prev.cln_start, prev.cln_end
369
+ else:
370
+ last_src_start = last_src_end = 0
371
+ last_cln_start = last_cln_end = 0
372
+ last_is_init = True
373
+ i += 1
374
+
375
+ elif src_pos < last_src_end:
376
+ # Overlap — skip this anchor
377
+ i += 1
378
+
379
+ else:
380
+ last_is_init = False
381
+ # Shorten left extension if it overlaps with previous segment
382
+ while src_pos - extra_left < last_src_end:
383
+ extra_left -= 1
384
+
385
+ match_src_start = src_pos - extra_left
386
+ match_src_end = src_pos + k + extra_right
387
+ match_cln_start = i - extra_left
388
+ match_cln_end = i + k + extra_right
389
+
390
+ # Insert open segment before this match (if there's a gap)
391
+ if match_src_start > last_src_end or match_cln_start > last_cln_end:
392
+ segments.append(
393
+ Segment(last_src_end, match_src_start, last_cln_end, match_cln_start, False)
394
+ )
395
+
396
+ # Insert matched segment
397
+ seg = Segment(match_src_start, match_src_end, match_cln_start, match_cln_end, True)
398
+ segments.append(seg)
399
+ last_src_start, last_src_end = seg.src_start, seg.src_end
400
+ last_cln_start, last_cln_end = seg.cln_start, seg.cln_end
401
+
402
+ i += k + extra_right
403
+ else:
404
+ i += 1
405
+
406
+ # Append final open segment if source/cleaned has remaining chars
407
+ if segments:
408
+ last = segments[-1]
409
+ if last.src_end < n or last.cln_end < m:
410
+ segments.append(Segment(last.src_end, n, last.cln_end, m, False))
411
+ else:
412
+ segments.append(Segment(0, n, 0, m, False))
413
+
414
+ return segments
415
+
416
+
417
+ # ---------------------------------------------------------------------------
418
+ # Dynamic-programming alignment (port of Scala dpalignment)
419
+ # ---------------------------------------------------------------------------
420
+
421
+ # Decision enum
422
+ _MATCH, _SKIP_SRC, _SKIP_CLN = 0, 1, 2
423
+
424
+
425
+ def _dp_align(source: str, cleaned: str) -> str:
426
+ """Align *cleaned* against *source* using DP with affine gap penalties.
427
+
428
+ Returns a string of len(source) with GAPCHAR at non-content positions.
429
+ """
430
+ n, m = len(source), len(cleaned)
431
+
432
+ if n == 0:
433
+ return ""
434
+ if m == 0:
435
+ return GAPCHAR * n
436
+
437
+ # Size guard — avoid quadratic blowup on huge unmatched segments.
438
+ # For large segments, align in chunks rather than giving up entirely.
439
+ if n * m > 10_000_000:
440
+ # Chunk the source into manageable pieces and align each
441
+ chunk_size = max(1, 10_000_000 // m)
442
+ result_parts: list[str] = []
443
+ cln_pos = 0
444
+ for start in range(0, n, chunk_size):
445
+ end = min(start + chunk_size, n)
446
+ src_chunk = source[start:end]
447
+ # Estimate how much clean text corresponds to this chunk
448
+ src_fraction = (end - start) / n
449
+ overshoot_factor = 1.5
450
+ cln_chunk_size = int(m * src_fraction * overshoot_factor)
451
+ cln_end = min(cln_pos + cln_chunk_size, m)
452
+ cln_chunk = cleaned[cln_pos:cln_end]
453
+ chunk_result = _dp_align(src_chunk, cln_chunk)
454
+ # Advance clean pointer by how many chars were consumed
455
+ consumed = sum(1 for c in chunk_result if c != GAPCHAR)
456
+ cln_pos += consumed
457
+ result_parts.append(chunk_result)
458
+ return "".join(result_parts)
459
+
460
+ # Score matrices (2-row rolling for space efficiency)
461
+ S = [[0] * (m + 1) for _ in range(2)]
462
+ G = [[True] * (m + 1) for _ in range(2)]
463
+ D = [[_SKIP_SRC] * (m + 1) for _ in range(n + 1)]
464
+
465
+ # Initialize first row
466
+ for j in range(m + 1):
467
+ S[0][j] = -j
468
+ G[0][j] = True
469
+ D[0][j] = _SKIP_CLN
470
+ D[0][0] = -1 # sentinel
471
+
472
+ for i in range(1, n + 1):
473
+ ci = i % 2
474
+ pi = (i - 1) % 2
475
+ S[ci][0] = 0
476
+ G[ci][0] = True
477
+
478
+ for j in range(1, m + 1):
479
+ sc = source[i - 1]
480
+ cc = cleaned[j - 1]
481
+
482
+ # SkipClean score
483
+ skip_cln_score = S[ci][j - 1] + (0 if cc.isspace() else -6)
484
+
485
+ # SkipSource score
486
+ skip_src_score = S[pi][j] + (0 if G[pi][j] else -2)
487
+
488
+ if skip_cln_score > skip_src_score:
489
+ best_score = skip_cln_score
490
+ best_dec = _SKIP_CLN
491
+ else:
492
+ best_score = skip_src_score
493
+ best_dec = _SKIP_SRC
494
+
495
+ # Match (only if chars are compatible)
496
+ chars_compatible = (
497
+ sc.upper() == cc.upper()
498
+ or (sc.isspace() and cc.isspace())
499
+ or (sc in _QUOTE_CHARS and cc in _QUOTE_CHARS)
500
+ )
501
+ if chars_compatible:
502
+ if sc.isalnum() and sc == cc:
503
+ match_score = S[pi][j - 1] + 3
504
+ else:
505
+ match_score = S[pi][j - 1] + 1
506
+ if match_score > best_score:
507
+ best_score = match_score
508
+ best_dec = _MATCH
509
+
510
+ D[i][j] = best_dec
511
+ S[ci][j] = best_score
512
+
513
+ if best_dec == _MATCH:
514
+ G[ci][j] = False
515
+ elif best_dec == _SKIP_SRC:
516
+ G[ci][j] = True
517
+ else: # _SKIP_CLN
518
+ G[ci][j] = G[ci][j - 1]
519
+
520
+ # Backtrack
521
+ result: list[str] = []
522
+ i, j = n, m
523
+ while i > 0 or j > 0:
524
+ d = D[i][j]
525
+ if d == _MATCH:
526
+ result.append(source[i - 1])
527
+ i -= 1
528
+ j -= 1
529
+ elif d == _SKIP_CLN:
530
+ j -= 1
531
+ else: # _SKIP_SRC
532
+ result.append(GAPCHAR)
533
+ i -= 1
534
+
535
+ result.reverse()
536
+ assert len(result) == n, f"DP output length {len(result)} != source length {n}"
537
+ return "".join(result)
538
+
539
+
540
+ # ---------------------------------------------------------------------------
541
+ # Alignment orchestrator
542
+ # ---------------------------------------------------------------------------
543
+
544
+ def align(
545
+ leaves: list[tuple[etree._Element, str]], clean_text: str
546
+ ) -> dict[int, float]:
547
+ """Align leaf texts against *clean_text* and return per-leaf content scores.
548
+
549
+ Returns {leaf_index: fraction_of_chars_matched} (0.0–1.0).
550
+ """
551
+ if not leaves or not clean_text.strip():
552
+ return {i: 0.0 for i in range(len(leaves))}
553
+
554
+ # Build source by concatenating leaf texts with space separators
555
+ parts: list[str] = []
556
+ offsets: list[tuple[int, int]] = []
557
+ pos = 0
558
+ for idx, (el, text) in enumerate(leaves):
559
+ offsets.append((pos, pos + len(text)))
560
+ parts.append(text)
561
+ pos += len(text)
562
+ if idx < len(leaves) - 1:
563
+ parts.append(" ")
564
+ pos += 1
565
+ source = "".join(parts)
566
+
567
+ cleaned = normalize_text(clean_text)
568
+ if not cleaned:
569
+ return {i: 0.0 for i in range(len(leaves))}
570
+
571
+ # Phase 1: anchor matching
572
+ segments = _find_anchors(source, cleaned, k=10)
573
+
574
+ # Phase 2: DP on open segments, pass-through on matched segments
575
+ aligned_parts: list[str] = []
576
+ for seg in segments:
577
+ src_slice = source[seg.src_start : seg.src_end]
578
+ cln_slice = cleaned[seg.cln_start : seg.cln_end]
579
+ if seg.matched:
580
+ aligned_parts.append(cln_slice)
581
+ else:
582
+ aligned_parts.append(_dp_align(src_slice, cln_slice))
583
+ aligned = "".join(aligned_parts)
584
+
585
+ # Extract per-leaf scores
586
+ scores: dict[int, float] = {}
587
+ for i, (start, end) in enumerate(offsets):
588
+ if end <= len(aligned):
589
+ substr = aligned[start:end]
590
+ n_matched = sum(1 for c in substr if c != GAPCHAR)
591
+ else:
592
+ n_matched = 0
593
+ n_total = end - start
594
+ scores[i] = n_matched / n_total if n_total > 0 else 0.0
595
+
596
+ # Fallback pass: leaves with score 0 that have substantial text might
597
+ # have been missed due to ordering differences between DOM and clean text.
598
+ # Try direct substring matching against the full clean text.
599
+ cleaned_lower = cleaned.lower()
600
+ for i, (el, text) in enumerate(leaves):
601
+ if scores[i] > 0.0 or len(text) < 20:
602
+ continue
603
+ leaf_lower = text.lower()
604
+ if leaf_lower in cleaned_lower:
605
+ scores[i] = 1.0
606
+ elif len(text) >= 50:
607
+ # Try matching 50-char chunks to estimate content fraction
608
+ chunk_size = 50
609
+ matched_chars = 0
610
+ for c in range(0, len(text) - chunk_size + 1, chunk_size):
611
+ if text[c : c + chunk_size].lower() in cleaned_lower:
612
+ matched_chars += chunk_size
613
+ scores[i] = matched_chars / len(text) if len(text) > 0 else 0.0
614
+
615
+ return scores
616
+
617
+
618
+ # ---------------------------------------------------------------------------
619
+ # Labeling
620
+ # ---------------------------------------------------------------------------
621
+
622
+ def label_nodes(
623
+ tree: etree._Element,
624
+ scores: dict[int, float],
625
+ threshold: float = 0.667,
626
+ ) -> etree._Element:
627
+ """Label CDOM nodes as content or boilerplate based on alignment scores.
628
+
629
+ Threshold 0.667 matches the original Scala's 2/3 rule.
630
+ """
631
+ # Label leaves
632
+ for el in tree.iter():
633
+ if not isinstance(el.tag, str):
634
+ continue
635
+ leaf_id = el.get("data-leaf-id")
636
+ if leaf_id is not None:
637
+ score = scores.get(int(leaf_id), 0.0)
638
+ el.set("data-label", "content" if score > threshold else "boilerplate")
639
+
640
+ # Propagate upward: internal node is content if any child is content
641
+ def _propagate(el: etree._Element) -> bool:
642
+ if not isinstance(el.tag, str):
643
+ return False
644
+ if el.get("data-label"):
645
+ return el.get("data-label") == "content"
646
+ has_content = any(_propagate(child) for child in el)
647
+ el.set("data-label", "content" if has_content else "boilerplate")
648
+ return has_content
649
+
650
+ _propagate(tree)
651
+ return tree
652
+
653
+
654
+ # ---------------------------------------------------------------------------
655
+ # Text extraction
656
+ # ---------------------------------------------------------------------------
657
+
658
+ def extract_text(tree: etree._Element) -> str:
659
+ """Reconstruct clean text from content-labeled leaf nodes."""
660
+ parts: list[str] = []
661
+ for el in tree.iter():
662
+ if not isinstance(el.tag, str):
663
+ continue
664
+ if el.get("data-label") == "content" and len(el) == 0:
665
+ text = normalize_text(el.text_content())
666
+ if text:
667
+ parts.append(text)
668
+ return "\n".join(parts)
669
+
670
+
671
+ def extract_text_from_labeled_html(labeled_html: str) -> str:
672
+ """Extract content text from an already-labeled HTML string.
673
+
674
+ Accepts HTML where elements carry ``data-label="content"`` or
675
+ ``data-label="boilerplate"`` attributes (e.g. the ``labeled_html``
676
+ column produced by :func:`label_original_html`) and returns only
677
+ the text of content-labeled leaf nodes.
678
+ """
679
+ doc = document_fromstring(labeled_html)
680
+ return extract_text(doc)
681
+
682
+
683
+ # ---------------------------------------------------------------------------
684
+ # Evaluation
685
+ # ---------------------------------------------------------------------------
686
+
687
+ def evaluate(extracted: str, ground_truth: str) -> dict:
688
+ """Compute quality metrics: token F1, ROUGE-1, BLEU, CHRF.
689
+
690
+ Both inputs are normalized for fair comparison (smart quotes, ?-apostrophes).
691
+ ROUGE-L is skipped as it's O(n*m) and very slow on long texts.
692
+ """
693
+ try:
694
+ import sacrebleu
695
+ from rouge_score import rouge_scorer
696
+ except ImportError:
697
+ raise ImportError(
698
+ "evaluate() requires optional dependencies. "
699
+ "Install them with: uv add 'web2textpy[eval]'"
700
+ )
701
+
702
+ # Normalize both sides for fair comparison
703
+ extracted = _normalize_for_eval(extracted)
704
+ ground_truth = _normalize_for_eval(ground_truth)
705
+
706
+ # Token-level F1 (multiset)
707
+ ext_tokens = Counter(extracted.lower().split())
708
+ gt_tokens = Counter(ground_truth.lower().split())
709
+ overlap = sum((ext_tokens & gt_tokens).values())
710
+ ext_total = sum(ext_tokens.values())
711
+ gt_total = sum(gt_tokens.values())
712
+ precision = overlap / ext_total if ext_total else 0.0
713
+ recall = overlap / gt_total if gt_total else 0.0
714
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
715
+
716
+ # ROUGE-1 only (ROUGE-L is O(n*m) and very slow on long texts)
717
+ scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False)
718
+ rouge = scorer.score(ground_truth, extracted)
719
+
720
+ # BLEU & CHRF
721
+ bleu = sacrebleu.corpus_bleu([extracted], [[ground_truth]])
722
+ chrf = sacrebleu.corpus_chrf([extracted], [[ground_truth]])
723
+
724
+ return {
725
+ "token_f1": round(f1, 4),
726
+ "precision": round(precision, 4),
727
+ "recall": round(recall, 4),
728
+ "rouge1_f": round(rouge["rouge1"].fmeasure, 4),
729
+ "bleu": round(bleu.score, 2),
730
+ "chrf": round(chrf.score, 2),
731
+ }
732
+
733
+
734
+ # ---------------------------------------------------------------------------
735
+ # Pipeline orchestrator
736
+ # ---------------------------------------------------------------------------
737
+
738
+ def run_pipeline(
739
+ html_str: str, clean_text: str
740
+ ) -> tuple[etree._Element, str, dict]:
741
+ """Run the full Web2Text alignment pipeline.
742
+
743
+ Returns (labeled_tree, extracted_text, metrics).
744
+ """
745
+ tree = build_cdom(html_str)
746
+ leaves = extract_leaves(tree)
747
+ scores = align(leaves, clean_text)
748
+ tree = label_nodes(tree, scores)
749
+ extracted = extract_text(tree)
750
+ metrics = evaluate(extracted, clean_text)
751
+ return tree, extracted, metrics
752
+
753
+
754
+ # ---------------------------------------------------------------------------
755
+ # Label original HTML
756
+ # ---------------------------------------------------------------------------
757
+
758
+ def label_original_html(
759
+ html_str: str, clean_text: str, threshold: float = 0.667,
760
+ ) -> tuple[str, str, dict]:
761
+ """Run pipeline and return the *original* HTML with data-label attributes.
762
+
763
+ Unlike run_pipeline (which returns the collapsed CDOM), this preserves
764
+ the full original document structure and annotates every element with
765
+ data-label="content" or data-label="boilerplate".
766
+
767
+ Returns (labeled_html_string, extracted_text, metrics).
768
+ """
769
+ import copy
770
+
771
+ # --- Parse & clean (same steps as build_cdom) ---
772
+ cleaned_str = _XML_DECL_RE.sub("", html_str, count=1)
773
+ doc = document_fromstring(cleaned_str)
774
+
775
+ for el in doc.iter():
776
+ if el.text:
777
+ el.text = _CTRL_RE.sub("", el.text)
778
+ if el.tail:
779
+ el.tail = _CTRL_RE.sub("", el.tail)
780
+ if isinstance(el.tag, str):
781
+ for attr, val in el.attrib.items():
782
+ c = _CTRL_RE.sub("", val)
783
+ if c != val:
784
+ el.attrib[attr] = c
785
+
786
+ try:
787
+ body = doc.body
788
+ except IndexError:
789
+ body = None
790
+ if body is None:
791
+ body = doc
792
+
793
+ # --- Assign stable IDs before any tree modifications ---
794
+ _id = 0
795
+ for el in body.iter():
796
+ if isinstance(el.tag, str):
797
+ el.set("data-orig-id", str(_id))
798
+ _id += 1
799
+
800
+ # --- Deep-copy: this is the "original" we will label at the end ---
801
+ orig_doc = copy.deepcopy(doc)
802
+
803
+ # --- CDOM construction (mirrors build_cdom logic) ---
804
+ # Remove comments
805
+ for c in body.iter():
806
+ if callable(c.tag):
807
+ _remove_preserving_tail(c)
808
+
809
+ # Remove skip-tag elements
810
+ for el in [e for e in body.iter() if isinstance(e.tag, str) and e.tag in SKIP_TAGS]:
811
+ _remove_preserving_tail(el)
812
+
813
+ # Remove empty text
814
+ for el in body.iter():
815
+ if not isinstance(el.tag, str):
816
+ continue
817
+ if el.text and not el.text.strip():
818
+ el.text = None
819
+ if el.tail and not el.tail.strip():
820
+ el.tail = None
821
+
822
+ # Remove empty leaf elements
823
+ changed = True
824
+ while changed:
825
+ changed = False
826
+ for el in list(body.iter()):
827
+ if not isinstance(el.tag, str) or el is body:
828
+ continue
829
+ if len(el) == 0 and not (el.text and el.text.strip()):
830
+ _remove_preserving_tail(el)
831
+ changed = True
832
+
833
+ # Collapse single-child chains (tracking merged orig-ids)
834
+ def _collapse_tracking(el: etree._Element) -> None:
835
+ for child in list(el):
836
+ if isinstance(child.tag, str):
837
+ _collapse_tracking(child)
838
+ if len(el) == 1 and not (el.text and el.text.strip()):
839
+ child = el[0]
840
+ if not isinstance(child.tag, str):
841
+ return
842
+ # Track which orig-ids get absorbed
843
+ child_oid = child.get("data-orig-id", "")
844
+ child_merged = child.get("data-merged-ids", "")
845
+ existing = el.get("data-merged-ids", "")
846
+ all_ids = [existing, child_oid, child_merged]
847
+ non_empty_ids = filter(None, all_ids)
848
+ merged = ",".join(non_empty_ids)
849
+ if merged:
850
+ el.set("data-merged-ids", merged)
851
+ el.text = child.text
852
+ grandchildren = list(child)
853
+ for gc in grandchildren:
854
+ child.remove(gc)
855
+ el.append(gc)
856
+ if child.tail and child.tail.strip():
857
+ if grandchildren:
858
+ grandchildren[-1].tail = (grandchildren[-1].tail or "") + child.tail
859
+ else:
860
+ el.text = (el.text or "") + child.tail
861
+ el.remove(child)
862
+
863
+ _collapse_tracking(body)
864
+
865
+ # --- Run alignment pipeline on the CDOM ---
866
+ leaves = extract_leaves(body)
867
+ scores = align(leaves, clean_text)
868
+ label_nodes(body, scores, threshold)
869
+ extracted = extract_text(body)
870
+ metrics = evaluate(extracted, clean_text)
871
+
872
+ # --- Build orig-id → label mapping ---
873
+ label_map: dict[str, str] = {}
874
+ for el in body.iter():
875
+ if not isinstance(el.tag, str):
876
+ continue
877
+ label = el.get("data-label")
878
+ if not label:
879
+ continue
880
+ oid = el.get("data-orig-id")
881
+ if oid:
882
+ label_map[oid] = label
883
+ merged = el.get("data-merged-ids")
884
+ if merged:
885
+ for mid in merged.split(","):
886
+ if mid:
887
+ label_map[mid] = label
888
+
889
+ # --- Apply labels to original doc ---
890
+ try:
891
+ orig_body = orig_doc.body
892
+ except IndexError:
893
+ orig_body = None
894
+ if orig_body is None:
895
+ orig_body = orig_doc
896
+
897
+ for el in orig_body.iter():
898
+ if not isinstance(el.tag, str):
899
+ continue
900
+ oid = el.get("data-orig-id")
901
+ if oid and oid in label_map:
902
+ el.set("data-label", label_map[oid])
903
+ elif isinstance(el.tag, str) and el.tag in SKIP_TAGS:
904
+ el.set("data-label", "boilerplate")
905
+ # Clean up tracking attributes
906
+ for attr in ("data-orig-id",):
907
+ if attr in el.attrib:
908
+ del el.attrib[attr]
909
+
910
+ # Propagate: unlabeled nodes inherit from children
911
+ def _propagate(el: etree._Element) -> bool:
912
+ if not isinstance(el.tag, str):
913
+ return False
914
+ if el.get("data-label"):
915
+ return el.get("data-label") == "content"
916
+ has_content = any(_propagate(child) for child in el)
917
+ el.set("data-label", "content" if has_content else "boilerplate")
918
+ return has_content
919
+
920
+ _propagate(orig_body)
921
+
922
+ # Label <head> as boilerplate
923
+ try:
924
+ head = orig_doc.head
925
+ if head is not None and not head.get("data-label"):
926
+ head.set("data-label", "boilerplate")
927
+ except Exception:
928
+ pass
929
+
930
+ labeled_html = etree.tostring(orig_doc, encoding="unicode", method="html")
931
+ return labeled_html, extracted, metrics
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: web2textpy
3
+ Version: 0.1.0
4
+ Summary: Python reimplementation of the Web2Text pipeline for labeling HTML DOM nodes as content or boilerplate
5
+ Project-URL: Homepage, https://github.com/williambrach/web2textpy
6
+ Project-URL: Repository, https://github.com/williambrach/web2textpy
7
+ Author: William Brach
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Text Processing :: Markup :: HTML
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: lxml>=6.0.2
22
+ Provides-Extra: cli
23
+ Requires-Dist: datasets>=4.8.4; extra == 'cli'
24
+ Requires-Dist: rouge-score>=0.1.2; extra == 'cli'
25
+ Requires-Dist: sacrebleu>=2.6.0; extra == 'cli'
26
+ Provides-Extra: eval
27
+ Requires-Dist: rouge-score>=0.1.2; extra == 'eval'
28
+ Requires-Dist: sacrebleu>=2.6.0; extra == 'eval'
29
+ Description-Content-Type: text/markdown
30
+
31
+ # web2textpy
32
+
33
+ Python reimplementation of the [Web2Text](https://github.com/dalab/web2text) pipeline for labeling HTML DOM nodes as **content** or **boilerplate** using paired `(raw_html, clean_text)` data.
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ uv add web2textpy
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```python
44
+ from datasets import load_dataset
45
+ from web2text import run_pipeline
46
+
47
+ ds = load_dataset("williambrach/html-boilerplate-labeled", split="test")
48
+ row = ds[0]
49
+
50
+ tree, extracted_text, metrics = run_pipeline(row["html"], row["text"])
51
+
52
+ print(extracted_text[:200])
53
+ print(metrics)
54
+ ```
55
+
56
+ ## Step-by-Step API
57
+
58
+ Each stage of the pipeline is exposed as a standalone function:
59
+
60
+ ```python
61
+ from web2text import build_cdom, extract_leaves, align, label_nodes, extract_text, evaluate
62
+
63
+ # 1. Parse HTML into a collapsed DOM tree
64
+ tree = build_cdom(html_string)
65
+
66
+ # 2. Extract ordered text-bearing leaf nodes
67
+ leaves = extract_leaves(tree) # [(element, "normalized text"), ...]
68
+
69
+ # 3. Align leaf texts against ground-truth clean text
70
+ scores = align(leaves, clean_text) # {leaf_id: 0.0-1.0 match score}
71
+
72
+ # 4. Label each node as "content" or "boilerplate"
73
+ tree = label_nodes(tree, scores, threshold=0.667)
74
+
75
+ # 5. Extract text from content-labeled nodes
76
+ result = extract_text(tree)
77
+
78
+ # 6. Evaluate against ground truth
79
+ metrics = evaluate(result, clean_text)
80
+ # => {'token_f1': 0.99, 'precision': 0.99, 'recall': 0.99, 'rouge1_f': 0.99, 'bleu': 98.5, 'chrf': 98.8}
81
+ ```
82
+
83
+ ## How the Matching Algorithm Works
84
+
85
+ Given raw HTML and its known clean text, the algorithm determines which DOM nodes are content versus boilerplate in six steps:
86
+
87
+ 1. **Simplify the DOM** — strip non-content tags (`<script>`, `<style>`, etc.) and collapse single-child chains into a Collapsed DOM (CDOM) representation
88
+ 2. **Collect leaf text** — walk the CDOM, concatenate text from every leaf node into one source string with tracked character offsets
89
+ 3. **Find anchors** — identify 10-character substrings that appear exactly once in both the source and clean text, splitting the problem into independent segments
90
+ 4. **DP alignment** — for each segment between anchors, run character-level dynamic programming with affine gap penalties to map source characters to clean-text characters
91
+ 5. **Score leaves** — map alignment results back to leaf boundaries via stored offsets, giving each leaf a score: `matched_chars / total_chars`
92
+ 6. **Label nodes** — leaves scoring above `0.667` are labeled `"content"`, the rest `"boilerplate"`, with labels propagating upward to parents
93
+
94
+ ![Alignment pipeline: extract leaf texts → anchor matching → DP alignment → per-leaf scores](assets/image1.png)
95
+
96
+
97
+ ## Dataset
98
+
99
+ Dataset: [williambrach/html-boilerplate-labeled](https://huggingface.co/datasets/williambrach/html-boilerplate-labeled) — ~4k pages from CleanEval, Dragnet, CETD, Readability, and others (3,985 pages total).
100
+
101
+ | Source | Train (ROUGE-1 F) | Test (ROUGE-1 F) |
102
+ |--------------------|-------------------|------------------|
103
+ | readability | 0.993 (92) | 0.997 (23) |
104
+ | scrapinghub | 0.991 (145) | 0.996 (36) |
105
+ | cetd | 0.993 (560) | 0.987 (140) |
106
+ | google-trends-2017 | 0.986 (144) | 0.995 (36) |
107
+ | cleanportaleval | 0.985 (57) | 0.971 (14) |
108
+ | cleaneval | 0.985 (590) | 0.991 (148) |
109
+ | dragnet | 0.983 (1,103) | 0.983 (276) |
110
+ | l3s-gn1 | 0.920 (497) | 0.927 (124) |
111
+ | **Overall** | **0.976** (3,188) | **0.978** (797) |
112
+
113
+ >Sample counts in parentheses.
114
+
115
+ ## Original Work
116
+
117
+ - **Paper**: Vogels et al., "Web2Text: Deep Structured Boilerplate Removal" (ECIR 2018) — [arxiv.org/abs/1801.02607](https://arxiv.org/abs/1801.02607)
118
+ - **Original implementation** (Scala): [github.com/dalab/web2text](https://github.com/dalab/web2text)
@@ -0,0 +1,5 @@
1
+ web2text.py,sha256=-VDlyjafOBSPFKkSji0sctD0yfH-rYSK5v1HnTeltgE,32864
2
+ web2textpy-0.1.0.dist-info/METADATA,sha256=uXg3KL09aHrPku17BTziHsVRQu_d_zKTRy8YH1aH23Y,5081
3
+ web2textpy-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
4
+ web2textpy-0.1.0.dist-info/licenses/LICENSE,sha256=89q9i9qb5tf1LUyum9pM1pdA5X1fUowF_t7fpvhNpEE,1070
5
+ web2textpy-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 William Brach
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.