web2textpy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web2text.py +931 -0
- web2textpy-0.1.0.dist-info/METADATA +118 -0
- web2textpy-0.1.0.dist-info/RECORD +5 -0
- web2textpy-0.1.0.dist-info/WHEEL +4 -0
- web2textpy-0.1.0.dist-info/licenses/LICENSE +21 -0
web2text.py
ADDED
|
@@ -0,0 +1,931 @@
|
|
|
1
|
+
"""web2text.py — Python port of Web2Text alignment-based labeling pipeline.
|
|
2
|
+
|
|
3
|
+
Given paired (raw_html, clean_text), aligns clean text back onto DOM nodes
|
|
4
|
+
and labels each as content or boilerplate.
|
|
5
|
+
|
|
6
|
+
Based on: Vogels et al., "Web2Text: Deep Structured Boilerplate Removal" (ECIR 2018)
|
|
7
|
+
Original Scala: https://github.com/dalab/web2text
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
import unicodedata
|
|
14
|
+
from collections import Counter, defaultdict
|
|
15
|
+
from typing import NamedTuple
|
|
16
|
+
|
|
17
|
+
from lxml import etree
|
|
18
|
+
from lxml.html import document_fromstring
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# Constants
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
GAPCHAR = "\u25a1" # □
|
|
25
|
+
|
|
26
|
+
SKIP_TAGS = frozenset({
|
|
27
|
+
"script", "style", "head", "noscript", "iframe", "img", "input",
|
|
28
|
+
"br", "hr", "meta", "title", "video", "select", "textarea",
|
|
29
|
+
"link", "object", "embed", "applet", "param", "svg",
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
BLOCK_TAGS = frozenset({
|
|
33
|
+
"address", "article", "aside", "blockquote", "body", "center",
|
|
34
|
+
"dd", "div", "dl", "dt", "fieldset", "figcaption", "figure",
|
|
35
|
+
"footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header",
|
|
36
|
+
"li", "main", "nav", "ol", "p", "pre", "section", "table",
|
|
37
|
+
"tbody", "td", "tfoot", "th", "thead", "tr", "ul",
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
# Characters treated as equivalent in alignment (apostrophes, quotes, ?)
|
|
41
|
+
_QUOTE_CHARS = frozenset("?'\u2018\u2019\u201a\u201b`\u0027")
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Text normalization
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
_WS_RE = re.compile(r"\s+")
|
|
48
|
+
_CTRL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]")
|
|
49
|
+
# Word-internal ? acting as apostrophe: letter?letter (e.g., it?s, don?t)
|
|
50
|
+
_QUESTION_APOS_RE = re.compile(r"(?<=\w)\?(?=\w)")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _fix_mojibake(s: str) -> str:
|
|
54
|
+
"""Fix CP1252 mojibake: text that was UTF-8 but decoded as CP1252.
|
|
55
|
+
|
|
56
|
+
Tries to re-encode as CP1252 and decode as UTF-8. If the result is
|
|
57
|
+
shorter (mojibake expands chars), it's a genuine fix.
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
fixed = s.encode("cp1252").decode("utf-8")
|
|
61
|
+
if len(fixed) < len(s): # mojibake always inflates length
|
|
62
|
+
return fixed
|
|
63
|
+
except (UnicodeDecodeError, UnicodeEncodeError):
|
|
64
|
+
pass
|
|
65
|
+
return s
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def normalize_text(s: str) -> str:
|
|
69
|
+
"""NFC-normalize, fix mojibake, collapse whitespace, replace NBSP, strip."""
|
|
70
|
+
s = _fix_mojibake(s)
|
|
71
|
+
s = unicodedata.normalize("NFC", s)
|
|
72
|
+
s = s.replace("\u00a0", " ")
|
|
73
|
+
return _WS_RE.sub(" ", s).strip()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _normalize_for_eval(s: str) -> str:
|
|
77
|
+
"""Extra normalization applied only during evaluation.
|
|
78
|
+
|
|
79
|
+
Normalizes smart quotes and ?-as-apostrophe so that ground truth
|
|
80
|
+
quirks don't penalize correct extraction.
|
|
81
|
+
"""
|
|
82
|
+
s = s.replace("\u2018", "'").replace("\u2019", "'")
|
|
83
|
+
s = s.replace("\u201c", '"').replace("\u201d", '"')
|
|
84
|
+
s = _QUESTION_APOS_RE.sub("'", s)
|
|
85
|
+
return s
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# CDOM construction
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
def _remove_preserving_tail(el: etree._Element) -> None:
|
|
93
|
+
"""Remove *el* from its parent without losing its tail text."""
|
|
94
|
+
parent = el.getparent()
|
|
95
|
+
if parent is None:
|
|
96
|
+
return
|
|
97
|
+
tail = el.tail
|
|
98
|
+
if tail:
|
|
99
|
+
prev = el.getprevious()
|
|
100
|
+
if prev is not None:
|
|
101
|
+
prev.tail = (prev.tail or "") + tail
|
|
102
|
+
else:
|
|
103
|
+
parent.text = (parent.text or "") + tail
|
|
104
|
+
parent.remove(el)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
_XML_DECL_RE = re.compile(r"<\?xml[^\n]*(?:\?>|\n|$)", re.IGNORECASE)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def build_cdom(html_str: str) -> etree._Element:
|
|
111
|
+
"""Parse HTML and build a Collapsed DOM tree.
|
|
112
|
+
|
|
113
|
+
Removes non-content elements, empty nodes, and collapses single-child chains.
|
|
114
|
+
"""
|
|
115
|
+
# Strip XML declaration that lxml.html rejects
|
|
116
|
+
html_str = _XML_DECL_RE.sub("", html_str, count=1)
|
|
117
|
+
|
|
118
|
+
doc = document_fromstring(html_str)
|
|
119
|
+
try:
|
|
120
|
+
body = doc.body
|
|
121
|
+
except IndexError:
|
|
122
|
+
body = None
|
|
123
|
+
if body is None:
|
|
124
|
+
body = doc
|
|
125
|
+
|
|
126
|
+
# --- Strip XML-invalid control characters from all text/tail/attributes ---
|
|
127
|
+
for el in doc.iter():
|
|
128
|
+
if el.text:
|
|
129
|
+
el.text = _CTRL_RE.sub("", el.text)
|
|
130
|
+
if el.tail:
|
|
131
|
+
el.tail = _CTRL_RE.sub("", el.tail)
|
|
132
|
+
if isinstance(el.tag, str):
|
|
133
|
+
for attr, val in el.attrib.items():
|
|
134
|
+
cleaned = _CTRL_RE.sub("", val)
|
|
135
|
+
if cleaned != val:
|
|
136
|
+
el.attrib[attr] = cleaned
|
|
137
|
+
|
|
138
|
+
# --- Remove comments ---
|
|
139
|
+
for c in body.iter():
|
|
140
|
+
if callable(c.tag): # Comments, PIs
|
|
141
|
+
_remove_preserving_tail(c)
|
|
142
|
+
|
|
143
|
+
# --- Remove skip-tag elements ---
|
|
144
|
+
to_remove = [el for el in body.iter() if isinstance(el.tag, str) and el.tag in SKIP_TAGS]
|
|
145
|
+
for el in to_remove:
|
|
146
|
+
_remove_preserving_tail(el)
|
|
147
|
+
|
|
148
|
+
# --- Remove empty text (set .text / .tail to None if whitespace-only) ---
|
|
149
|
+
for el in body.iter():
|
|
150
|
+
if not isinstance(el.tag, str):
|
|
151
|
+
continue
|
|
152
|
+
if el.text and not el.text.strip():
|
|
153
|
+
el.text = None
|
|
154
|
+
if el.tail and not el.tail.strip():
|
|
155
|
+
el.tail = None
|
|
156
|
+
|
|
157
|
+
# --- Remove empty leaf elements (bottom-up) ---
|
|
158
|
+
changed = True
|
|
159
|
+
while changed:
|
|
160
|
+
changed = False
|
|
161
|
+
for el in list(body.iter()):
|
|
162
|
+
if not isinstance(el.tag, str):
|
|
163
|
+
continue
|
|
164
|
+
if el is body:
|
|
165
|
+
continue
|
|
166
|
+
if len(el) == 0 and not (el.text and el.text.strip()):
|
|
167
|
+
_remove_preserving_tail(el)
|
|
168
|
+
changed = True
|
|
169
|
+
|
|
170
|
+
# --- Collapse single-child chains (bottom-up via post-order) ---
|
|
171
|
+
def _collapse(el: etree._Element) -> None:
|
|
172
|
+
for child in list(el):
|
|
173
|
+
if isinstance(child.tag, str):
|
|
174
|
+
_collapse(child)
|
|
175
|
+
if len(el) == 1 and not (el.text and el.text.strip()):
|
|
176
|
+
child = el[0]
|
|
177
|
+
if not isinstance(child.tag, str):
|
|
178
|
+
return
|
|
179
|
+
# Merge child into parent: adopt child's children and text
|
|
180
|
+
el.text = child.text
|
|
181
|
+
# Move grandchildren up
|
|
182
|
+
grandchildren = list(child)
|
|
183
|
+
for gc in grandchildren:
|
|
184
|
+
child.remove(gc)
|
|
185
|
+
el.append(gc)
|
|
186
|
+
# Preserve child.tail: append it to the last grandchild's tail,
|
|
187
|
+
# or to el.text if there are no grandchildren
|
|
188
|
+
if child.tail and child.tail.strip():
|
|
189
|
+
if grandchildren:
|
|
190
|
+
last_gc = grandchildren[-1]
|
|
191
|
+
last_gc.tail = (last_gc.tail or "") + child.tail
|
|
192
|
+
else:
|
|
193
|
+
el.text = (el.text or "") + child.tail
|
|
194
|
+
el.remove(child)
|
|
195
|
+
|
|
196
|
+
_collapse(body)
|
|
197
|
+
|
|
198
|
+
return body
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# Leaf extraction
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
|
|
205
|
+
def extract_leaves(tree: etree._Element) -> list[tuple[etree._Element, str]]:
|
|
206
|
+
"""Extract ordered text blocks from the CDOM tree.
|
|
207
|
+
|
|
208
|
+
In lxml, text lives in two places: el.text (before first child) and
|
|
209
|
+
child.tail (after a child element). Both must be captured, mirroring
|
|
210
|
+
how Jsoup represents text as separate TextNode children.
|
|
211
|
+
|
|
212
|
+
For leaf elements (no children), we capture el.text_content() as usual.
|
|
213
|
+
For internal elements, we capture el.text as a separate text block,
|
|
214
|
+
and each child's .tail as a separate text block.
|
|
215
|
+
"""
|
|
216
|
+
leaves: list[tuple[etree._Element, str]] = []
|
|
217
|
+
|
|
218
|
+
def _walk(el: etree._Element) -> None:
|
|
219
|
+
if not isinstance(el.tag, str):
|
|
220
|
+
return
|
|
221
|
+
if el.get("data-synthetic"):
|
|
222
|
+
return # already processed
|
|
223
|
+
if len(el) == 0:
|
|
224
|
+
# Leaf element — capture all its text
|
|
225
|
+
text = normalize_text(el.text_content())
|
|
226
|
+
if text:
|
|
227
|
+
el.set("data-leaf-id", str(len(leaves)))
|
|
228
|
+
leaves.append((el, text))
|
|
229
|
+
else:
|
|
230
|
+
# Snapshot original children BEFORE any tree mutations
|
|
231
|
+
original_children = list(el)
|
|
232
|
+
|
|
233
|
+
# Internal element — capture .text (text before first child)
|
|
234
|
+
if el.text and el.text.strip():
|
|
235
|
+
text = normalize_text(el.text)
|
|
236
|
+
if text:
|
|
237
|
+
span = etree.SubElement(el, "span")
|
|
238
|
+
span.set("data-synthetic", "1")
|
|
239
|
+
span.text = el.text
|
|
240
|
+
el.text = None
|
|
241
|
+
el.insert(0, span)
|
|
242
|
+
span.set("data-leaf-id", str(len(leaves)))
|
|
243
|
+
leaves.append((span, text))
|
|
244
|
+
|
|
245
|
+
# Recurse into original children only
|
|
246
|
+
for child in original_children:
|
|
247
|
+
_walk(child)
|
|
248
|
+
# Capture child.tail (text after this child, before next sibling)
|
|
249
|
+
if child.tail and child.tail.strip():
|
|
250
|
+
tail_text = normalize_text(child.tail)
|
|
251
|
+
if tail_text:
|
|
252
|
+
span = etree.SubElement(el, "span")
|
|
253
|
+
span.set("data-synthetic", "1")
|
|
254
|
+
span.text = child.tail
|
|
255
|
+
child.tail = None
|
|
256
|
+
idx = list(el).index(child)
|
|
257
|
+
el.insert(idx + 1, span)
|
|
258
|
+
span.set("data-leaf-id", str(len(leaves)))
|
|
259
|
+
leaves.append((span, tail_text))
|
|
260
|
+
|
|
261
|
+
_walk(tree)
|
|
262
|
+
return leaves
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ---------------------------------------------------------------------------
|
|
266
|
+
# Anchor-based alignment (port of Scala find1to1mathches)
|
|
267
|
+
# ---------------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
class Segment(NamedTuple):
|
|
270
|
+
src_start: int
|
|
271
|
+
src_end: int
|
|
272
|
+
cln_start: int
|
|
273
|
+
cln_end: int
|
|
274
|
+
matched: bool
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _find_anchors(source: str, cleaned: str, k: int = 10) -> list[Segment]:
|
|
278
|
+
"""Find unique k-char anchors shared between *source* and *cleaned*.
|
|
279
|
+
|
|
280
|
+
Returns a list of Segments covering the full source/cleaned range.
|
|
281
|
+
Matched segments are definite alignments; open segments need DP.
|
|
282
|
+
"""
|
|
283
|
+
n, m = len(source), len(cleaned)
|
|
284
|
+
|
|
285
|
+
if n < k or m < k:
|
|
286
|
+
return [Segment(0, n, 0, m, False)]
|
|
287
|
+
|
|
288
|
+
# Build substring → [positions] map for source (skip substrings with GAPCHAR)
|
|
289
|
+
source_map: dict[str, list[int]] = defaultdict(list)
|
|
290
|
+
for i in range(n + 1 - k):
|
|
291
|
+
sub = source[i : i + k]
|
|
292
|
+
if GAPCHAR not in sub:
|
|
293
|
+
source_map[sub].append(i)
|
|
294
|
+
|
|
295
|
+
# Trimmed source maps for safety check (strip whitespace + GAPCHAR)
|
|
296
|
+
def _trim_filter(c: str) -> bool:
|
|
297
|
+
return not c.isspace() and c != GAPCHAR
|
|
298
|
+
|
|
299
|
+
trimmed_source = "".join(c for c in source if _trim_filter(c))
|
|
300
|
+
# Pre-compute occurrence counts for trimmed substrings of lengths 1..k
|
|
301
|
+
trimmed_maps: list[dict[str, int]] = []
|
|
302
|
+
for kk in range(1, k + 1):
|
|
303
|
+
counts: dict[str, int] = defaultdict(int)
|
|
304
|
+
for i in range(len(trimmed_source) + 1 - kk):
|
|
305
|
+
counts[trimmed_source[i : i + kk]] += 1
|
|
306
|
+
trimmed_maps.append(counts)
|
|
307
|
+
|
|
308
|
+
def _equal_enough(c1: str, c2: str) -> bool:
|
|
309
|
+
if c1.isspace() and c2.isspace():
|
|
310
|
+
return True
|
|
311
|
+
if c1.upper() == c2.upper():
|
|
312
|
+
return True
|
|
313
|
+
# Treat ? as equivalent to apostrophe variants (l3s-gn1 ground truth quirk)
|
|
314
|
+
if c1 in _QUOTE_CHARS and c2 in _QUOTE_CHARS:
|
|
315
|
+
return True
|
|
316
|
+
return False
|
|
317
|
+
|
|
318
|
+
segments: list[Segment] = []
|
|
319
|
+
# Track last matched/open segment end positions
|
|
320
|
+
last_src_start, last_src_end = 0, 0
|
|
321
|
+
last_cln_start, last_cln_end = 0, 0
|
|
322
|
+
last_is_init = True # first sentinel
|
|
323
|
+
|
|
324
|
+
i = 0
|
|
325
|
+
while i < m + 1 - k:
|
|
326
|
+
subs = cleaned[i : i + k]
|
|
327
|
+
|
|
328
|
+
# Trimmed substring for safety check
|
|
329
|
+
trimmed_subs = "".join(c for c in subs if _trim_filter(c))
|
|
330
|
+
|
|
331
|
+
match_locs = source_map.get(subs, [])
|
|
332
|
+
trimmed_count = (
|
|
333
|
+
trimmed_maps[len(trimmed_subs) - 1].get(trimmed_subs, 0)
|
|
334
|
+
if trimmed_subs
|
|
335
|
+
else 0
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if len(match_locs) == 1 and trimmed_count == 1:
|
|
339
|
+
src_pos = match_locs[0]
|
|
340
|
+
|
|
341
|
+
# Extend right
|
|
342
|
+
extra_right = 0
|
|
343
|
+
while (
|
|
344
|
+
i + k + extra_right < m
|
|
345
|
+
and src_pos + k + extra_right < n
|
|
346
|
+
and _equal_enough(cleaned[i + k + extra_right], source[src_pos + k + extra_right])
|
|
347
|
+
):
|
|
348
|
+
extra_right += 1
|
|
349
|
+
|
|
350
|
+
# Extend left
|
|
351
|
+
extra_left = 0
|
|
352
|
+
while (
|
|
353
|
+
i - extra_left > 0
|
|
354
|
+
and src_pos - extra_left > 0
|
|
355
|
+
and src_pos - extra_left >= last_src_end + 1
|
|
356
|
+
and _equal_enough(cleaned[i - 1 - extra_left], source[src_pos - 1 - extra_left])
|
|
357
|
+
):
|
|
358
|
+
extra_left += 1
|
|
359
|
+
|
|
360
|
+
if src_pos <= last_src_start and not last_is_init:
|
|
361
|
+
# Collision: new match is before previous — discard last 2 segments
|
|
362
|
+
if len(segments) >= 2:
|
|
363
|
+
segments.pop()
|
|
364
|
+
segments.pop()
|
|
365
|
+
if segments:
|
|
366
|
+
prev = segments[-1]
|
|
367
|
+
last_src_start, last_src_end = prev.src_start, prev.src_end
|
|
368
|
+
last_cln_start, last_cln_end = prev.cln_start, prev.cln_end
|
|
369
|
+
else:
|
|
370
|
+
last_src_start = last_src_end = 0
|
|
371
|
+
last_cln_start = last_cln_end = 0
|
|
372
|
+
last_is_init = True
|
|
373
|
+
i += 1
|
|
374
|
+
|
|
375
|
+
elif src_pos < last_src_end:
|
|
376
|
+
# Overlap — skip this anchor
|
|
377
|
+
i += 1
|
|
378
|
+
|
|
379
|
+
else:
|
|
380
|
+
last_is_init = False
|
|
381
|
+
# Shorten left extension if it overlaps with previous segment
|
|
382
|
+
while src_pos - extra_left < last_src_end:
|
|
383
|
+
extra_left -= 1
|
|
384
|
+
|
|
385
|
+
match_src_start = src_pos - extra_left
|
|
386
|
+
match_src_end = src_pos + k + extra_right
|
|
387
|
+
match_cln_start = i - extra_left
|
|
388
|
+
match_cln_end = i + k + extra_right
|
|
389
|
+
|
|
390
|
+
# Insert open segment before this match (if there's a gap)
|
|
391
|
+
if match_src_start > last_src_end or match_cln_start > last_cln_end:
|
|
392
|
+
segments.append(
|
|
393
|
+
Segment(last_src_end, match_src_start, last_cln_end, match_cln_start, False)
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Insert matched segment
|
|
397
|
+
seg = Segment(match_src_start, match_src_end, match_cln_start, match_cln_end, True)
|
|
398
|
+
segments.append(seg)
|
|
399
|
+
last_src_start, last_src_end = seg.src_start, seg.src_end
|
|
400
|
+
last_cln_start, last_cln_end = seg.cln_start, seg.cln_end
|
|
401
|
+
|
|
402
|
+
i += k + extra_right
|
|
403
|
+
else:
|
|
404
|
+
i += 1
|
|
405
|
+
|
|
406
|
+
# Append final open segment if source/cleaned has remaining chars
|
|
407
|
+
if segments:
|
|
408
|
+
last = segments[-1]
|
|
409
|
+
if last.src_end < n or last.cln_end < m:
|
|
410
|
+
segments.append(Segment(last.src_end, n, last.cln_end, m, False))
|
|
411
|
+
else:
|
|
412
|
+
segments.append(Segment(0, n, 0, m, False))
|
|
413
|
+
|
|
414
|
+
return segments
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
# ---------------------------------------------------------------------------
|
|
418
|
+
# Dynamic-programming alignment (port of Scala dpalignment)
|
|
419
|
+
# ---------------------------------------------------------------------------
|
|
420
|
+
|
|
421
|
+
# Decision enum
|
|
422
|
+
_MATCH, _SKIP_SRC, _SKIP_CLN = 0, 1, 2
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def _dp_align(source: str, cleaned: str) -> str:
|
|
426
|
+
"""Align *cleaned* against *source* using DP with affine gap penalties.
|
|
427
|
+
|
|
428
|
+
Returns a string of len(source) with GAPCHAR at non-content positions.
|
|
429
|
+
"""
|
|
430
|
+
n, m = len(source), len(cleaned)
|
|
431
|
+
|
|
432
|
+
if n == 0:
|
|
433
|
+
return ""
|
|
434
|
+
if m == 0:
|
|
435
|
+
return GAPCHAR * n
|
|
436
|
+
|
|
437
|
+
# Size guard — avoid quadratic blowup on huge unmatched segments.
|
|
438
|
+
# For large segments, align in chunks rather than giving up entirely.
|
|
439
|
+
if n * m > 10_000_000:
|
|
440
|
+
# Chunk the source into manageable pieces and align each
|
|
441
|
+
chunk_size = max(1, 10_000_000 // m)
|
|
442
|
+
result_parts: list[str] = []
|
|
443
|
+
cln_pos = 0
|
|
444
|
+
for start in range(0, n, chunk_size):
|
|
445
|
+
end = min(start + chunk_size, n)
|
|
446
|
+
src_chunk = source[start:end]
|
|
447
|
+
# Estimate how much clean text corresponds to this chunk
|
|
448
|
+
src_fraction = (end - start) / n
|
|
449
|
+
overshoot_factor = 1.5
|
|
450
|
+
cln_chunk_size = int(m * src_fraction * overshoot_factor)
|
|
451
|
+
cln_end = min(cln_pos + cln_chunk_size, m)
|
|
452
|
+
cln_chunk = cleaned[cln_pos:cln_end]
|
|
453
|
+
chunk_result = _dp_align(src_chunk, cln_chunk)
|
|
454
|
+
# Advance clean pointer by how many chars were consumed
|
|
455
|
+
consumed = sum(1 for c in chunk_result if c != GAPCHAR)
|
|
456
|
+
cln_pos += consumed
|
|
457
|
+
result_parts.append(chunk_result)
|
|
458
|
+
return "".join(result_parts)
|
|
459
|
+
|
|
460
|
+
# Score matrices (2-row rolling for space efficiency)
|
|
461
|
+
S = [[0] * (m + 1) for _ in range(2)]
|
|
462
|
+
G = [[True] * (m + 1) for _ in range(2)]
|
|
463
|
+
D = [[_SKIP_SRC] * (m + 1) for _ in range(n + 1)]
|
|
464
|
+
|
|
465
|
+
# Initialize first row
|
|
466
|
+
for j in range(m + 1):
|
|
467
|
+
S[0][j] = -j
|
|
468
|
+
G[0][j] = True
|
|
469
|
+
D[0][j] = _SKIP_CLN
|
|
470
|
+
D[0][0] = -1 # sentinel
|
|
471
|
+
|
|
472
|
+
for i in range(1, n + 1):
|
|
473
|
+
ci = i % 2
|
|
474
|
+
pi = (i - 1) % 2
|
|
475
|
+
S[ci][0] = 0
|
|
476
|
+
G[ci][0] = True
|
|
477
|
+
|
|
478
|
+
for j in range(1, m + 1):
|
|
479
|
+
sc = source[i - 1]
|
|
480
|
+
cc = cleaned[j - 1]
|
|
481
|
+
|
|
482
|
+
# SkipClean score
|
|
483
|
+
skip_cln_score = S[ci][j - 1] + (0 if cc.isspace() else -6)
|
|
484
|
+
|
|
485
|
+
# SkipSource score
|
|
486
|
+
skip_src_score = S[pi][j] + (0 if G[pi][j] else -2)
|
|
487
|
+
|
|
488
|
+
if skip_cln_score > skip_src_score:
|
|
489
|
+
best_score = skip_cln_score
|
|
490
|
+
best_dec = _SKIP_CLN
|
|
491
|
+
else:
|
|
492
|
+
best_score = skip_src_score
|
|
493
|
+
best_dec = _SKIP_SRC
|
|
494
|
+
|
|
495
|
+
# Match (only if chars are compatible)
|
|
496
|
+
chars_compatible = (
|
|
497
|
+
sc.upper() == cc.upper()
|
|
498
|
+
or (sc.isspace() and cc.isspace())
|
|
499
|
+
or (sc in _QUOTE_CHARS and cc in _QUOTE_CHARS)
|
|
500
|
+
)
|
|
501
|
+
if chars_compatible:
|
|
502
|
+
if sc.isalnum() and sc == cc:
|
|
503
|
+
match_score = S[pi][j - 1] + 3
|
|
504
|
+
else:
|
|
505
|
+
match_score = S[pi][j - 1] + 1
|
|
506
|
+
if match_score > best_score:
|
|
507
|
+
best_score = match_score
|
|
508
|
+
best_dec = _MATCH
|
|
509
|
+
|
|
510
|
+
D[i][j] = best_dec
|
|
511
|
+
S[ci][j] = best_score
|
|
512
|
+
|
|
513
|
+
if best_dec == _MATCH:
|
|
514
|
+
G[ci][j] = False
|
|
515
|
+
elif best_dec == _SKIP_SRC:
|
|
516
|
+
G[ci][j] = True
|
|
517
|
+
else: # _SKIP_CLN
|
|
518
|
+
G[ci][j] = G[ci][j - 1]
|
|
519
|
+
|
|
520
|
+
# Backtrack
|
|
521
|
+
result: list[str] = []
|
|
522
|
+
i, j = n, m
|
|
523
|
+
while i > 0 or j > 0:
|
|
524
|
+
d = D[i][j]
|
|
525
|
+
if d == _MATCH:
|
|
526
|
+
result.append(source[i - 1])
|
|
527
|
+
i -= 1
|
|
528
|
+
j -= 1
|
|
529
|
+
elif d == _SKIP_CLN:
|
|
530
|
+
j -= 1
|
|
531
|
+
else: # _SKIP_SRC
|
|
532
|
+
result.append(GAPCHAR)
|
|
533
|
+
i -= 1
|
|
534
|
+
|
|
535
|
+
result.reverse()
|
|
536
|
+
assert len(result) == n, f"DP output length {len(result)} != source length {n}"
|
|
537
|
+
return "".join(result)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
# ---------------------------------------------------------------------------
|
|
541
|
+
# Alignment orchestrator
|
|
542
|
+
# ---------------------------------------------------------------------------
|
|
543
|
+
|
|
544
|
+
def align(
|
|
545
|
+
leaves: list[tuple[etree._Element, str]], clean_text: str
|
|
546
|
+
) -> dict[int, float]:
|
|
547
|
+
"""Align leaf texts against *clean_text* and return per-leaf content scores.
|
|
548
|
+
|
|
549
|
+
Returns {leaf_index: fraction_of_chars_matched} (0.0–1.0).
|
|
550
|
+
"""
|
|
551
|
+
if not leaves or not clean_text.strip():
|
|
552
|
+
return {i: 0.0 for i in range(len(leaves))}
|
|
553
|
+
|
|
554
|
+
# Build source by concatenating leaf texts with space separators
|
|
555
|
+
parts: list[str] = []
|
|
556
|
+
offsets: list[tuple[int, int]] = []
|
|
557
|
+
pos = 0
|
|
558
|
+
for idx, (el, text) in enumerate(leaves):
|
|
559
|
+
offsets.append((pos, pos + len(text)))
|
|
560
|
+
parts.append(text)
|
|
561
|
+
pos += len(text)
|
|
562
|
+
if idx < len(leaves) - 1:
|
|
563
|
+
parts.append(" ")
|
|
564
|
+
pos += 1
|
|
565
|
+
source = "".join(parts)
|
|
566
|
+
|
|
567
|
+
cleaned = normalize_text(clean_text)
|
|
568
|
+
if not cleaned:
|
|
569
|
+
return {i: 0.0 for i in range(len(leaves))}
|
|
570
|
+
|
|
571
|
+
# Phase 1: anchor matching
|
|
572
|
+
segments = _find_anchors(source, cleaned, k=10)
|
|
573
|
+
|
|
574
|
+
# Phase 2: DP on open segments, pass-through on matched segments
|
|
575
|
+
aligned_parts: list[str] = []
|
|
576
|
+
for seg in segments:
|
|
577
|
+
src_slice = source[seg.src_start : seg.src_end]
|
|
578
|
+
cln_slice = cleaned[seg.cln_start : seg.cln_end]
|
|
579
|
+
if seg.matched:
|
|
580
|
+
aligned_parts.append(cln_slice)
|
|
581
|
+
else:
|
|
582
|
+
aligned_parts.append(_dp_align(src_slice, cln_slice))
|
|
583
|
+
aligned = "".join(aligned_parts)
|
|
584
|
+
|
|
585
|
+
# Extract per-leaf scores
|
|
586
|
+
scores: dict[int, float] = {}
|
|
587
|
+
for i, (start, end) in enumerate(offsets):
|
|
588
|
+
if end <= len(aligned):
|
|
589
|
+
substr = aligned[start:end]
|
|
590
|
+
n_matched = sum(1 for c in substr if c != GAPCHAR)
|
|
591
|
+
else:
|
|
592
|
+
n_matched = 0
|
|
593
|
+
n_total = end - start
|
|
594
|
+
scores[i] = n_matched / n_total if n_total > 0 else 0.0
|
|
595
|
+
|
|
596
|
+
# Fallback pass: leaves with score 0 that have substantial text might
|
|
597
|
+
# have been missed due to ordering differences between DOM and clean text.
|
|
598
|
+
# Try direct substring matching against the full clean text.
|
|
599
|
+
cleaned_lower = cleaned.lower()
|
|
600
|
+
for i, (el, text) in enumerate(leaves):
|
|
601
|
+
if scores[i] > 0.0 or len(text) < 20:
|
|
602
|
+
continue
|
|
603
|
+
leaf_lower = text.lower()
|
|
604
|
+
if leaf_lower in cleaned_lower:
|
|
605
|
+
scores[i] = 1.0
|
|
606
|
+
elif len(text) >= 50:
|
|
607
|
+
# Try matching 50-char chunks to estimate content fraction
|
|
608
|
+
chunk_size = 50
|
|
609
|
+
matched_chars = 0
|
|
610
|
+
for c in range(0, len(text) - chunk_size + 1, chunk_size):
|
|
611
|
+
if text[c : c + chunk_size].lower() in cleaned_lower:
|
|
612
|
+
matched_chars += chunk_size
|
|
613
|
+
scores[i] = matched_chars / len(text) if len(text) > 0 else 0.0
|
|
614
|
+
|
|
615
|
+
return scores
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
# ---------------------------------------------------------------------------
|
|
619
|
+
# Labeling
|
|
620
|
+
# ---------------------------------------------------------------------------
|
|
621
|
+
|
|
622
|
+
def label_nodes(
|
|
623
|
+
tree: etree._Element,
|
|
624
|
+
scores: dict[int, float],
|
|
625
|
+
threshold: float = 0.667,
|
|
626
|
+
) -> etree._Element:
|
|
627
|
+
"""Label CDOM nodes as content or boilerplate based on alignment scores.
|
|
628
|
+
|
|
629
|
+
Threshold 0.667 matches the original Scala's 2/3 rule.
|
|
630
|
+
"""
|
|
631
|
+
# Label leaves
|
|
632
|
+
for el in tree.iter():
|
|
633
|
+
if not isinstance(el.tag, str):
|
|
634
|
+
continue
|
|
635
|
+
leaf_id = el.get("data-leaf-id")
|
|
636
|
+
if leaf_id is not None:
|
|
637
|
+
score = scores.get(int(leaf_id), 0.0)
|
|
638
|
+
el.set("data-label", "content" if score > threshold else "boilerplate")
|
|
639
|
+
|
|
640
|
+
# Propagate upward: internal node is content if any child is content
|
|
641
|
+
def _propagate(el: etree._Element) -> bool:
|
|
642
|
+
if not isinstance(el.tag, str):
|
|
643
|
+
return False
|
|
644
|
+
if el.get("data-label"):
|
|
645
|
+
return el.get("data-label") == "content"
|
|
646
|
+
has_content = any(_propagate(child) for child in el)
|
|
647
|
+
el.set("data-label", "content" if has_content else "boilerplate")
|
|
648
|
+
return has_content
|
|
649
|
+
|
|
650
|
+
_propagate(tree)
|
|
651
|
+
return tree
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
# ---------------------------------------------------------------------------
|
|
655
|
+
# Text extraction
|
|
656
|
+
# ---------------------------------------------------------------------------
|
|
657
|
+
|
|
658
|
+
def extract_text(tree: etree._Element) -> str:
|
|
659
|
+
"""Reconstruct clean text from content-labeled leaf nodes."""
|
|
660
|
+
parts: list[str] = []
|
|
661
|
+
for el in tree.iter():
|
|
662
|
+
if not isinstance(el.tag, str):
|
|
663
|
+
continue
|
|
664
|
+
if el.get("data-label") == "content" and len(el) == 0:
|
|
665
|
+
text = normalize_text(el.text_content())
|
|
666
|
+
if text:
|
|
667
|
+
parts.append(text)
|
|
668
|
+
return "\n".join(parts)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def extract_text_from_labeled_html(labeled_html: str) -> str:
|
|
672
|
+
"""Extract content text from an already-labeled HTML string.
|
|
673
|
+
|
|
674
|
+
Accepts HTML where elements carry ``data-label="content"`` or
|
|
675
|
+
``data-label="boilerplate"`` attributes (e.g. the ``labeled_html``
|
|
676
|
+
column produced by :func:`label_original_html`) and returns only
|
|
677
|
+
the text of content-labeled leaf nodes.
|
|
678
|
+
"""
|
|
679
|
+
doc = document_fromstring(labeled_html)
|
|
680
|
+
return extract_text(doc)
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
# ---------------------------------------------------------------------------
|
|
684
|
+
# Evaluation
|
|
685
|
+
# ---------------------------------------------------------------------------
|
|
686
|
+
|
|
687
|
+
def evaluate(extracted: str, ground_truth: str) -> dict:
|
|
688
|
+
"""Compute quality metrics: token F1, ROUGE-1, BLEU, CHRF.
|
|
689
|
+
|
|
690
|
+
Both inputs are normalized for fair comparison (smart quotes, ?-apostrophes).
|
|
691
|
+
ROUGE-L is skipped as it's O(n*m) and very slow on long texts.
|
|
692
|
+
"""
|
|
693
|
+
try:
|
|
694
|
+
import sacrebleu
|
|
695
|
+
from rouge_score import rouge_scorer
|
|
696
|
+
except ImportError:
|
|
697
|
+
raise ImportError(
|
|
698
|
+
"evaluate() requires optional dependencies. "
|
|
699
|
+
"Install them with: uv add 'web2textpy[eval]'"
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
# Normalize both sides for fair comparison
|
|
703
|
+
extracted = _normalize_for_eval(extracted)
|
|
704
|
+
ground_truth = _normalize_for_eval(ground_truth)
|
|
705
|
+
|
|
706
|
+
# Token-level F1 (multiset)
|
|
707
|
+
ext_tokens = Counter(extracted.lower().split())
|
|
708
|
+
gt_tokens = Counter(ground_truth.lower().split())
|
|
709
|
+
overlap = sum((ext_tokens & gt_tokens).values())
|
|
710
|
+
ext_total = sum(ext_tokens.values())
|
|
711
|
+
gt_total = sum(gt_tokens.values())
|
|
712
|
+
precision = overlap / ext_total if ext_total else 0.0
|
|
713
|
+
recall = overlap / gt_total if gt_total else 0.0
|
|
714
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
|
|
715
|
+
|
|
716
|
+
# ROUGE-1 only (ROUGE-L is O(n*m) and very slow on long texts)
|
|
717
|
+
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False)
|
|
718
|
+
rouge = scorer.score(ground_truth, extracted)
|
|
719
|
+
|
|
720
|
+
# BLEU & CHRF
|
|
721
|
+
bleu = sacrebleu.corpus_bleu([extracted], [[ground_truth]])
|
|
722
|
+
chrf = sacrebleu.corpus_chrf([extracted], [[ground_truth]])
|
|
723
|
+
|
|
724
|
+
return {
|
|
725
|
+
"token_f1": round(f1, 4),
|
|
726
|
+
"precision": round(precision, 4),
|
|
727
|
+
"recall": round(recall, 4),
|
|
728
|
+
"rouge1_f": round(rouge["rouge1"].fmeasure, 4),
|
|
729
|
+
"bleu": round(bleu.score, 2),
|
|
730
|
+
"chrf": round(chrf.score, 2),
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
# ---------------------------------------------------------------------------
|
|
735
|
+
# Pipeline orchestrator
|
|
736
|
+
# ---------------------------------------------------------------------------
|
|
737
|
+
|
|
738
|
+
def run_pipeline(
|
|
739
|
+
html_str: str, clean_text: str
|
|
740
|
+
) -> tuple[etree._Element, str, dict]:
|
|
741
|
+
"""Run the full Web2Text alignment pipeline.
|
|
742
|
+
|
|
743
|
+
Returns (labeled_tree, extracted_text, metrics).
|
|
744
|
+
"""
|
|
745
|
+
tree = build_cdom(html_str)
|
|
746
|
+
leaves = extract_leaves(tree)
|
|
747
|
+
scores = align(leaves, clean_text)
|
|
748
|
+
tree = label_nodes(tree, scores)
|
|
749
|
+
extracted = extract_text(tree)
|
|
750
|
+
metrics = evaluate(extracted, clean_text)
|
|
751
|
+
return tree, extracted, metrics
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
# ---------------------------------------------------------------------------
|
|
755
|
+
# Label original HTML
|
|
756
|
+
# ---------------------------------------------------------------------------
|
|
757
|
+
|
|
758
|
+
def label_original_html(
|
|
759
|
+
html_str: str, clean_text: str, threshold: float = 0.667,
|
|
760
|
+
) -> tuple[str, str, dict]:
|
|
761
|
+
"""Run pipeline and return the *original* HTML with data-label attributes.
|
|
762
|
+
|
|
763
|
+
Unlike run_pipeline (which returns the collapsed CDOM), this preserves
|
|
764
|
+
the full original document structure and annotates every element with
|
|
765
|
+
data-label="content" or data-label="boilerplate".
|
|
766
|
+
|
|
767
|
+
Returns (labeled_html_string, extracted_text, metrics).
|
|
768
|
+
"""
|
|
769
|
+
import copy
|
|
770
|
+
|
|
771
|
+
# --- Parse & clean (same steps as build_cdom) ---
|
|
772
|
+
cleaned_str = _XML_DECL_RE.sub("", html_str, count=1)
|
|
773
|
+
doc = document_fromstring(cleaned_str)
|
|
774
|
+
|
|
775
|
+
for el in doc.iter():
|
|
776
|
+
if el.text:
|
|
777
|
+
el.text = _CTRL_RE.sub("", el.text)
|
|
778
|
+
if el.tail:
|
|
779
|
+
el.tail = _CTRL_RE.sub("", el.tail)
|
|
780
|
+
if isinstance(el.tag, str):
|
|
781
|
+
for attr, val in el.attrib.items():
|
|
782
|
+
c = _CTRL_RE.sub("", val)
|
|
783
|
+
if c != val:
|
|
784
|
+
el.attrib[attr] = c
|
|
785
|
+
|
|
786
|
+
try:
|
|
787
|
+
body = doc.body
|
|
788
|
+
except IndexError:
|
|
789
|
+
body = None
|
|
790
|
+
if body is None:
|
|
791
|
+
body = doc
|
|
792
|
+
|
|
793
|
+
# --- Assign stable IDs before any tree modifications ---
|
|
794
|
+
_id = 0
|
|
795
|
+
for el in body.iter():
|
|
796
|
+
if isinstance(el.tag, str):
|
|
797
|
+
el.set("data-orig-id", str(_id))
|
|
798
|
+
_id += 1
|
|
799
|
+
|
|
800
|
+
# --- Deep-copy: this is the "original" we will label at the end ---
|
|
801
|
+
orig_doc = copy.deepcopy(doc)
|
|
802
|
+
|
|
803
|
+
# --- CDOM construction (mirrors build_cdom logic) ---
|
|
804
|
+
# Remove comments
|
|
805
|
+
for c in body.iter():
|
|
806
|
+
if callable(c.tag):
|
|
807
|
+
_remove_preserving_tail(c)
|
|
808
|
+
|
|
809
|
+
# Remove skip-tag elements
|
|
810
|
+
for el in [e for e in body.iter() if isinstance(e.tag, str) and e.tag in SKIP_TAGS]:
|
|
811
|
+
_remove_preserving_tail(el)
|
|
812
|
+
|
|
813
|
+
# Remove empty text
|
|
814
|
+
for el in body.iter():
|
|
815
|
+
if not isinstance(el.tag, str):
|
|
816
|
+
continue
|
|
817
|
+
if el.text and not el.text.strip():
|
|
818
|
+
el.text = None
|
|
819
|
+
if el.tail and not el.tail.strip():
|
|
820
|
+
el.tail = None
|
|
821
|
+
|
|
822
|
+
# Remove empty leaf elements
|
|
823
|
+
changed = True
|
|
824
|
+
while changed:
|
|
825
|
+
changed = False
|
|
826
|
+
for el in list(body.iter()):
|
|
827
|
+
if not isinstance(el.tag, str) or el is body:
|
|
828
|
+
continue
|
|
829
|
+
if len(el) == 0 and not (el.text and el.text.strip()):
|
|
830
|
+
_remove_preserving_tail(el)
|
|
831
|
+
changed = True
|
|
832
|
+
|
|
833
|
+
# Collapse single-child chains (tracking merged orig-ids)
|
|
834
|
+
def _collapse_tracking(el: etree._Element) -> None:
|
|
835
|
+
for child in list(el):
|
|
836
|
+
if isinstance(child.tag, str):
|
|
837
|
+
_collapse_tracking(child)
|
|
838
|
+
if len(el) == 1 and not (el.text and el.text.strip()):
|
|
839
|
+
child = el[0]
|
|
840
|
+
if not isinstance(child.tag, str):
|
|
841
|
+
return
|
|
842
|
+
# Track which orig-ids get absorbed
|
|
843
|
+
child_oid = child.get("data-orig-id", "")
|
|
844
|
+
child_merged = child.get("data-merged-ids", "")
|
|
845
|
+
existing = el.get("data-merged-ids", "")
|
|
846
|
+
all_ids = [existing, child_oid, child_merged]
|
|
847
|
+
non_empty_ids = filter(None, all_ids)
|
|
848
|
+
merged = ",".join(non_empty_ids)
|
|
849
|
+
if merged:
|
|
850
|
+
el.set("data-merged-ids", merged)
|
|
851
|
+
el.text = child.text
|
|
852
|
+
grandchildren = list(child)
|
|
853
|
+
for gc in grandchildren:
|
|
854
|
+
child.remove(gc)
|
|
855
|
+
el.append(gc)
|
|
856
|
+
if child.tail and child.tail.strip():
|
|
857
|
+
if grandchildren:
|
|
858
|
+
grandchildren[-1].tail = (grandchildren[-1].tail or "") + child.tail
|
|
859
|
+
else:
|
|
860
|
+
el.text = (el.text or "") + child.tail
|
|
861
|
+
el.remove(child)
|
|
862
|
+
|
|
863
|
+
_collapse_tracking(body)
|
|
864
|
+
|
|
865
|
+
# --- Run alignment pipeline on the CDOM ---
|
|
866
|
+
leaves = extract_leaves(body)
|
|
867
|
+
scores = align(leaves, clean_text)
|
|
868
|
+
label_nodes(body, scores, threshold)
|
|
869
|
+
extracted = extract_text(body)
|
|
870
|
+
metrics = evaluate(extracted, clean_text)
|
|
871
|
+
|
|
872
|
+
# --- Build orig-id → label mapping ---
|
|
873
|
+
label_map: dict[str, str] = {}
|
|
874
|
+
for el in body.iter():
|
|
875
|
+
if not isinstance(el.tag, str):
|
|
876
|
+
continue
|
|
877
|
+
label = el.get("data-label")
|
|
878
|
+
if not label:
|
|
879
|
+
continue
|
|
880
|
+
oid = el.get("data-orig-id")
|
|
881
|
+
if oid:
|
|
882
|
+
label_map[oid] = label
|
|
883
|
+
merged = el.get("data-merged-ids")
|
|
884
|
+
if merged:
|
|
885
|
+
for mid in merged.split(","):
|
|
886
|
+
if mid:
|
|
887
|
+
label_map[mid] = label
|
|
888
|
+
|
|
889
|
+
# --- Apply labels to original doc ---
|
|
890
|
+
try:
|
|
891
|
+
orig_body = orig_doc.body
|
|
892
|
+
except IndexError:
|
|
893
|
+
orig_body = None
|
|
894
|
+
if orig_body is None:
|
|
895
|
+
orig_body = orig_doc
|
|
896
|
+
|
|
897
|
+
for el in orig_body.iter():
|
|
898
|
+
if not isinstance(el.tag, str):
|
|
899
|
+
continue
|
|
900
|
+
oid = el.get("data-orig-id")
|
|
901
|
+
if oid and oid in label_map:
|
|
902
|
+
el.set("data-label", label_map[oid])
|
|
903
|
+
elif isinstance(el.tag, str) and el.tag in SKIP_TAGS:
|
|
904
|
+
el.set("data-label", "boilerplate")
|
|
905
|
+
# Clean up tracking attributes
|
|
906
|
+
for attr in ("data-orig-id",):
|
|
907
|
+
if attr in el.attrib:
|
|
908
|
+
del el.attrib[attr]
|
|
909
|
+
|
|
910
|
+
# Propagate: unlabeled nodes inherit from children
|
|
911
|
+
def _propagate(el: etree._Element) -> bool:
|
|
912
|
+
if not isinstance(el.tag, str):
|
|
913
|
+
return False
|
|
914
|
+
if el.get("data-label"):
|
|
915
|
+
return el.get("data-label") == "content"
|
|
916
|
+
has_content = any(_propagate(child) for child in el)
|
|
917
|
+
el.set("data-label", "content" if has_content else "boilerplate")
|
|
918
|
+
return has_content
|
|
919
|
+
|
|
920
|
+
_propagate(orig_body)
|
|
921
|
+
|
|
922
|
+
# Label <head> as boilerplate
|
|
923
|
+
try:
|
|
924
|
+
head = orig_doc.head
|
|
925
|
+
if head is not None and not head.get("data-label"):
|
|
926
|
+
head.set("data-label", "boilerplate")
|
|
927
|
+
except Exception:
|
|
928
|
+
pass
|
|
929
|
+
|
|
930
|
+
labeled_html = etree.tostring(orig_doc, encoding="unicode", method="html")
|
|
931
|
+
return labeled_html, extracted, metrics
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: web2textpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python reimplementation of the Web2Text pipeline for labeling HTML DOM nodes as content or boilerplate
|
|
5
|
+
Project-URL: Homepage, https://github.com/williambrach/web2textpy
|
|
6
|
+
Project-URL: Repository, https://github.com/williambrach/web2textpy
|
|
7
|
+
Author: William Brach
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: lxml>=6.0.2
|
|
22
|
+
Provides-Extra: cli
|
|
23
|
+
Requires-Dist: datasets>=4.8.4; extra == 'cli'
|
|
24
|
+
Requires-Dist: rouge-score>=0.1.2; extra == 'cli'
|
|
25
|
+
Requires-Dist: sacrebleu>=2.6.0; extra == 'cli'
|
|
26
|
+
Provides-Extra: eval
|
|
27
|
+
Requires-Dist: rouge-score>=0.1.2; extra == 'eval'
|
|
28
|
+
Requires-Dist: sacrebleu>=2.6.0; extra == 'eval'
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# web2textpy
|
|
32
|
+
|
|
33
|
+
Python reimplementation of the [Web2Text](https://github.com/dalab/web2text) pipeline for labeling HTML DOM nodes as **content** or **boilerplate** using paired `(raw_html, clean_text)` data.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv add web2textpy
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from datasets import load_dataset
|
|
45
|
+
from web2text import run_pipeline
|
|
46
|
+
|
|
47
|
+
ds = load_dataset("williambrach/html-boilerplate-labeled", split="test")
|
|
48
|
+
row = ds[0]
|
|
49
|
+
|
|
50
|
+
tree, extracted_text, metrics = run_pipeline(row["html"], row["text"])
|
|
51
|
+
|
|
52
|
+
print(extracted_text[:200])
|
|
53
|
+
print(metrics)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Step-by-Step API
|
|
57
|
+
|
|
58
|
+
Each stage of the pipeline is exposed as a standalone function:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from web2text import build_cdom, extract_leaves, align, label_nodes, extract_text, evaluate
|
|
62
|
+
|
|
63
|
+
# 1. Parse HTML into a collapsed DOM tree
|
|
64
|
+
tree = build_cdom(html_string)
|
|
65
|
+
|
|
66
|
+
# 2. Extract ordered text-bearing leaf nodes
|
|
67
|
+
leaves = extract_leaves(tree) # [(element, "normalized text"), ...]
|
|
68
|
+
|
|
69
|
+
# 3. Align leaf texts against ground-truth clean text
|
|
70
|
+
scores = align(leaves, clean_text) # {leaf_id: 0.0-1.0 match score}
|
|
71
|
+
|
|
72
|
+
# 4. Label each node as "content" or "boilerplate"
|
|
73
|
+
tree = label_nodes(tree, scores, threshold=0.667)
|
|
74
|
+
|
|
75
|
+
# 5. Extract text from content-labeled nodes
|
|
76
|
+
result = extract_text(tree)
|
|
77
|
+
|
|
78
|
+
# 6. Evaluate against ground truth
|
|
79
|
+
metrics = evaluate(result, clean_text)
|
|
80
|
+
# => {'token_f1': 0.99, 'precision': 0.99, 'recall': 0.99, 'rouge1_f': 0.99, 'bleu': 98.5, 'chrf': 98.8}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## How the Matching Algorithm Works
|
|
84
|
+
|
|
85
|
+
Given raw HTML and its known clean text, the algorithm determines which DOM nodes are content versus boilerplate in six steps:
|
|
86
|
+
|
|
87
|
+
1. **Simplify the DOM** — strip non-content tags (`<script>`, `<style>`, etc.) and collapse single-child chains into a Collapsed DOM (CDOM) representation
|
|
88
|
+
2. **Collect leaf text** — walk the CDOM, concatenate text from every leaf node into one source string with tracked character offsets
|
|
89
|
+
3. **Find anchors** — identify 10-character substrings that appear exactly once in both the source and clean text, splitting the problem into independent segments
|
|
90
|
+
4. **DP alignment** — for each segment between anchors, run character-level dynamic programming with affine gap penalties to map source characters to clean-text characters
|
|
91
|
+
5. **Score leaves** — map alignment results back to leaf boundaries via stored offsets, giving each leaf a score: `matched_chars / total_chars`
|
|
92
|
+
6. **Label nodes** — leaves scoring above `0.667` are labeled `"content"`, the rest `"boilerplate"`, with labels propagating upward to parents
|
|
93
|
+
|
|
94
|
+

|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
## Dataset
|
|
98
|
+
|
|
99
|
+
Dataset: [williambrach/html-boilerplate-labeled](https://huggingface.co/datasets/williambrach/html-boilerplate-labeled) — ~4k pages from CleanEval, Dragnet, CETD, Readability, and others (3,985 pages total).
|
|
100
|
+
|
|
101
|
+
| Source | Train (ROUGE-1 F) | Test (ROUGE-1 F) |
|
|
102
|
+
|--------------------|-------------------|------------------|
|
|
103
|
+
| readability | 0.993 (92) | 0.997 (23) |
|
|
104
|
+
| scrapinghub | 0.991 (145) | 0.996 (36) |
|
|
105
|
+
| cetd | 0.993 (560) | 0.987 (140) |
|
|
106
|
+
| google-trends-2017 | 0.986 (144) | 0.995 (36) |
|
|
107
|
+
| cleanportaleval | 0.985 (57) | 0.971 (14) |
|
|
108
|
+
| cleaneval | 0.985 (590) | 0.991 (148) |
|
|
109
|
+
| dragnet | 0.983 (1,103) | 0.983 (276) |
|
|
110
|
+
| l3s-gn1 | 0.920 (497) | 0.927 (124) |
|
|
111
|
+
| **Overall** | **0.976** (3,188) | **0.978** (797) |
|
|
112
|
+
|
|
113
|
+
>Sample counts in parentheses.
|
|
114
|
+
|
|
115
|
+
## Original Work
|
|
116
|
+
|
|
117
|
+
- **Paper**: Vogels et al., "Web2Text: Deep Structured Boilerplate Removal" (ECIR 2018) — [arxiv.org/abs/1801.02607](https://arxiv.org/abs/1801.02607)
|
|
118
|
+
- **Original implementation** (Scala): [github.com/dalab/web2text](https://github.com/dalab/web2text)
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
web2text.py,sha256=-VDlyjafOBSPFKkSji0sctD0yfH-rYSK5v1HnTeltgE,32864
|
|
2
|
+
web2textpy-0.1.0.dist-info/METADATA,sha256=uXg3KL09aHrPku17BTziHsVRQu_d_zKTRy8YH1aH23Y,5081
|
|
3
|
+
web2textpy-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
4
|
+
web2textpy-0.1.0.dist-info/licenses/LICENSE,sha256=89q9i9qb5tf1LUyum9pM1pdA5X1fUowF_t7fpvhNpEE,1070
|
|
5
|
+
web2textpy-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 William Brach
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|