iflow-mcp_janspoerer-mcp_browser_use 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/METADATA +26 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/RECORD +50 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/WHEEL +5 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/entry_points.txt +2 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/licenses/LICENSE +201 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/top_level.txt +1 -0
- mcp_browser_use/__init__.py +2 -0
- mcp_browser_use/__main__.py +1347 -0
- mcp_browser_use/actions/__init__.py +1 -0
- mcp_browser_use/actions/elements.py +173 -0
- mcp_browser_use/actions/extraction.py +864 -0
- mcp_browser_use/actions/keyboard.py +43 -0
- mcp_browser_use/actions/navigation.py +73 -0
- mcp_browser_use/actions/screenshots.py +85 -0
- mcp_browser_use/browser/__init__.py +1 -0
- mcp_browser_use/browser/chrome.py +150 -0
- mcp_browser_use/browser/chrome_executable.py +204 -0
- mcp_browser_use/browser/chrome_launcher.py +330 -0
- mcp_browser_use/browser/chrome_process.py +104 -0
- mcp_browser_use/browser/devtools.py +230 -0
- mcp_browser_use/browser/driver.py +322 -0
- mcp_browser_use/browser/process.py +133 -0
- mcp_browser_use/cleaners.py +530 -0
- mcp_browser_use/config/__init__.py +30 -0
- mcp_browser_use/config/environment.py +155 -0
- mcp_browser_use/config/paths.py +97 -0
- mcp_browser_use/constants.py +68 -0
- mcp_browser_use/context.py +150 -0
- mcp_browser_use/context_pack.py +85 -0
- mcp_browser_use/decorators/__init__.py +13 -0
- mcp_browser_use/decorators/ensure.py +84 -0
- mcp_browser_use/decorators/envelope.py +83 -0
- mcp_browser_use/decorators/locking.py +172 -0
- mcp_browser_use/helpers.py +173 -0
- mcp_browser_use/helpers_context.py +261 -0
- mcp_browser_use/locking/__init__.py +1 -0
- mcp_browser_use/locking/action_lock.py +190 -0
- mcp_browser_use/locking/file_mutex.py +139 -0
- mcp_browser_use/locking/window_registry.py +178 -0
- mcp_browser_use/tools/__init__.py +59 -0
- mcp_browser_use/tools/browser_management.py +260 -0
- mcp_browser_use/tools/debugging.py +195 -0
- mcp_browser_use/tools/extraction.py +58 -0
- mcp_browser_use/tools/interaction.py +323 -0
- mcp_browser_use/tools/navigation.py +84 -0
- mcp_browser_use/tools/screenshots.py +116 -0
- mcp_browser_use/utils/__init__.py +1 -0
- mcp_browser_use/utils/diagnostics.py +85 -0
- mcp_browser_use/utils/html_utils.py +118 -0
- mcp_browser_use/utils/retry.py +57 -0
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
# mcp_browser_use/cleaners.py
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Tuple, Dict, Optional, Sequence, Pattern, Union
|
|
5
|
+
|
|
6
|
+
NOISE_ID_CLASS_PAT = re.compile(
|
|
7
|
+
r"(gtm|gtag|analytics|ad[s-]?|adslot|sponsor|cookie[-_ ]?banner|chat[-_ ]?widget)",
|
|
8
|
+
re.I
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
HIDDEN_CLASS_PAT = re.compile(r"(sr-only|visually-hidden|offscreen)", re.I)
|
|
12
|
+
|
|
13
|
+
def approx_token_count(text: str) -> int:
|
|
14
|
+
# Fast heuristic: ~4 chars per token
|
|
15
|
+
return max(0, (len(text) // 4))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# CDN detection and cleanup helpers
|
|
19
|
+
CDN_HOST_PATS = [
|
|
20
|
+
re.compile(r"(?:^|\.)cdn(?:[\.-]|$)", re.I), # matches cdn.*, *.cdn-foo.*, *.cdn.foo.*
|
|
21
|
+
re.compile(r"/cdn/", re.I), # handles relative /cdn/... paths
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
def _build_cdn_pats(extra: Optional[Sequence[Union[str, Pattern[str]]]]) -> list[Pattern[str]]:
|
|
25
|
+
pats: list[Pattern[str]] = list(CDN_HOST_PATS)
|
|
26
|
+
for p in extra or []:
|
|
27
|
+
if isinstance(p, str):
|
|
28
|
+
pats.append(re.compile(p, re.I))
|
|
29
|
+
elif hasattr(p, "search"):
|
|
30
|
+
pats.append(p) # already a compiled regex
|
|
31
|
+
return pats
|
|
32
|
+
|
|
33
|
+
def _is_cdn_url(url: str, extra_pats=None) -> bool:
|
|
34
|
+
if not isinstance(url, str) or not url.strip():
|
|
35
|
+
return False
|
|
36
|
+
s = url.strip().strip('\'"')
|
|
37
|
+
# extract from url(...) if style
|
|
38
|
+
s = re.sub(r"^url\((.+?)\)$", r"\1", s).strip('\'"')
|
|
39
|
+
# srcset often has descriptors like "800w" – take only the URL portion
|
|
40
|
+
s = s.split()[0] if s else s
|
|
41
|
+
|
|
42
|
+
# support scheme-less //cdn.host
|
|
43
|
+
host, path = "", ""
|
|
44
|
+
try:
|
|
45
|
+
if s.startswith("//"):
|
|
46
|
+
host = s[2:].split("/", 1)[0].lower()
|
|
47
|
+
path = "/" + s[2:].split("/", 1)[1] if "/" in s[2:] else ""
|
|
48
|
+
else:
|
|
49
|
+
from urllib.parse import urlparse
|
|
50
|
+
p = urlparse(s)
|
|
51
|
+
host = (p.netloc or "").lower()
|
|
52
|
+
path = p.path or ""
|
|
53
|
+
except Exception:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
pats = (extra_pats or []) + CDN_HOST_PATS
|
|
57
|
+
haystacks = [host, path, s]
|
|
58
|
+
return any(p.search(h) for h in haystacks for p in pats)
|
|
59
|
+
|
|
60
|
+
def _filter_srcset(srcset_val: str, extra_pats=None) -> str | None:
|
|
61
|
+
# Return a cleaned srcset string without CDN candidates, or None if all are removed
|
|
62
|
+
if not srcset_val:
|
|
63
|
+
return None
|
|
64
|
+
pieces = [i.strip() for i in str(srcset_val).split(",") if i.strip()]
|
|
65
|
+
kept = []
|
|
66
|
+
for piece in pieces:
|
|
67
|
+
url = piece.split()[0].strip('\'"')
|
|
68
|
+
if not _is_cdn_url(url, extra_pats):
|
|
69
|
+
kept.append(piece)
|
|
70
|
+
if not kept:
|
|
71
|
+
return None
|
|
72
|
+
return ", ".join(kept)
|
|
73
|
+
|
|
74
|
+
def _is_button_like(el) -> bool:
|
|
75
|
+
"""
|
|
76
|
+
Check if an element should be treated as button-like for class preservation.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
el: BeautifulSoup element to check
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
bool: True if element is <button>, <input type="button/submit/reset/image">, or role="button"
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
tag = (el.name or "").lower()
|
|
86
|
+
except Exception:
|
|
87
|
+
tag = ""
|
|
88
|
+
if tag == "button":
|
|
89
|
+
return True
|
|
90
|
+
typ = str(el.get("type", "")).lower()
|
|
91
|
+
if tag == "input" and typ in ("button", "submit", "reset", "image"):
|
|
92
|
+
return True
|
|
93
|
+
role = str(el.get("role", "")).lower()
|
|
94
|
+
if role == "button":
|
|
95
|
+
return True
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _remove_comments(soup, pruned_counts: Dict[str, int]) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Remove HTML comments from the document to save tokens.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
soup: BeautifulSoup object to modify in-place
|
|
105
|
+
pruned_counts: Dictionary to update with removal counts
|
|
106
|
+
"""
|
|
107
|
+
from bs4 import Comment
|
|
108
|
+
|
|
109
|
+
for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
|
|
110
|
+
c.extract()
|
|
111
|
+
pruned_counts["comments_removed"] += 1
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _remove_scripts_and_styles(soup, pruned_counts: Dict[str, int]) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Remove non-content tags like scripts, styles, SVG, and non-canonical links.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
soup: BeautifulSoup object to modify in-place
|
|
120
|
+
pruned_counts: Dictionary to update with removal counts
|
|
121
|
+
"""
|
|
122
|
+
# Remove scripts/styles/noscript/template/svg/canvas/meta/source/track
|
|
123
|
+
for tag_name in ["script", "style", "noscript", "template", "canvas", "svg", "meta", "source", "track"]:
|
|
124
|
+
removed = soup.find_all(tag_name)
|
|
125
|
+
key = tag_name if tag_name in ["script", "style"] else "noise"
|
|
126
|
+
pruned_counts[key] = pruned_counts.get(key, 0) + len(removed)
|
|
127
|
+
for t in removed:
|
|
128
|
+
t.decompose()
|
|
129
|
+
|
|
130
|
+
# Remove <link> except canonical (robust to str vs list)
|
|
131
|
+
for link in soup.find_all("link"):
|
|
132
|
+
rel = link.get("rel")
|
|
133
|
+
rels = [s.lower() for s in rel] if isinstance(rel, (list, tuple)) else ([str(rel).lower()] if rel else [])
|
|
134
|
+
if "canonical" in rels:
|
|
135
|
+
continue
|
|
136
|
+
pruned_counts["noise"] += 1
|
|
137
|
+
link.decompose()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _remove_noise_containers(soup, pruned_counts: Dict[str, int], prune_hidden: bool) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Remove ads, trackers, hidden elements, and oversized dropdowns.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
soup: BeautifulSoup object to modify in-place
|
|
146
|
+
pruned_counts: Dictionary to update with removal counts
|
|
147
|
+
prune_hidden: If True, remove hidden elements and hidden inputs
|
|
148
|
+
"""
|
|
149
|
+
removed_noise = 0
|
|
150
|
+
removed_hidden = 0
|
|
151
|
+
|
|
152
|
+
for el in soup.find_all(True):
|
|
153
|
+
if el.attrs is None:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
idv = el.get("id") or ""
|
|
157
|
+
classes = el.get("class") or []
|
|
158
|
+
classv = " ".join(classes) if isinstance(classes, (list, tuple)) else str(classes)
|
|
159
|
+
|
|
160
|
+
aria_hidden = str(el.get("aria-hidden", "")).strip().lower() == "true"
|
|
161
|
+
style_val = el.get("style")
|
|
162
|
+
|
|
163
|
+
style_hidden = False
|
|
164
|
+
if isinstance(style_val, str):
|
|
165
|
+
sv = style_val.lower()
|
|
166
|
+
if re.search(r"display\s*:\s*none\b", sv) or re.search(r"visibility\s*:\s*hidden\b", sv):
|
|
167
|
+
style_hidden = True
|
|
168
|
+
|
|
169
|
+
hidden_attr = el.has_attr("hidden") or aria_hidden or style_hidden
|
|
170
|
+
|
|
171
|
+
# Requires NOISE_ID_CLASS_PAT / HIDDEN_CLASS_PAT to be defined at module scope
|
|
172
|
+
remove_for_noise = bool(NOISE_ID_CLASS_PAT.search(idv) or NOISE_ID_CLASS_PAT.search(classv))
|
|
173
|
+
remove_for_hidden = bool(hidden_attr or HIDDEN_CLASS_PAT.search(classv))
|
|
174
|
+
|
|
175
|
+
if remove_for_noise or (prune_hidden and remove_for_hidden):
|
|
176
|
+
if remove_for_noise:
|
|
177
|
+
removed_noise += 1
|
|
178
|
+
if prune_hidden and remove_for_hidden:
|
|
179
|
+
removed_hidden += 1
|
|
180
|
+
el.decompose()
|
|
181
|
+
|
|
182
|
+
pruned_counts["noise"] += removed_noise
|
|
183
|
+
pruned_counts["hidden_removed"] += removed_hidden
|
|
184
|
+
|
|
185
|
+
# Remove hidden inputs explicitly
|
|
186
|
+
if prune_hidden:
|
|
187
|
+
hidden_inputs_removed = 0
|
|
188
|
+
for inp in soup.find_all("input"):
|
|
189
|
+
typ = str(inp.get("type", "")).lower()
|
|
190
|
+
if typ == "hidden":
|
|
191
|
+
inp.decompose()
|
|
192
|
+
hidden_inputs_removed += 1
|
|
193
|
+
pruned_counts["hidden_removed"] += hidden_inputs_removed
|
|
194
|
+
|
|
195
|
+
# Remove large select dropdowns that cause token overflow
|
|
196
|
+
select_removed = 0
|
|
197
|
+
for select in soup.find_all("select"):
|
|
198
|
+
options = select.find_all("option")
|
|
199
|
+
if len(options) > 5:
|
|
200
|
+
select.decompose()
|
|
201
|
+
select_removed += 1
|
|
202
|
+
pruned_counts["noise"] += select_removed
|
|
203
|
+
|
|
204
|
+
# Also remove JavaScript dropdown menus with many items
|
|
205
|
+
dropdown_removed = 0
|
|
206
|
+
for dropdown_menu in soup.find_all("div", class_=re.compile(r"dropdown-menu")):
|
|
207
|
+
dropdown_items = dropdown_menu.find_all(class_=re.compile(r"dropdown-item"))
|
|
208
|
+
if len(dropdown_items) > 5:
|
|
209
|
+
dropdown_menu.decompose()
|
|
210
|
+
dropdown_removed += 1
|
|
211
|
+
pruned_counts["noise"] += dropdown_removed
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _clean_cdn_links(
|
|
215
|
+
soup,
|
|
216
|
+
pruned_counts: Dict[str, int],
|
|
217
|
+
cdn_host_patterns: list[str] | None,
|
|
218
|
+
drop_cdn_elements: bool,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""
|
|
221
|
+
Strip CDN URLs from attributes, inline styles, and text nodes.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
soup: BeautifulSoup object to modify in-place
|
|
225
|
+
pruned_counts: Dictionary to update with removal counts
|
|
226
|
+
cdn_host_patterns: Additional CDN pattern strings to match
|
|
227
|
+
drop_cdn_elements: If True, remove entire asset tags with CDN URLs
|
|
228
|
+
"""
|
|
229
|
+
cdn_removed = 0
|
|
230
|
+
# Common URL-carrying attributes to inspect
|
|
231
|
+
url_attrs = ("src", "href", "poster", "data-src", "data-lazy", "data-original", "data-lazy-src", "data-srcset")
|
|
232
|
+
asset_tags = {"img", "script", "link", "source", "video", "audio", "track"}
|
|
233
|
+
|
|
234
|
+
for el in list(soup.find_all(True)):
|
|
235
|
+
# Clean srcset by filtering out CDN candidates
|
|
236
|
+
if el.has_attr("srcset"):
|
|
237
|
+
cleaned = _filter_srcset(str(el.get("srcset")), cdn_host_patterns)
|
|
238
|
+
original = str(el.get("srcset"))
|
|
239
|
+
if cleaned != original:
|
|
240
|
+
if cleaned:
|
|
241
|
+
el["srcset"] = cleaned
|
|
242
|
+
else:
|
|
243
|
+
try:
|
|
244
|
+
del el.attrs["srcset"]
|
|
245
|
+
except Exception:
|
|
246
|
+
pass
|
|
247
|
+
cdn_removed += 1
|
|
248
|
+
|
|
249
|
+
# Remove CDN URLs from URL attributes
|
|
250
|
+
for attr in url_attrs:
|
|
251
|
+
if not el.has_attr(attr):
|
|
252
|
+
continue
|
|
253
|
+
val = str(el.get(attr, ""))
|
|
254
|
+
if _is_cdn_url(val, cdn_host_patterns):
|
|
255
|
+
if drop_cdn_elements and el.name in asset_tags:
|
|
256
|
+
el.decompose()
|
|
257
|
+
cdn_removed += 1
|
|
258
|
+
break # element is gone
|
|
259
|
+
else:
|
|
260
|
+
try:
|
|
261
|
+
del el.attrs[attr]
|
|
262
|
+
except Exception:
|
|
263
|
+
pass
|
|
264
|
+
cdn_removed += 1
|
|
265
|
+
|
|
266
|
+
# Strip any inline style url(...) pointing to CDNs
|
|
267
|
+
if el.has_attr("style"):
|
|
268
|
+
style_val = str(el["style"])
|
|
269
|
+
# Remove only url(...) tokens that are CDN; keep the rest of the style intact
|
|
270
|
+
def repl(m):
|
|
271
|
+
raw = m.group(1).strip('\'"')
|
|
272
|
+
return "" if _is_cdn_url(raw, cdn_host_patterns) else m.group(0)
|
|
273
|
+
|
|
274
|
+
new_style = re.sub(r"url\((.+?)\)", repl, style_val)
|
|
275
|
+
if new_style != style_val:
|
|
276
|
+
# Clean leftover artifacts like empty declarations
|
|
277
|
+
new_style = re.sub(r"\s*;\s*;\s*", ";", new_style).strip(" ;")
|
|
278
|
+
if new_style:
|
|
279
|
+
el["style"] = new_style
|
|
280
|
+
else:
|
|
281
|
+
try:
|
|
282
|
+
del el.attrs["style"]
|
|
283
|
+
except Exception:
|
|
284
|
+
pass
|
|
285
|
+
cdn_removed += 1
|
|
286
|
+
|
|
287
|
+
pruned_counts["cdn_links_removed"] += cdn_removed
|
|
288
|
+
|
|
289
|
+
# Optional: remove plaintext CDN URLs from text nodes
|
|
290
|
+
for t in soup.find_all(string=True):
|
|
291
|
+
new_t = re.sub(r"https?://[^ \t\n\r,]*cdn[^ \t\n\r,]*", "", str(t), flags=re.I)
|
|
292
|
+
if new_t != str(t):
|
|
293
|
+
t.replace_with(new_t)
|
|
294
|
+
pruned_counts["cdn_links_removed"] += 1
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _prune_attributes(soup, pruned_counts: Dict[str, int], prune_classes_except_buttons: bool) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Prune non-essential attributes and normalize remaining ones.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
soup: BeautifulSoup object to modify in-place
|
|
303
|
+
pruned_counts: Dictionary to update with removal counts
|
|
304
|
+
prune_classes_except_buttons: If True, remove class attributes except from button-like elements
|
|
305
|
+
"""
|
|
306
|
+
# Keep class for button-like elements only; drop elsewhere if enabled
|
|
307
|
+
keep_attrs = {"id", "class", "href", "src", "alt", "title", "type", "value", "name", "role", "rel"}
|
|
308
|
+
|
|
309
|
+
for el in soup.find_all(True):
|
|
310
|
+
if el.attrs is None:
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
for attr in list(el.attrs.keys()):
|
|
314
|
+
# Always keep aria-* attributes
|
|
315
|
+
if attr.startswith("aria-"):
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
# Class pruning toggle
|
|
319
|
+
if attr == "class" and prune_classes_except_buttons:
|
|
320
|
+
if not _is_button_like(el):
|
|
321
|
+
try:
|
|
322
|
+
del el.attrs["class"]
|
|
323
|
+
pruned_counts["attr_trim"] += 1
|
|
324
|
+
pruned_counts["class_drops"] += 1
|
|
325
|
+
except Exception:
|
|
326
|
+
pass
|
|
327
|
+
continue # move on to next attr
|
|
328
|
+
|
|
329
|
+
# Drop attributes not in the allowlist
|
|
330
|
+
if attr not in keep_attrs:
|
|
331
|
+
del el.attrs[attr]
|
|
332
|
+
pruned_counts["attr_trim"] += 1
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
# Normalize values
|
|
336
|
+
val = el.get(attr)
|
|
337
|
+
|
|
338
|
+
# Preserve list type for class/rel
|
|
339
|
+
if attr in {"class", "rel"} and isinstance(val, (list, tuple)):
|
|
340
|
+
pass # keep as list
|
|
341
|
+
else:
|
|
342
|
+
# Normalize other list-like values to strings
|
|
343
|
+
if isinstance(val, (list, tuple)):
|
|
344
|
+
val = " ".join(map(str, val))
|
|
345
|
+
el[attr] = val
|
|
346
|
+
|
|
347
|
+
# Truncate descriptive text fields
|
|
348
|
+
if attr in {"alt", "title"} and isinstance(val, str) and len(val) > 80:
|
|
349
|
+
el[attr] = val[:80] + "...(trunc)"
|
|
350
|
+
pruned_counts["attr_trim"] += 1
|
|
351
|
+
|
|
352
|
+
# Strip data URIs on src
|
|
353
|
+
src = el.get("src")
|
|
354
|
+
if isinstance(src, str) and src.startswith("data:"):
|
|
355
|
+
try:
|
|
356
|
+
del el.attrs["src"]
|
|
357
|
+
except Exception:
|
|
358
|
+
pass
|
|
359
|
+
pruned_counts["attr_trim"] += 1
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _collapse_wrappers(soup, pruned_counts: Dict[str, int]) -> None:
|
|
363
|
+
"""
|
|
364
|
+
Collapse empty div/span wrappers with a single child and replace images with minimal attributes.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
soup: BeautifulSoup object to modify in-place
|
|
368
|
+
pruned_counts: Dictionary to update with removal counts
|
|
369
|
+
"""
|
|
370
|
+
# Replace images with minimal attrs
|
|
371
|
+
for img in soup.find_all("img"):
|
|
372
|
+
for k in list(img.attrs.keys()):
|
|
373
|
+
if k not in {"alt", "title"}:
|
|
374
|
+
del img.attrs[k]
|
|
375
|
+
pruned_counts["media"] += 1
|
|
376
|
+
|
|
377
|
+
# Collapse empty div/span wrappers with a single child
|
|
378
|
+
changed = True
|
|
379
|
+
while changed:
|
|
380
|
+
changed = False
|
|
381
|
+
for el in list(soup.find_all(["div", "span"])):
|
|
382
|
+
if not el.parent:
|
|
383
|
+
continue
|
|
384
|
+
children = [c for c in el.children if getattr(c, "name", None)]
|
|
385
|
+
if len(children) == 1 and not (el.get_text(strip=True) or "").strip():
|
|
386
|
+
el.replace_with(children[0])
|
|
387
|
+
pruned_counts["wrapper"] += 1
|
|
388
|
+
changed = True
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _normalize_whitespace(soup, pruned_counts: Dict[str, int]) -> str:
|
|
393
|
+
"""
|
|
394
|
+
Normalize whitespace in text nodes and final HTML output.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
soup: BeautifulSoup object to process
|
|
398
|
+
pruned_counts: Dictionary to update with removal counts
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
str: HTML string with normalized whitespace
|
|
402
|
+
"""
|
|
403
|
+
from bs4 import NavigableString
|
|
404
|
+
|
|
405
|
+
WHITESPACE_SENSITIVE = {"pre", "code", "textarea"}
|
|
406
|
+
changed_nodes = 0
|
|
407
|
+
|
|
408
|
+
for t in soup.find_all(string=True):
|
|
409
|
+
parent = getattr(t, "parent", None)
|
|
410
|
+
parent_name = (getattr(parent, "name", "") or "").lower()
|
|
411
|
+
if parent_name in WHITESPACE_SENSITIVE:
|
|
412
|
+
continue
|
|
413
|
+
new_text = re.sub(r"\s+", " ", str(t))
|
|
414
|
+
if new_text != str(t):
|
|
415
|
+
t.replace_with(NavigableString(new_text))
|
|
416
|
+
changed_nodes += 1
|
|
417
|
+
pruned_counts["whitespace_trim"] += changed_nodes
|
|
418
|
+
|
|
419
|
+
html_out = str(soup)
|
|
420
|
+
|
|
421
|
+
# As a final safety net, ensure no literal newlines/tabs remain
|
|
422
|
+
before_len = len(html_out)
|
|
423
|
+
html_out = re.sub(r"[\r\n\t]+", " ", html_out)
|
|
424
|
+
html_out = re.sub(r" {2,}", " ", html_out)
|
|
425
|
+
html_out = html_out.strip()
|
|
426
|
+
if len(html_out) < before_len:
|
|
427
|
+
pruned_counts["whitespace_trim"] += 1
|
|
428
|
+
|
|
429
|
+
return html_out
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def basic_prune(
|
|
433
|
+
html: str,
|
|
434
|
+
level: int,
|
|
435
|
+
prune_hidden: bool = True,
|
|
436
|
+
prune_classes_except_buttons: bool = True,
|
|
437
|
+
prune_linebreaks: bool = True,
|
|
438
|
+
remove_cdn_links: bool = True,
|
|
439
|
+
cdn_host_patterns: list[str] | None = None,
|
|
440
|
+
drop_cdn_elements: bool = False, # if True, remove entire asset tags
|
|
441
|
+
) -> Tuple[str, Dict[str, int]]:
|
|
442
|
+
"""
|
|
443
|
+
Perform structural pruning on raw HTML to remove non-content noise.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
html: Raw HTML string.
|
|
447
|
+
level: Cleaning level. Higher = more aggressive.
|
|
448
|
+
prune_hidden: If True, remove hidden elements and <input type="hidden">.
|
|
449
|
+
prune_classes_except_buttons: If True, drop 'class' attributes for all
|
|
450
|
+
elements except button-like ones (<button>, certain <input>, role="button").
|
|
451
|
+
prune_linebreaks: If True, remove line breaks/tabs and collapse excessive
|
|
452
|
+
whitespace in text nodes (skipping <pre>, <code>, <textarea>).
|
|
453
|
+
remove_cdn_links: If True, strip CDN URLs from attributes and styles.
|
|
454
|
+
cdn_host_patterns: Additional CDN pattern strings to match.
|
|
455
|
+
drop_cdn_elements: If True, remove entire asset tags with CDN URLs.
|
|
456
|
+
"""
|
|
457
|
+
pruned_counts = {
|
|
458
|
+
"script": 0,
|
|
459
|
+
"style": 0,
|
|
460
|
+
"noise": 0,
|
|
461
|
+
"attr_trim": 0,
|
|
462
|
+
"wrapper": 0,
|
|
463
|
+
"media": 0,
|
|
464
|
+
"hidden_removed": 0,
|
|
465
|
+
"class_drops": 0,
|
|
466
|
+
"whitespace_trim": 0,
|
|
467
|
+
"comments_removed": 0,
|
|
468
|
+
"cdn_links_removed": 0,
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
import bs4
|
|
472
|
+
soup = bs4.BeautifulSoup(html or "", "html.parser")
|
|
473
|
+
|
|
474
|
+
# Phase 0: Remove HTML comments
|
|
475
|
+
_remove_comments(soup=soup, pruned_counts=pruned_counts)
|
|
476
|
+
|
|
477
|
+
# Phase 1: Remove scripts, styles, and non-content tags
|
|
478
|
+
_remove_scripts_and_styles(soup=soup, pruned_counts=pruned_counts)
|
|
479
|
+
|
|
480
|
+
# Phase 2: Remove noise containers and hidden elements (if level >= 1)
|
|
481
|
+
if level >= 1:
|
|
482
|
+
_remove_noise_containers(soup=soup, pruned_counts=pruned_counts, prune_hidden=prune_hidden)
|
|
483
|
+
|
|
484
|
+
# Phase 2.5: Strip CDN links (if enabled)
|
|
485
|
+
if remove_cdn_links:
|
|
486
|
+
_clean_cdn_links(soup=soup, pruned_counts=pruned_counts, cdn_host_patterns=cdn_host_patterns, drop_cdn_elements=drop_cdn_elements)
|
|
487
|
+
|
|
488
|
+
# Phase 3: Prune attributes (if level >= 2)
|
|
489
|
+
if level >= 2:
|
|
490
|
+
_prune_attributes(soup=soup, pruned_counts=pruned_counts, prune_classes_except_buttons=prune_classes_except_buttons)
|
|
491
|
+
|
|
492
|
+
# Phase 4: Collapse wrappers (if level >= 3)
|
|
493
|
+
if level >= 3:
|
|
494
|
+
_collapse_wrappers(soup=soup, pruned_counts=pruned_counts)
|
|
495
|
+
|
|
496
|
+
# Phase 5: Normalize whitespace (if enabled)
|
|
497
|
+
if prune_linebreaks:
|
|
498
|
+
html_out = _normalize_whitespace(soup=soup, pruned_counts=pruned_counts)
|
|
499
|
+
else:
|
|
500
|
+
html_out = str(soup)
|
|
501
|
+
|
|
502
|
+
return html_out, pruned_counts
|
|
503
|
+
|
|
504
|
+
def extract_outline(html: str, max_items: int = 64):
|
|
505
|
+
import bs4
|
|
506
|
+
soup = bs4.BeautifulSoup(html or "", "html.parser")
|
|
507
|
+
outline = []
|
|
508
|
+
for level, tag in [(1, "h1"), (2, "h2"), (3, "h3"), (4, "h4")]:
|
|
509
|
+
for el in soup.find_all(tag):
|
|
510
|
+
text = el.get_text(" ", strip=True)
|
|
511
|
+
wc = len(text.split())
|
|
512
|
+
# build a rough css_path
|
|
513
|
+
css_path = None
|
|
514
|
+
try:
|
|
515
|
+
# naive css path
|
|
516
|
+
parts = []
|
|
517
|
+
cur = el
|
|
518
|
+
while cur and cur.name and cur.name != "[document]":
|
|
519
|
+
idp = ("#" + cur.get("id")) if cur.has_attr("id") else ""
|
|
520
|
+
cls = "." + ".".join(cur.get("class", [])) if cur.has_attr("class") else ""
|
|
521
|
+
parts.append(f"{cur.name}{idp}{cls}")
|
|
522
|
+
cur = cur.parent
|
|
523
|
+
css_path = " > ".join(reversed(parts))
|
|
524
|
+
except Exception:
|
|
525
|
+
css_path = None
|
|
526
|
+
outline.append({"level": level, "text": text, "word_count": wc, "css_path": css_path, "subtree_id": None})
|
|
527
|
+
if len(outline) >= max_items:
|
|
528
|
+
return outline
|
|
529
|
+
return outline
|
|
530
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Configuration management for browser automation."""
|
|
2
|
+
|
|
3
|
+
from .environment import (
|
|
4
|
+
get_env_config,
|
|
5
|
+
profile_key,
|
|
6
|
+
is_default_user_data_dir,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from .paths import (
|
|
10
|
+
get_lock_dir,
|
|
11
|
+
rendezvous_path,
|
|
12
|
+
start_lock_dir,
|
|
13
|
+
chromedriver_log_path,
|
|
14
|
+
_lock_paths,
|
|
15
|
+
_window_registry_path,
|
|
16
|
+
_same_dir,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"get_env_config",
|
|
21
|
+
"profile_key",
|
|
22
|
+
"is_default_user_data_dir",
|
|
23
|
+
"get_lock_dir",
|
|
24
|
+
"rendezvous_path",
|
|
25
|
+
"start_lock_dir",
|
|
26
|
+
"chromedriver_log_path",
|
|
27
|
+
"_lock_paths",
|
|
28
|
+
"_window_registry_path",
|
|
29
|
+
"_same_dir",
|
|
30
|
+
]
|