iflow-mcp_janspoerer-mcp_browser_use 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/METADATA +26 -0
  2. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/RECORD +50 -0
  3. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/WHEEL +5 -0
  4. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/entry_points.txt +2 -0
  5. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/licenses/LICENSE +201 -0
  6. iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/top_level.txt +1 -0
  7. mcp_browser_use/__init__.py +2 -0
  8. mcp_browser_use/__main__.py +1347 -0
  9. mcp_browser_use/actions/__init__.py +1 -0
  10. mcp_browser_use/actions/elements.py +173 -0
  11. mcp_browser_use/actions/extraction.py +864 -0
  12. mcp_browser_use/actions/keyboard.py +43 -0
  13. mcp_browser_use/actions/navigation.py +73 -0
  14. mcp_browser_use/actions/screenshots.py +85 -0
  15. mcp_browser_use/browser/__init__.py +1 -0
  16. mcp_browser_use/browser/chrome.py +150 -0
  17. mcp_browser_use/browser/chrome_executable.py +204 -0
  18. mcp_browser_use/browser/chrome_launcher.py +330 -0
  19. mcp_browser_use/browser/chrome_process.py +104 -0
  20. mcp_browser_use/browser/devtools.py +230 -0
  21. mcp_browser_use/browser/driver.py +322 -0
  22. mcp_browser_use/browser/process.py +133 -0
  23. mcp_browser_use/cleaners.py +530 -0
  24. mcp_browser_use/config/__init__.py +30 -0
  25. mcp_browser_use/config/environment.py +155 -0
  26. mcp_browser_use/config/paths.py +97 -0
  27. mcp_browser_use/constants.py +68 -0
  28. mcp_browser_use/context.py +150 -0
  29. mcp_browser_use/context_pack.py +85 -0
  30. mcp_browser_use/decorators/__init__.py +13 -0
  31. mcp_browser_use/decorators/ensure.py +84 -0
  32. mcp_browser_use/decorators/envelope.py +83 -0
  33. mcp_browser_use/decorators/locking.py +172 -0
  34. mcp_browser_use/helpers.py +173 -0
  35. mcp_browser_use/helpers_context.py +261 -0
  36. mcp_browser_use/locking/__init__.py +1 -0
  37. mcp_browser_use/locking/action_lock.py +190 -0
  38. mcp_browser_use/locking/file_mutex.py +139 -0
  39. mcp_browser_use/locking/window_registry.py +178 -0
  40. mcp_browser_use/tools/__init__.py +59 -0
  41. mcp_browser_use/tools/browser_management.py +260 -0
  42. mcp_browser_use/tools/debugging.py +195 -0
  43. mcp_browser_use/tools/extraction.py +58 -0
  44. mcp_browser_use/tools/interaction.py +323 -0
  45. mcp_browser_use/tools/navigation.py +84 -0
  46. mcp_browser_use/tools/screenshots.py +116 -0
  47. mcp_browser_use/utils/__init__.py +1 -0
  48. mcp_browser_use/utils/diagnostics.py +85 -0
  49. mcp_browser_use/utils/html_utils.py +118 -0
  50. mcp_browser_use/utils/retry.py +57 -0
@@ -0,0 +1,530 @@
1
+ # mcp_browser_use/cleaners.py
2
+
3
+ import re
4
+ from typing import Tuple, Dict, Optional, Sequence, Pattern, Union
5
+
6
+ NOISE_ID_CLASS_PAT = re.compile(
7
+ r"(gtm|gtag|analytics|ad[s-]?|adslot|sponsor|cookie[-_ ]?banner|chat[-_ ]?widget)",
8
+ re.I
9
+ )
10
+
11
+ HIDDEN_CLASS_PAT = re.compile(r"(sr-only|visually-hidden|offscreen)", re.I)
12
+
13
+ def approx_token_count(text: str) -> int:
14
+ # Fast heuristic: ~4 chars per token
15
+ return max(0, (len(text) // 4))
16
+
17
+
18
+ # CDN detection and cleanup helpers
19
+ CDN_HOST_PATS = [
20
+ re.compile(r"(?:^|\.)cdn(?:[\.-]|$)", re.I), # matches cdn.*, *.cdn-foo.*, *.cdn.foo.*
21
+ re.compile(r"/cdn/", re.I), # handles relative /cdn/... paths
22
+ ]
23
+
24
+ def _build_cdn_pats(extra: Optional[Sequence[Union[str, Pattern[str]]]]) -> list[Pattern[str]]:
25
+ pats: list[Pattern[str]] = list(CDN_HOST_PATS)
26
+ for p in extra or []:
27
+ if isinstance(p, str):
28
+ pats.append(re.compile(p, re.I))
29
+ elif hasattr(p, "search"):
30
+ pats.append(p) # already a compiled regex
31
+ return pats
32
+
33
+ def _is_cdn_url(url: str, extra_pats=None) -> bool:
34
+ if not isinstance(url, str) or not url.strip():
35
+ return False
36
+ s = url.strip().strip('\'"')
37
+ # extract from url(...) if style
38
+ s = re.sub(r"^url\((.+?)\)$", r"\1", s).strip('\'"')
39
+ # srcset often has descriptors like "800w" – take only the URL portion
40
+ s = s.split()[0] if s else s
41
+
42
+ # support scheme-less //cdn.host
43
+ host, path = "", ""
44
+ try:
45
+ if s.startswith("//"):
46
+ host = s[2:].split("/", 1)[0].lower()
47
+ path = "/" + s[2:].split("/", 1)[1] if "/" in s[2:] else ""
48
+ else:
49
+ from urllib.parse import urlparse
50
+ p = urlparse(s)
51
+ host = (p.netloc or "").lower()
52
+ path = p.path or ""
53
+ except Exception:
54
+ pass
55
+
56
+ pats = (extra_pats or []) + CDN_HOST_PATS
57
+ haystacks = [host, path, s]
58
+ return any(p.search(h) for h in haystacks for p in pats)
59
+
60
+ def _filter_srcset(srcset_val: str, extra_pats=None) -> str | None:
61
+ # Return a cleaned srcset string without CDN candidates, or None if all are removed
62
+ if not srcset_val:
63
+ return None
64
+ pieces = [i.strip() for i in str(srcset_val).split(",") if i.strip()]
65
+ kept = []
66
+ for piece in pieces:
67
+ url = piece.split()[0].strip('\'"')
68
+ if not _is_cdn_url(url, extra_pats):
69
+ kept.append(piece)
70
+ if not kept:
71
+ return None
72
+ return ", ".join(kept)
73
+
74
+ def _is_button_like(el) -> bool:
75
+ """
76
+ Check if an element should be treated as button-like for class preservation.
77
+
78
+ Args:
79
+ el: BeautifulSoup element to check
80
+
81
+ Returns:
82
+ bool: True if element is <button>, <input type="button/submit/reset/image">, or role="button"
83
+ """
84
+ try:
85
+ tag = (el.name or "").lower()
86
+ except Exception:
87
+ tag = ""
88
+ if tag == "button":
89
+ return True
90
+ typ = str(el.get("type", "")).lower()
91
+ if tag == "input" and typ in ("button", "submit", "reset", "image"):
92
+ return True
93
+ role = str(el.get("role", "")).lower()
94
+ if role == "button":
95
+ return True
96
+ return False
97
+
98
+
99
+ def _remove_comments(soup, pruned_counts: Dict[str, int]) -> None:
100
+ """
101
+ Remove HTML comments from the document to save tokens.
102
+
103
+ Args:
104
+ soup: BeautifulSoup object to modify in-place
105
+ pruned_counts: Dictionary to update with removal counts
106
+ """
107
+ from bs4 import Comment
108
+
109
+ for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
110
+ c.extract()
111
+ pruned_counts["comments_removed"] += 1
112
+
113
+
114
+ def _remove_scripts_and_styles(soup, pruned_counts: Dict[str, int]) -> None:
115
+ """
116
+ Remove non-content tags like scripts, styles, SVG, and non-canonical links.
117
+
118
+ Args:
119
+ soup: BeautifulSoup object to modify in-place
120
+ pruned_counts: Dictionary to update with removal counts
121
+ """
122
+ # Remove scripts/styles/noscript/template/svg/canvas/meta/source/track
123
+ for tag_name in ["script", "style", "noscript", "template", "canvas", "svg", "meta", "source", "track"]:
124
+ removed = soup.find_all(tag_name)
125
+ key = tag_name if tag_name in ["script", "style"] else "noise"
126
+ pruned_counts[key] = pruned_counts.get(key, 0) + len(removed)
127
+ for t in removed:
128
+ t.decompose()
129
+
130
+ # Remove <link> except canonical (robust to str vs list)
131
+ for link in soup.find_all("link"):
132
+ rel = link.get("rel")
133
+ rels = [s.lower() for s in rel] if isinstance(rel, (list, tuple)) else ([str(rel).lower()] if rel else [])
134
+ if "canonical" in rels:
135
+ continue
136
+ pruned_counts["noise"] += 1
137
+ link.decompose()
138
+
139
+
140
+ def _remove_noise_containers(soup, pruned_counts: Dict[str, int], prune_hidden: bool) -> None:
141
+ """
142
+ Remove ads, trackers, hidden elements, and oversized dropdowns.
143
+
144
+ Args:
145
+ soup: BeautifulSoup object to modify in-place
146
+ pruned_counts: Dictionary to update with removal counts
147
+ prune_hidden: If True, remove hidden elements and hidden inputs
148
+ """
149
+ removed_noise = 0
150
+ removed_hidden = 0
151
+
152
+ for el in soup.find_all(True):
153
+ if el.attrs is None:
154
+ continue
155
+
156
+ idv = el.get("id") or ""
157
+ classes = el.get("class") or []
158
+ classv = " ".join(classes) if isinstance(classes, (list, tuple)) else str(classes)
159
+
160
+ aria_hidden = str(el.get("aria-hidden", "")).strip().lower() == "true"
161
+ style_val = el.get("style")
162
+
163
+ style_hidden = False
164
+ if isinstance(style_val, str):
165
+ sv = style_val.lower()
166
+ if re.search(r"display\s*:\s*none\b", sv) or re.search(r"visibility\s*:\s*hidden\b", sv):
167
+ style_hidden = True
168
+
169
+ hidden_attr = el.has_attr("hidden") or aria_hidden or style_hidden
170
+
171
+ # Requires NOISE_ID_CLASS_PAT / HIDDEN_CLASS_PAT to be defined at module scope
172
+ remove_for_noise = bool(NOISE_ID_CLASS_PAT.search(idv) or NOISE_ID_CLASS_PAT.search(classv))
173
+ remove_for_hidden = bool(hidden_attr or HIDDEN_CLASS_PAT.search(classv))
174
+
175
+ if remove_for_noise or (prune_hidden and remove_for_hidden):
176
+ if remove_for_noise:
177
+ removed_noise += 1
178
+ if prune_hidden and remove_for_hidden:
179
+ removed_hidden += 1
180
+ el.decompose()
181
+
182
+ pruned_counts["noise"] += removed_noise
183
+ pruned_counts["hidden_removed"] += removed_hidden
184
+
185
+ # Remove hidden inputs explicitly
186
+ if prune_hidden:
187
+ hidden_inputs_removed = 0
188
+ for inp in soup.find_all("input"):
189
+ typ = str(inp.get("type", "")).lower()
190
+ if typ == "hidden":
191
+ inp.decompose()
192
+ hidden_inputs_removed += 1
193
+ pruned_counts["hidden_removed"] += hidden_inputs_removed
194
+
195
+ # Remove large select dropdowns that cause token overflow
196
+ select_removed = 0
197
+ for select in soup.find_all("select"):
198
+ options = select.find_all("option")
199
+ if len(options) > 5:
200
+ select.decompose()
201
+ select_removed += 1
202
+ pruned_counts["noise"] += select_removed
203
+
204
+ # Also remove JavaScript dropdown menus with many items
205
+ dropdown_removed = 0
206
+ for dropdown_menu in soup.find_all("div", class_=re.compile(r"dropdown-menu")):
207
+ dropdown_items = dropdown_menu.find_all(class_=re.compile(r"dropdown-item"))
208
+ if len(dropdown_items) > 5:
209
+ dropdown_menu.decompose()
210
+ dropdown_removed += 1
211
+ pruned_counts["noise"] += dropdown_removed
212
+
213
+
214
+ def _clean_cdn_links(
215
+ soup,
216
+ pruned_counts: Dict[str, int],
217
+ cdn_host_patterns: list[str] | None,
218
+ drop_cdn_elements: bool,
219
+ ) -> None:
220
+ """
221
+ Strip CDN URLs from attributes, inline styles, and text nodes.
222
+
223
+ Args:
224
+ soup: BeautifulSoup object to modify in-place
225
+ pruned_counts: Dictionary to update with removal counts
226
+ cdn_host_patterns: Additional CDN pattern strings to match
227
+ drop_cdn_elements: If True, remove entire asset tags with CDN URLs
228
+ """
229
+ cdn_removed = 0
230
+ # Common URL-carrying attributes to inspect
231
+ url_attrs = ("src", "href", "poster", "data-src", "data-lazy", "data-original", "data-lazy-src", "data-srcset")
232
+ asset_tags = {"img", "script", "link", "source", "video", "audio", "track"}
233
+
234
+ for el in list(soup.find_all(True)):
235
+ # Clean srcset by filtering out CDN candidates
236
+ if el.has_attr("srcset"):
237
+ cleaned = _filter_srcset(str(el.get("srcset")), cdn_host_patterns)
238
+ original = str(el.get("srcset"))
239
+ if cleaned != original:
240
+ if cleaned:
241
+ el["srcset"] = cleaned
242
+ else:
243
+ try:
244
+ del el.attrs["srcset"]
245
+ except Exception:
246
+ pass
247
+ cdn_removed += 1
248
+
249
+ # Remove CDN URLs from URL attributes
250
+ for attr in url_attrs:
251
+ if not el.has_attr(attr):
252
+ continue
253
+ val = str(el.get(attr, ""))
254
+ if _is_cdn_url(val, cdn_host_patterns):
255
+ if drop_cdn_elements and el.name in asset_tags:
256
+ el.decompose()
257
+ cdn_removed += 1
258
+ break # element is gone
259
+ else:
260
+ try:
261
+ del el.attrs[attr]
262
+ except Exception:
263
+ pass
264
+ cdn_removed += 1
265
+
266
+ # Strip any inline style url(...) pointing to CDNs
267
+ if el.has_attr("style"):
268
+ style_val = str(el["style"])
269
+ # Remove only url(...) tokens that are CDN; keep the rest of the style intact
270
+ def repl(m):
271
+ raw = m.group(1).strip('\'"')
272
+ return "" if _is_cdn_url(raw, cdn_host_patterns) else m.group(0)
273
+
274
+ new_style = re.sub(r"url\((.+?)\)", repl, style_val)
275
+ if new_style != style_val:
276
+ # Clean leftover artifacts like empty declarations
277
+ new_style = re.sub(r"\s*;\s*;\s*", ";", new_style).strip(" ;")
278
+ if new_style:
279
+ el["style"] = new_style
280
+ else:
281
+ try:
282
+ del el.attrs["style"]
283
+ except Exception:
284
+ pass
285
+ cdn_removed += 1
286
+
287
+ pruned_counts["cdn_links_removed"] += cdn_removed
288
+
289
+ # Optional: remove plaintext CDN URLs from text nodes
290
+ for t in soup.find_all(string=True):
291
+ new_t = re.sub(r"https?://[^ \t\n\r,]*cdn[^ \t\n\r,]*", "", str(t), flags=re.I)
292
+ if new_t != str(t):
293
+ t.replace_with(new_t)
294
+ pruned_counts["cdn_links_removed"] += 1
295
+
296
+
297
+ def _prune_attributes(soup, pruned_counts: Dict[str, int], prune_classes_except_buttons: bool) -> None:
298
+ """
299
+ Prune non-essential attributes and normalize remaining ones.
300
+
301
+ Args:
302
+ soup: BeautifulSoup object to modify in-place
303
+ pruned_counts: Dictionary to update with removal counts
304
+ prune_classes_except_buttons: If True, remove class attributes except from button-like elements
305
+ """
306
+ # Keep class for button-like elements only; drop elsewhere if enabled
307
+ keep_attrs = {"id", "class", "href", "src", "alt", "title", "type", "value", "name", "role", "rel"}
308
+
309
+ for el in soup.find_all(True):
310
+ if el.attrs is None:
311
+ continue
312
+
313
+ for attr in list(el.attrs.keys()):
314
+ # Always keep aria-* attributes
315
+ if attr.startswith("aria-"):
316
+ continue
317
+
318
+ # Class pruning toggle
319
+ if attr == "class" and prune_classes_except_buttons:
320
+ if not _is_button_like(el):
321
+ try:
322
+ del el.attrs["class"]
323
+ pruned_counts["attr_trim"] += 1
324
+ pruned_counts["class_drops"] += 1
325
+ except Exception:
326
+ pass
327
+ continue # move on to next attr
328
+
329
+ # Drop attributes not in the allowlist
330
+ if attr not in keep_attrs:
331
+ del el.attrs[attr]
332
+ pruned_counts["attr_trim"] += 1
333
+ continue
334
+
335
+ # Normalize values
336
+ val = el.get(attr)
337
+
338
+ # Preserve list type for class/rel
339
+ if attr in {"class", "rel"} and isinstance(val, (list, tuple)):
340
+ pass # keep as list
341
+ else:
342
+ # Normalize other list-like values to strings
343
+ if isinstance(val, (list, tuple)):
344
+ val = " ".join(map(str, val))
345
+ el[attr] = val
346
+
347
+ # Truncate descriptive text fields
348
+ if attr in {"alt", "title"} and isinstance(val, str) and len(val) > 80:
349
+ el[attr] = val[:80] + "...(trunc)"
350
+ pruned_counts["attr_trim"] += 1
351
+
352
+ # Strip data URIs on src
353
+ src = el.get("src")
354
+ if isinstance(src, str) and src.startswith("data:"):
355
+ try:
356
+ del el.attrs["src"]
357
+ except Exception:
358
+ pass
359
+ pruned_counts["attr_trim"] += 1
360
+
361
+
362
+ def _collapse_wrappers(soup, pruned_counts: Dict[str, int]) -> None:
363
+ """
364
+ Collapse empty div/span wrappers with a single child and replace images with minimal attributes.
365
+
366
+ Args:
367
+ soup: BeautifulSoup object to modify in-place
368
+ pruned_counts: Dictionary to update with removal counts
369
+ """
370
+ # Replace images with minimal attrs
371
+ for img in soup.find_all("img"):
372
+ for k in list(img.attrs.keys()):
373
+ if k not in {"alt", "title"}:
374
+ del img.attrs[k]
375
+ pruned_counts["media"] += 1
376
+
377
+ # Collapse empty div/span wrappers with a single child
378
+ changed = True
379
+ while changed:
380
+ changed = False
381
+ for el in list(soup.find_all(["div", "span"])):
382
+ if not el.parent:
383
+ continue
384
+ children = [c for c in el.children if getattr(c, "name", None)]
385
+ if len(children) == 1 and not (el.get_text(strip=True) or "").strip():
386
+ el.replace_with(children[0])
387
+ pruned_counts["wrapper"] += 1
388
+ changed = True
389
+ break
390
+
391
+
392
+ def _normalize_whitespace(soup, pruned_counts: Dict[str, int]) -> str:
393
+ """
394
+ Normalize whitespace in text nodes and final HTML output.
395
+
396
+ Args:
397
+ soup: BeautifulSoup object to process
398
+ pruned_counts: Dictionary to update with removal counts
399
+
400
+ Returns:
401
+ str: HTML string with normalized whitespace
402
+ """
403
+ from bs4 import NavigableString
404
+
405
+ WHITESPACE_SENSITIVE = {"pre", "code", "textarea"}
406
+ changed_nodes = 0
407
+
408
+ for t in soup.find_all(string=True):
409
+ parent = getattr(t, "parent", None)
410
+ parent_name = (getattr(parent, "name", "") or "").lower()
411
+ if parent_name in WHITESPACE_SENSITIVE:
412
+ continue
413
+ new_text = re.sub(r"\s+", " ", str(t))
414
+ if new_text != str(t):
415
+ t.replace_with(NavigableString(new_text))
416
+ changed_nodes += 1
417
+ pruned_counts["whitespace_trim"] += changed_nodes
418
+
419
+ html_out = str(soup)
420
+
421
+ # As a final safety net, ensure no literal newlines/tabs remain
422
+ before_len = len(html_out)
423
+ html_out = re.sub(r"[\r\n\t]+", " ", html_out)
424
+ html_out = re.sub(r" {2,}", " ", html_out)
425
+ html_out = html_out.strip()
426
+ if len(html_out) < before_len:
427
+ pruned_counts["whitespace_trim"] += 1
428
+
429
+ return html_out
430
+
431
+
432
+ def basic_prune(
433
+ html: str,
434
+ level: int,
435
+ prune_hidden: bool = True,
436
+ prune_classes_except_buttons: bool = True,
437
+ prune_linebreaks: bool = True,
438
+ remove_cdn_links: bool = True,
439
+ cdn_host_patterns: list[str] | None = None,
440
+ drop_cdn_elements: bool = False, # if True, remove entire asset tags
441
+ ) -> Tuple[str, Dict[str, int]]:
442
+ """
443
+ Perform structural pruning on raw HTML to remove non-content noise.
444
+
445
+ Args:
446
+ html: Raw HTML string.
447
+ level: Cleaning level. Higher = more aggressive.
448
+ prune_hidden: If True, remove hidden elements and <input type="hidden">.
449
+ prune_classes_except_buttons: If True, drop 'class' attributes for all
450
+ elements except button-like ones (<button>, certain <input>, role="button").
451
+ prune_linebreaks: If True, remove line breaks/tabs and collapse excessive
452
+ whitespace in text nodes (skipping <pre>, <code>, <textarea>).
453
+ remove_cdn_links: If True, strip CDN URLs from attributes and styles.
454
+ cdn_host_patterns: Additional CDN pattern strings to match.
455
+ drop_cdn_elements: If True, remove entire asset tags with CDN URLs.
456
+ """
457
+ pruned_counts = {
458
+ "script": 0,
459
+ "style": 0,
460
+ "noise": 0,
461
+ "attr_trim": 0,
462
+ "wrapper": 0,
463
+ "media": 0,
464
+ "hidden_removed": 0,
465
+ "class_drops": 0,
466
+ "whitespace_trim": 0,
467
+ "comments_removed": 0,
468
+ "cdn_links_removed": 0,
469
+ }
470
+
471
+ import bs4
472
+ soup = bs4.BeautifulSoup(html or "", "html.parser")
473
+
474
+ # Phase 0: Remove HTML comments
475
+ _remove_comments(soup=soup, pruned_counts=pruned_counts)
476
+
477
+ # Phase 1: Remove scripts, styles, and non-content tags
478
+ _remove_scripts_and_styles(soup=soup, pruned_counts=pruned_counts)
479
+
480
+ # Phase 2: Remove noise containers and hidden elements (if level >= 1)
481
+ if level >= 1:
482
+ _remove_noise_containers(soup=soup, pruned_counts=pruned_counts, prune_hidden=prune_hidden)
483
+
484
+ # Phase 2.5: Strip CDN links (if enabled)
485
+ if remove_cdn_links:
486
+ _clean_cdn_links(soup=soup, pruned_counts=pruned_counts, cdn_host_patterns=cdn_host_patterns, drop_cdn_elements=drop_cdn_elements)
487
+
488
+ # Phase 3: Prune attributes (if level >= 2)
489
+ if level >= 2:
490
+ _prune_attributes(soup=soup, pruned_counts=pruned_counts, prune_classes_except_buttons=prune_classes_except_buttons)
491
+
492
+ # Phase 4: Collapse wrappers (if level >= 3)
493
+ if level >= 3:
494
+ _collapse_wrappers(soup=soup, pruned_counts=pruned_counts)
495
+
496
+ # Phase 5: Normalize whitespace (if enabled)
497
+ if prune_linebreaks:
498
+ html_out = _normalize_whitespace(soup=soup, pruned_counts=pruned_counts)
499
+ else:
500
+ html_out = str(soup)
501
+
502
+ return html_out, pruned_counts
503
+
504
+ def extract_outline(html: str, max_items: int = 64):
505
+ import bs4
506
+ soup = bs4.BeautifulSoup(html or "", "html.parser")
507
+ outline = []
508
+ for level, tag in [(1, "h1"), (2, "h2"), (3, "h3"), (4, "h4")]:
509
+ for el in soup.find_all(tag):
510
+ text = el.get_text(" ", strip=True)
511
+ wc = len(text.split())
512
+ # build a rough css_path
513
+ css_path = None
514
+ try:
515
+ # naive css path
516
+ parts = []
517
+ cur = el
518
+ while cur and cur.name and cur.name != "[document]":
519
+ idp = ("#" + cur.get("id")) if cur.has_attr("id") else ""
520
+ cls = "." + ".".join(cur.get("class", [])) if cur.has_attr("class") else ""
521
+ parts.append(f"{cur.name}{idp}{cls}")
522
+ cur = cur.parent
523
+ css_path = " > ".join(reversed(parts))
524
+ except Exception:
525
+ css_path = None
526
+ outline.append({"level": level, "text": text, "word_count": wc, "css_path": css_path, "subtree_id": None})
527
+ if len(outline) >= max_items:
528
+ return outline
529
+ return outline
530
+
@@ -0,0 +1,30 @@
1
+ """Configuration management for browser automation."""
2
+
3
+ from .environment import (
4
+ get_env_config,
5
+ profile_key,
6
+ is_default_user_data_dir,
7
+ )
8
+
9
+ from .paths import (
10
+ get_lock_dir,
11
+ rendezvous_path,
12
+ start_lock_dir,
13
+ chromedriver_log_path,
14
+ _lock_paths,
15
+ _window_registry_path,
16
+ _same_dir,
17
+ )
18
+
19
+ __all__ = [
20
+ "get_env_config",
21
+ "profile_key",
22
+ "is_default_user_data_dir",
23
+ "get_lock_dir",
24
+ "rendezvous_path",
25
+ "start_lock_dir",
26
+ "chromedriver_log_path",
27
+ "_lock_paths",
28
+ "_window_registry_path",
29
+ "_same_dir",
30
+ ]