iflow-mcp_janspoerer-mcp_browser_use 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/METADATA +26 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/RECORD +50 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/WHEEL +5 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/entry_points.txt +2 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/licenses/LICENSE +201 -0
- iflow_mcp_janspoerer_mcp_browser_use-0.1.0.dist-info/top_level.txt +1 -0
- mcp_browser_use/__init__.py +2 -0
- mcp_browser_use/__main__.py +1347 -0
- mcp_browser_use/actions/__init__.py +1 -0
- mcp_browser_use/actions/elements.py +173 -0
- mcp_browser_use/actions/extraction.py +864 -0
- mcp_browser_use/actions/keyboard.py +43 -0
- mcp_browser_use/actions/navigation.py +73 -0
- mcp_browser_use/actions/screenshots.py +85 -0
- mcp_browser_use/browser/__init__.py +1 -0
- mcp_browser_use/browser/chrome.py +150 -0
- mcp_browser_use/browser/chrome_executable.py +204 -0
- mcp_browser_use/browser/chrome_launcher.py +330 -0
- mcp_browser_use/browser/chrome_process.py +104 -0
- mcp_browser_use/browser/devtools.py +230 -0
- mcp_browser_use/browser/driver.py +322 -0
- mcp_browser_use/browser/process.py +133 -0
- mcp_browser_use/cleaners.py +530 -0
- mcp_browser_use/config/__init__.py +30 -0
- mcp_browser_use/config/environment.py +155 -0
- mcp_browser_use/config/paths.py +97 -0
- mcp_browser_use/constants.py +68 -0
- mcp_browser_use/context.py +150 -0
- mcp_browser_use/context_pack.py +85 -0
- mcp_browser_use/decorators/__init__.py +13 -0
- mcp_browser_use/decorators/ensure.py +84 -0
- mcp_browser_use/decorators/envelope.py +83 -0
- mcp_browser_use/decorators/locking.py +172 -0
- mcp_browser_use/helpers.py +173 -0
- mcp_browser_use/helpers_context.py +261 -0
- mcp_browser_use/locking/__init__.py +1 -0
- mcp_browser_use/locking/action_lock.py +190 -0
- mcp_browser_use/locking/file_mutex.py +139 -0
- mcp_browser_use/locking/window_registry.py +178 -0
- mcp_browser_use/tools/__init__.py +59 -0
- mcp_browser_use/tools/browser_management.py +260 -0
- mcp_browser_use/tools/debugging.py +195 -0
- mcp_browser_use/tools/extraction.py +58 -0
- mcp_browser_use/tools/interaction.py +323 -0
- mcp_browser_use/tools/navigation.py +84 -0
- mcp_browser_use/tools/screenshots.py +116 -0
- mcp_browser_use/utils/__init__.py +1 -0
- mcp_browser_use/utils/diagnostics.py +85 -0
- mcp_browser_use/utils/html_utils.py +118 -0
- mcp_browser_use/utils/retry.py +57 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""HTML processing and cleaning utilities.
|
|
2
|
+
|
|
3
|
+
This module consolidates HTML cleaning functions from both cleaners.py and helpers.py.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import Tuple, Dict, Optional, Sequence, Pattern, Union
|
|
8
|
+
from bs4 import BeautifulSoup, Comment
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Re-export from cleaners.py
|
|
12
|
+
from ..cleaners import (
|
|
13
|
+
NOISE_ID_CLASS_PAT,
|
|
14
|
+
HIDDEN_CLASS_PAT,
|
|
15
|
+
approx_token_count,
|
|
16
|
+
CDN_HOST_PATS,
|
|
17
|
+
_build_cdn_pats,
|
|
18
|
+
_is_cdn_url,
|
|
19
|
+
_filter_srcset,
|
|
20
|
+
basic_prune,
|
|
21
|
+
extract_outline,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def remove_unwanted_tags(html_content: str, aggressive: bool = False) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Remove unwanted tags from HTML.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
html_content: Raw HTML string
|
|
31
|
+
aggressive: If True, removes additional tags like svg, iframe, comments, headers, footers, navigation
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Cleaned HTML string with whitespace collapsed
|
|
35
|
+
"""
|
|
36
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
|
37
|
+
|
|
38
|
+
# Always remove these
|
|
39
|
+
basic_removals = ['script', 'style', 'meta', 'link', 'noscript']
|
|
40
|
+
|
|
41
|
+
# Aggressive mode removes more
|
|
42
|
+
if aggressive:
|
|
43
|
+
basic_removals.extend([
|
|
44
|
+
'svg', 'iframe', 'canvas', 'form'
|
|
45
|
+
])
|
|
46
|
+
|
|
47
|
+
for tag in soup.find_all(basic_removals):
|
|
48
|
+
tag.extract()
|
|
49
|
+
|
|
50
|
+
# Remove HTML comments in aggressive mode
|
|
51
|
+
if aggressive:
|
|
52
|
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
53
|
+
comment.extract()
|
|
54
|
+
|
|
55
|
+
# Remove hidden inputs
|
|
56
|
+
for hidden_input in soup.find_all('input', {'type': 'hidden'}):
|
|
57
|
+
hidden_input.extract()
|
|
58
|
+
|
|
59
|
+
# Remove headers, footers, and navigation (huge space savers for e-commerce sites)
|
|
60
|
+
for tag in soup.find_all(['header', 'footer', 'nav']):
|
|
61
|
+
tag.extract()
|
|
62
|
+
|
|
63
|
+
# Remove common navigation/menu class patterns (but be more selective)
|
|
64
|
+
for tag in soup.find_all(class_=lambda c: c and any(x in str(c).lower() for x in ['-header', '-footer', '-navigation', 'nav-main', '-menu', '-flyout', '-dropdown', 'breadcrumb'])):
|
|
65
|
+
tag.extract()
|
|
66
|
+
|
|
67
|
+
# Remove all attributes except critical ones for product data
|
|
68
|
+
critical_attrs = {'href', 'src', 'alt', 'title', 'class', 'id', 'type', 'name', 'value'}
|
|
69
|
+
for tag in soup.find_all(True):
|
|
70
|
+
# Remove all non-critical attributes
|
|
71
|
+
attrs_to_remove = [attr for attr in tag.attrs if attr not in critical_attrs]
|
|
72
|
+
for attr in attrs_to_remove:
|
|
73
|
+
del tag[attr]
|
|
74
|
+
|
|
75
|
+
# Also remove data-* attributes (often just for JS functionality)
|
|
76
|
+
data_attrs = [attr for attr in tag.attrs if attr.startswith('data-')]
|
|
77
|
+
for attr in data_attrs:
|
|
78
|
+
del tag[attr]
|
|
79
|
+
|
|
80
|
+
# Remove empty tags after cleaning, but preserve structural tags like body, html, divs with children
|
|
81
|
+
# Only remove leaf nodes that are empty
|
|
82
|
+
for tag in soup.find_all():
|
|
83
|
+
if tag.name not in ['html', 'head', 'body'] and not tag.get_text(strip=True) and not tag.find_all(['img', 'input', 'br', 'hr', 'a']):
|
|
84
|
+
tag.extract()
|
|
85
|
+
|
|
86
|
+
return str(soup)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_cleaned_html(driver, aggressive: bool = False) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Get cleaned HTML from the current page.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
driver: Selenium WebDriver instance
|
|
95
|
+
aggressive: If True, applies aggressive HTML cleaning
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Cleaned HTML string
|
|
99
|
+
"""
|
|
100
|
+
html_content = driver.page_source
|
|
101
|
+
return remove_unwanted_tags(html_content, aggressive=aggressive)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
__all__ = [
|
|
105
|
+
# From cleaners.py
|
|
106
|
+
'NOISE_ID_CLASS_PAT',
|
|
107
|
+
'HIDDEN_CLASS_PAT',
|
|
108
|
+
'approx_token_count',
|
|
109
|
+
'CDN_HOST_PATS',
|
|
110
|
+
'_build_cdn_pats',
|
|
111
|
+
'_is_cdn_url',
|
|
112
|
+
'_filter_srcset',
|
|
113
|
+
'basic_prune',
|
|
114
|
+
'extract_outline',
|
|
115
|
+
# From helpers.py
|
|
116
|
+
'remove_unwanted_tags',
|
|
117
|
+
'get_cleaned_html',
|
|
118
|
+
]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Retry logic and error handling utilities."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import random
|
|
5
|
+
import json
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
from selenium.common.exceptions import (
|
|
8
|
+
NoSuchWindowException,
|
|
9
|
+
StaleElementReferenceException,
|
|
10
|
+
WebDriverException,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def retry_op(fn: Callable, retries: int = 2, base_delay: float = 0.15):
|
|
15
|
+
"""
|
|
16
|
+
Retry a function call that may fail due to transient Selenium exceptions.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
fn: The function to call
|
|
20
|
+
retries: Number of retry attempts (default: 2)
|
|
21
|
+
base_delay: Base delay between retries in seconds (default: 0.15)
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
The result of the function call
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
The last exception if all retries fail
|
|
28
|
+
"""
|
|
29
|
+
for attempt in range(retries + 1):
|
|
30
|
+
try:
|
|
31
|
+
return fn()
|
|
32
|
+
except (NoSuchWindowException, StaleElementReferenceException, WebDriverException):
|
|
33
|
+
if attempt == retries:
|
|
34
|
+
raise
|
|
35
|
+
time.sleep(base_delay * (1.0 + random.random()))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _read_json(path: str) -> Optional[dict]:
|
|
39
|
+
"""
|
|
40
|
+
Read a JSON file and return its contents as a dict.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
path: Path to the JSON file
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Dictionary from JSON file, or None if file doesn't exist or is invalid
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
with open(path, "r") as f:
|
|
50
|
+
return json.load(f)
|
|
51
|
+
except Exception:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _now() -> float:
|
|
56
|
+
"""Get current time as a float timestamp."""
|
|
57
|
+
return time.time()
|