stealthfetch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ """StealthFetch — URL in, LLM-ready markdown out."""
2
+
3
+ from importlib.metadata import version
4
+
5
+ from stealthfetch._core import (
6
+ FetchResult,
7
+ afetch_markdown,
8
+ afetch_result,
9
+ fetch_markdown,
10
+ fetch_result,
11
+ )
12
+ from stealthfetch._errors import (
13
+ BrowserNotAvailable,
14
+ ExtractionError,
15
+ FetchError,
16
+ StealthFetchError,
17
+ )
18
+
19
+ __version__ = version("stealthfetch")
20
+
21
+ __all__ = [
22
+ "BrowserNotAvailable",
23
+ "ExtractionError",
24
+ "FetchError",
25
+ "FetchResult",
26
+ "StealthFetchError",
27
+ "__version__",
28
+ "afetch_markdown",
29
+ "afetch_result",
30
+ "fetch_markdown",
31
+ "fetch_result",
32
+ ]
@@ -0,0 +1,75 @@
1
+ """Browser backend dispatcher."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from stealthfetch._compat import get_default_backend, require_browser
6
+
7
+
8
+ def fetch_browser(
9
+ url: str,
10
+ *,
11
+ backend: str = "auto",
12
+ timeout: int = 30,
13
+ proxy: dict[str, str] | None = None,
14
+ headers: dict[str, str] | None = None,
15
+ ) -> str:
16
+ """Fetch a URL using a stealth browser (sync).
17
+
18
+ Args:
19
+ url: The URL to fetch.
20
+ backend: "auto", "camoufox", or "patchright".
21
+ timeout: Timeout in seconds.
22
+ proxy: Proxy config dict with "server", optional "username"/"password".
23
+ headers: Additional HTTP headers to send with the request.
24
+
25
+ Returns:
26
+ Rendered HTML string.
27
+ """
28
+ name = _resolve_backend(backend)
29
+ if name == "camoufox":
30
+ from stealthfetch._browsers._camoufox import fetch as _cfetch
31
+
32
+ return _cfetch(url, timeout=timeout, proxy=proxy, headers=headers)
33
+ else:
34
+ from stealthfetch._browsers._patchright import fetch as _pfetch
35
+
36
+ return _pfetch(url, timeout=timeout, proxy=proxy, headers=headers)
37
+
38
+
39
+ async def afetch_browser(
40
+ url: str,
41
+ *,
42
+ backend: str = "auto",
43
+ timeout: int = 30,
44
+ proxy: dict[str, str] | None = None,
45
+ headers: dict[str, str] | None = None,
46
+ ) -> str:
47
+ """Fetch a URL using a stealth browser (async).
48
+
49
+ Args:
50
+ url: The URL to fetch.
51
+ backend: "auto", "camoufox", or "patchright".
52
+ timeout: Timeout in seconds.
53
+ proxy: Proxy config dict with "server", optional "username"/"password".
54
+ headers: Additional HTTP headers to send with the request.
55
+
56
+ Returns:
57
+ Rendered HTML string.
58
+ """
59
+ name = _resolve_backend(backend)
60
+ if name == "camoufox":
61
+ from stealthfetch._browsers._camoufox import afetch as _cafetch
62
+
63
+ return await _cafetch(url, timeout=timeout, proxy=proxy, headers=headers)
64
+ else:
65
+ from stealthfetch._browsers._patchright import afetch as _pafetch
66
+
67
+ return await _pafetch(url, timeout=timeout, proxy=proxy, headers=headers)
68
+
69
+
70
+ def _resolve_backend(backend: str) -> str:
71
+ """Resolve 'auto' to a concrete backend name and validate availability."""
72
+ if backend == "auto":
73
+ return get_default_backend()
74
+ require_browser(backend)
75
+ return backend
@@ -0,0 +1,73 @@
1
+ """Camoufox browser backend — sync and async."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from stealthfetch._browsers._constants import BODY_READY_JS, BODY_READY_TIMEOUT, build_proxy
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def fetch(
13
+ url: str,
14
+ *,
15
+ timeout: int = 30,
16
+ proxy: dict[str, str] | None = None,
17
+ headers: dict[str, str] | None = None,
18
+ ) -> str:
19
+ """Fetch a URL with Camoufox stealth browser (sync)."""
20
+ from camoufox.sync_api import Camoufox
21
+
22
+ logger.debug("Camoufox sync fetch: %s", url)
23
+ camoufox_proxy = build_proxy(proxy)
24
+
25
+ with Camoufox( # type: ignore[no-untyped-call]
26
+ headless=True,
27
+ proxy=camoufox_proxy,
28
+ geoip=bool(proxy),
29
+ block_images=True,
30
+ block_webrtc=True,
31
+ ) as browser:
32
+ page = browser.new_page()
33
+ page.set_default_timeout(timeout * 1000)
34
+ if headers:
35
+ page.set_extra_http_headers(headers)
36
+ page.goto(url, wait_until="domcontentloaded")
37
+ try:
38
+ page.wait_for_function(BODY_READY_JS, timeout=BODY_READY_TIMEOUT)
39
+ except Exception:
40
+ logger.debug("Body readiness check timed out, continuing with current content")
41
+ return str(page.content())
42
+
43
+
44
+ async def afetch(
45
+ url: str,
46
+ *,
47
+ timeout: int = 30,
48
+ proxy: dict[str, str] | None = None,
49
+ headers: dict[str, str] | None = None,
50
+ ) -> str:
51
+ """Fetch a URL with Camoufox stealth browser (async)."""
52
+ from camoufox.async_api import AsyncCamoufox
53
+
54
+ logger.debug("Camoufox async fetch: %s", url)
55
+ camoufox_proxy = build_proxy(proxy)
56
+
57
+ async with AsyncCamoufox( # type: ignore[no-untyped-call]
58
+ headless=True,
59
+ proxy=camoufox_proxy,
60
+ geoip=bool(proxy),
61
+ block_images=True,
62
+ block_webrtc=True,
63
+ ) as browser:
64
+ page = await browser.new_page()
65
+ page.set_default_timeout(timeout * 1000)
66
+ if headers:
67
+ await page.set_extra_http_headers(headers)
68
+ await page.goto(url, wait_until="domcontentloaded")
69
+ try:
70
+ await page.wait_for_function(BODY_READY_JS, timeout=BODY_READY_TIMEOUT)
71
+ except Exception:
72
+ logger.debug("Body readiness check timed out, continuing with current content")
73
+ return str(await page.content())
@@ -0,0 +1,16 @@
1
+ """Shared constants and utilities for browser backends."""
2
+
3
+ BODY_READY_JS = "document.body && document.body.innerText.trim().length > 100"
4
+ BODY_READY_TIMEOUT = 10_000 # ms
5
+
6
+
7
+ def build_proxy(proxy: dict[str, str] | None) -> dict[str, str] | None:
8
+ """Convert a stealthfetch proxy dict to a Playwright-compatible proxy dict."""
9
+ if not proxy:
10
+ return None
11
+ result: dict[str, str] = {"server": proxy["server"]}
12
+ if "username" in proxy:
13
+ result["username"] = proxy["username"]
14
+ if "password" in proxy:
15
+ result["password"] = proxy["password"]
16
+ return result
@@ -0,0 +1,67 @@
1
+ """Patchright browser backend — sync and async."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from stealthfetch._browsers._constants import BODY_READY_JS, BODY_READY_TIMEOUT, build_proxy
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def fetch(
13
+ url: str,
14
+ *,
15
+ timeout: int = 30,
16
+ proxy: dict[str, str] | None = None,
17
+ headers: dict[str, str] | None = None,
18
+ ) -> str:
19
+ """Fetch a URL with Patchright stealth browser (sync)."""
20
+ from patchright.sync_api import sync_playwright
21
+
22
+ logger.debug("Patchright sync fetch: %s", url)
23
+
24
+ with sync_playwright() as p:
25
+ browser = p.chromium.launch(headless=True, proxy=build_proxy(proxy)) # type: ignore[arg-type]
26
+ try:
27
+ page = browser.new_page()
28
+ page.set_default_timeout(timeout * 1000)
29
+ if headers:
30
+ page.set_extra_http_headers(headers)
31
+ page.goto(url, wait_until="domcontentloaded")
32
+ try:
33
+ page.wait_for_function(BODY_READY_JS, timeout=BODY_READY_TIMEOUT)
34
+ except Exception:
35
+ logger.debug("Body readiness check timed out, continuing with current content")
36
+ return str(page.content())
37
+ finally:
38
+ browser.close()
39
+
40
+
41
+ async def afetch(
42
+ url: str,
43
+ *,
44
+ timeout: int = 30,
45
+ proxy: dict[str, str] | None = None,
46
+ headers: dict[str, str] | None = None,
47
+ ) -> str:
48
+ """Fetch a URL with Patchright stealth browser (async)."""
49
+ from patchright.async_api import async_playwright
50
+
51
+ logger.debug("Patchright async fetch: %s", url)
52
+
53
+ async with async_playwright() as p:
54
+ browser = await p.chromium.launch(headless=True, proxy=build_proxy(proxy)) # type: ignore[arg-type]
55
+ try:
56
+ page = await browser.new_page()
57
+ page.set_default_timeout(timeout * 1000)
58
+ if headers:
59
+ await page.set_extra_http_headers(headers)
60
+ await page.goto(url, wait_until="domcontentloaded")
61
+ try:
62
+ await page.wait_for_function(BODY_READY_JS, timeout=BODY_READY_TIMEOUT)
63
+ except Exception:
64
+ logger.debug("Body readiness check timed out, continuing with current content")
65
+ return str(await page.content())
66
+ finally:
67
+ await browser.close()
@@ -0,0 +1,50 @@
1
+ """Lazy imports and feature detection for optional browser backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from stealthfetch._errors import BrowserNotAvailable
6
+
7
+ # Not using @functools.cache — these check on every call so that
8
+ # installing a browser backend mid-process (e.g., in a long-running
9
+ # MCP server) is detected without restart.
10
+
11
+
12
+ def has_camoufox() -> bool:
13
+ """Check if camoufox is importable."""
14
+ try:
15
+ import camoufox.sync_api # noqa: F401
16
+
17
+ return True
18
+ except ImportError:
19
+ return False
20
+
21
+
22
+ def has_patchright() -> bool:
23
+ """Check if patchright is importable."""
24
+ try:
25
+ import patchright.sync_api # noqa: F401
26
+
27
+ return True
28
+ except ImportError:
29
+ return False
30
+
31
+
32
+ def require_browser(backend: str) -> None:
33
+ """Raise BrowserNotAvailable if the requested backend is missing."""
34
+ if backend == "camoufox" and not has_camoufox():
35
+ raise BrowserNotAvailable("camoufox")
36
+ if backend == "patchright" and not has_patchright():
37
+ raise BrowserNotAvailable("patchright")
38
+
39
+
40
+ def get_default_backend() -> str:
41
+ """Return the best available browser backend name.
42
+
43
+ Raises:
44
+ BrowserNotAvailable: If no browser backend is installed.
45
+ """
46
+ if has_camoufox():
47
+ return "camoufox"
48
+ if has_patchright():
49
+ return "patchright"
50
+ raise BrowserNotAvailable("browser")