switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+ """Opt-in Playwright trace capture for the browser tiers (req 16).
2
+
3
+ Off by default. Set ``SCRAPER_TRACE_SESSION=1`` to record a Playwright trace
4
+ (screenshots + DOM snapshots + network) for every browser-tier attempt; each is
5
+ written as a self-contained zip under ``state/traces/`` and is openable with
6
+ ``playwright show-trace <zip>``. The HTTP server exposes list / fetch / delete
7
+ endpoints so traces can be pulled and cleaned up on demand.
8
+
9
+ Capture is wrapped so a tracing failure never breaks a scrape — it just means no
10
+ trace for that attempt. Traces are heavyweight (MBs each); keep this off in
11
+ steady state and flip it on to debug a specific host.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import os
17
+ import re
18
+ import time
19
+
20
+ from .policy.gates import host_of
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ _DEFAULT_STATE_DIR = os.path.join(
25
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "state")
26
+ _STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
27
+ TRACE_DIR = os.path.join(_STATE_DIR, "traces")
28
+
29
+ # Trace ids are the zip filename stem; constrain to a safe charset so a request
30
+ # id can never escape TRACE_DIR.
31
+ _ID_RE = re.compile(r"^[A-Za-z0-9._-]+$")
32
+
33
+
34
+ def enabled() -> bool:
35
+ return os.getenv("SCRAPER_TRACE_SESSION") in ("1", "true", "True")
36
+
37
+
38
+ def start(context, url: str) -> bool:
39
+ """Begin tracing on a browser context. No-op (returns False) when disabled or
40
+ if the context doesn't support tracing."""
41
+ if not enabled():
42
+ return False
43
+ try:
44
+ context.tracing.start(screenshots=True, snapshots=True, sources=True)
45
+ return True
46
+ except Exception as e:
47
+ logger.warning(f"session_trace: start failed: {e}")
48
+ return False
49
+
50
+
51
+ def stop(context, url: str) -> str | None:
52
+ """Stop tracing and write the zip; returns its path (or None on failure)."""
53
+ if not enabled():
54
+ return None
55
+ os.makedirs(TRACE_DIR, exist_ok=True)
56
+ name = f"{host_of(url) or 'unknown'}-{int(time.time() * 1000)}.zip"
57
+ path = os.path.join(TRACE_DIR, name)
58
+ try:
59
+ context.tracing.stop(path=path)
60
+ logger.info(f"session_trace: wrote {path}")
61
+ return path
62
+ except Exception as e:
63
+ logger.warning(f"session_trace: stop failed: {e}")
64
+ return None
65
+
66
+
67
+ # ── server-side management ───────────────────────────────────────────────────
68
+
69
+ def list_traces() -> list[dict]:
70
+ """All captured traces, newest first: id, bytes, modified-at (epoch)."""
71
+ if not os.path.isdir(TRACE_DIR):
72
+ return []
73
+ out = []
74
+ for fn in os.listdir(TRACE_DIR):
75
+ if not fn.endswith(".zip"):
76
+ continue
77
+ p = os.path.join(TRACE_DIR, fn)
78
+ st = os.stat(p)
79
+ out.append({"id": fn[:-4], "bytes": st.st_size, "modified": st.st_mtime})
80
+ return sorted(out, key=lambda t: -t["modified"])
81
+
82
+
83
+ def path_for(trace_id: str) -> str | None:
84
+ """Resolve a trace id to its zip path, or None if missing/invalid."""
85
+ if not _ID_RE.match(trace_id or ""):
86
+ return None
87
+ p = os.path.join(TRACE_DIR, f"{trace_id}.zip")
88
+ return p if os.path.exists(p) else None
89
+
90
+
91
+ def delete(trace_id: str) -> bool:
92
+ p = path_for(trace_id)
93
+ if not p:
94
+ return False
95
+ os.remove(p)
96
+ return True
@@ -0,0 +1,24 @@
1
+ """The cost-ordered cascade. Each tier exposes:
2
+
3
+ NAME : str
4
+ PAID : bool # gated/audited if True
5
+ fetch(url) -> str | None # markdown on success; None if not
6
+ # applicable; raises on failure.
7
+
8
+ Order matters — cheapest/cleanest first, paid last.
9
+ """
10
+ from . import (tier0_apis, tier1_http, tier2_cloudscraper,
11
+ tier3_browser, tier3b_camoufox, tier_residential, tier4_firecrawl)
12
+
13
+ TIERS = [
14
+ tier0_apis,
15
+ tier1_http,
16
+ tier2_cloudscraper,
17
+ tier3_browser,
18
+ tier3b_camoufox, # env-gated Firefox stealth (off by default; orthogonal to T3)
19
+ tier_residential, # residential-IP CDP browser (off unless BU_CDP_URL set)
20
+ tier4_firecrawl,
21
+ ]
22
+
23
+ # tier name -> index, for botwall winning-tier routing.
24
+ INDEX = {t.NAME: i for i, t in enumerate(TIERS)}
@@ -0,0 +1,50 @@
1
+ """Shared helpers for the stealth-browser tiers (patchright, camoufox).
2
+
3
+ Not a tier itself (leading underscore, not in the TIERS registry) — just the
4
+ challenge-resolution mechanic both browsers need.
5
+
6
+ Akamai Bot Manager / Imperva / Kasada serve a JS-*sensor* interstitial on the
7
+ first load: it sets cookies (e.g. ak_bmsc, _abck) as the sensor script runs, then
8
+ the *real* content is only returned on a re-request. A single settle + reload
9
+ after the sensor runs clears them — provided the egress IP is acceptable. (A hard
10
+ IP block never validates `_abck` and the page stays an interstitial, so the tier
11
+ still falls through; this just stops us snapshotting the interstitial too early on
12
+ IPs that would have passed.)
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from ..normalize import html_to_markdown
17
+ from ..policy.gates import _looks_like_botwall
18
+
19
+ _SETTLE_MS = 5000 # let the sensor JS run and set its cookies
20
+ _POST_RELOAD_MS = 1500 # let the real content paint after the reload
21
+
22
+
23
+ def looks_blocked(html: str, url: str) -> bool:
24
+ """True when the rendered DOM is a bot-wall / sensor interstitial."""
25
+ return _looks_like_botwall(html_to_markdown(html, base_url=url))
26
+
27
+
28
+ def response_bytes(responses) -> int:
29
+ """Total wire bytes a render pulled across every resource — the residential-cost
30
+ basis. Reads only the Content-Length header: it's non-blocking. (We deliberately
31
+ do NOT call resp.body() — on a stalled response body() blocks with no timeout and
32
+ can freeze the whole render, which is uninterruptible by the cascade deadline.)
33
+ Responses without a Content-Length are skipped, so this slightly undercounts."""
34
+ total = 0
35
+ for resp in responses:
36
+ try:
37
+ cl = resp.headers.get("content-length")
38
+ if cl:
39
+ total += int(cl)
40
+ except Exception:
41
+ pass
42
+ return total
43
+
44
+
45
+ def reload_through_challenge(page, url: str, timeout_ms: int) -> str:
46
+ """Settle so the bot-manager sensor JS runs, reload once, return fresh html."""
47
+ page.wait_for_timeout(_SETTLE_MS)
48
+ page.goto(url, wait_until="networkidle", timeout=timeout_ms)
49
+ page.wait_for_timeout(_POST_RELOAD_MS)
50
+ return page.content()
@@ -0,0 +1,77 @@
1
+ """Tier 0 — direct APIs / open mirrors.
2
+
3
+ Cheapest, cleanest, most reliable. Pattern-routed: returns None when no mirror
4
+ matches (caller falls through to Tier 1). Bypasses the botwall skip-list because
5
+ these are open, stable endpoints — not the scraped page.
6
+
7
+ Extend here with career-ops-style structured providers (greenhouse/lever/ashby).
8
+ Web *search* (query → URLs) is a different shape and lives in switchback/search.py
9
+ (local SearXNG), not in this fetch cascade.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from urllib.parse import unquote
15
+ from xml.etree import ElementTree as ET
16
+
17
+ from ..normalize import html_to_markdown, UA
18
+ from ..policy.gates import check
19
+
20
+ NAME = "tier0_apis"
21
+ PAID = False
22
+
23
+ ARXIV_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:v\d+)?(?:\.pdf)?", re.I)
24
+ WIKI_RE = re.compile(r"en\.wikipedia\.org/wiki/([^?#]+)", re.I)
25
+ PMC_RE = re.compile(r"pmc\.ncbi\.nlm\.nih\.gov/articles/(PMC\d+)", re.I)
26
+
27
+
28
+ def fetch(url: str) -> str | None:
29
+ m = ARXIV_RE.search(url)
30
+ if m:
31
+ return _arxiv(m.group(1), url)
32
+ m = WIKI_RE.search(url)
33
+ if m:
34
+ return _wikipedia(m.group(1), url)
35
+ if PMC_RE.search(url):
36
+ return _europepmc(url)
37
+ return None # no mirror — fall through
38
+
39
+
40
+ def _arxiv(arxiv_id: str, url: str) -> str:
41
+ # arxiv wants plain requests + an identifying UA (their published guidance);
42
+ # impersonating Chrome triggers aggressive 429s from their Akamai front-end.
43
+ import requests
44
+ r = requests.get(f"https://export.arxiv.org/api/query?id_list={arxiv_id}",
45
+ timeout=15,
46
+ headers={"User-Agent": "switchback/1.0 (mailto:akash@theaklabs.com)"})
47
+ r.raise_for_status()
48
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
49
+ entry = ET.fromstring(r.text).find("atom:entry", ns)
50
+ if entry is None:
51
+ raise RuntimeError("arxiv: no entry in API response")
52
+ title = (entry.findtext("atom:title", "", ns) or "").strip()
53
+ summary = (entry.findtext("atom:summary", "", ns) or "").strip()
54
+ authors = [a.findtext("atom:name", "", ns) or "" for a in entry.findall("atom:author", ns)]
55
+ md = (f"# {title}\n\n**Authors:** {', '.join(a for a in authors if a)}\n\n"
56
+ f"**arXiv:** {arxiv_id}\n\n## Abstract\n\n{summary}")
57
+ return check(url, md)
58
+
59
+
60
+ def _wikipedia(title: str, url: str) -> str:
61
+ from curl_cffi import requests as cffi
62
+ r = cffi.get(f"https://en.wikipedia.org/api/rest_v1/page/html/{unquote(title)}",
63
+ timeout=15, impersonate="chrome")
64
+ r.raise_for_status()
65
+ return check(url, html_to_markdown(r.text, base_url=url))
66
+
67
+
68
+ def _europepmc(url: str) -> str:
69
+ # PMC full text via EuropePMC mirror (avoids reCAPTCHA on ncbi).
70
+ import requests
71
+ pmcid = PMC_RE.search(url).group(1)
72
+ api = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
73
+ r = requests.get(api, timeout=20, headers={"User-Agent": UA})
74
+ r.raise_for_status()
75
+ if len(r.text) < 1000:
76
+ raise RuntimeError(f"europepmc empty: {len(r.text)}")
77
+ return check(url, html_to_markdown(r.text, base_url=url))
@@ -0,0 +1,65 @@
1
+ """Tier 1 — plain HTTP with TLS fingerprint impersonation.
2
+
3
+ curl_cffi impersonates a real Chrome TLS handshake, which clears many naive bot
4
+ walls without a browser. Handles PDFs inline. Fast and cheap.
5
+
6
+ The bare "chrome" alias resolves to an old default; we pin recent targets and
7
+ rotate them deterministically per host, so our traffic isn't one shared JA3 yet
8
+ each host stays reproducible (and pairs cleanly with the session cache, which
9
+ records the target that won).
10
+
11
+ No User-Agent override: the impersonate target already sends a UA that matches
12
+ its TLS fingerprint. Overriding it with a stale string is a detection tell (TLS
13
+ says one Chrome version, the header says another).
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ from urllib.parse import urlsplit
19
+
20
+ from .. import session_cache
21
+ from ..egress import requests_proxies, add_wire_bytes
22
+ from ..normalize import html_to_markdown, pdf_bytes_to_text
23
+ from ..policy.gates import BotWall, check, is_cf_challenge
24
+
25
+ NAME = "tier1_http"
26
+ PAID = False
27
+
28
+ # Recent Chrome JA3 targets available in curl_cffi 0.15.x. A small spread of real
29
+ # versions mirrors how live traffic is distributed across Chrome releases.
30
+ _IMPERSONATE_TARGETS = ("chrome131", "chrome136", "chrome142")
31
+
32
+
33
+ def _impersonate_for(url: str) -> str:
34
+ host = urlsplit(url).hostname or ""
35
+ h = int(hashlib.sha1(host.encode()).hexdigest(), 16)
36
+ return _IMPERSONATE_TARGETS[h % len(_IMPERSONATE_TARGETS)]
37
+
38
+
39
+ def fetch(url: str) -> str:
40
+ from curl_cffi import requests as cffi
41
+ # Auth cookies only: the cached cf_clearance is UA-bound to whichever tier
42
+ # solved it, and CF hosts route straight to Tier 2 on repeat, so replaying it
43
+ # against Tier 1's distinct impersonate UA would be a mismatch tell.
44
+ cookie = session_cache.cookie_header(url, include_cache=False)
45
+ headers = {"Cookie": cookie} if cookie else None
46
+ r = cffi.get(url, timeout=15, allow_redirects=True,
47
+ impersonate=_impersonate_for(url),
48
+ proxies=requests_proxies(), headers=headers)
49
+ add_wire_bytes(len(r.content)) # count even on a block — failed fetches burn bandwidth too
50
+ if r.status_code >= 400:
51
+ # A Cloudflare JS challenge often returns 403/503 with the interstitial in
52
+ # the body. Surface that as a botwall (Tier 2 can solve it) rather than a
53
+ # hard http_block — which the orchestrator uses to skip Tier 2 entirely.
54
+ if is_cf_challenge(r.headers, r.text):
55
+ raise BotWall("cloudflare challenge", vendor="cloudflare")
56
+ r.raise_for_status()
57
+ ctype = r.headers.get("Content-Type", "").lower()
58
+ is_pdf = "application/pdf" in ctype or r.url.lower().split("?")[0].endswith(".pdf")
59
+ if is_pdf:
60
+ try:
61
+ text = pdf_bytes_to_text(r.content)
62
+ finally:
63
+ r.close()
64
+ return check(url, text)
65
+ return check(url, html_to_markdown(r.text, base_url=r.url))
@@ -0,0 +1,135 @@
1
+ """Tier 2 — Cloudflare / anti-bot solver (cloudscraper 3.x "Enhanced Edition").
2
+
3
+ Targets the specific failure the cheaper tiers can't clear: a Cloudflare JS
4
+ challenge / "checking your browser" interstitial. Solves it in-process (no
5
+ browser), then returns the real page. First hit to a CF host sleeps ~5s.
6
+
7
+ cloudscraper 3.x clears v1/v2/v3 (JS-VM) challenges and Turnstile, with stealth
8
+ on by default (randomized headers, browser quirks, human-like pacing) and
9
+ automatic cf_clearance refresh on 403. Pinned to the GitHub Enhanced Edition;
10
+ PyPI is frozen at 1.2.71 (v1/v2 only, no stealth) — see pyproject.toml.
11
+
12
+ On hard CAPTCHA variants with no solver configured this raises and the cascade
13
+ falls through to the stealth browser (Tier 3).
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import os
19
+ import shutil
20
+ import threading
21
+
22
+ from .. import egress, session_cache
23
+ from ..egress import requests_proxies
24
+ from ..normalize import html_to_markdown
25
+ from ..policy.gates import check
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ NAME = "tier2_cloudscraper"
30
+ PAID = False
31
+
32
+ # Wall-clock cap on the whole solve. cloudscraper 3.x *attempts* interactive
33
+ # Turnstile and can loop for minutes on a challenge it can't clear — far past the
34
+ # per-request socket timeout. Capping it here lets the cascade fall through to the
35
+ # stealth browser (which can handle interactive challenges) instead of burning the
36
+ # per-URL deadline. ~25s comfortably covers a real JS/v3 solve (~5-15s).
37
+ _TIMEOUT_S = float(os.getenv("SCRAPER_CLOUDSCRAPER_TIMEOUT_S", "25"))
38
+
39
+ # Stealth pacing. Kept modest: Tier 2 only fires on CF-suspected hosts, and the
40
+ # real latency win comes from skipping the solve entirely on repeat hits (session
41
+ # cache), not from long inter-request sleeps.
42
+ _STEALTH_OPTIONS = {
43
+ "min_delay": 0.5,
44
+ "max_delay": 1.5,
45
+ "human_like_delays": True,
46
+ "randomize_headers": True,
47
+ "browser_quirks": True,
48
+ }
49
+
50
+
51
+ _captcha_warned = False
52
+
53
+
54
+ def _captcha_opts() -> dict:
55
+ """Opt-in third-party captcha solver (off by default). When both env vars are
56
+ set, cloudscraper solves Turnstile / reCAPTCHA / hCaptcha on CF hosts in-process
57
+ via the provider (2captcha, capsolver, capmonster, anticaptcha, deathbycaptcha,
58
+ 9kw). PAID: the provider bills per solve. cloudscraper resets its solve counter
59
+ on success, so per-solve counts aren't observable here — track spend in the
60
+ provider's own dashboard."""
61
+ provider = os.getenv("SCRAPER_CAPTCHA_PROVIDER")
62
+ api_key = os.getenv("SCRAPER_CAPTCHA_API_KEY")
63
+ if not (provider and api_key):
64
+ return {}
65
+ global _captcha_warned
66
+ if not _captcha_warned:
67
+ logger.warning(f"tier2: captcha solver active (provider={provider}); "
68
+ "solves are billed by the provider")
69
+ _captcha_warned = True
70
+ return {"captcha": {"provider": provider, "api_key": api_key}}
71
+
72
+
73
+ def _interpreter_opts() -> dict:
74
+ """The v3 JS-VM challenge runs an interpreter. The 3.x default js2py is pure
75
+ Python — slow and prone to stalling on heavy challenges; Node runs them fast
76
+ and reliably. Prefer it when present, else fall back to the default."""
77
+ return {"interpreter": "nodejs"} if shutil.which("node") else {}
78
+
79
+
80
+ def _make_scraper():
81
+ import cloudscraper
82
+ # enable_stealth / auto_refresh_on_403 are on by default in 3.x; we pass the
83
+ # stealth tuning explicitly. No UA override: cloudscraper derives a UA (and
84
+ # matching cipher suite) from the browser dict; a stale override contradicts it.
85
+ return cloudscraper.create_scraper(
86
+ browser={"browser": "chrome", "platform": "linux", "mobile": False},
87
+ enable_stealth=True,
88
+ stealth_options=_STEALTH_OPTIONS,
89
+ **_interpreter_opts(),
90
+ **_captcha_opts(),
91
+ )
92
+
93
+
94
+ def _fetch(url: str) -> str:
95
+ scraper = _make_scraper()
96
+ # Replay a cached cf_clearance (skips the ~5s solve) plus any auth cookies.
97
+ cookies = session_cache.cookies_for(url, include_cache=True)
98
+ r = scraper.get(url, timeout=20, proxies=requests_proxies(),
99
+ cookies=cookies or None)
100
+ r.raise_for_status()
101
+ nbytes = len(r.content)
102
+ md = check(url, html_to_markdown(r.text, base_url=r.url))
103
+ # Cleared: cache whatever cf cookies the session now holds for next time.
104
+ session_cache.remember(url, dict(scraper.cookies),
105
+ ua=scraper.headers.get("User-Agent", ""))
106
+ return md, nbytes
107
+
108
+
109
+ def fetch(url: str) -> str:
110
+ # Run the (blocking, occasionally runaway) solve under a hard wall-clock cap.
111
+ # A daemon worker means an abandoned solve can't block process exit; it dies on
112
+ # its own socket timeout shortly after. Thread-locals don't inherit, so the
113
+ # egress scope is re-applied inside the worker.
114
+ scoped = egress.in_egress_scope()
115
+ box: dict = {}
116
+
117
+ def work():
118
+ with egress.egress_scope(scoped):
119
+ try:
120
+ box["md"], box["bytes"] = _fetch(url)
121
+ except BaseException as e: # noqa: BLE001 — propagated to caller below
122
+ box["err"] = e
123
+
124
+ t = threading.Thread(target=work, name="tier2-cloudscraper", daemon=True)
125
+ t.start()
126
+ t.join(_TIMEOUT_S)
127
+ if t.is_alive():
128
+ raise TimeoutError(
129
+ f"cloudscraper exceeded {_TIMEOUT_S}s (unsolvable challenge); "
130
+ "falling through to the stealth browser")
131
+ # Re-attribute the worker's wire bytes here, in the scope-owning thread.
132
+ egress.add_wire_bytes(box.get("bytes", 0))
133
+ if "err" in box:
134
+ raise box["err"]
135
+ return box["md"]
@@ -0,0 +1,59 @@
1
+ """Tier 3 — stealth headless browser (patchright).
2
+
3
+ Renders JS-heavy SPAs. patchright is a hardened Playwright fork that evades the
4
+ common automation fingerprints. Tries domcontentloaded first, then networkidle
5
+ if the DOM is suspiciously small (lazy/JS content).
6
+
7
+ Future: (a) batch many URLs through one browser (musings does this — big perf
8
+ win); (b) browser-harness mode to drive the user's logged-in Chrome over CDP for
9
+ auth-walled pages (set BU_CDP_URL).
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from . import _browser
14
+ from .. import session_cache, session_trace
15
+ from ..concurrency import browser_slot
16
+ from ..egress import playwright_proxy, add_wire_bytes
17
+ from ..normalize import html_to_markdown
18
+ from ..policy.gates import check
19
+
20
+ NAME = "tier3_browser"
21
+ PAID = False
22
+
23
+
24
+ def fetch(url: str, timeout_ms: int = 15000) -> str:
25
+ from patchright.sync_api import sync_playwright
26
+ with browser_slot(NAME), sync_playwright() as p:
27
+ browser = p.chromium.launch(headless=True, proxy=playwright_proxy())
28
+ ctx = None
29
+ try:
30
+ # No user_agent override: patchright ships a real, internally
31
+ # consistent Chromium fingerprint; overriding the UA desyncs it from
32
+ # the engine version / client hints and defeats the stealth fork.
33
+ ctx = browser.new_context()
34
+ session_trace.start(ctx, url)
35
+ auth = session_cache.browser_cookies(url)
36
+ if auth:
37
+ ctx.add_cookies(auth)
38
+ page = ctx.new_page()
39
+ responses: list = []
40
+ page.on("response", lambda resp: responses.append(resp))
41
+ page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
42
+ html = page.content()
43
+ if len(html) < 5000:
44
+ try:
45
+ page.wait_for_load_state("networkidle", timeout=8000)
46
+ except Exception:
47
+ pass
48
+ html = page.content()
49
+ # A JS bot-manager (Akamai/Imperva/…) may serve a sensor interstitial
50
+ # first; settle + reload once to get the real page on an acceptable IP.
51
+ if _browser.looks_blocked(html, page.url or url):
52
+ html = _browser.reload_through_challenge(page, url, timeout_ms)
53
+ add_wire_bytes(_browser.response_bytes(responses))
54
+ md = html_to_markdown(html, base_url=page.url or url)
55
+ finally:
56
+ if ctx is not None:
57
+ session_trace.stop(ctx, url)
58
+ browser.close()
59
+ return check(url, md)
@@ -0,0 +1,89 @@
1
+ """Tier 3b — Camoufox (hardened Firefox), env-gated stealth.
2
+
3
+ An *orthogonal* fingerprint to the Chromium patchright tier: Camoufox patches
4
+ stealth at the C++ level and is best-in-class for **headless** detection evasion,
5
+ so it can clear hosts where the Chromium browser still gets blocked. It's the
6
+ slowest rung we own (~40s on a hard Cloudflare solve), but it only fires after
7
+ the four cheaper tiers AND patchright all miss, so easy traffic never pays for
8
+ it. ON by default — opt out with SCRAPER_DISABLE_CAMOUFOX=1.
9
+
10
+ Needs its Firefox build (`camoufox fetch`); if absent the launch raises and the
11
+ cascade falls through to Firecrawl. Tried after the Chromium browser misses,
12
+ before the paid Firecrawl tier.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ import os
18
+
19
+ from . import _browser
20
+ from .. import session_cache, session_trace
21
+ from ..concurrency import browser_slot
22
+ from ..egress import playwright_proxy, add_wire_bytes
23
+ from ..normalize import html_to_markdown
24
+ from ..policy.gates import check
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ NAME = "tier3b_camoufox"
29
+ PAID = False
30
+
31
+ _TIMEOUT_MS = int(os.getenv("SCRAPER_CAMOUFOX_TIMEOUT_MS", "45000"))
32
+
33
+
34
+ def disabled() -> bool:
35
+ """On by default; opt out (heavy + slow) with SCRAPER_DISABLE_CAMOUFOX=1."""
36
+ return bool(os.getenv("SCRAPER_DISABLE_CAMOUFOX"))
37
+
38
+
39
+ def _geoip_available() -> bool:
40
+ """camoufox's geoip matching needs the `geoip2` package (the camoufox[geoip]
41
+ extra). Without it, requesting geoip raises and kills the whole tier — so we
42
+ probe and degrade gracefully instead."""
43
+ try:
44
+ import geoip2 # noqa: F401
45
+ return True
46
+ except Exception:
47
+ return False
48
+
49
+
50
+ def _launch_opts() -> dict:
51
+ """Camoufox evasion knobs. We launch it bare no longer: humanize + randomized
52
+ desktop OS make the fingerprint blend in; with a proxy set we also turn on
53
+ geoip so timezone/locale/geolocation match the proxy's IP (a mismatch there
54
+ is itself a tell) — but only when the geoip extra is installed."""
55
+ opts: dict = {"headless": True, "humanize": True,
56
+ "os": ["windows", "macos", "linux"]}
57
+ proxy = playwright_proxy()
58
+ if proxy:
59
+ opts["proxy"] = proxy
60
+ if _geoip_available():
61
+ opts["geoip"] = True
62
+ else:
63
+ logger.warning("camoufox: proxy set but geoip extra missing "
64
+ "(pip install camoufox[geoip]); locale/timezone won't "
65
+ "match the proxy IP — a possible detection tell")
66
+ return opts
67
+
68
+
69
+ def fetch(url: str) -> str:
70
+ from camoufox.sync_api import Camoufox
71
+ with browser_slot(NAME), Camoufox(**_launch_opts()) as browser:
72
+ page = browser.new_page()
73
+ responses: list = []
74
+ page.on("response", lambda resp: responses.append(resp))
75
+ try:
76
+ session_trace.start(page.context, url)
77
+ auth = session_cache.browser_cookies(url)
78
+ if auth:
79
+ page.context.add_cookies(auth)
80
+ page.goto(url, wait_until="networkidle", timeout=_TIMEOUT_MS)
81
+ html = page.content()
82
+ # JS bot-manager sensor interstitial → settle + reload once.
83
+ if _browser.looks_blocked(html, url):
84
+ html = _browser.reload_through_challenge(page, url, _TIMEOUT_MS)
85
+ add_wire_bytes(_browser.response_bytes(responses))
86
+ finally:
87
+ session_trace.stop(page.context, url)
88
+ page.close()
89
+ return check(url, html_to_markdown(html, base_url=url))
@@ -0,0 +1,48 @@
1
+ """Tier 4 — Firecrawl (paid, last resort).
2
+
3
+ Env-gated: set SCRAPER_DISABLE_FIRECRAWL to skip this tier entirely (URL is then
4
+ dropped). Every invocation is audited and feeds the botwall promotion counter, so
5
+ hosts that keep needing it get auto-skipped. Needs FIRECRAWL_API_KEY.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import threading
11
+
12
+ from ..policy.gates import check
13
+
14
+ NAME = "tier4_firecrawl"
15
+ PAID = True
16
+
17
+
18
+ def disabled() -> bool:
19
+ return bool(os.getenv("SCRAPER_DISABLE_FIRECRAWL"))
20
+
21
+
22
+ def _scrape(url: str) -> str:
23
+ from firecrawl import Firecrawl
24
+ app = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"])
25
+ doc = app.scrape(url, formats=["markdown"])
26
+ d = doc.model_dump() if hasattr(doc, "model_dump") else (doc if isinstance(doc, dict) else {})
27
+ return check(url, (d.get("markdown") or "").strip())
28
+
29
+
30
+ def fetch(url: str) -> str:
31
+ # Run in a dedicated thread: the Firecrawl SDK sets an asyncio event loop on
32
+ # the calling thread, which then makes a later sync-Playwright browser tier in
33
+ # the same batch raise "Sync API inside the asyncio loop". A worker thread
34
+ # confines that loop so the browser tiers stay usable across a multi-URL run.
35
+ box: dict = {}
36
+
37
+ def work():
38
+ try:
39
+ box["md"] = _scrape(url)
40
+ except BaseException as e: # noqa: BLE001 — re-raised to the caller below
41
+ box["err"] = e
42
+
43
+ t = threading.Thread(target=work, name="tier4-firecrawl", daemon=True)
44
+ t.start()
45
+ t.join()
46
+ if "err" in box:
47
+ raise box["err"]
48
+ return box["md"]