switchback 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- switchback/__init__.py +12 -0
- switchback/__main__.py +4 -0
- switchback/api.py +81 -0
- switchback/concurrency.py +37 -0
- switchback/content_cache.py +94 -0
- switchback/egress.py +108 -0
- switchback/extract.py +56 -0
- switchback/flags.py +96 -0
- switchback/normalize.py +81 -0
- switchback/orchestrator.py +343 -0
- switchback/policy/__init__.py +0 -0
- switchback/policy/botwall.py +393 -0
- switchback/policy/gates.py +173 -0
- switchback/py.typed +0 -0
- switchback/reporting.py +236 -0
- switchback/search.py +39 -0
- switchback/server.py +114 -0
- switchback/session_cache.py +274 -0
- switchback/session_trace.py +96 -0
- switchback/tiers/__init__.py +24 -0
- switchback/tiers/_browser.py +50 -0
- switchback/tiers/tier0_apis.py +77 -0
- switchback/tiers/tier1_http.py +65 -0
- switchback/tiers/tier2_cloudscraper.py +135 -0
- switchback/tiers/tier3_browser.py +59 -0
- switchback/tiers/tier3b_camoufox.py +89 -0
- switchback/tiers/tier4_firecrawl.py +48 -0
- switchback/tiers/tier_residential.py +57 -0
- switchback/tracing.py +152 -0
- switchback-0.1.0.dist-info/METADATA +325 -0
- switchback-0.1.0.dist-info/RECORD +36 -0
- switchback-0.1.0.dist-info/WHEEL +5 -0
- switchback-0.1.0.dist-info/entry_points.txt +3 -0
- switchback-0.1.0.dist-info/licenses/LICENSE +21 -0
- switchback-0.1.0.dist-info/licenses/NOTICE +34 -0
- switchback-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Opt-in Playwright trace capture for the browser tiers (req 16).
|
|
2
|
+
|
|
3
|
+
Off by default. Set ``SCRAPER_TRACE_SESSION=1`` to record a Playwright trace
|
|
4
|
+
(screenshots + DOM snapshots + network) for every browser-tier attempt; each is
|
|
5
|
+
written as a self-contained zip under ``state/traces/`` and is openable with
|
|
6
|
+
``playwright show-trace <zip>``. The HTTP server exposes list / fetch / delete
|
|
7
|
+
endpoints so traces can be pulled and cleaned up on demand.
|
|
8
|
+
|
|
9
|
+
Capture is wrapped so a tracing failure never breaks a scrape — it just means no
|
|
10
|
+
trace for that attempt. Traces are heavyweight (MBs each); keep this off in
|
|
11
|
+
steady state and flip it on to debug a specific host.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
import time
|
|
19
|
+
|
|
20
|
+
from .policy.gates import host_of
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
_DEFAULT_STATE_DIR = os.path.join(
|
|
25
|
+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "state")
|
|
26
|
+
_STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
|
|
27
|
+
TRACE_DIR = os.path.join(_STATE_DIR, "traces")
|
|
28
|
+
|
|
29
|
+
# Trace ids are the zip filename stem; constrain to a safe charset so a request
|
|
30
|
+
# id can never escape TRACE_DIR.
|
|
31
|
+
_ID_RE = re.compile(r"^[A-Za-z0-9._-]+$")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def enabled() -> bool:
|
|
35
|
+
return os.getenv("SCRAPER_TRACE_SESSION") in ("1", "true", "True")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def start(context, url: str) -> bool:
|
|
39
|
+
"""Begin tracing on a browser context. No-op (returns False) when disabled or
|
|
40
|
+
if the context doesn't support tracing."""
|
|
41
|
+
if not enabled():
|
|
42
|
+
return False
|
|
43
|
+
try:
|
|
44
|
+
context.tracing.start(screenshots=True, snapshots=True, sources=True)
|
|
45
|
+
return True
|
|
46
|
+
except Exception as e:
|
|
47
|
+
logger.warning(f"session_trace: start failed: {e}")
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def stop(context, url: str) -> str | None:
|
|
52
|
+
"""Stop tracing and write the zip; returns its path (or None on failure)."""
|
|
53
|
+
if not enabled():
|
|
54
|
+
return None
|
|
55
|
+
os.makedirs(TRACE_DIR, exist_ok=True)
|
|
56
|
+
name = f"{host_of(url) or 'unknown'}-{int(time.time() * 1000)}.zip"
|
|
57
|
+
path = os.path.join(TRACE_DIR, name)
|
|
58
|
+
try:
|
|
59
|
+
context.tracing.stop(path=path)
|
|
60
|
+
logger.info(f"session_trace: wrote {path}")
|
|
61
|
+
return path
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.warning(f"session_trace: stop failed: {e}")
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ── server-side management ───────────────────────────────────────────────────
|
|
68
|
+
|
|
69
|
+
def list_traces() -> list[dict]:
|
|
70
|
+
"""All captured traces, newest first: id, bytes, modified-at (epoch)."""
|
|
71
|
+
if not os.path.isdir(TRACE_DIR):
|
|
72
|
+
return []
|
|
73
|
+
out = []
|
|
74
|
+
for fn in os.listdir(TRACE_DIR):
|
|
75
|
+
if not fn.endswith(".zip"):
|
|
76
|
+
continue
|
|
77
|
+
p = os.path.join(TRACE_DIR, fn)
|
|
78
|
+
st = os.stat(p)
|
|
79
|
+
out.append({"id": fn[:-4], "bytes": st.st_size, "modified": st.st_mtime})
|
|
80
|
+
return sorted(out, key=lambda t: -t["modified"])
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def path_for(trace_id: str) -> str | None:
|
|
84
|
+
"""Resolve a trace id to its zip path, or None if missing/invalid."""
|
|
85
|
+
if not _ID_RE.match(trace_id or ""):
|
|
86
|
+
return None
|
|
87
|
+
p = os.path.join(TRACE_DIR, f"{trace_id}.zip")
|
|
88
|
+
return p if os.path.exists(p) else None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def delete(trace_id: str) -> bool:
|
|
92
|
+
p = path_for(trace_id)
|
|
93
|
+
if not p:
|
|
94
|
+
return False
|
|
95
|
+
os.remove(p)
|
|
96
|
+
return True
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""The cost-ordered cascade. Each tier exposes:
|
|
2
|
+
|
|
3
|
+
NAME : str
|
|
4
|
+
PAID : bool # gated/audited if True
|
|
5
|
+
fetch(url) -> str | None # markdown on success; None if not
|
|
6
|
+
# applicable; raises on failure.
|
|
7
|
+
|
|
8
|
+
Order matters — cheapest/cleanest first, paid last.
|
|
9
|
+
"""
|
|
10
|
+
from . import (tier0_apis, tier1_http, tier2_cloudscraper,
|
|
11
|
+
tier3_browser, tier3b_camoufox, tier_residential, tier4_firecrawl)
|
|
12
|
+
|
|
13
|
+
TIERS = [
|
|
14
|
+
tier0_apis,
|
|
15
|
+
tier1_http,
|
|
16
|
+
tier2_cloudscraper,
|
|
17
|
+
tier3_browser,
|
|
18
|
+
tier3b_camoufox, # env-gated Firefox stealth (off by default; orthogonal to T3)
|
|
19
|
+
tier_residential, # residential-IP CDP browser (off unless BU_CDP_URL set)
|
|
20
|
+
tier4_firecrawl,
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
# tier name -> index, for botwall winning-tier routing.
|
|
24
|
+
INDEX = {t.NAME: i for i, t in enumerate(TIERS)}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Shared helpers for the stealth-browser tiers (patchright, camoufox).
|
|
2
|
+
|
|
3
|
+
Not a tier itself (leading underscore, not in the TIERS registry) — just the
|
|
4
|
+
challenge-resolution mechanic both browsers need.
|
|
5
|
+
|
|
6
|
+
Akamai Bot Manager / Imperva / Kasada serve a JS-*sensor* interstitial on the
|
|
7
|
+
first load: it sets cookies (e.g. ak_bmsc, _abck) as the sensor script runs, then
|
|
8
|
+
the *real* content is only returned on a re-request. A single settle + reload
|
|
9
|
+
after the sensor runs clears them — provided the egress IP is acceptable. (A hard
|
|
10
|
+
IP block never validates `_abck` and the page stays an interstitial, so the tier
|
|
11
|
+
still falls through; this just stops us snapshotting the interstitial too early on
|
|
12
|
+
IPs that would have passed.)
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from ..normalize import html_to_markdown
|
|
17
|
+
from ..policy.gates import _looks_like_botwall
|
|
18
|
+
|
|
19
|
+
_SETTLE_MS = 5000 # let the sensor JS run and set its cookies
|
|
20
|
+
_POST_RELOAD_MS = 1500 # let the real content paint after the reload
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def looks_blocked(html: str, url: str) -> bool:
|
|
24
|
+
"""True when the rendered DOM is a bot-wall / sensor interstitial."""
|
|
25
|
+
return _looks_like_botwall(html_to_markdown(html, base_url=url))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def response_bytes(responses) -> int:
|
|
29
|
+
"""Total wire bytes a render pulled across every resource — the residential-cost
|
|
30
|
+
basis. Reads only the Content-Length header: it's non-blocking. (We deliberately
|
|
31
|
+
do NOT call resp.body() — on a stalled response body() blocks with no timeout and
|
|
32
|
+
can freeze the whole render, which is uninterruptible by the cascade deadline.)
|
|
33
|
+
Responses without a Content-Length are skipped, so this slightly undercounts."""
|
|
34
|
+
total = 0
|
|
35
|
+
for resp in responses:
|
|
36
|
+
try:
|
|
37
|
+
cl = resp.headers.get("content-length")
|
|
38
|
+
if cl:
|
|
39
|
+
total += int(cl)
|
|
40
|
+
except Exception:
|
|
41
|
+
pass
|
|
42
|
+
return total
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def reload_through_challenge(page, url: str, timeout_ms: int) -> str:
|
|
46
|
+
"""Settle so the bot-manager sensor JS runs, reload once, return fresh html."""
|
|
47
|
+
page.wait_for_timeout(_SETTLE_MS)
|
|
48
|
+
page.goto(url, wait_until="networkidle", timeout=timeout_ms)
|
|
49
|
+
page.wait_for_timeout(_POST_RELOAD_MS)
|
|
50
|
+
return page.content()
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Tier 0 — direct APIs / open mirrors.
|
|
2
|
+
|
|
3
|
+
Cheapest, cleanest, most reliable. Pattern-routed: returns None when no mirror
|
|
4
|
+
matches (caller falls through to Tier 1). Bypasses the botwall skip-list because
|
|
5
|
+
these are open, stable endpoints — not the scraped page.
|
|
6
|
+
|
|
7
|
+
Extend here with career-ops-style structured providers (greenhouse/lever/ashby).
|
|
8
|
+
Web *search* (query → URLs) is a different shape and lives in switchback/search.py
|
|
9
|
+
(local SearXNG), not in this fetch cascade.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from urllib.parse import unquote
|
|
15
|
+
from xml.etree import ElementTree as ET
|
|
16
|
+
|
|
17
|
+
from ..normalize import html_to_markdown, UA
|
|
18
|
+
from ..policy.gates import check
|
|
19
|
+
|
|
20
|
+
NAME = "tier0_apis"
|
|
21
|
+
PAID = False
|
|
22
|
+
|
|
23
|
+
ARXIV_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:v\d+)?(?:\.pdf)?", re.I)
|
|
24
|
+
WIKI_RE = re.compile(r"en\.wikipedia\.org/wiki/([^?#]+)", re.I)
|
|
25
|
+
PMC_RE = re.compile(r"pmc\.ncbi\.nlm\.nih\.gov/articles/(PMC\d+)", re.I)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def fetch(url: str) -> str | None:
|
|
29
|
+
m = ARXIV_RE.search(url)
|
|
30
|
+
if m:
|
|
31
|
+
return _arxiv(m.group(1), url)
|
|
32
|
+
m = WIKI_RE.search(url)
|
|
33
|
+
if m:
|
|
34
|
+
return _wikipedia(m.group(1), url)
|
|
35
|
+
if PMC_RE.search(url):
|
|
36
|
+
return _europepmc(url)
|
|
37
|
+
return None # no mirror — fall through
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _arxiv(arxiv_id: str, url: str) -> str:
|
|
41
|
+
# arxiv wants plain requests + an identifying UA (their published guidance);
|
|
42
|
+
# impersonating Chrome triggers aggressive 429s from their Akamai front-end.
|
|
43
|
+
import requests
|
|
44
|
+
r = requests.get(f"https://export.arxiv.org/api/query?id_list={arxiv_id}",
|
|
45
|
+
timeout=15,
|
|
46
|
+
headers={"User-Agent": "switchback/1.0 (mailto:akash@theaklabs.com)"})
|
|
47
|
+
r.raise_for_status()
|
|
48
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
49
|
+
entry = ET.fromstring(r.text).find("atom:entry", ns)
|
|
50
|
+
if entry is None:
|
|
51
|
+
raise RuntimeError("arxiv: no entry in API response")
|
|
52
|
+
title = (entry.findtext("atom:title", "", ns) or "").strip()
|
|
53
|
+
summary = (entry.findtext("atom:summary", "", ns) or "").strip()
|
|
54
|
+
authors = [a.findtext("atom:name", "", ns) or "" for a in entry.findall("atom:author", ns)]
|
|
55
|
+
md = (f"# {title}\n\n**Authors:** {', '.join(a for a in authors if a)}\n\n"
|
|
56
|
+
f"**arXiv:** {arxiv_id}\n\n## Abstract\n\n{summary}")
|
|
57
|
+
return check(url, md)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _wikipedia(title: str, url: str) -> str:
|
|
61
|
+
from curl_cffi import requests as cffi
|
|
62
|
+
r = cffi.get(f"https://en.wikipedia.org/api/rest_v1/page/html/{unquote(title)}",
|
|
63
|
+
timeout=15, impersonate="chrome")
|
|
64
|
+
r.raise_for_status()
|
|
65
|
+
return check(url, html_to_markdown(r.text, base_url=url))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _europepmc(url: str) -> str:
|
|
69
|
+
# PMC full text via EuropePMC mirror (avoids reCAPTCHA on ncbi).
|
|
70
|
+
import requests
|
|
71
|
+
pmcid = PMC_RE.search(url).group(1)
|
|
72
|
+
api = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
|
|
73
|
+
r = requests.get(api, timeout=20, headers={"User-Agent": UA})
|
|
74
|
+
r.raise_for_status()
|
|
75
|
+
if len(r.text) < 1000:
|
|
76
|
+
raise RuntimeError(f"europepmc empty: {len(r.text)}")
|
|
77
|
+
return check(url, html_to_markdown(r.text, base_url=url))
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Tier 1 — plain HTTP with TLS fingerprint impersonation.
|
|
2
|
+
|
|
3
|
+
curl_cffi impersonates a real Chrome TLS handshake, which clears many naive bot
|
|
4
|
+
walls without a browser. Handles PDFs inline. Fast and cheap.
|
|
5
|
+
|
|
6
|
+
The bare "chrome" alias resolves to an old default; we pin recent targets and
|
|
7
|
+
rotate them deterministically per host, so our traffic isn't one shared JA3 yet
|
|
8
|
+
each host stays reproducible (and pairs cleanly with the session cache, which
|
|
9
|
+
records the target that won).
|
|
10
|
+
|
|
11
|
+
No User-Agent override: the impersonate target already sends a UA that matches
|
|
12
|
+
its TLS fingerprint. Overriding it with a stale string is a detection tell (TLS
|
|
13
|
+
says one Chrome version, the header says another).
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
from urllib.parse import urlsplit
|
|
19
|
+
|
|
20
|
+
from .. import session_cache
|
|
21
|
+
from ..egress import requests_proxies, add_wire_bytes
|
|
22
|
+
from ..normalize import html_to_markdown, pdf_bytes_to_text
|
|
23
|
+
from ..policy.gates import BotWall, check, is_cf_challenge
|
|
24
|
+
|
|
25
|
+
NAME = "tier1_http"
|
|
26
|
+
PAID = False
|
|
27
|
+
|
|
28
|
+
# Recent Chrome JA3 targets available in curl_cffi 0.15.x. A small spread of real
|
|
29
|
+
# versions mirrors how live traffic is distributed across Chrome releases.
|
|
30
|
+
_IMPERSONATE_TARGETS = ("chrome131", "chrome136", "chrome142")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _impersonate_for(url: str) -> str:
|
|
34
|
+
host = urlsplit(url).hostname or ""
|
|
35
|
+
h = int(hashlib.sha1(host.encode()).hexdigest(), 16)
|
|
36
|
+
return _IMPERSONATE_TARGETS[h % len(_IMPERSONATE_TARGETS)]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def fetch(url: str) -> str:
|
|
40
|
+
from curl_cffi import requests as cffi
|
|
41
|
+
# Auth cookies only: the cached cf_clearance is UA-bound to whichever tier
|
|
42
|
+
# solved it, and CF hosts route straight to Tier 2 on repeat, so replaying it
|
|
43
|
+
# against Tier 1's distinct impersonate UA would be a mismatch tell.
|
|
44
|
+
cookie = session_cache.cookie_header(url, include_cache=False)
|
|
45
|
+
headers = {"Cookie": cookie} if cookie else None
|
|
46
|
+
r = cffi.get(url, timeout=15, allow_redirects=True,
|
|
47
|
+
impersonate=_impersonate_for(url),
|
|
48
|
+
proxies=requests_proxies(), headers=headers)
|
|
49
|
+
add_wire_bytes(len(r.content)) # count even on a block — failed fetches burn bandwidth too
|
|
50
|
+
if r.status_code >= 400:
|
|
51
|
+
# A Cloudflare JS challenge often returns 403/503 with the interstitial in
|
|
52
|
+
# the body. Surface that as a botwall (Tier 2 can solve it) rather than a
|
|
53
|
+
# hard http_block — which the orchestrator uses to skip Tier 2 entirely.
|
|
54
|
+
if is_cf_challenge(r.headers, r.text):
|
|
55
|
+
raise BotWall("cloudflare challenge", vendor="cloudflare")
|
|
56
|
+
r.raise_for_status()
|
|
57
|
+
ctype = r.headers.get("Content-Type", "").lower()
|
|
58
|
+
is_pdf = "application/pdf" in ctype or r.url.lower().split("?")[0].endswith(".pdf")
|
|
59
|
+
if is_pdf:
|
|
60
|
+
try:
|
|
61
|
+
text = pdf_bytes_to_text(r.content)
|
|
62
|
+
finally:
|
|
63
|
+
r.close()
|
|
64
|
+
return check(url, text)
|
|
65
|
+
return check(url, html_to_markdown(r.text, base_url=r.url))
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Tier 2 — Cloudflare / anti-bot solver (cloudscraper 3.x "Enhanced Edition").
|
|
2
|
+
|
|
3
|
+
Targets the specific failure the cheaper tiers can't clear: a Cloudflare JS
|
|
4
|
+
challenge / "checking your browser" interstitial. Solves it in-process (no
|
|
5
|
+
browser), then returns the real page. First hit to a CF host sleeps ~5s.
|
|
6
|
+
|
|
7
|
+
cloudscraper 3.x clears v1/v2/v3 (JS-VM) challenges and Turnstile, with stealth
|
|
8
|
+
on by default (randomized headers, browser quirks, human-like pacing) and
|
|
9
|
+
automatic cf_clearance refresh on 403. Pinned to the GitHub Enhanced Edition;
|
|
10
|
+
PyPI is frozen at 1.2.71 (v1/v2 only, no stealth) — see pyproject.toml.
|
|
11
|
+
|
|
12
|
+
On hard CAPTCHA variants with no solver configured this raises and the cascade
|
|
13
|
+
falls through to the stealth browser (Tier 3).
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import threading
|
|
21
|
+
|
|
22
|
+
from .. import egress, session_cache
|
|
23
|
+
from ..egress import requests_proxies
|
|
24
|
+
from ..normalize import html_to_markdown
|
|
25
|
+
from ..policy.gates import check
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
NAME = "tier2_cloudscraper"
|
|
30
|
+
PAID = False
|
|
31
|
+
|
|
32
|
+
# Wall-clock cap on the whole solve. cloudscraper 3.x *attempts* interactive
|
|
33
|
+
# Turnstile and can loop for minutes on a challenge it can't clear — far past the
|
|
34
|
+
# per-request socket timeout. Capping it here lets the cascade fall through to the
|
|
35
|
+
# stealth browser (which can handle interactive challenges) instead of burning the
|
|
36
|
+
# per-URL deadline. ~25s comfortably covers a real JS/v3 solve (~5-15s).
|
|
37
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_CLOUDSCRAPER_TIMEOUT_S", "25"))
|
|
38
|
+
|
|
39
|
+
# Stealth pacing. Kept modest: Tier 2 only fires on CF-suspected hosts, and the
|
|
40
|
+
# real latency win comes from skipping the solve entirely on repeat hits (session
|
|
41
|
+
# cache), not from long inter-request sleeps.
|
|
42
|
+
_STEALTH_OPTIONS = {
|
|
43
|
+
"min_delay": 0.5,
|
|
44
|
+
"max_delay": 1.5,
|
|
45
|
+
"human_like_delays": True,
|
|
46
|
+
"randomize_headers": True,
|
|
47
|
+
"browser_quirks": True,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_captcha_warned = False
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _captcha_opts() -> dict:
|
|
55
|
+
"""Opt-in third-party captcha solver (off by default). When both env vars are
|
|
56
|
+
set, cloudscraper solves Turnstile / reCAPTCHA / hCaptcha on CF hosts in-process
|
|
57
|
+
via the provider (2captcha, capsolver, capmonster, anticaptcha, deathbycaptcha,
|
|
58
|
+
9kw). PAID: the provider bills per solve. cloudscraper resets its solve counter
|
|
59
|
+
on success, so per-solve counts aren't observable here — track spend in the
|
|
60
|
+
provider's own dashboard."""
|
|
61
|
+
provider = os.getenv("SCRAPER_CAPTCHA_PROVIDER")
|
|
62
|
+
api_key = os.getenv("SCRAPER_CAPTCHA_API_KEY")
|
|
63
|
+
if not (provider and api_key):
|
|
64
|
+
return {}
|
|
65
|
+
global _captcha_warned
|
|
66
|
+
if not _captcha_warned:
|
|
67
|
+
logger.warning(f"tier2: captcha solver active (provider={provider}); "
|
|
68
|
+
"solves are billed by the provider")
|
|
69
|
+
_captcha_warned = True
|
|
70
|
+
return {"captcha": {"provider": provider, "api_key": api_key}}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _interpreter_opts() -> dict:
|
|
74
|
+
"""The v3 JS-VM challenge runs an interpreter. The 3.x default js2py is pure
|
|
75
|
+
Python — slow and prone to stalling on heavy challenges; Node runs them fast
|
|
76
|
+
and reliably. Prefer it when present, else fall back to the default."""
|
|
77
|
+
return {"interpreter": "nodejs"} if shutil.which("node") else {}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _make_scraper():
|
|
81
|
+
import cloudscraper
|
|
82
|
+
# enable_stealth / auto_refresh_on_403 are on by default in 3.x; we pass the
|
|
83
|
+
# stealth tuning explicitly. No UA override: cloudscraper derives a UA (and
|
|
84
|
+
# matching cipher suite) from the browser dict; a stale override contradicts it.
|
|
85
|
+
return cloudscraper.create_scraper(
|
|
86
|
+
browser={"browser": "chrome", "platform": "linux", "mobile": False},
|
|
87
|
+
enable_stealth=True,
|
|
88
|
+
stealth_options=_STEALTH_OPTIONS,
|
|
89
|
+
**_interpreter_opts(),
|
|
90
|
+
**_captcha_opts(),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _fetch(url: str) -> str:
|
|
95
|
+
scraper = _make_scraper()
|
|
96
|
+
# Replay a cached cf_clearance (skips the ~5s solve) plus any auth cookies.
|
|
97
|
+
cookies = session_cache.cookies_for(url, include_cache=True)
|
|
98
|
+
r = scraper.get(url, timeout=20, proxies=requests_proxies(),
|
|
99
|
+
cookies=cookies or None)
|
|
100
|
+
r.raise_for_status()
|
|
101
|
+
nbytes = len(r.content)
|
|
102
|
+
md = check(url, html_to_markdown(r.text, base_url=r.url))
|
|
103
|
+
# Cleared: cache whatever cf cookies the session now holds for next time.
|
|
104
|
+
session_cache.remember(url, dict(scraper.cookies),
|
|
105
|
+
ua=scraper.headers.get("User-Agent", ""))
|
|
106
|
+
return md, nbytes
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def fetch(url: str) -> str:
|
|
110
|
+
# Run the (blocking, occasionally runaway) solve under a hard wall-clock cap.
|
|
111
|
+
# A daemon worker means an abandoned solve can't block process exit; it dies on
|
|
112
|
+
# its own socket timeout shortly after. Thread-locals don't inherit, so the
|
|
113
|
+
# egress scope is re-applied inside the worker.
|
|
114
|
+
scoped = egress.in_egress_scope()
|
|
115
|
+
box: dict = {}
|
|
116
|
+
|
|
117
|
+
def work():
|
|
118
|
+
with egress.egress_scope(scoped):
|
|
119
|
+
try:
|
|
120
|
+
box["md"], box["bytes"] = _fetch(url)
|
|
121
|
+
except BaseException as e: # noqa: BLE001 — propagated to caller below
|
|
122
|
+
box["err"] = e
|
|
123
|
+
|
|
124
|
+
t = threading.Thread(target=work, name="tier2-cloudscraper", daemon=True)
|
|
125
|
+
t.start()
|
|
126
|
+
t.join(_TIMEOUT_S)
|
|
127
|
+
if t.is_alive():
|
|
128
|
+
raise TimeoutError(
|
|
129
|
+
f"cloudscraper exceeded {_TIMEOUT_S}s (unsolvable challenge); "
|
|
130
|
+
"falling through to the stealth browser")
|
|
131
|
+
# Re-attribute the worker's wire bytes here, in the scope-owning thread.
|
|
132
|
+
egress.add_wire_bytes(box.get("bytes", 0))
|
|
133
|
+
if "err" in box:
|
|
134
|
+
raise box["err"]
|
|
135
|
+
return box["md"]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Tier 3 — stealth headless browser (patchright).
|
|
2
|
+
|
|
3
|
+
Renders JS-heavy SPAs. patchright is a hardened Playwright fork that evades the
|
|
4
|
+
common automation fingerprints. Tries domcontentloaded first, then networkidle
|
|
5
|
+
if the DOM is suspiciously small (lazy/JS content).
|
|
6
|
+
|
|
7
|
+
Future: (a) batch many URLs through one browser (musings does this — big perf
|
|
8
|
+
win); (b) browser-harness mode to drive the user's logged-in Chrome over CDP for
|
|
9
|
+
auth-walled pages (set BU_CDP_URL).
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from . import _browser
|
|
14
|
+
from .. import session_cache, session_trace
|
|
15
|
+
from ..concurrency import browser_slot
|
|
16
|
+
from ..egress import playwright_proxy, add_wire_bytes
|
|
17
|
+
from ..normalize import html_to_markdown
|
|
18
|
+
from ..policy.gates import check
|
|
19
|
+
|
|
20
|
+
NAME = "tier3_browser"
|
|
21
|
+
PAID = False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def fetch(url: str, timeout_ms: int = 15000) -> str:
|
|
25
|
+
from patchright.sync_api import sync_playwright
|
|
26
|
+
with browser_slot(NAME), sync_playwright() as p:
|
|
27
|
+
browser = p.chromium.launch(headless=True, proxy=playwright_proxy())
|
|
28
|
+
ctx = None
|
|
29
|
+
try:
|
|
30
|
+
# No user_agent override: patchright ships a real, internally
|
|
31
|
+
# consistent Chromium fingerprint; overriding the UA desyncs it from
|
|
32
|
+
# the engine version / client hints and defeats the stealth fork.
|
|
33
|
+
ctx = browser.new_context()
|
|
34
|
+
session_trace.start(ctx, url)
|
|
35
|
+
auth = session_cache.browser_cookies(url)
|
|
36
|
+
if auth:
|
|
37
|
+
ctx.add_cookies(auth)
|
|
38
|
+
page = ctx.new_page()
|
|
39
|
+
responses: list = []
|
|
40
|
+
page.on("response", lambda resp: responses.append(resp))
|
|
41
|
+
page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
|
42
|
+
html = page.content()
|
|
43
|
+
if len(html) < 5000:
|
|
44
|
+
try:
|
|
45
|
+
page.wait_for_load_state("networkidle", timeout=8000)
|
|
46
|
+
except Exception:
|
|
47
|
+
pass
|
|
48
|
+
html = page.content()
|
|
49
|
+
# A JS bot-manager (Akamai/Imperva/…) may serve a sensor interstitial
|
|
50
|
+
# first; settle + reload once to get the real page on an acceptable IP.
|
|
51
|
+
if _browser.looks_blocked(html, page.url or url):
|
|
52
|
+
html = _browser.reload_through_challenge(page, url, timeout_ms)
|
|
53
|
+
add_wire_bytes(_browser.response_bytes(responses))
|
|
54
|
+
md = html_to_markdown(html, base_url=page.url or url)
|
|
55
|
+
finally:
|
|
56
|
+
if ctx is not None:
|
|
57
|
+
session_trace.stop(ctx, url)
|
|
58
|
+
browser.close()
|
|
59
|
+
return check(url, md)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Tier 3b — Camoufox (hardened Firefox), env-gated stealth.
|
|
2
|
+
|
|
3
|
+
An *orthogonal* fingerprint to the Chromium patchright tier: Camoufox patches
|
|
4
|
+
stealth at the C++ level and is best-in-class for **headless** detection evasion,
|
|
5
|
+
so it can clear hosts where the Chromium browser still gets blocked. It's the
|
|
6
|
+
slowest rung we own (~40s on a hard Cloudflare solve), but it only fires after
|
|
7
|
+
the four cheaper tiers AND patchright all miss, so easy traffic never pays for
|
|
8
|
+
it. ON by default — opt out with SCRAPER_DISABLE_CAMOUFOX=1.
|
|
9
|
+
|
|
10
|
+
Needs its Firefox build (`camoufox fetch`); if absent the launch raises and the
|
|
11
|
+
cascade falls through to Firecrawl. Tried after the Chromium browser misses,
|
|
12
|
+
before the paid Firecrawl tier.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from . import _browser
|
|
20
|
+
from .. import session_cache, session_trace
|
|
21
|
+
from ..concurrency import browser_slot
|
|
22
|
+
from ..egress import playwright_proxy, add_wire_bytes
|
|
23
|
+
from ..normalize import html_to_markdown
|
|
24
|
+
from ..policy.gates import check
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
NAME = "tier3b_camoufox"
|
|
29
|
+
PAID = False
|
|
30
|
+
|
|
31
|
+
_TIMEOUT_MS = int(os.getenv("SCRAPER_CAMOUFOX_TIMEOUT_MS", "45000"))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def disabled() -> bool:
|
|
35
|
+
"""On by default; opt out (heavy + slow) with SCRAPER_DISABLE_CAMOUFOX=1."""
|
|
36
|
+
return bool(os.getenv("SCRAPER_DISABLE_CAMOUFOX"))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _geoip_available() -> bool:
|
|
40
|
+
"""camoufox's geoip matching needs the `geoip2` package (the camoufox[geoip]
|
|
41
|
+
extra). Without it, requesting geoip raises and kills the whole tier — so we
|
|
42
|
+
probe and degrade gracefully instead."""
|
|
43
|
+
try:
|
|
44
|
+
import geoip2 # noqa: F401
|
|
45
|
+
return True
|
|
46
|
+
except Exception:
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _launch_opts() -> dict:
|
|
51
|
+
"""Camoufox evasion knobs. We launch it bare no longer: humanize + randomized
|
|
52
|
+
desktop OS make the fingerprint blend in; with a proxy set we also turn on
|
|
53
|
+
geoip so timezone/locale/geolocation match the proxy's IP (a mismatch there
|
|
54
|
+
is itself a tell) — but only when the geoip extra is installed."""
|
|
55
|
+
opts: dict = {"headless": True, "humanize": True,
|
|
56
|
+
"os": ["windows", "macos", "linux"]}
|
|
57
|
+
proxy = playwright_proxy()
|
|
58
|
+
if proxy:
|
|
59
|
+
opts["proxy"] = proxy
|
|
60
|
+
if _geoip_available():
|
|
61
|
+
opts["geoip"] = True
|
|
62
|
+
else:
|
|
63
|
+
logger.warning("camoufox: proxy set but geoip extra missing "
|
|
64
|
+
"(pip install camoufox[geoip]); locale/timezone won't "
|
|
65
|
+
"match the proxy IP — a possible detection tell")
|
|
66
|
+
return opts
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def fetch(url: str) -> str:
|
|
70
|
+
from camoufox.sync_api import Camoufox
|
|
71
|
+
with browser_slot(NAME), Camoufox(**_launch_opts()) as browser:
|
|
72
|
+
page = browser.new_page()
|
|
73
|
+
responses: list = []
|
|
74
|
+
page.on("response", lambda resp: responses.append(resp))
|
|
75
|
+
try:
|
|
76
|
+
session_trace.start(page.context, url)
|
|
77
|
+
auth = session_cache.browser_cookies(url)
|
|
78
|
+
if auth:
|
|
79
|
+
page.context.add_cookies(auth)
|
|
80
|
+
page.goto(url, wait_until="networkidle", timeout=_TIMEOUT_MS)
|
|
81
|
+
html = page.content()
|
|
82
|
+
# JS bot-manager sensor interstitial → settle + reload once.
|
|
83
|
+
if _browser.looks_blocked(html, url):
|
|
84
|
+
html = _browser.reload_through_challenge(page, url, _TIMEOUT_MS)
|
|
85
|
+
add_wire_bytes(_browser.response_bytes(responses))
|
|
86
|
+
finally:
|
|
87
|
+
session_trace.stop(page.context, url)
|
|
88
|
+
page.close()
|
|
89
|
+
return check(url, html_to_markdown(html, base_url=url))
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Tier 4 — Firecrawl (paid, last resort).
|
|
2
|
+
|
|
3
|
+
Env-gated: set SCRAPER_DISABLE_FIRECRAWL to skip this tier entirely (URL is then
|
|
4
|
+
dropped). Every invocation is audited and feeds the botwall promotion counter, so
|
|
5
|
+
hosts that keep needing it get auto-skipped. Needs FIRECRAWL_API_KEY.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import threading
|
|
11
|
+
|
|
12
|
+
from ..policy.gates import check
|
|
13
|
+
|
|
14
|
+
NAME = "tier4_firecrawl"
|
|
15
|
+
PAID = True
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def disabled() -> bool:
|
|
19
|
+
return bool(os.getenv("SCRAPER_DISABLE_FIRECRAWL"))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _scrape(url: str) -> str:
|
|
23
|
+
from firecrawl import Firecrawl
|
|
24
|
+
app = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"])
|
|
25
|
+
doc = app.scrape(url, formats=["markdown"])
|
|
26
|
+
d = doc.model_dump() if hasattr(doc, "model_dump") else (doc if isinstance(doc, dict) else {})
|
|
27
|
+
return check(url, (d.get("markdown") or "").strip())
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def fetch(url: str) -> str:
|
|
31
|
+
# Run in a dedicated thread: the Firecrawl SDK sets an asyncio event loop on
|
|
32
|
+
# the calling thread, which then makes a later sync-Playwright browser tier in
|
|
33
|
+
# the same batch raise "Sync API inside the asyncio loop". A worker thread
|
|
34
|
+
# confines that loop so the browser tiers stay usable across a multi-URL run.
|
|
35
|
+
box: dict = {}
|
|
36
|
+
|
|
37
|
+
def work():
|
|
38
|
+
try:
|
|
39
|
+
box["md"] = _scrape(url)
|
|
40
|
+
except BaseException as e: # noqa: BLE001 — re-raised to the caller below
|
|
41
|
+
box["err"] = e
|
|
42
|
+
|
|
43
|
+
t = threading.Thread(target=work, name="tier4-firecrawl", daemon=True)
|
|
44
|
+
t.start()
|
|
45
|
+
t.join()
|
|
46
|
+
if "err" in box:
|
|
47
|
+
raise box["err"]
|
|
48
|
+
return box["md"]
|