switchback 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- switchback/__init__.py +12 -0
- switchback/__main__.py +4 -0
- switchback/api.py +81 -0
- switchback/concurrency.py +37 -0
- switchback/content_cache.py +94 -0
- switchback/egress.py +108 -0
- switchback/extract.py +56 -0
- switchback/flags.py +96 -0
- switchback/normalize.py +81 -0
- switchback/orchestrator.py +343 -0
- switchback/policy/__init__.py +0 -0
- switchback/policy/botwall.py +393 -0
- switchback/policy/gates.py +173 -0
- switchback/py.typed +0 -0
- switchback/reporting.py +236 -0
- switchback/search.py +39 -0
- switchback/server.py +114 -0
- switchback/session_cache.py +274 -0
- switchback/session_trace.py +96 -0
- switchback/tiers/__init__.py +24 -0
- switchback/tiers/_browser.py +50 -0
- switchback/tiers/tier0_apis.py +77 -0
- switchback/tiers/tier1_http.py +65 -0
- switchback/tiers/tier2_cloudscraper.py +135 -0
- switchback/tiers/tier3_browser.py +59 -0
- switchback/tiers/tier3b_camoufox.py +89 -0
- switchback/tiers/tier4_firecrawl.py +48 -0
- switchback/tiers/tier_residential.py +57 -0
- switchback/tracing.py +152 -0
- switchback-0.1.0.dist-info/METADATA +325 -0
- switchback-0.1.0.dist-info/RECORD +36 -0
- switchback-0.1.0.dist-info/WHEEL +5 -0
- switchback-0.1.0.dist-info/entry_points.txt +3 -0
- switchback-0.1.0.dist-info/licenses/LICENSE +21 -0
- switchback-0.1.0.dist-info/licenses/NOTICE +34 -0
- switchback-0.1.0.dist-info/top_level.txt +1 -0
switchback/reporting.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Metrics rollups from the engine's own state — no external store needed.
|
|
2
|
+
|
|
3
|
+
Reads the two files the policy already writes (see switchback.policy.botwall):
|
|
4
|
+
|
|
5
|
+
state/botwall_events.jsonl — one row per *tier attempt*:
|
|
6
|
+
{ts, url, host, tier, outcome, md_len, latency_ms, error, status_code, challenge}
|
|
7
|
+
state/botwall_db.json — per-host record incl. challenge_counts, winning_tier
|
|
8
|
+
|
|
9
|
+
and produces the metrics the firecrawl-replacement case is argued on:
|
|
10
|
+
|
|
11
|
+
• cost savings vs Firecrawl (free-tier wins are money not spent; hard pages
|
|
12
|
+
Firecrawl bills more credits for are weighted by HARD_MULT)
|
|
13
|
+
• latency — overall and per tier and per domain (mean/median/min/max/p50/p95)
|
|
14
|
+
• coverage (unique-URL success rate)
|
|
15
|
+
• error codes by domain
|
|
16
|
+
• challenges / bot-walls by domain
|
|
17
|
+
|
|
18
|
+
Pure functions returning JSON-serialisable dicts, so the same rollup backs the
|
|
19
|
+
CLI (scrape_stats.py), the HTTP API (/metrics), and the periodic digest
|
|
20
|
+
(switchback.flags). One tier attempt ≈ sequential wall-clock, so a URL's total
|
|
21
|
+
latency is approximated by summing its attempts' latencies.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import os
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
from datetime import datetime, timezone
|
|
29
|
+
from statistics import mean, median
|
|
30
|
+
|
|
31
|
+
from .policy import botwall
|
|
32
|
+
|
|
33
|
+
# Cost model (estimates — override to your real Firecrawl rates).
|
|
34
|
+
# FIRECRAWL_USD $ per basic Firecrawl scrape (matches benchmark.py's default)
|
|
35
|
+
# HARD_MULT credit multiplier Firecrawl charges for stealth/JS-rendered
|
|
36
|
+
# pages — the hard ones our browser/residential tiers resolve
|
|
37
|
+
# for free. Firecrawl's stealth proxy is ~5× basic, hence 5.
|
|
38
|
+
FIRECRAWL_USD = float(os.getenv("BENCH_FIRECRAWL_USD", "0.001"))
|
|
39
|
+
HARD_MULT = float(os.getenv("BENCH_FIRECRAWL_HARD_MULT", "5"))
|
|
40
|
+
|
|
41
|
+
# Tiers whose win means Firecrawl would have billed the hard (stealth) rate.
|
|
42
|
+
_HARD_TIERS = {"tier3_browser", "tier3b_camoufox", "tier_residential", "tier4_firecrawl"}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _parse_ts(ts: str) -> datetime | None:
|
|
46
|
+
try:
|
|
47
|
+
dt = datetime.fromisoformat(ts)
|
|
48
|
+
return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
|
|
49
|
+
except (ValueError, TypeError):
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _percentile(values: list[int], p: float) -> int:
|
|
54
|
+
if not values:
|
|
55
|
+
return 0
|
|
56
|
+
s = sorted(values)
|
|
57
|
+
idx = max(0, min(len(s) - 1, int(len(s) * p / 100)))
|
|
58
|
+
return s[idx]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _stats(values: list[int]) -> dict:
|
|
62
|
+
"""min/max/mean/median/p50/p95 over a list of latencies (ms)."""
|
|
63
|
+
if not values:
|
|
64
|
+
return {"count": 0, "min": 0, "max": 0, "mean": 0, "median": 0, "p50": 0, "p95": 0}
|
|
65
|
+
return {
|
|
66
|
+
"count": len(values),
|
|
67
|
+
"min": min(values),
|
|
68
|
+
"max": max(values),
|
|
69
|
+
"mean": round(mean(values)),
|
|
70
|
+
"median": round(median(values)),
|
|
71
|
+
"p50": _percentile(values, 50),
|
|
72
|
+
"p95": _percentile(values, 95),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def load_events(path: str | None = None, since: datetime | None = None) -> list[dict]:
|
|
77
|
+
"""Read botwall_events.jsonl (optionally only rows at/after `since`)."""
|
|
78
|
+
path = path or botwall.EVENTS_PATH
|
|
79
|
+
if not os.path.exists(path):
|
|
80
|
+
return []
|
|
81
|
+
out: list[dict] = []
|
|
82
|
+
with open(path, encoding="utf-8") as f:
|
|
83
|
+
for line in f:
|
|
84
|
+
line = line.strip()
|
|
85
|
+
if not line:
|
|
86
|
+
continue
|
|
87
|
+
try:
|
|
88
|
+
e = json.loads(line)
|
|
89
|
+
except json.JSONDecodeError:
|
|
90
|
+
continue
|
|
91
|
+
if since is not None:
|
|
92
|
+
dt = _parse_ts(e.get("ts", ""))
|
|
93
|
+
if dt is None or dt < since:
|
|
94
|
+
continue
|
|
95
|
+
out.append(e)
|
|
96
|
+
return out
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _by_url(events: list[dict]) -> dict[str, list[dict]]:
|
|
100
|
+
out: dict[str, list[dict]] = defaultdict(list)
|
|
101
|
+
for e in events:
|
|
102
|
+
out[e.get("url") or "?"].append(e)
|
|
103
|
+
return out
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _url_total_ms(attempts: list[dict]) -> int:
|
|
107
|
+
"""Per-URL wall-clock proxy: tiers run sequentially, so sum the attempts."""
|
|
108
|
+
return sum(a["latency_ms"] for a in attempts if isinstance(a.get("latency_ms"), int))
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _coverage(by_url: dict[str, list[dict]]) -> dict:
|
|
112
|
+
succeeded = {u: a for u, a in by_url.items()
|
|
113
|
+
if any(e.get("outcome") == "ok" for e in a)}
|
|
114
|
+
total = len(by_url)
|
|
115
|
+
return {
|
|
116
|
+
"unique_urls": total,
|
|
117
|
+
"succeeded": len(succeeded),
|
|
118
|
+
"failed": total - len(succeeded),
|
|
119
|
+
"success_pct": round(100 * len(succeeded) / total, 1) if total else 0.0,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _per_tier(events: list[dict]) -> dict:
|
|
124
|
+
tiers: dict[str, list[dict]] = defaultdict(list)
|
|
125
|
+
for e in events:
|
|
126
|
+
tiers[e.get("tier") or "?"].append(e)
|
|
127
|
+
out = {}
|
|
128
|
+
for tier, evs in tiers.items():
|
|
129
|
+
ok = [e for e in evs if e.get("outcome") == "ok"]
|
|
130
|
+
lats = [e["latency_ms"] for e in ok if isinstance(e.get("latency_ms"), int)]
|
|
131
|
+
out[tier] = {
|
|
132
|
+
"attempts": len(evs),
|
|
133
|
+
"ok": len(ok),
|
|
134
|
+
"miss": len(evs) - len(ok),
|
|
135
|
+
"ok_pct": round(100 * len(ok) / len(evs), 1) if evs else 0.0,
|
|
136
|
+
"latency_ms": _stats(lats),
|
|
137
|
+
}
|
|
138
|
+
return out
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _outcomes(events: list[dict]) -> dict:
|
|
142
|
+
counts: dict[str, int] = defaultdict(int)
|
|
143
|
+
for e in events:
|
|
144
|
+
counts[e.get("outcome") or "?"] += 1
|
|
145
|
+
return dict(sorted(counts.items(), key=lambda kv: -kv[1]))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _cost(by_url: dict[str, list[dict]]) -> dict:
|
|
149
|
+
"""Engine spend vs the Firecrawl-everything baseline curiouscats pays today.
|
|
150
|
+
|
|
151
|
+
For each unique URL:
|
|
152
|
+
engine_cost = FIRECRAWL_USD per actual Firecrawl invocation
|
|
153
|
+
firecrawl_equivalent = what scraping that URL via Firecrawl would cost —
|
|
154
|
+
HARD_MULT× when it needed a hard tier (Firecrawl
|
|
155
|
+
bills stealth/JS pages more), else 1×.
|
|
156
|
+
savings = baseline − engine. Failed URLs still count toward the baseline
|
|
157
|
+
(Firecrawl would have been billed for the attempt too)."""
|
|
158
|
+
engine = 0.0
|
|
159
|
+
baseline = 0.0
|
|
160
|
+
firecrawl_calls = 0
|
|
161
|
+
for attempts in by_url.values():
|
|
162
|
+
used_fc = sum(1 for a in attempts if a.get("outcome") == "firecrawl_used")
|
|
163
|
+
firecrawl_calls += used_fc
|
|
164
|
+
engine += used_fc * FIRECRAWL_USD
|
|
165
|
+
hard = any((a.get("tier") in _HARD_TIERS) or a.get("challenge") for a in attempts)
|
|
166
|
+
baseline += FIRECRAWL_USD * (HARD_MULT if hard else 1)
|
|
167
|
+
return {
|
|
168
|
+
"firecrawl_usd_per_scrape": FIRECRAWL_USD,
|
|
169
|
+
"hard_multiplier": HARD_MULT,
|
|
170
|
+
"firecrawl_invocations": firecrawl_calls,
|
|
171
|
+
"engine_cost_usd": round(engine, 4),
|
|
172
|
+
"firecrawl_baseline_usd": round(baseline, 4),
|
|
173
|
+
"savings_usd": round(baseline - engine, 4),
|
|
174
|
+
"savings_pct": round(100 * (baseline - engine) / baseline, 1) if baseline else 0.0,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _domains(events: list[dict], by_url: dict[str, list[dict]], db: dict) -> dict:
|
|
179
|
+
"""Per-host (== per-subdomain) rollup: attempts, error codes, challenges,
|
|
180
|
+
latency. Challenges come from the durable per-host counts in botwall_db.json
|
|
181
|
+
so they reflect all history, not just the events window."""
|
|
182
|
+
host_attempts: dict[str, list[dict]] = defaultdict(list)
|
|
183
|
+
for e in events:
|
|
184
|
+
host_attempts[e.get("host") or "?"].append(e)
|
|
185
|
+
|
|
186
|
+
# Per-URL total latency grouped by host (host taken from the URL's events).
|
|
187
|
+
host_url_ms: dict[str, list[int]] = defaultdict(list)
|
|
188
|
+
for attempts in by_url.values():
|
|
189
|
+
host = next((a.get("host") for a in attempts if a.get("host")), "?")
|
|
190
|
+
host_url_ms[host].append(_url_total_ms(attempts))
|
|
191
|
+
|
|
192
|
+
hosts_db = db.get("hosts", {})
|
|
193
|
+
out = {}
|
|
194
|
+
for host, evs in host_attempts.items():
|
|
195
|
+
error_codes: dict[str, int] = defaultdict(int)
|
|
196
|
+
for e in evs:
|
|
197
|
+
sc = e.get("status_code")
|
|
198
|
+
if sc:
|
|
199
|
+
error_codes[str(sc)] += 1
|
|
200
|
+
rec = hosts_db.get(host, {})
|
|
201
|
+
out[host] = {
|
|
202
|
+
"attempts": len(evs),
|
|
203
|
+
"ok": sum(1 for e in evs if e.get("outcome") == "ok"),
|
|
204
|
+
"winning_tier": rec.get("winning_tier"),
|
|
205
|
+
"needs_egress": bool(rec.get("needs_egress")),
|
|
206
|
+
"error_codes": dict(error_codes),
|
|
207
|
+
"challenges": dict(rec.get("challenge_counts", {})),
|
|
208
|
+
"latency_ms": _stats(host_url_ms.get(host, [])),
|
|
209
|
+
}
|
|
210
|
+
return dict(sorted(out.items(), key=lambda kv: -kv[1]["attempts"]))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def build_report(events: list[dict] | None = None, db: dict | None = None,
|
|
214
|
+
since: datetime | None = None) -> dict:
|
|
215
|
+
"""Full metrics rollup. Reads state files when args are omitted."""
|
|
216
|
+
if events is None:
|
|
217
|
+
events = load_events(since=since)
|
|
218
|
+
if db is None:
|
|
219
|
+
db = botwall.load_db()
|
|
220
|
+
by_url = _by_url(events)
|
|
221
|
+
url_totals = [_url_total_ms(a) for a in by_url.values()]
|
|
222
|
+
return {
|
|
223
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
224
|
+
"events": len(events),
|
|
225
|
+
"coverage": _coverage(by_url),
|
|
226
|
+
"cost": _cost(by_url),
|
|
227
|
+
"latency_overall_ms": _stats(url_totals),
|
|
228
|
+
"latency_per_tier": _per_tier(events),
|
|
229
|
+
"outcomes": _outcomes(events),
|
|
230
|
+
"domains": _domains(events, by_url, db),
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def domain_report(since: datetime | None = None) -> dict:
|
|
235
|
+
"""Just the per-domain table (error codes + challenges + latency per host)."""
|
|
236
|
+
return build_report(since=since)["domains"]
|
switchback/search.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Web search via a local SearXNG instance (query → ranked URLs).
|
|
2
|
+
|
|
3
|
+
Distinct from the scrape cascade: scrape fetches one *known* URL; search
|
|
4
|
+
*discovers* URLs from a query. Returns lightweight results a caller can then
|
|
5
|
+
feed into scrape(). Points at the SearXNG container on localhost:8888 (override
|
|
6
|
+
with SEARXNG_URL); requires its JSON format to be enabled.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
from .tracing import span
|
|
14
|
+
|
|
15
|
+
SEARXNG_URL = os.getenv("SEARXNG_URL", "http://localhost:8888")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SearchResult:
|
|
20
|
+
title: str
|
|
21
|
+
url: str
|
|
22
|
+
snippet: str
|
|
23
|
+
engine: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def search(query: str, n: int = 10) -> list[SearchResult]:
|
|
27
|
+
"""Query the local SearXNG and return up to n results."""
|
|
28
|
+
import requests
|
|
29
|
+
with span("search", **{"search.query": query, "search.n": n}) as sp:
|
|
30
|
+
r = requests.get(f"{SEARXNG_URL}/search",
|
|
31
|
+
params={"q": query, "format": "json"}, timeout=15)
|
|
32
|
+
r.raise_for_status()
|
|
33
|
+
raw = r.json().get("results", [])[:n]
|
|
34
|
+
results = [SearchResult(title=x.get("title", ""), url=x.get("url", ""),
|
|
35
|
+
snippet=x.get("content", ""),
|
|
36
|
+
engine=x.get("engine", ""))
|
|
37
|
+
for x in raw]
|
|
38
|
+
sp.set("search.n_results", len(results))
|
|
39
|
+
return results
|
switchback/server.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""HTTP service — the language-agnostic way to use the engine.
|
|
2
|
+
|
|
3
|
+
Wraps the same public functions as the library/CLI (`switchback.scrape`,
|
|
4
|
+
`switchback.search`) so any app, in any language, can hit one warm process (which
|
|
5
|
+
keeps the Tier-3 browser pool hot) instead of cold-starting a subprocess.
|
|
6
|
+
|
|
7
|
+
pip install -e .[server] # fastapi + uvicorn
|
|
8
|
+
switchback-server # or: python -m switchback.server
|
|
9
|
+
curl localhost:8799/healthz
|
|
10
|
+
curl -s localhost:8799/scrape -d '{"urls":["https://example.com"]}'
|
|
11
|
+
curl 'localhost:8799/search?q=web+scraping'
|
|
12
|
+
|
|
13
|
+
Local-first by design: no auth, no rate-limiting. Put it behind your own gateway
|
|
14
|
+
if you expose it. Host/port via SCRAPER_HOST / SCRAPER_PORT.
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
from datetime import datetime, timedelta, timezone
|
|
21
|
+
|
|
22
|
+
from fastapi import FastAPI, HTTPException
|
|
23
|
+
from fastapi.responses import FileResponse
|
|
24
|
+
from pydantic import BaseModel
|
|
25
|
+
|
|
26
|
+
from . import session_trace
|
|
27
|
+
from .api import scrape
|
|
28
|
+
from .reporting import build_report, domain_report
|
|
29
|
+
from .search import search
|
|
30
|
+
from .tracing import setup_logs
|
|
31
|
+
|
|
32
|
+
app = FastAPI(title="switchback", version="0.1.0")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ScrapeRequest(BaseModel):
|
|
36
|
+
urls: list[str]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@app.get("/healthz")
|
|
40
|
+
def healthz() -> dict:
|
|
41
|
+
return {"status": "ok"}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@app.post("/scrape")
|
|
45
|
+
def scrape_endpoint(req: ScrapeRequest) -> list[dict]:
|
|
46
|
+
"""Run URLs through the cascade. Returns successes only (failed URLs omitted)."""
|
|
47
|
+
return [{"url": r.url, "source_method": r.source_method, "markdown": r.markdown}
|
|
48
|
+
for r in scrape(req.urls)]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@app.get("/search")
|
|
52
|
+
def search_endpoint(q: str) -> list[dict]:
|
|
53
|
+
"""Query → ranked URLs (SearXNG). Feed results back into /scrape."""
|
|
54
|
+
return [{"title": h.title, "url": h.url, "snippet": h.snippet, "engine": h.engine}
|
|
55
|
+
for h in search(q)]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _since(minutes: int | None) -> datetime | None:
|
|
59
|
+
return datetime.now(timezone.utc) - timedelta(minutes=minutes) if minutes else None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@app.get("/metrics")
|
|
63
|
+
def metrics_endpoint(minutes: int | None = None) -> dict:
|
|
64
|
+
"""Metrics rollup from the engine's own state: cost savings vs Firecrawl,
|
|
65
|
+
coverage, overall + per-tier latency, outcomes, and per-domain detail.
|
|
66
|
+
Pass ?minutes=N to window the event-derived sections."""
|
|
67
|
+
return build_report(since=_since(minutes))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@app.get("/metrics/domains")
|
|
71
|
+
def metrics_domains_endpoint(minutes: int | None = None) -> dict:
|
|
72
|
+
"""Per-domain table: error codes, challenges/bot-walls, and latency by host."""
|
|
73
|
+
return domain_report(since=_since(minutes))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@app.get("/traces")
|
|
77
|
+
def list_traces_endpoint() -> list[dict]:
|
|
78
|
+
"""Captured Playwright session traces (opt-in via SCRAPER_TRACE_SESSION)."""
|
|
79
|
+
return session_trace.list_traces()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@app.get("/traces/{trace_id}")
|
|
83
|
+
def get_trace_endpoint(trace_id: str):
|
|
84
|
+
"""Download one trace zip (open with `playwright show-trace <zip>`)."""
|
|
85
|
+
path = session_trace.path_for(trace_id)
|
|
86
|
+
if not path:
|
|
87
|
+
raise HTTPException(status_code=404, detail="trace not found")
|
|
88
|
+
return FileResponse(path, media_type="application/zip",
|
|
89
|
+
filename=f"{trace_id}.zip")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@app.delete("/traces/{trace_id}")
|
|
93
|
+
def delete_trace_endpoint(trace_id: str) -> dict:
|
|
94
|
+
"""Delete a trace zip."""
|
|
95
|
+
if not session_trace.delete(trace_id):
|
|
96
|
+
raise HTTPException(status_code=404, detail="trace not found")
|
|
97
|
+
return {"deleted": trace_id}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def main() -> None:
|
|
101
|
+
import logging
|
|
102
|
+
import uvicorn
|
|
103
|
+
|
|
104
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
105
|
+
setup_logs() # ship logs to the OTLP backend too when configured
|
|
106
|
+
uvicorn.run(
|
|
107
|
+
app,
|
|
108
|
+
host=os.getenv("SCRAPER_HOST", "0.0.0.0"),
|
|
109
|
+
port=int(os.getenv("SCRAPER_PORT", "8799")),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
if __name__ == "__main__":
|
|
114
|
+
main()
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Cookie provisioning for the HTTP/browser tiers — two sources, merged per hit.
|
|
2
|
+
|
|
3
|
+
1. Session cache (cf_clearance reuse). After Tier 2 solves a Cloudflare
|
|
4
|
+
challenge its ``cf_*`` cookies are cached per ``(host, egress_scope)`` and
|
|
5
|
+
replayed on later hits so the ~5s solve is skipped. cf_clearance is bound to
|
|
6
|
+
the UA that solved it and the egress IP — hence the scope key and the stored
|
|
7
|
+
UA. Entries expire after ``SCRAPER_SESSION_TTL_S`` (cf_clearance's typical
|
|
8
|
+
lifetime); a re-detected wall calls ``forget()`` so a stale cookie self-heals
|
|
9
|
+
into a fresh solve. Disable with ``SCRAPER_DISABLE_SESSION_CACHE=1``.
|
|
10
|
+
|
|
11
|
+
2. Auth import (``SCRAPER_COOKIES_FILE``). A Netscape ``cookies.txt`` the user
|
|
12
|
+
exports from a logged-in browser; domain-matching cookies are sent so the
|
|
13
|
+
tiers can fetch pages behind a login. Opt-in; unset/absent → no-op.
|
|
14
|
+
|
|
15
|
+
The cf cache layers on top of auth cookies for the same request. Reads are in
|
|
16
|
+
memory; writes are write-through to ``state/session_cache.json`` (atomic).
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import importlib
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
import threading
|
|
25
|
+
import time
|
|
26
|
+
from http.cookiejar import MozillaCookieJar
|
|
27
|
+
from urllib.parse import urlsplit
|
|
28
|
+
|
|
29
|
+
from . import egress
|
|
30
|
+
from .policy.gates import host_of
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
_DEFAULT_STATE_DIR = os.path.join(
|
|
35
|
+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "state")
|
|
36
|
+
_STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
|
|
37
|
+
CACHE_PATH = os.path.join(_STATE_DIR, "session_cache.json")
|
|
38
|
+
|
|
39
|
+
_TTL_S = float(os.getenv("SCRAPER_SESSION_TTL_S", "1800"))
|
|
40
|
+
|
|
41
|
+
# cf_clearance is the durable one; __cf_bm / cf_chl_* are the supporting set.
|
|
42
|
+
def _is_cf_cookie(name: str) -> bool:
|
|
43
|
+
return name == "cf_clearance" or name.startswith("__cf") or name.startswith("cf_chl")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _enabled() -> bool:
|
|
47
|
+
return os.getenv("SCRAPER_DISABLE_SESSION_CACHE") not in ("1", "true", "True")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── cf_clearance session cache ────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
_LOCK = threading.Lock()
|
|
53
|
+
_DB: dict | None = None # {"version": 1, "entries": {"<host>\t<scope>": {...}}}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _load() -> dict:
|
|
57
|
+
global _DB
|
|
58
|
+
if _DB is not None:
|
|
59
|
+
return _DB
|
|
60
|
+
db = {"version": 1, "entries": {}}
|
|
61
|
+
if os.path.exists(CACHE_PATH):
|
|
62
|
+
try:
|
|
63
|
+
with open(CACHE_PATH) as f:
|
|
64
|
+
db = json.load(f)
|
|
65
|
+
db.setdefault("entries", {})
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"session_cache: load failed ({e}); starting fresh")
|
|
68
|
+
_DB = db
|
|
69
|
+
return _DB
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _save(db: dict) -> None:
|
|
73
|
+
os.makedirs(_STATE_DIR, exist_ok=True)
|
|
74
|
+
tmp = f"{CACHE_PATH}.tmp.{os.getpid()}.{threading.get_ident()}"
|
|
75
|
+
with open(tmp, "w") as f:
|
|
76
|
+
json.dump(db, f, indent=2, sort_keys=True)
|
|
77
|
+
os.replace(tmp, CACHE_PATH)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _key(url: str) -> str:
|
|
81
|
+
return f"{host_of(url)}\t{egress.scope_label()}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _cache_cookies(url: str) -> dict:
|
|
85
|
+
"""Fresh cached cf cookies for this host+scope, or {} (also lazily evicts
|
|
86
|
+
expired entries)."""
|
|
87
|
+
if not _enabled():
|
|
88
|
+
return {}
|
|
89
|
+
with _LOCK:
|
|
90
|
+
db = _load()
|
|
91
|
+
key = _key(url)
|
|
92
|
+
ent = db["entries"].get(key)
|
|
93
|
+
if not ent:
|
|
94
|
+
return {}
|
|
95
|
+
if time.time() - ent.get("ts", 0) > _TTL_S:
|
|
96
|
+
db["entries"].pop(key, None)
|
|
97
|
+
_save(db)
|
|
98
|
+
return {}
|
|
99
|
+
return dict(ent.get("cookies", {}))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def remember(url: str, cookies: dict, ua: str = "") -> None:
|
|
103
|
+
"""Persist the cf cookies from a successful solve for this host+scope."""
|
|
104
|
+
if not _enabled():
|
|
105
|
+
return
|
|
106
|
+
cf = {k: v for k, v in (cookies or {}).items() if _is_cf_cookie(k)}
|
|
107
|
+
if not cf:
|
|
108
|
+
return
|
|
109
|
+
with _LOCK:
|
|
110
|
+
db = _load()
|
|
111
|
+
db["entries"][_key(url)] = {"cookies": cf, "ua": ua, "ts": time.time()}
|
|
112
|
+
_save(db)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def forget(url: str) -> None:
|
|
116
|
+
"""Drop the cached entry for this host+scope (a re-detected wall means the
|
|
117
|
+
cookie is stale or IP-mismatched)."""
|
|
118
|
+
if not _enabled():
|
|
119
|
+
return
|
|
120
|
+
with _LOCK:
|
|
121
|
+
db = _load()
|
|
122
|
+
if db["entries"].pop(_key(url), None) is not None:
|
|
123
|
+
_save(db)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ── auth cookie import (SCRAPER_COOKIES_FILE) ────────────────────────────────
|
|
127
|
+
|
|
128
|
+
_AUTH_LOCK = threading.Lock()
|
|
129
|
+
_AUTH_JAR: MozillaCookieJar | None = None
|
|
130
|
+
_AUTH_LOADED = False
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _auth_jar() -> MozillaCookieJar | None:
|
|
134
|
+
global _AUTH_JAR, _AUTH_LOADED
|
|
135
|
+
if _AUTH_LOADED:
|
|
136
|
+
return _AUTH_JAR
|
|
137
|
+
with _AUTH_LOCK:
|
|
138
|
+
if _AUTH_LOADED:
|
|
139
|
+
return _AUTH_JAR
|
|
140
|
+
path = os.getenv("SCRAPER_COOKIES_FILE")
|
|
141
|
+
if path and os.path.exists(path):
|
|
142
|
+
jar = MozillaCookieJar()
|
|
143
|
+
try:
|
|
144
|
+
jar.load(path, ignore_discard=True, ignore_expires=True)
|
|
145
|
+
_AUTH_JAR = jar
|
|
146
|
+
logger.info(f"session_cache: loaded auth cookies from {path}")
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"session_cache: cookie import failed ({e})")
|
|
149
|
+
_AUTH_LOADED = True
|
|
150
|
+
return _AUTH_JAR
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _host_matches(cookie_domain: str, host: str) -> bool:
|
|
154
|
+
d = cookie_domain.lstrip(".")
|
|
155
|
+
return host == d or host.endswith("." + d)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _auth_cookies(url: str) -> dict:
|
|
159
|
+
jar = _auth_jar()
|
|
160
|
+
if not jar:
|
|
161
|
+
return {}
|
|
162
|
+
host = host_of(url)
|
|
163
|
+
return {c.name: c.value for c in jar if _host_matches(c.domain, host)}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ── logged-in session refresh (SCRAPER_LOGIN_HOOK) ───────────────────────────
|
|
167
|
+
#
|
|
168
|
+
# A cookies.txt export goes stale. Configure SCRAPER_LOGIN_HOOK="pkg.module:func"
|
|
169
|
+
# — a callable func(host) -> {cookie_name: value}. When an authed host trips a
|
|
170
|
+
# login / bot wall, the engine calls the hook once, persists the returned cookies
|
|
171
|
+
# per host (in the session cache under "logins"), and overlays them on the
|
|
172
|
+
# cookies.txt jar for every later request and run. The hook owns the site-specific
|
|
173
|
+
# mechanics (drive a browser, hit an auth API, read a secret); the engine stays
|
|
174
|
+
# generic, which is what lets it cover many/varied logged-in sites.
|
|
175
|
+
|
|
176
|
+
_LOGIN_LOCK = threading.Lock()
|
|
177
|
+
_LOGIN_HOOK = None
|
|
178
|
+
_LOGIN_HOOK_LOADED = False
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _login_hook():
|
|
182
|
+
global _LOGIN_HOOK, _LOGIN_HOOK_LOADED
|
|
183
|
+
if _LOGIN_HOOK_LOADED:
|
|
184
|
+
return _LOGIN_HOOK
|
|
185
|
+
with _LOGIN_LOCK:
|
|
186
|
+
if not _LOGIN_HOOK_LOADED:
|
|
187
|
+
spec = os.getenv("SCRAPER_LOGIN_HOOK", "")
|
|
188
|
+
if spec and ":" in spec:
|
|
189
|
+
mod, _, fn = spec.partition(":")
|
|
190
|
+
try:
|
|
191
|
+
_LOGIN_HOOK = getattr(importlib.import_module(mod), fn)
|
|
192
|
+
logger.info(f"session_cache: login hook loaded ({spec})")
|
|
193
|
+
except Exception as e:
|
|
194
|
+
logger.error(f"session_cache: login hook {spec!r} load failed: {e}")
|
|
195
|
+
_LOGIN_HOOK_LOADED = True
|
|
196
|
+
return _LOGIN_HOOK
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def has_login_hook() -> bool:
|
|
200
|
+
return _login_hook() is not None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _login_cookies(url: str) -> dict:
|
|
204
|
+
"""Refreshed login cookies stored for this host (host-level, scope-agnostic)."""
|
|
205
|
+
with _LOCK:
|
|
206
|
+
db = _load()
|
|
207
|
+
ent = db.get("logins", {}).get(host_of(url))
|
|
208
|
+
return dict(ent.get("cookies", {})) if ent else {}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def is_authed_host(url: str) -> bool:
|
|
212
|
+
"""True when we hold a logged-in credential for this host — an imported
|
|
213
|
+
cookies.txt match or previously-refreshed login cookies. Lets the policy tell
|
|
214
|
+
a dead session (worth re-logging-in for) from a plain bot wall."""
|
|
215
|
+
return bool(_auth_cookies(url) or _login_cookies(url))
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def refresh_login(url: str) -> bool:
|
|
219
|
+
"""Invoke the configured login hook for this host and persist the cookies it
|
|
220
|
+
returns. True if fresh cookies were obtained; no-op without a hook."""
|
|
221
|
+
hook = _login_hook()
|
|
222
|
+
if not hook:
|
|
223
|
+
return False
|
|
224
|
+
host = host_of(url)
|
|
225
|
+
try:
|
|
226
|
+
cookies = hook(host) or {}
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.error(f"session_cache: login hook failed for {host}: {e}")
|
|
229
|
+
return False
|
|
230
|
+
if not cookies:
|
|
231
|
+
logger.warning(f"session_cache: login hook returned no cookies for {host}")
|
|
232
|
+
return False
|
|
233
|
+
with _LOCK:
|
|
234
|
+
db = _load()
|
|
235
|
+
db.setdefault("logins", {})[host] = {"cookies": dict(cookies), "ts": time.time()}
|
|
236
|
+
_save(db)
|
|
237
|
+
logger.info(f"session_cache: refreshed login for {host} ({len(cookies)} cookies)")
|
|
238
|
+
return True
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# ── public API used by the tiers ─────────────────────────────────────────────
|
|
242
|
+
|
|
243
|
+
def cookies_for(url: str, *, include_cache: bool) -> dict:
|
|
244
|
+
"""Cookies to send for this URL: imported auth, overlaid by refreshed login
|
|
245
|
+
cookies, plus (when requested) the cached cf_clearance. Freshest wins on a
|
|
246
|
+
name clash (auth file < refreshed login < cf cache)."""
|
|
247
|
+
out = _auth_cookies(url)
|
|
248
|
+
out.update(_login_cookies(url))
|
|
249
|
+
if include_cache:
|
|
250
|
+
out.update(_cache_cookies(url))
|
|
251
|
+
return out
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def cookie_header(url: str, *, include_cache: bool) -> str | None:
|
|
255
|
+
"""The same cookies as a ``Cookie:`` header value (for curl_cffi, which has
|
|
256
|
+
no cookies= kwarg), or None when there are none."""
|
|
257
|
+
c = cookies_for(url, include_cache=include_cache)
|
|
258
|
+
return "; ".join(f"{k}={v}" for k, v in c.items()) if c else None
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def browser_cookies(url: str | None = None) -> list[dict]:
|
|
262
|
+
"""Auth + refreshed-login cookies as Playwright ``add_cookies`` records.
|
|
263
|
+
cf_clearance is not replayed into browsers — they solve natively. Refreshed
|
|
264
|
+
login cookies are added for the URL's host when a URL is given."""
|
|
265
|
+
out = []
|
|
266
|
+
jar = _auth_jar()
|
|
267
|
+
if jar:
|
|
268
|
+
out.extend({"name": c.name, "value": c.value,
|
|
269
|
+
"domain": c.domain, "path": c.path or "/"} for c in jar)
|
|
270
|
+
if url:
|
|
271
|
+
host = host_of(url)
|
|
272
|
+
out.extend({"name": k, "value": v, "domain": host, "path": "/"}
|
|
273
|
+
for k, v in _login_cookies(url).items())
|
|
274
|
+
return out
|