switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,236 @@
1
+ """Metrics rollups from the engine's own state — no external store needed.
2
+
3
+ Reads the two files the policy already writes (see switchback.policy.botwall):
4
+
5
+ state/botwall_events.jsonl — one row per *tier attempt*:
6
+ {ts, url, host, tier, outcome, md_len, latency_ms, error, status_code, challenge}
7
+ state/botwall_db.json — per-host record incl. challenge_counts, winning_tier
8
+
9
+ and produces the metrics the firecrawl-replacement case is argued on:
10
+
11
+ • cost savings vs Firecrawl (free-tier wins are money not spent; hard pages
12
+ Firecrawl bills more credits for are weighted by HARD_MULT)
13
+ • latency — overall and per tier and per domain (mean/median/min/max/p50/p95)
14
+ • coverage (unique-URL success rate)
15
+ • error codes by domain
16
+ • challenges / bot-walls by domain
17
+
18
+ Pure functions returning JSON-serialisable dicts, so the same rollup backs the
19
+ CLI (scrape_stats.py), the HTTP API (/metrics), and the periodic digest
20
+ (switchback.flags). One tier attempt ≈ sequential wall-clock, so a URL's total
21
+ latency is approximated by summing its attempts' latencies.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ import os
27
+ from collections import defaultdict
28
+ from datetime import datetime, timezone
29
+ from statistics import mean, median
30
+
31
+ from .policy import botwall
32
+
33
+ # Cost model (estimates — override to your real Firecrawl rates).
34
+ # FIRECRAWL_USD $ per basic Firecrawl scrape (matches benchmark.py's default)
35
+ # HARD_MULT credit multiplier Firecrawl charges for stealth/JS-rendered
36
+ # pages — the hard ones our browser/residential tiers resolve
37
+ # for free. Firecrawl's stealth proxy is ~5× basic, hence 5.
38
+ FIRECRAWL_USD = float(os.getenv("BENCH_FIRECRAWL_USD", "0.001"))
39
+ HARD_MULT = float(os.getenv("BENCH_FIRECRAWL_HARD_MULT", "5"))
40
+
41
+ # Tiers whose win means Firecrawl would have billed the hard (stealth) rate.
42
+ _HARD_TIERS = {"tier3_browser", "tier3b_camoufox", "tier_residential", "tier4_firecrawl"}
43
+
44
+
45
+ def _parse_ts(ts: str) -> datetime | None:
46
+ try:
47
+ dt = datetime.fromisoformat(ts)
48
+ return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
49
+ except (ValueError, TypeError):
50
+ return None
51
+
52
+
53
+ def _percentile(values: list[int], p: float) -> int:
54
+ if not values:
55
+ return 0
56
+ s = sorted(values)
57
+ idx = max(0, min(len(s) - 1, int(len(s) * p / 100)))
58
+ return s[idx]
59
+
60
+
61
+ def _stats(values: list[int]) -> dict:
62
+ """min/max/mean/median/p50/p95 over a list of latencies (ms)."""
63
+ if not values:
64
+ return {"count": 0, "min": 0, "max": 0, "mean": 0, "median": 0, "p50": 0, "p95": 0}
65
+ return {
66
+ "count": len(values),
67
+ "min": min(values),
68
+ "max": max(values),
69
+ "mean": round(mean(values)),
70
+ "median": round(median(values)),
71
+ "p50": _percentile(values, 50),
72
+ "p95": _percentile(values, 95),
73
+ }
74
+
75
+
76
+ def load_events(path: str | None = None, since: datetime | None = None) -> list[dict]:
77
+ """Read botwall_events.jsonl (optionally only rows at/after `since`)."""
78
+ path = path or botwall.EVENTS_PATH
79
+ if not os.path.exists(path):
80
+ return []
81
+ out: list[dict] = []
82
+ with open(path, encoding="utf-8") as f:
83
+ for line in f:
84
+ line = line.strip()
85
+ if not line:
86
+ continue
87
+ try:
88
+ e = json.loads(line)
89
+ except json.JSONDecodeError:
90
+ continue
91
+ if since is not None:
92
+ dt = _parse_ts(e.get("ts", ""))
93
+ if dt is None or dt < since:
94
+ continue
95
+ out.append(e)
96
+ return out
97
+
98
+
99
+ def _by_url(events: list[dict]) -> dict[str, list[dict]]:
100
+ out: dict[str, list[dict]] = defaultdict(list)
101
+ for e in events:
102
+ out[e.get("url") or "?"].append(e)
103
+ return out
104
+
105
+
106
+ def _url_total_ms(attempts: list[dict]) -> int:
107
+ """Per-URL wall-clock proxy: tiers run sequentially, so sum the attempts."""
108
+ return sum(a["latency_ms"] for a in attempts if isinstance(a.get("latency_ms"), int))
109
+
110
+
111
+ def _coverage(by_url: dict[str, list[dict]]) -> dict:
112
+ succeeded = {u: a for u, a in by_url.items()
113
+ if any(e.get("outcome") == "ok" for e in a)}
114
+ total = len(by_url)
115
+ return {
116
+ "unique_urls": total,
117
+ "succeeded": len(succeeded),
118
+ "failed": total - len(succeeded),
119
+ "success_pct": round(100 * len(succeeded) / total, 1) if total else 0.0,
120
+ }
121
+
122
+
123
+ def _per_tier(events: list[dict]) -> dict:
124
+ tiers: dict[str, list[dict]] = defaultdict(list)
125
+ for e in events:
126
+ tiers[e.get("tier") or "?"].append(e)
127
+ out = {}
128
+ for tier, evs in tiers.items():
129
+ ok = [e for e in evs if e.get("outcome") == "ok"]
130
+ lats = [e["latency_ms"] for e in ok if isinstance(e.get("latency_ms"), int)]
131
+ out[tier] = {
132
+ "attempts": len(evs),
133
+ "ok": len(ok),
134
+ "miss": len(evs) - len(ok),
135
+ "ok_pct": round(100 * len(ok) / len(evs), 1) if evs else 0.0,
136
+ "latency_ms": _stats(lats),
137
+ }
138
+ return out
139
+
140
+
141
+ def _outcomes(events: list[dict]) -> dict:
142
+ counts: dict[str, int] = defaultdict(int)
143
+ for e in events:
144
+ counts[e.get("outcome") or "?"] += 1
145
+ return dict(sorted(counts.items(), key=lambda kv: -kv[1]))
146
+
147
+
148
+ def _cost(by_url: dict[str, list[dict]]) -> dict:
149
+ """Engine spend vs the Firecrawl-everything baseline curiouscats pays today.
150
+
151
+ For each unique URL:
152
+ engine_cost = FIRECRAWL_USD per actual Firecrawl invocation
153
+ firecrawl_equivalent = what scraping that URL via Firecrawl would cost —
154
+ HARD_MULT× when it needed a hard tier (Firecrawl
155
+ bills stealth/JS pages more), else 1×.
156
+ savings = baseline − engine. Failed URLs still count toward the baseline
157
+ (Firecrawl would have been billed for the attempt too)."""
158
+ engine = 0.0
159
+ baseline = 0.0
160
+ firecrawl_calls = 0
161
+ for attempts in by_url.values():
162
+ used_fc = sum(1 for a in attempts if a.get("outcome") == "firecrawl_used")
163
+ firecrawl_calls += used_fc
164
+ engine += used_fc * FIRECRAWL_USD
165
+ hard = any((a.get("tier") in _HARD_TIERS) or a.get("challenge") for a in attempts)
166
+ baseline += FIRECRAWL_USD * (HARD_MULT if hard else 1)
167
+ return {
168
+ "firecrawl_usd_per_scrape": FIRECRAWL_USD,
169
+ "hard_multiplier": HARD_MULT,
170
+ "firecrawl_invocations": firecrawl_calls,
171
+ "engine_cost_usd": round(engine, 4),
172
+ "firecrawl_baseline_usd": round(baseline, 4),
173
+ "savings_usd": round(baseline - engine, 4),
174
+ "savings_pct": round(100 * (baseline - engine) / baseline, 1) if baseline else 0.0,
175
+ }
176
+
177
+
178
+ def _domains(events: list[dict], by_url: dict[str, list[dict]], db: dict) -> dict:
179
+ """Per-host (== per-subdomain) rollup: attempts, error codes, challenges,
180
+ latency. Challenges come from the durable per-host counts in botwall_db.json
181
+ so they reflect all history, not just the events window."""
182
+ host_attempts: dict[str, list[dict]] = defaultdict(list)
183
+ for e in events:
184
+ host_attempts[e.get("host") or "?"].append(e)
185
+
186
+ # Per-URL total latency grouped by host (host taken from the URL's events).
187
+ host_url_ms: dict[str, list[int]] = defaultdict(list)
188
+ for attempts in by_url.values():
189
+ host = next((a.get("host") for a in attempts if a.get("host")), "?")
190
+ host_url_ms[host].append(_url_total_ms(attempts))
191
+
192
+ hosts_db = db.get("hosts", {})
193
+ out = {}
194
+ for host, evs in host_attempts.items():
195
+ error_codes: dict[str, int] = defaultdict(int)
196
+ for e in evs:
197
+ sc = e.get("status_code")
198
+ if sc:
199
+ error_codes[str(sc)] += 1
200
+ rec = hosts_db.get(host, {})
201
+ out[host] = {
202
+ "attempts": len(evs),
203
+ "ok": sum(1 for e in evs if e.get("outcome") == "ok"),
204
+ "winning_tier": rec.get("winning_tier"),
205
+ "needs_egress": bool(rec.get("needs_egress")),
206
+ "error_codes": dict(error_codes),
207
+ "challenges": dict(rec.get("challenge_counts", {})),
208
+ "latency_ms": _stats(host_url_ms.get(host, [])),
209
+ }
210
+ return dict(sorted(out.items(), key=lambda kv: -kv[1]["attempts"]))
211
+
212
+
213
+ def build_report(events: list[dict] | None = None, db: dict | None = None,
214
+ since: datetime | None = None) -> dict:
215
+ """Full metrics rollup. Reads state files when args are omitted."""
216
+ if events is None:
217
+ events = load_events(since=since)
218
+ if db is None:
219
+ db = botwall.load_db()
220
+ by_url = _by_url(events)
221
+ url_totals = [_url_total_ms(a) for a in by_url.values()]
222
+ return {
223
+ "generated_at": datetime.now(timezone.utc).isoformat(),
224
+ "events": len(events),
225
+ "coverage": _coverage(by_url),
226
+ "cost": _cost(by_url),
227
+ "latency_overall_ms": _stats(url_totals),
228
+ "latency_per_tier": _per_tier(events),
229
+ "outcomes": _outcomes(events),
230
+ "domains": _domains(events, by_url, db),
231
+ }
232
+
233
+
234
+ def domain_report(since: datetime | None = None) -> dict:
235
+ """Just the per-domain table (error codes + challenges + latency per host)."""
236
+ return build_report(since=since)["domains"]
switchback/search.py ADDED
@@ -0,0 +1,39 @@
1
+ """Web search via a local SearXNG instance (query → ranked URLs).
2
+
3
+ Distinct from the scrape cascade: scrape fetches one *known* URL; search
4
+ *discovers* URLs from a query. Returns lightweight results a caller can then
5
+ feed into scrape(). Points at the SearXNG container on localhost:8888 (override
6
+ with SEARXNG_URL); requires its JSON format to be enabled.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from dataclasses import dataclass
12
+
13
+ from .tracing import span
14
+
15
+ SEARXNG_URL = os.getenv("SEARXNG_URL", "http://localhost:8888")
16
+
17
+
18
+ @dataclass
19
+ class SearchResult:
20
+ title: str
21
+ url: str
22
+ snippet: str
23
+ engine: str
24
+
25
+
26
+ def search(query: str, n: int = 10) -> list[SearchResult]:
27
+ """Query the local SearXNG and return up to n results."""
28
+ import requests
29
+ with span("search", **{"search.query": query, "search.n": n}) as sp:
30
+ r = requests.get(f"{SEARXNG_URL}/search",
31
+ params={"q": query, "format": "json"}, timeout=15)
32
+ r.raise_for_status()
33
+ raw = r.json().get("results", [])[:n]
34
+ results = [SearchResult(title=x.get("title", ""), url=x.get("url", ""),
35
+ snippet=x.get("content", ""),
36
+ engine=x.get("engine", ""))
37
+ for x in raw]
38
+ sp.set("search.n_results", len(results))
39
+ return results
switchback/server.py ADDED
@@ -0,0 +1,114 @@
1
+ """HTTP service — the language-agnostic way to use the engine.
2
+
3
+ Wraps the same public functions as the library/CLI (`switchback.scrape`,
4
+ `switchback.search`) so any app, in any language, can hit one warm process (which
5
+ keeps the Tier-3 browser pool hot) instead of cold-starting a subprocess.
6
+
7
+ pip install -e .[server] # fastapi + uvicorn
8
+ switchback-server # or: python -m switchback.server
9
+ curl localhost:8799/healthz
10
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"]}'
11
+ curl 'localhost:8799/search?q=web+scraping'
12
+
13
+ Local-first by design: no auth, no rate-limiting. Put it behind your own gateway
14
+ if you expose it. Host/port via SCRAPER_HOST / SCRAPER_PORT.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import os
19
+
20
+ from datetime import datetime, timedelta, timezone
21
+
22
+ from fastapi import FastAPI, HTTPException
23
+ from fastapi.responses import FileResponse
24
+ from pydantic import BaseModel
25
+
26
+ from . import session_trace
27
+ from .api import scrape
28
+ from .reporting import build_report, domain_report
29
+ from .search import search
30
+ from .tracing import setup_logs
31
+
32
+ app = FastAPI(title="switchback", version="0.1.0")
33
+
34
+
35
+ class ScrapeRequest(BaseModel):
36
+ urls: list[str]
37
+
38
+
39
+ @app.get("/healthz")
40
+ def healthz() -> dict:
41
+ return {"status": "ok"}
42
+
43
+
44
+ @app.post("/scrape")
45
+ def scrape_endpoint(req: ScrapeRequest) -> list[dict]:
46
+ """Run URLs through the cascade. Returns successes only (failed URLs omitted)."""
47
+ return [{"url": r.url, "source_method": r.source_method, "markdown": r.markdown}
48
+ for r in scrape(req.urls)]
49
+
50
+
51
+ @app.get("/search")
52
+ def search_endpoint(q: str) -> list[dict]:
53
+ """Query → ranked URLs (SearXNG). Feed results back into /scrape."""
54
+ return [{"title": h.title, "url": h.url, "snippet": h.snippet, "engine": h.engine}
55
+ for h in search(q)]
56
+
57
+
58
+ def _since(minutes: int | None) -> datetime | None:
59
+ return datetime.now(timezone.utc) - timedelta(minutes=minutes) if minutes else None
60
+
61
+
62
+ @app.get("/metrics")
63
+ def metrics_endpoint(minutes: int | None = None) -> dict:
64
+ """Metrics rollup from the engine's own state: cost savings vs Firecrawl,
65
+ coverage, overall + per-tier latency, outcomes, and per-domain detail.
66
+ Pass ?minutes=N to window the event-derived sections."""
67
+ return build_report(since=_since(minutes))
68
+
69
+
70
+ @app.get("/metrics/domains")
71
+ def metrics_domains_endpoint(minutes: int | None = None) -> dict:
72
+ """Per-domain table: error codes, challenges/bot-walls, and latency by host."""
73
+ return domain_report(since=_since(minutes))
74
+
75
+
76
+ @app.get("/traces")
77
+ def list_traces_endpoint() -> list[dict]:
78
+ """Captured Playwright session traces (opt-in via SCRAPER_TRACE_SESSION)."""
79
+ return session_trace.list_traces()
80
+
81
+
82
+ @app.get("/traces/{trace_id}")
83
+ def get_trace_endpoint(trace_id: str):
84
+ """Download one trace zip (open with `playwright show-trace <zip>`)."""
85
+ path = session_trace.path_for(trace_id)
86
+ if not path:
87
+ raise HTTPException(status_code=404, detail="trace not found")
88
+ return FileResponse(path, media_type="application/zip",
89
+ filename=f"{trace_id}.zip")
90
+
91
+
92
+ @app.delete("/traces/{trace_id}")
93
+ def delete_trace_endpoint(trace_id: str) -> dict:
94
+ """Delete a trace zip."""
95
+ if not session_trace.delete(trace_id):
96
+ raise HTTPException(status_code=404, detail="trace not found")
97
+ return {"deleted": trace_id}
98
+
99
+
100
+ def main() -> None:
101
+ import logging
102
+ import uvicorn
103
+
104
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
105
+ setup_logs() # ship logs to the OTLP backend too when configured
106
+ uvicorn.run(
107
+ app,
108
+ host=os.getenv("SCRAPER_HOST", "0.0.0.0"),
109
+ port=int(os.getenv("SCRAPER_PORT", "8799")),
110
+ )
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()
@@ -0,0 +1,274 @@
1
+ """Cookie provisioning for the HTTP/browser tiers — two sources, merged per hit.
2
+
3
+ 1. Session cache (cf_clearance reuse). After Tier 2 solves a Cloudflare
4
+ challenge its ``cf_*`` cookies are cached per ``(host, egress_scope)`` and
5
+ replayed on later hits so the ~5s solve is skipped. cf_clearance is bound to
6
+ the UA that solved it and the egress IP — hence the scope key and the stored
7
+ UA. Entries expire after ``SCRAPER_SESSION_TTL_S`` (cf_clearance's typical
8
+ lifetime); a re-detected wall calls ``forget()`` so a stale cookie self-heals
9
+ into a fresh solve. Disable with ``SCRAPER_DISABLE_SESSION_CACHE=1``.
10
+
11
+ 2. Auth import (``SCRAPER_COOKIES_FILE``). A Netscape ``cookies.txt`` the user
12
+ exports from a logged-in browser; domain-matching cookies are sent so the
13
+ tiers can fetch pages behind a login. Opt-in; unset/absent → no-op.
14
+
15
+ The cf cache layers on top of auth cookies for the same request. Reads are in
16
+ memory; writes are write-through to ``state/session_cache.json`` (atomic).
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import importlib
21
+ import json
22
+ import logging
23
+ import os
24
+ import threading
25
+ import time
26
+ from http.cookiejar import MozillaCookieJar
27
+ from urllib.parse import urlsplit
28
+
29
+ from . import egress
30
+ from .policy.gates import host_of
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ _DEFAULT_STATE_DIR = os.path.join(
35
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "state")
36
+ _STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
37
+ CACHE_PATH = os.path.join(_STATE_DIR, "session_cache.json")
38
+
39
+ _TTL_S = float(os.getenv("SCRAPER_SESSION_TTL_S", "1800"))
40
+
41
+ # cf_clearance is the durable one; __cf_bm / cf_chl_* are the supporting set.
42
+ def _is_cf_cookie(name: str) -> bool:
43
+ return name == "cf_clearance" or name.startswith("__cf") or name.startswith("cf_chl")
44
+
45
+
46
+ def _enabled() -> bool:
47
+ return os.getenv("SCRAPER_DISABLE_SESSION_CACHE") not in ("1", "true", "True")
48
+
49
+
50
+ # ── cf_clearance session cache ────────────────────────────────────────────────
51
+
52
+ _LOCK = threading.Lock()
53
+ _DB: dict | None = None # {"version": 1, "entries": {"<host>\t<scope>": {...}}}
54
+
55
+
56
+ def _load() -> dict:
57
+ global _DB
58
+ if _DB is not None:
59
+ return _DB
60
+ db = {"version": 1, "entries": {}}
61
+ if os.path.exists(CACHE_PATH):
62
+ try:
63
+ with open(CACHE_PATH) as f:
64
+ db = json.load(f)
65
+ db.setdefault("entries", {})
66
+ except Exception as e:
67
+ logger.error(f"session_cache: load failed ({e}); starting fresh")
68
+ _DB = db
69
+ return _DB
70
+
71
+
72
+ def _save(db: dict) -> None:
73
+ os.makedirs(_STATE_DIR, exist_ok=True)
74
+ tmp = f"{CACHE_PATH}.tmp.{os.getpid()}.{threading.get_ident()}"
75
+ with open(tmp, "w") as f:
76
+ json.dump(db, f, indent=2, sort_keys=True)
77
+ os.replace(tmp, CACHE_PATH)
78
+
79
+
80
+ def _key(url: str) -> str:
81
+ return f"{host_of(url)}\t{egress.scope_label()}"
82
+
83
+
84
+ def _cache_cookies(url: str) -> dict:
85
+ """Fresh cached cf cookies for this host+scope, or {} (also lazily evicts
86
+ expired entries)."""
87
+ if not _enabled():
88
+ return {}
89
+ with _LOCK:
90
+ db = _load()
91
+ key = _key(url)
92
+ ent = db["entries"].get(key)
93
+ if not ent:
94
+ return {}
95
+ if time.time() - ent.get("ts", 0) > _TTL_S:
96
+ db["entries"].pop(key, None)
97
+ _save(db)
98
+ return {}
99
+ return dict(ent.get("cookies", {}))
100
+
101
+
102
+ def remember(url: str, cookies: dict, ua: str = "") -> None:
103
+ """Persist the cf cookies from a successful solve for this host+scope."""
104
+ if not _enabled():
105
+ return
106
+ cf = {k: v for k, v in (cookies or {}).items() if _is_cf_cookie(k)}
107
+ if not cf:
108
+ return
109
+ with _LOCK:
110
+ db = _load()
111
+ db["entries"][_key(url)] = {"cookies": cf, "ua": ua, "ts": time.time()}
112
+ _save(db)
113
+
114
+
115
+ def forget(url: str) -> None:
116
+ """Drop the cached entry for this host+scope (a re-detected wall means the
117
+ cookie is stale or IP-mismatched)."""
118
+ if not _enabled():
119
+ return
120
+ with _LOCK:
121
+ db = _load()
122
+ if db["entries"].pop(_key(url), None) is not None:
123
+ _save(db)
124
+
125
+
126
+ # ── auth cookie import (SCRAPER_COOKIES_FILE) ────────────────────────────────
127
+
128
+ _AUTH_LOCK = threading.Lock()
129
+ _AUTH_JAR: MozillaCookieJar | None = None
130
+ _AUTH_LOADED = False
131
+
132
+
133
+ def _auth_jar() -> MozillaCookieJar | None:
134
+ global _AUTH_JAR, _AUTH_LOADED
135
+ if _AUTH_LOADED:
136
+ return _AUTH_JAR
137
+ with _AUTH_LOCK:
138
+ if _AUTH_LOADED:
139
+ return _AUTH_JAR
140
+ path = os.getenv("SCRAPER_COOKIES_FILE")
141
+ if path and os.path.exists(path):
142
+ jar = MozillaCookieJar()
143
+ try:
144
+ jar.load(path, ignore_discard=True, ignore_expires=True)
145
+ _AUTH_JAR = jar
146
+ logger.info(f"session_cache: loaded auth cookies from {path}")
147
+ except Exception as e:
148
+ logger.error(f"session_cache: cookie import failed ({e})")
149
+ _AUTH_LOADED = True
150
+ return _AUTH_JAR
151
+
152
+
153
+ def _host_matches(cookie_domain: str, host: str) -> bool:
154
+ d = cookie_domain.lstrip(".")
155
+ return host == d or host.endswith("." + d)
156
+
157
+
158
+ def _auth_cookies(url: str) -> dict:
159
+ jar = _auth_jar()
160
+ if not jar:
161
+ return {}
162
+ host = host_of(url)
163
+ return {c.name: c.value for c in jar if _host_matches(c.domain, host)}
164
+
165
+
166
+ # ── logged-in session refresh (SCRAPER_LOGIN_HOOK) ───────────────────────────
167
+ #
168
+ # A cookies.txt export goes stale. Configure SCRAPER_LOGIN_HOOK="pkg.module:func"
169
+ # — a callable func(host) -> {cookie_name: value}. When an authed host trips a
170
+ # login / bot wall, the engine calls the hook once, persists the returned cookies
171
+ # per host (in the session cache under "logins"), and overlays them on the
172
+ # cookies.txt jar for every later request and run. The hook owns the site-specific
173
+ # mechanics (drive a browser, hit an auth API, read a secret); the engine stays
174
+ # generic, which is what lets it cover many/varied logged-in sites.
175
+
176
+ _LOGIN_LOCK = threading.Lock()
177
+ _LOGIN_HOOK = None
178
+ _LOGIN_HOOK_LOADED = False
179
+
180
+
181
+ def _login_hook():
182
+ global _LOGIN_HOOK, _LOGIN_HOOK_LOADED
183
+ if _LOGIN_HOOK_LOADED:
184
+ return _LOGIN_HOOK
185
+ with _LOGIN_LOCK:
186
+ if not _LOGIN_HOOK_LOADED:
187
+ spec = os.getenv("SCRAPER_LOGIN_HOOK", "")
188
+ if spec and ":" in spec:
189
+ mod, _, fn = spec.partition(":")
190
+ try:
191
+ _LOGIN_HOOK = getattr(importlib.import_module(mod), fn)
192
+ logger.info(f"session_cache: login hook loaded ({spec})")
193
+ except Exception as e:
194
+ logger.error(f"session_cache: login hook {spec!r} load failed: {e}")
195
+ _LOGIN_HOOK_LOADED = True
196
+ return _LOGIN_HOOK
197
+
198
+
199
+ def has_login_hook() -> bool:
200
+ return _login_hook() is not None
201
+
202
+
203
+ def _login_cookies(url: str) -> dict:
204
+ """Refreshed login cookies stored for this host (host-level, scope-agnostic)."""
205
+ with _LOCK:
206
+ db = _load()
207
+ ent = db.get("logins", {}).get(host_of(url))
208
+ return dict(ent.get("cookies", {})) if ent else {}
209
+
210
+
211
+ def is_authed_host(url: str) -> bool:
212
+ """True when we hold a logged-in credential for this host — an imported
213
+ cookies.txt match or previously-refreshed login cookies. Lets the policy tell
214
+ a dead session (worth re-logging-in for) from a plain bot wall."""
215
+ return bool(_auth_cookies(url) or _login_cookies(url))
216
+
217
+
218
+ def refresh_login(url: str) -> bool:
219
+ """Invoke the configured login hook for this host and persist the cookies it
220
+ returns. True if fresh cookies were obtained; no-op without a hook."""
221
+ hook = _login_hook()
222
+ if not hook:
223
+ return False
224
+ host = host_of(url)
225
+ try:
226
+ cookies = hook(host) or {}
227
+ except Exception as e:
228
+ logger.error(f"session_cache: login hook failed for {host}: {e}")
229
+ return False
230
+ if not cookies:
231
+ logger.warning(f"session_cache: login hook returned no cookies for {host}")
232
+ return False
233
+ with _LOCK:
234
+ db = _load()
235
+ db.setdefault("logins", {})[host] = {"cookies": dict(cookies), "ts": time.time()}
236
+ _save(db)
237
+ logger.info(f"session_cache: refreshed login for {host} ({len(cookies)} cookies)")
238
+ return True
239
+
240
+
241
+ # ── public API used by the tiers ─────────────────────────────────────────────
242
+
243
+ def cookies_for(url: str, *, include_cache: bool) -> dict:
244
+ """Cookies to send for this URL: imported auth, overlaid by refreshed login
245
+ cookies, plus (when requested) the cached cf_clearance. Freshest wins on a
246
+ name clash (auth file < refreshed login < cf cache)."""
247
+ out = _auth_cookies(url)
248
+ out.update(_login_cookies(url))
249
+ if include_cache:
250
+ out.update(_cache_cookies(url))
251
+ return out
252
+
253
+
254
+ def cookie_header(url: str, *, include_cache: bool) -> str | None:
255
+ """The same cookies as a ``Cookie:`` header value (for curl_cffi, which has
256
+ no cookies= kwarg), or None when there are none."""
257
+ c = cookies_for(url, include_cache=include_cache)
258
+ return "; ".join(f"{k}={v}" for k, v in c.items()) if c else None
259
+
260
+
261
+ def browser_cookies(url: str | None = None) -> list[dict]:
262
+ """Auth + refreshed-login cookies as Playwright ``add_cookies`` records.
263
+ cf_clearance is not replayed into browsers — they solve natively. Refreshed
264
+ login cookies are added for the URL's host when a URL is given."""
265
+ out = []
266
+ jar = _auth_jar()
267
+ if jar:
268
+ out.extend({"name": c.name, "value": c.value,
269
+ "domain": c.domain, "path": c.path or "/"} for c in jar)
270
+ if url:
271
+ host = host_of(url)
272
+ out.extend({"name": k, "value": v, "domain": host, "path": "/"}
273
+ for k, v in _login_cookies(url).items())
274
+ return out