switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,343 @@
1
+ """Cascade runner: route → run tiers in cost order → stop at first success.
2
+
3
+ One trace per URL, one span per tier attempt. Botwall governs skip-listing and
4
+ winning-tier routing; every outcome is recorded so the policy self-heals.
5
+
6
+ Failures are first-class: every attempt is classified (see gates.classify_error)
7
+ so a hard 403/429 escalates egress, the per-tier reasons are returned to callers
8
+ via `run_detailed()`/`ScrapeOutcome`, and one aggregate event is logged + traced
9
+ per URL. `run()` stays successes-only for backward compatibility.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import os
15
+ import random
16
+ import time
17
+ from dataclasses import dataclass, field
18
+
19
+ from . import content_cache, egress, session_cache
20
+ from .policy import botwall
21
+ from .policy.gates import BotWall, RateLimited, ShortContent, classify_error, host_of
22
+ from .tiers import TIERS, INDEX
23
+ from .tracing import Attr, flush, span
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Per-request wall-clock budget. Checked between tiers so a single URL can't run
28
+ # the whole cascade of timeouts; overridable via env. 45s balances latency vs
29
+ # coverage: roughly fits a Camoufox solve (~40s) that starts after the cheaper
30
+ # tiers fail fast, while still bounding the worst case.
31
+ _DEADLINE_S = float(os.getenv("SCRAPER_DEADLINE_S", "45"))
32
+
33
+ # Exponential backoff between tiers after a *transient* failure (rate_limited /
34
+ # timeout) — gives a rate limiter or a slow origin a moment before the next tier
35
+ # hammers it. Disabled by default (base 0) so behaviour is unchanged until opted
36
+ # in. delay = min(MAX, BASE·2^(n−1)) with 50–100% jitter; never sleeps past the
37
+ # per-request deadline.
38
+ _BACKOFF_BASE_MS = float(os.getenv("SCRAPER_BACKOFF_BASE_MS", "0"))
39
+ _BACKOFF_MAX_MS = float(os.getenv("SCRAPER_BACKOFF_MAX_MS", "8000"))
40
+ _TRANSIENT = ("rate_limited", "timeout")
41
+
42
+
43
+ def _maybe_backoff(transient_n: int, deadline: float) -> None:
44
+ if not _BACKOFF_BASE_MS or transient_n <= 0:
45
+ return
46
+ delay = min(_BACKOFF_MAX_MS, _BACKOFF_BASE_MS * (2 ** (transient_n - 1)))
47
+ delay = delay * (0.5 + random.random() * 0.5) / 1000.0 # jitter → seconds
48
+ if time.monotonic() + delay >= deadline: # don't burn the whole budget sleeping
49
+ return
50
+ time.sleep(delay)
51
+
52
+ # Per-attempt outcomes that aren't real failures (don't carry a failure reason).
53
+ _NON_FAILURE = ("ok", "not_applicable", "disabled")
54
+
55
+ # How explanatory each failure class is, for picking the reason that best
56
+ # describes why a URL failed. A real wall (403 / bot-wall) outranks a trailing
57
+ # config error (e.g. Firecrawl with no API key → "error"), so the verdict points
58
+ # at the actual blocker rather than the last thing that happened to throw.
59
+ _FAILURE_PRIORITY = {
60
+ "botwall": 5, "http_block": 5,
61
+ "rate_limited": 4, "short_content": 4,
62
+ "timeout": 3, "connection": 3,
63
+ "http_error": 2,
64
+ "error": 1,
65
+ }
66
+
67
+
68
+ @dataclass
69
+ class ScrapeResult:
70
+ url: str
71
+ markdown: str
72
+ source_method: str # tier NAME that won
73
+
74
+
75
+ @dataclass
76
+ class TierAttempt:
77
+ """One tier's attempt on a URL — what it was and why it ended."""
78
+ tier: str
79
+ outcome: str # ok | botwall | short_content | http_block | …
80
+ error: str = ""
81
+ status_code: int | None = None
82
+ latency_ms: int | None = None
83
+
84
+
85
+ @dataclass
86
+ class ScrapeOutcome:
87
+ """Full per-URL result, success or failure, with the cascade it took."""
88
+ url: str
89
+ ok: bool
90
+ markdown: str = ""
91
+ source_method: str = "" # winning tier (on success)
92
+ final_outcome: str = "" # ok | all_failed | deadline_exceeded | *_skipped
93
+ error_class: str = "" # dominant failure class (on failure)
94
+ status_code: int | None = None
95
+ latency_ms: int | None = None
96
+ egress: str = "direct" # "egress" if routed via SCRAPER_EGRESS_PROXY, else "direct"
97
+ wire_bytes: int = 0 # bytes transferred over the network (cost basis for proxy GB)
98
+ attempts: list[TierAttempt] = field(default_factory=list)
99
+
100
+
101
+ def _dominant_failure(attempts: list[TierAttempt]) -> tuple[str, int | None]:
102
+ """The failure that best explains 'why this URL failed': the highest-priority
103
+ real failing attempt (ties resolve to the later, more capable tier)."""
104
+ best: tuple[int, str, int | None] | None = None
105
+ for a in attempts:
106
+ if a.outcome in _NON_FAILURE:
107
+ continue
108
+ pr = _FAILURE_PRIORITY.get(a.outcome, 1)
109
+ if best is None or pr >= best[0]:
110
+ best = (pr, a.outcome, a.status_code)
111
+ return (best[1], best[2]) if best else ("", None)
112
+
113
+
114
+ def _start_index(url: str, db: dict) -> int:
115
+ """Begin at the host's known-good rung (fall through on regression).
116
+
117
+ For a host the local tiers keep walling (needs_egress) we escalate egress,
118
+ cheapest first:
119
+ 1. If a residential proxy is wired (SCRAPER_EGRESS_PROXY), rerun from the
120
+ top — the HTTP tiers now go through it (~0.2MB) instead of jumping
121
+ straight to a multi-MB remote browser render.
122
+ 2. Else, if the residential CDP browser is enabled, jump to it.
123
+ 3. Else fall back to normal routing (don't strand the host past every
124
+ usable tier)."""
125
+ host = host_of(url)
126
+ if botwall.needs_egress(host, db):
127
+ if egress.has_egress_proxy():
128
+ return 0
129
+ res_i = INDEX.get("tier_residential")
130
+ if res_i is not None:
131
+ disabled_fn = getattr(TIERS[res_i], "disabled", None)
132
+ if not (disabled_fn and disabled_fn()):
133
+ return res_i
134
+ wt = botwall.winning_tier(host, db)
135
+ return INDEX.get(wt, 0) if wt else 0
136
+
137
+
138
+ def _record_failure(sp, attempts, db, url, tier_name, outcome, exc, status, dt,
139
+ challenge=None):
140
+ """Annotate the span, persist to botwall, and append the attempt — for one
141
+ failed tier attempt. Shared by every except branch so classification,
142
+ tracing, and the event log never drift apart. `challenge` names the bot-wall
143
+ vendor when one was served, so the policy can learn it per host."""
144
+ msg = f"{type(exc).__name__}: {exc}"
145
+ sp.set(Attr.OUTCOME, outcome)
146
+ sp.set(Attr.ERROR, msg)
147
+ sp.set(Attr.ERROR_CLASS, outcome)
148
+ sp.set(Attr.CHALLENGE, challenge)
149
+ sp.set(Attr.STATUS_CODE, status)
150
+ sp.set(Attr.LATENCY_MS, dt)
151
+ botwall.record(db, url, tier_name, outcome, error=msg, latency_ms=dt,
152
+ status_code=status, challenge=challenge)
153
+ # A wall on a host we had a cached cf_clearance for means the cookie is stale
154
+ # or IP-mismatched: drop it so the next attempt re-solves instead of replaying.
155
+ if outcome in ("botwall", "http_block"):
156
+ session_cache.forget(url)
157
+ attempts.append(TierAttempt(tier_name, outcome, msg, status, dt))
158
+ log = logger.info if outcome in ("botwall", "short_content") else logger.warning
159
+ log(f"{tier_name} {outcome} {url}"
160
+ + (f" [{status}]" if status else "") + f": {exc}")
161
+
162
+
163
+ def _skipped(url, root, outcome, reason) -> ScrapeOutcome:
164
+ """Terminal short-circuit (domain/url skip): trace + aggregate event."""
165
+ logger.info(f"{outcome}: {url} [{reason}]")
166
+ root.set(Attr.OUTCOME, outcome)
167
+ botwall.log_final(url, outcome, error=reason)
168
+ return ScrapeOutcome(url, False, final_outcome=outcome, error_class=outcome)
169
+
170
+
171
+ def _run_one(url: str, db: dict) -> ScrapeOutcome:
172
+ host = host_of(url)
173
+ t0 = time.monotonic()
174
+ deadline = t0 + _DEADLINE_S
175
+ egress.take_wire_bytes() # zero the per-thread wire-byte tally for this URL
176
+ with span("scrape", **{Attr.HOST: host, Attr.DEADLINE_S: _DEADLINE_S}) as root:
177
+ if botwall.is_skipped(host, db):
178
+ return _skipped(url, root, "domain_skipped",
179
+ db["hosts"][host].get("reason", ""))
180
+ if botwall.is_url_skipped(url, db):
181
+ return _skipped(url, root, "url_excluded",
182
+ db.get("urls", {}).get(url, {}).get("reason", ""))
183
+ hit = content_cache.get(url)
184
+ if hit:
185
+ md, method = hit
186
+ root.set(Attr.OUTCOME, "cache_hit")
187
+ root.set(Attr.SOURCE, method)
188
+ root.set(Attr.MD_LEN, len(md))
189
+ logger.info(f"cache_hit {url} (was {method})")
190
+ return ScrapeOutcome(url, True, markdown=md, source_method=method,
191
+ final_outcome="ok")
192
+ # A needs_egress host runs the whole cascade in the egress scope, so the
193
+ # tiers route through SCRAPER_EGRESS_PROXY (when set); easy hosts stay
194
+ # direct and never spend residential bandwidth.
195
+ with egress.egress_scope(botwall.needs_egress(host, db)):
196
+ res = _run_cascade(url, host, db, root, t0, deadline)
197
+ # Dead logged-in session? For an authed host with a login hook wired,
198
+ # refresh the cookies once and re-run on a fresh budget — the
199
+ # refreshed cookies overlay every tier (and persist for later runs).
200
+ if (not res.ok and res.error_class in ("botwall", "http_block")
201
+ and session_cache.has_login_hook()
202
+ and session_cache.is_authed_host(url)
203
+ and session_cache.refresh_login(url)):
204
+ logger.info(f"re-running after login refresh: {url}")
205
+ rt = time.monotonic()
206
+ res = _run_cascade(url, host, db, root, rt, rt + _DEADLINE_S)
207
+ # Record which egress this URL's cascade ran on, while still in scope —
208
+ # "egress" means its bytes were metered residential-proxy bandwidth.
209
+ res.egress = egress.scope_label()
210
+ res.wire_bytes = egress.take_wire_bytes()
211
+ return res
212
+
213
+
214
+ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
215
+ attempts: list[TierAttempt] = []
216
+ transient = 0 # count of rate_limited/timeout misses so far (drives backoff)
217
+ start = _start_index(url, db)
218
+ for i, tier in enumerate(TIERS):
219
+ if i < start:
220
+ continue
221
+
222
+ # Env/feature gate (e.g. paid Firecrawl off, residential not wired).
223
+ disabled_fn = getattr(tier, "disabled", None)
224
+ if disabled_fn and disabled_fn():
225
+ logger.info(f"{tier.NAME} disabled; skipping {url}")
226
+ attempts.append(TierAttempt(tier.NAME, "disabled"))
227
+ continue
228
+
229
+ # Short-circuit: cloudscraper solves Cloudflare challenges, not IP-reputation
230
+ # blocks. If a cheaper HTTP tier already hit a hard http_block (bare 403/401),
231
+ # the same datacenter IP will block cloudscraper too — skip its (up to ~25s)
232
+ # solve attempt and go straight to the browser/egress tiers. A real CF
233
+ # challenge surfaces as `botwall`, not `http_block`, so this never skips a
234
+ # host cloudscraper could actually clear.
235
+ if tier.NAME == "tier2_cloudscraper" and any(
236
+ a.outcome == "http_block" for a in attempts):
237
+ logger.info(f"{tier.NAME} skipped (prior hard IP block): {url}")
238
+ attempts.append(TierAttempt(tier.NAME, "not_applicable"))
239
+ continue
240
+
241
+ # Limit: stop before starting another tier if we're out of budget.
242
+ if time.monotonic() >= deadline:
243
+ total = int((time.monotonic() - t0) * 1000)
244
+ ec, sc = _dominant_failure(attempts)
245
+ root.set(Attr.OUTCOME, "deadline_exceeded")
246
+ root.set(Attr.ERROR_CLASS, ec or "deadline_exceeded")
247
+ root.set(Attr.STATUS_CODE, sc)
248
+ root.set(Attr.LATENCY_MS, total)
249
+ logger.warning(
250
+ f"deadline {_DEADLINE_S}s exceeded before {tier.NAME} "
251
+ f"({total}ms): {url}")
252
+ botwall.log_final(url, "deadline_exceeded", latency_ms=total,
253
+ error=ec, status_code=sc)
254
+ return ScrapeOutcome(url, False, final_outcome="deadline_exceeded",
255
+ error_class=ec or "deadline_exceeded",
256
+ status_code=sc, latency_ms=total, attempts=attempts)
257
+
258
+ paid = getattr(tier, "PAID", False)
259
+ with span(tier.NAME, **{Attr.HOST: host, Attr.TIER: tier.NAME}) as sp:
260
+ if paid:
261
+ # Count every invocation so the host can be promoted to skip.
262
+ botwall.record(db, url, tier.NAME, "firecrawl_used")
263
+ ts = time.monotonic()
264
+ try:
265
+ md = tier.fetch(url)
266
+ except BotWall as e:
267
+ dt = int((time.monotonic() - ts) * 1000)
268
+ _record_failure(sp, attempts, db, url, tier.NAME, "botwall", e, None, dt,
269
+ challenge=getattr(e, "vendor", None))
270
+ continue
271
+ except ShortContent as e:
272
+ dt = int((time.monotonic() - ts) * 1000)
273
+ _record_failure(sp, attempts, db, url, tier.NAME, "short_content", e, None, dt)
274
+ continue
275
+ except RateLimited as e:
276
+ dt = int((time.monotonic() - ts) * 1000)
277
+ _record_failure(sp, attempts, db, url, tier.NAME, "rate_limited", e, 429, dt)
278
+ transient += 1
279
+ _maybe_backoff(transient, deadline)
280
+ continue
281
+ except Exception as e:
282
+ dt = int((time.monotonic() - ts) * 1000)
283
+ error_class, status = classify_error(e)
284
+ _record_failure(sp, attempts, db, url, tier.NAME, error_class, e, status, dt)
285
+ if error_class in _TRANSIENT:
286
+ transient += 1
287
+ _maybe_backoff(transient, deadline)
288
+ continue
289
+
290
+ dt = int((time.monotonic() - ts) * 1000)
291
+ if md is None: # tier not applicable (e.g. no API mirror)
292
+ sp.set(Attr.OUTCOME, "not_applicable")
293
+ sp.set(Attr.LATENCY_MS, dt)
294
+ attempts.append(TierAttempt(tier.NAME, "not_applicable", latency_ms=dt))
295
+ continue
296
+
297
+ total = int((time.monotonic() - t0) * 1000)
298
+ sp.set(Attr.OUTCOME, "ok")
299
+ sp.set(Attr.MD_LEN, len(md))
300
+ sp.set(Attr.SOURCE, tier.NAME)
301
+ sp.set(Attr.LATENCY_MS, dt)
302
+ botwall.record(db, url, tier.NAME, "ok", md_len=len(md), latency_ms=dt)
303
+ content_cache.put(url, md, tier.NAME)
304
+ root.set(Attr.OUTCOME, "ok")
305
+ root.set(Attr.SOURCE, tier.NAME)
306
+ root.set(Attr.LATENCY_MS, total)
307
+ attempts.append(TierAttempt(tier.NAME, "ok", latency_ms=dt))
308
+ logger.info(
309
+ f"{tier.NAME} OK {url} md_len={len(md)} {dt}ms (total {total}ms)")
310
+ return ScrapeOutcome(url, True, markdown=md, source_method=tier.NAME,
311
+ final_outcome="ok", latency_ms=total, attempts=attempts)
312
+
313
+ total = int((time.monotonic() - t0) * 1000)
314
+ ec, sc = _dominant_failure(attempts)
315
+ root.set(Attr.OUTCOME, "all_failed")
316
+ root.set(Attr.ERROR_CLASS, ec or "all_failed")
317
+ root.set(Attr.STATUS_CODE, sc)
318
+ root.set(Attr.LATENCY_MS, total)
319
+ botwall.log_final(url, "all_failed", latency_ms=total, error=ec, status_code=sc)
320
+ logger.warning(f"all tiers failed ({total}ms, {ec or 'no-attempt'}): {url}")
321
+ return ScrapeOutcome(url, False, final_outcome="all_failed",
322
+ error_class=ec or "all_failed", status_code=sc,
323
+ latency_ms=total, attempts=attempts)
324
+
325
+
326
+ def run_detailed(urls: list[str]) -> list[ScrapeOutcome]:
327
+ """Scrape each URL; return a full ScrapeOutcome (success or failure with the
328
+ per-tier cascade and a classified reason) for every URL."""
329
+ db = botwall.load_db()
330
+ out = []
331
+ try:
332
+ for url in urls:
333
+ out.append(_run_one(url, db))
334
+ finally:
335
+ botwall.save_db(db)
336
+ flush()
337
+ return out
338
+
339
+
340
+ def run(urls: list[str]) -> list[ScrapeResult]:
341
+ """Successes only (backward-compatible). Use run_detailed() for failures."""
342
+ return [ScrapeResult(o.url, o.markdown, o.source_method)
343
+ for o in run_detailed(urls) if o.ok]
File without changes