PyPI - switchback - Versions diffs - 0.1.0__py3-none-any.whl - Mend

switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

switchback/__init__.py +12 -0
switchback/__main__.py +4 -0
switchback/api.py +81 -0
switchback/concurrency.py +37 -0
switchback/content_cache.py +94 -0
switchback/egress.py +108 -0
switchback/extract.py +56 -0
switchback/flags.py +96 -0
switchback/normalize.py +81 -0
switchback/orchestrator.py +343 -0
switchback/policy/__init__.py +0 -0
switchback/policy/botwall.py +393 -0
switchback/policy/gates.py +173 -0
switchback/py.typed +0 -0
switchback/reporting.py +236 -0
switchback/search.py +39 -0
switchback/server.py +114 -0
switchback/session_cache.py +274 -0
switchback/session_trace.py +96 -0
switchback/tiers/__init__.py +24 -0
switchback/tiers/_browser.py +50 -0
switchback/tiers/tier0_apis.py +77 -0
switchback/tiers/tier1_http.py +65 -0
switchback/tiers/tier2_cloudscraper.py +135 -0
switchback/tiers/tier3_browser.py +59 -0
switchback/tiers/tier3b_camoufox.py +89 -0
switchback/tiers/tier4_firecrawl.py +48 -0
switchback/tiers/tier_residential.py +57 -0
switchback/tracing.py +152 -0
switchback-0.1.0.dist-info/METADATA +325 -0
switchback-0.1.0.dist-info/RECORD +36 -0
switchback-0.1.0.dist-info/WHEEL +5 -0
switchback-0.1.0.dist-info/entry_points.txt +3 -0
switchback-0.1.0.dist-info/licenses/LICENSE +21 -0
switchback-0.1.0.dist-info/licenses/NOTICE +34 -0
switchback-0.1.0.dist-info/top_level.txt +1 -0

switchback/orchestrator.py ADDED Viewed

@@ -0,0 +1,343 @@
+"""Cascade runner: route → run tiers in cost order → stop at first success.
+One trace per URL, one span per tier attempt. Botwall governs skip-listing and
+winning-tier routing; every outcome is recorded so the policy self-heals.
+Failures are first-class: every attempt is classified (see gates.classify_error)
+so a hard 403/429 escalates egress, the per-tier reasons are returned to callers
+via `run_detailed()`/`ScrapeOutcome`, and one aggregate event is logged + traced
+per URL. `run()` stays successes-only for backward compatibility.
+"""
+from __future__ import annotations
+import logging
+import os
+import random
+import time
+from dataclasses import dataclass, field
+from . import content_cache, egress, session_cache
+from .policy import botwall
+from .policy.gates import BotWall, RateLimited, ShortContent, classify_error, host_of
+from .tiers import TIERS, INDEX
+from .tracing import Attr, flush, span
+logger = logging.getLogger(__name__)
+# Per-request wall-clock budget. Checked between tiers so a single URL can't run
+# the whole cascade of timeouts; overridable via env. 45s balances latency vs
+# coverage: roughly fits a Camoufox solve (~40s) that starts after the cheaper
+# tiers fail fast, while still bounding the worst case.
+_DEADLINE_S = float(os.getenv("SCRAPER_DEADLINE_S", "45"))
+# Exponential backoff between tiers after a *transient* failure (rate_limited /
+# timeout) — gives a rate limiter or a slow origin a moment before the next tier
+# hammers it. Disabled by default (base 0) so behaviour is unchanged until opted
+# in. delay = min(MAX, BASE·2^(n−1)) with 50–100% jitter; never sleeps past the
+# per-request deadline.
+_BACKOFF_BASE_MS = float(os.getenv("SCRAPER_BACKOFF_BASE_MS", "0"))
+_BACKOFF_MAX_MS = float(os.getenv("SCRAPER_BACKOFF_MAX_MS", "8000"))
+_TRANSIENT = ("rate_limited", "timeout")
+def _maybe_backoff(transient_n: int, deadline: float) -> None:
+    if not _BACKOFF_BASE_MS or transient_n <= 0:
+        return
+    delay = min(_BACKOFF_MAX_MS, _BACKOFF_BASE_MS * (2 ** (transient_n - 1)))
+    delay = delay * (0.5 + random.random() * 0.5) / 1000.0  # jitter → seconds
+    if time.monotonic() + delay >= deadline:  # don't burn the whole budget sleeping
+        return
+    time.sleep(delay)
+# Per-attempt outcomes that aren't real failures (don't carry a failure reason).
+_NON_FAILURE = ("ok", "not_applicable", "disabled")
+# How explanatory each failure class is, for picking the reason that best
+# describes why a URL failed. A real wall (403 / bot-wall) outranks a trailing
+# config error (e.g. Firecrawl with no API key → "error"), so the verdict points
+# at the actual blocker rather than the last thing that happened to throw.
+_FAILURE_PRIORITY = {
+    "botwall": 5, "http_block": 5,
+    "rate_limited": 4, "short_content": 4,
+    "timeout": 3, "connection": 3,
+    "http_error": 2,
+    "error": 1,
+}
+@dataclass
+class ScrapeResult:
+    url: str
+    markdown: str
+    source_method: str  # tier NAME that won
+@dataclass
+class TierAttempt:
+    """One tier's attempt on a URL — what it was and why it ended."""
+    tier: str
+    outcome: str                  # ok | botwall | short_content | http_block | …
+    error: str = ""
+    status_code: int | None = None
+    latency_ms: int | None = None
+@dataclass
+class ScrapeOutcome:
+    """Full per-URL result, success or failure, with the cascade it took."""
+    url: str
+    ok: bool
+    markdown: str = ""
+    source_method: str = ""        # winning tier (on success)
+    final_outcome: str = ""        # ok | all_failed | deadline_exceeded | *_skipped
+    error_class: str = ""          # dominant failure class (on failure)
+    status_code: int | None = None
+    latency_ms: int | None = None
+    egress: str = "direct"         # "egress" if routed via SCRAPER_EGRESS_PROXY, else "direct"
+    wire_bytes: int = 0            # bytes transferred over the network (cost basis for proxy GB)
+    attempts: list[TierAttempt] = field(default_factory=list)
+def _dominant_failure(attempts: list[TierAttempt]) -> tuple[str, int | None]:
+    """The failure that best explains 'why this URL failed': the highest-priority
+    real failing attempt (ties resolve to the later, more capable tier)."""
+    best: tuple[int, str, int | None] | None = None
+    for a in attempts:
+        if a.outcome in _NON_FAILURE:
+            continue
+        pr = _FAILURE_PRIORITY.get(a.outcome, 1)
+        if best is None or pr >= best[0]:
+            best = (pr, a.outcome, a.status_code)
+    return (best[1], best[2]) if best else ("", None)
+def _start_index(url: str, db: dict) -> int:
+    """Begin at the host's known-good rung (fall through on regression).
+    For a host the local tiers keep walling (needs_egress) we escalate egress,
+    cheapest first:
+      1. If a residential proxy is wired (SCRAPER_EGRESS_PROXY), rerun from the
+         top — the HTTP tiers now go through it (~0.2MB) instead of jumping
+         straight to a multi-MB remote browser render.
+      2. Else, if the residential CDP browser is enabled, jump to it.
+      3. Else fall back to normal routing (don't strand the host past every
+         usable tier)."""
+    host = host_of(url)
+    if botwall.needs_egress(host, db):
+        if egress.has_egress_proxy():
+            return 0
+        res_i = INDEX.get("tier_residential")
+        if res_i is not None:
+            disabled_fn = getattr(TIERS[res_i], "disabled", None)
+            if not (disabled_fn and disabled_fn()):
+                return res_i
+    wt = botwall.winning_tier(host, db)
+    return INDEX.get(wt, 0) if wt else 0
+def _record_failure(sp, attempts, db, url, tier_name, outcome, exc, status, dt,
+                    challenge=None):
+    """Annotate the span, persist to botwall, and append the attempt — for one
+    failed tier attempt. Shared by every except branch so classification,
+    tracing, and the event log never drift apart. `challenge` names the bot-wall
+    vendor when one was served, so the policy can learn it per host."""
+    msg = f"{type(exc).__name__}: {exc}"
+    sp.set(Attr.OUTCOME, outcome)
+    sp.set(Attr.ERROR, msg)
+    sp.set(Attr.ERROR_CLASS, outcome)
+    sp.set(Attr.CHALLENGE, challenge)
+    sp.set(Attr.STATUS_CODE, status)
+    sp.set(Attr.LATENCY_MS, dt)
+    botwall.record(db, url, tier_name, outcome, error=msg, latency_ms=dt,
+                   status_code=status, challenge=challenge)
+    # A wall on a host we had a cached cf_clearance for means the cookie is stale
+    # or IP-mismatched: drop it so the next attempt re-solves instead of replaying.
+    if outcome in ("botwall", "http_block"):
+        session_cache.forget(url)
+    attempts.append(TierAttempt(tier_name, outcome, msg, status, dt))
+    log = logger.info if outcome in ("botwall", "short_content") else logger.warning
+    log(f"{tier_name} {outcome} {url}"
+        + (f" [{status}]" if status else "") + f": {exc}")
+def _skipped(url, root, outcome, reason) -> ScrapeOutcome:
+    """Terminal short-circuit (domain/url skip): trace + aggregate event."""
+    logger.info(f"{outcome}: {url} [{reason}]")
+    root.set(Attr.OUTCOME, outcome)
+    botwall.log_final(url, outcome, error=reason)
+    return ScrapeOutcome(url, False, final_outcome=outcome, error_class=outcome)
+def _run_one(url: str, db: dict) -> ScrapeOutcome:
+    host = host_of(url)
+    t0 = time.monotonic()
+    deadline = t0 + _DEADLINE_S
+    egress.take_wire_bytes()  # zero the per-thread wire-byte tally for this URL
+    with span("scrape", **{Attr.HOST: host, Attr.DEADLINE_S: _DEADLINE_S}) as root:
+        if botwall.is_skipped(host, db):
+            return _skipped(url, root, "domain_skipped",
+                            db["hosts"][host].get("reason", ""))
+        if botwall.is_url_skipped(url, db):
+            return _skipped(url, root, "url_excluded",
+                            db.get("urls", {}).get(url, {}).get("reason", ""))
+        hit = content_cache.get(url)
+        if hit:
+            md, method = hit
+            root.set(Attr.OUTCOME, "cache_hit")
+            root.set(Attr.SOURCE, method)
+            root.set(Attr.MD_LEN, len(md))
+            logger.info(f"cache_hit {url} (was {method})")
+            return ScrapeOutcome(url, True, markdown=md, source_method=method,
+                                 final_outcome="ok")
+        # A needs_egress host runs the whole cascade in the egress scope, so the
+        # tiers route through SCRAPER_EGRESS_PROXY (when set); easy hosts stay
+        # direct and never spend residential bandwidth.
+        with egress.egress_scope(botwall.needs_egress(host, db)):
+            res = _run_cascade(url, host, db, root, t0, deadline)
+            # Dead logged-in session? For an authed host with a login hook wired,
+            # refresh the cookies once and re-run on a fresh budget — the
+            # refreshed cookies overlay every tier (and persist for later runs).
+            if (not res.ok and res.error_class in ("botwall", "http_block")
+                    and session_cache.has_login_hook()
+                    and session_cache.is_authed_host(url)
+                    and session_cache.refresh_login(url)):
+                logger.info(f"re-running after login refresh: {url}")
+                rt = time.monotonic()
+                res = _run_cascade(url, host, db, root, rt, rt + _DEADLINE_S)
+            # Record which egress this URL's cascade ran on, while still in scope —
+            # "egress" means its bytes were metered residential-proxy bandwidth.
+            res.egress = egress.scope_label()
+            res.wire_bytes = egress.take_wire_bytes()
+            return res
+def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
+    attempts: list[TierAttempt] = []
+    transient = 0  # count of rate_limited/timeout misses so far (drives backoff)
+    start = _start_index(url, db)
+    for i, tier in enumerate(TIERS):
+        if i < start:
+            continue
+        # Env/feature gate (e.g. paid Firecrawl off, residential not wired).
+        disabled_fn = getattr(tier, "disabled", None)
+        if disabled_fn and disabled_fn():
+            logger.info(f"{tier.NAME} disabled; skipping {url}")
+            attempts.append(TierAttempt(tier.NAME, "disabled"))
+            continue
+        # Short-circuit: cloudscraper solves Cloudflare challenges, not IP-reputation
+        # blocks. If a cheaper HTTP tier already hit a hard http_block (bare 403/401),
+        # the same datacenter IP will block cloudscraper too — skip its (up to ~25s)
+        # solve attempt and go straight to the browser/egress tiers. A real CF
+        # challenge surfaces as `botwall`, not `http_block`, so this never skips a
+        # host cloudscraper could actually clear.
+        if tier.NAME == "tier2_cloudscraper" and any(
+                a.outcome == "http_block" for a in attempts):
+            logger.info(f"{tier.NAME} skipped (prior hard IP block): {url}")
+            attempts.append(TierAttempt(tier.NAME, "not_applicable"))
+            continue
+        # Limit: stop before starting another tier if we're out of budget.
+        if time.monotonic() >= deadline:
+            total = int((time.monotonic() - t0) * 1000)
+            ec, sc = _dominant_failure(attempts)
+            root.set(Attr.OUTCOME, "deadline_exceeded")
+            root.set(Attr.ERROR_CLASS, ec or "deadline_exceeded")
+            root.set(Attr.STATUS_CODE, sc)
+            root.set(Attr.LATENCY_MS, total)
+            logger.warning(
+                f"deadline {_DEADLINE_S}s exceeded before {tier.NAME} "
+                f"({total}ms): {url}")
+            botwall.log_final(url, "deadline_exceeded", latency_ms=total,
+                              error=ec, status_code=sc)
+            return ScrapeOutcome(url, False, final_outcome="deadline_exceeded",
+                                 error_class=ec or "deadline_exceeded",
+                                 status_code=sc, latency_ms=total, attempts=attempts)
+        paid = getattr(tier, "PAID", False)
+        with span(tier.NAME, **{Attr.HOST: host, Attr.TIER: tier.NAME}) as sp:
+            if paid:
+                # Count every invocation so the host can be promoted to skip.
+                botwall.record(db, url, tier.NAME, "firecrawl_used")
+            ts = time.monotonic()
+            try:
+                md = tier.fetch(url)
+            except BotWall as e:
+                dt = int((time.monotonic() - ts) * 1000)
+                _record_failure(sp, attempts, db, url, tier.NAME, "botwall", e, None, dt,
+                                challenge=getattr(e, "vendor", None))
+                continue
+            except ShortContent as e:
+                dt = int((time.monotonic() - ts) * 1000)
+                _record_failure(sp, attempts, db, url, tier.NAME, "short_content", e, None, dt)
+                continue
+            except RateLimited as e:
+                dt = int((time.monotonic() - ts) * 1000)
+                _record_failure(sp, attempts, db, url, tier.NAME, "rate_limited", e, 429, dt)
+                transient += 1
+                _maybe_backoff(transient, deadline)
+                continue
+            except Exception as e:
+                dt = int((time.monotonic() - ts) * 1000)
+                error_class, status = classify_error(e)
+                _record_failure(sp, attempts, db, url, tier.NAME, error_class, e, status, dt)
+                if error_class in _TRANSIENT:
+                    transient += 1
+                    _maybe_backoff(transient, deadline)
+                continue
+            dt = int((time.monotonic() - ts) * 1000)
+            if md is None:  # tier not applicable (e.g. no API mirror)
+                sp.set(Attr.OUTCOME, "not_applicable")
+                sp.set(Attr.LATENCY_MS, dt)
+                attempts.append(TierAttempt(tier.NAME, "not_applicable", latency_ms=dt))
+                continue
+            total = int((time.monotonic() - t0) * 1000)
+            sp.set(Attr.OUTCOME, "ok")
+            sp.set(Attr.MD_LEN, len(md))
+            sp.set(Attr.SOURCE, tier.NAME)
+            sp.set(Attr.LATENCY_MS, dt)
+            botwall.record(db, url, tier.NAME, "ok", md_len=len(md), latency_ms=dt)
+            content_cache.put(url, md, tier.NAME)
+            root.set(Attr.OUTCOME, "ok")
+            root.set(Attr.SOURCE, tier.NAME)
+            root.set(Attr.LATENCY_MS, total)
+            attempts.append(TierAttempt(tier.NAME, "ok", latency_ms=dt))
+            logger.info(
+                f"{tier.NAME} OK {url} md_len={len(md)} {dt}ms (total {total}ms)")
+            return ScrapeOutcome(url, True, markdown=md, source_method=tier.NAME,
+                                 final_outcome="ok", latency_ms=total, attempts=attempts)
+    total = int((time.monotonic() - t0) * 1000)
+    ec, sc = _dominant_failure(attempts)
+    root.set(Attr.OUTCOME, "all_failed")
+    root.set(Attr.ERROR_CLASS, ec or "all_failed")
+    root.set(Attr.STATUS_CODE, sc)
+    root.set(Attr.LATENCY_MS, total)
+    botwall.log_final(url, "all_failed", latency_ms=total, error=ec, status_code=sc)
+    logger.warning(f"all tiers failed ({total}ms, {ec or 'no-attempt'}): {url}")
+    return ScrapeOutcome(url, False, final_outcome="all_failed",
+                         error_class=ec or "all_failed", status_code=sc,
+                         latency_ms=total, attempts=attempts)
+def run_detailed(urls: list[str]) -> list[ScrapeOutcome]:
+    """Scrape each URL; return a full ScrapeOutcome (success or failure with the
+    per-tier cascade and a classified reason) for every URL."""
+    db = botwall.load_db()
+    out = []
+    try:
+        for url in urls:
+            out.append(_run_one(url, db))
+    finally:
+        botwall.save_db(db)
+        flush()
+    return out
+def run(urls: list[str]) -> list[ScrapeResult]:
+    """Successes only (backward-compatible). Use run_detailed() for failures."""
+    return [ScrapeResult(o.url, o.markdown, o.source_method)
+            for o in run_detailed(urls) if o.ok]

switchback/policy/__init__.py ADDED Viewed

File without changes