PyPI - switchback - Versions diffs - 0.1.0__py3-none-any.whl - Mend

switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

switchback/__init__.py +12 -0
switchback/__main__.py +4 -0
switchback/api.py +81 -0
switchback/concurrency.py +37 -0
switchback/content_cache.py +94 -0
switchback/egress.py +108 -0
switchback/extract.py +56 -0
switchback/flags.py +96 -0
switchback/normalize.py +81 -0
switchback/orchestrator.py +343 -0
switchback/policy/__init__.py +0 -0
switchback/policy/botwall.py +393 -0
switchback/policy/gates.py +173 -0
switchback/py.typed +0 -0
switchback/reporting.py +236 -0
switchback/search.py +39 -0
switchback/server.py +114 -0
switchback/session_cache.py +274 -0
switchback/session_trace.py +96 -0
switchback/tiers/__init__.py +24 -0
switchback/tiers/_browser.py +50 -0
switchback/tiers/tier0_apis.py +77 -0
switchback/tiers/tier1_http.py +65 -0
switchback/tiers/tier2_cloudscraper.py +135 -0
switchback/tiers/tier3_browser.py +59 -0
switchback/tiers/tier3b_camoufox.py +89 -0
switchback/tiers/tier4_firecrawl.py +48 -0
switchback/tiers/tier_residential.py +57 -0
switchback/tracing.py +152 -0
switchback-0.1.0.dist-info/METADATA +325 -0
switchback-0.1.0.dist-info/RECORD +36 -0
switchback-0.1.0.dist-info/WHEEL +5 -0
switchback-0.1.0.dist-info/entry_points.txt +3 -0
switchback-0.1.0.dist-info/licenses/LICENSE +21 -0
switchback-0.1.0.dist-info/licenses/NOTICE +34 -0
switchback-0.1.0.dist-info/top_level.txt +1 -0

switchback/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""switchback — one cost-ordered scrape cascade, used by every tool.
+Public API:
+    from switchback import scrape, search
+    results = scrape(["https://example.com/article"])
+    hits = search("web scraping")            # query → URLs (SearXNG)
+"""
+from .api import scrape, scrape_detailed, ScrapeOutcome, ScrapeResult, TierAttempt
+from .search import search, SearchResult
+__all__ = ["scrape", "scrape_detailed", "ScrapeOutcome", "ScrapeResult",
+           "TierAttempt", "search", "SearchResult"]

switchback/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+"""`python -m switchback <url> [<url> ...]`"""
+from .api import _main
+raise SystemExit(_main())

switchback/api.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Public entry point + CLI.
+    from switchback import scrape
+    results = scrape(["https://example.com/article"])
+    # or:  python -m switchback.api <url> [<url> ...]
+    #      python -m switchback.api --search <query ...>
+"""
+from __future__ import annotations
+import sys
+from .orchestrator import ScrapeOutcome, ScrapeResult, TierAttempt, run, run_detailed
+from .search import search  # re-export: query → URLs (SearXNG)
+def scrape(urls: str | list[str]) -> list[ScrapeResult]:
+    """Scrape one or many URLs through the cascade. Returns successes only.
+    For failures with classified reasons + the per-tier cascade, use
+    scrape_detailed()."""
+    if isinstance(urls, str):
+        urls = [urls]
+    return run(urls)
+def scrape_detailed(urls: str | list[str]) -> list[ScrapeOutcome]:
+    """Like scrape() but returns a ScrapeOutcome per URL — successes *and*
+    failures, each with final_outcome, error_class, status_code, and the
+    per-tier attempts that were made."""
+    if isinstance(urls, str):
+        urls = [urls]
+    return run_detailed(urls)
+def _main() -> int:
+    import json
+    import logging
+    import pathlib
+    from .tracing import setup_logs
+    # Auto-load .env from the repo root so OTEL/SCRAPER vars are set even when
+    # invoked as a subprocess (parent process needn't export them explicitly).
+    _env = pathlib.Path(__file__).parent.parent / ".env"
+    if _env.exists():
+        import os as _os
+        for _line in _env.read_text().splitlines():
+            _line = _line.strip()
+            if _line and not _line.startswith("#") and "=" in _line:
+                _k, _, _v = _line.partition("=")
+                _k = _k.strip()
+                if _k and _k not in _os.environ:
+                    _os.environ[_k] = _v.strip()
+    usage = ("usage: switchback <url> [<url> ...]\n"
+             "       switchback --search <query ...>\n"
+             "       (or: python -m switchback <url> ...)")
+    # --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
+    # as a URL to scrape). Check before any work so it stays fast and side-effect-free.
+    if any(a in ("--help", "-h") for a in sys.argv[1:]):
+        print(usage)
+        return 0
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    setup_logs()  # also ship logs to the OTLP backend when configured
+    if len(sys.argv) < 2:
+        print(usage, file=sys.stderr)
+        return 2
+    if sys.argv[1] == "--search":
+        hits = search(" ".join(sys.argv[2:]))
+        print(json.dumps(
+            [{"title": h.title, "url": h.url, "snippet": h.snippet} for h in hits],
+            indent=2))
+        return 0 if hits else 1
+    results = scrape(sys.argv[1:])
+    print(json.dumps(
+        [{"url": r.url, "source_method": r.source_method, "markdown": r.markdown}
+         for r in results],
+        indent=2))
+    return 0 if results else 1
+if __name__ == "__main__":
+    raise SystemExit(_main())

switchback/concurrency.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Process-wide cap on simultaneous headless browsers.
+The browser tiers (patchright ~150MB, Camoufox ~600MB) are the memory-heavy
+rungs. The engine scrapes sequentially today, but when callers run scrapes in
+parallel this semaphore bounds how many browsers spin up at once. Default 1
+matches the sequential design (one browser, one footprint); raise it with
+SCRAPER_BROWSER_CONCURRENCY once you know the box has headroom.
+"""
+from __future__ import annotations
+import logging
+import os
+import threading
+import time
+from contextlib import contextmanager
+logger = logging.getLogger(__name__)
+MAX_BROWSER_CONCURRENCY = max(1, int(os.getenv("SCRAPER_BROWSER_CONCURRENCY", "1")))
+_sem = threading.BoundedSemaphore(MAX_BROWSER_CONCURRENCY)
+@contextmanager
+def browser_slot(label: str = "browser"):
+    """Acquire one of the MAX_BROWSER_CONCURRENCY browser slots for the duration
+    of a browser launch; blocks if all slots are in use. Logs the wait when a
+    caller actually had to queue (a signal the cap is saturated)."""
+    t0 = time.monotonic()
+    _sem.acquire()
+    waited = time.monotonic() - t0
+    if waited > 0.1:
+        logger.info(f"{label}: waited {waited * 1000:.0f}ms for a browser slot "
+                    f"(SCRAPER_BROWSER_CONCURRENCY={MAX_BROWSER_CONCURRENCY})")
+    try:
+        yield
+    finally:
+        _sem.release()

switchback/content_cache.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""URL → result cache, so an already-scraped page isn't scraped again.
+Off by default (article content goes stale): set ``SCRAPER_CONTENT_TTL_S`` to a
+positive number of seconds to enable it, e.g. 86400 to dedupe re-scrapes within a
+day. A hit short-circuits the whole cascade before any tier (or proxy byte) runs.
+Backed by stdlib sqlite (``state/content_cache.db``), not a JSON blob: at
+curiouscats' ~300k URLs/month a single JSON file would be reloaded and rewritten
+in full on every access. sqlite keys by URL and stays O(1). The cache is keyed by
+normalised URL (fragment dropped); the egress scope is irrelevant to the *content*
+so it isn't part of the key.
+"""
+from __future__ import annotations
+import logging
+import os
+import sqlite3
+import threading
+import time
+from urllib.parse import urlsplit, urlunsplit
+logger = logging.getLogger(__name__)
+_DEFAULT_STATE_DIR = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "state")
+_STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
+DB_PATH = os.path.join(_STATE_DIR, "content_cache.db")
+_TTL_S = float(os.getenv("SCRAPER_CONTENT_TTL_S", "0"))  # 0 = disabled
+_LOCK = threading.Lock()
+_CONN: sqlite3.Connection | None = None
+def enabled() -> bool:
+    return _TTL_S > 0
+def _norm(url: str) -> str:
+    """Drop the fragment; everything else is significant (query strings select
+    content)."""
+    p = urlsplit(url)
+    return urlunsplit((p.scheme, p.netloc, p.path, p.query, ""))
+def _conn() -> sqlite3.Connection:
+    global _CONN
+    if _CONN is not None:
+        return _CONN
+    with _LOCK:
+        if _CONN is None:
+            os.makedirs(_STATE_DIR, exist_ok=True)
+            c = sqlite3.connect(DB_PATH, check_same_thread=False)
+            c.execute("CREATE TABLE IF NOT EXISTS cache ("
+                      "url TEXT PRIMARY KEY, markdown TEXT, source_method TEXT, ts REAL)")
+            c.commit()
+            _CONN = c
+    return _CONN
+def get(url: str) -> tuple[str, str] | None:
+    """Return ``(markdown, source_method)`` for a fresh cache hit, else None."""
+    if not enabled():
+        return None
+    conn = _conn()  # NB: acquires _LOCK itself — must be outside the lock below
+    try:
+        with _LOCK:
+            row = conn.execute(
+                "SELECT markdown, source_method, ts FROM cache WHERE url=?",
+                (_norm(url),)).fetchone()
+    except Exception as e:
+        logger.warning(f"content_cache: read failed: {e}")
+        return None
+    if not row:
+        return None
+    markdown, source_method, ts = row
+    if time.time() - ts > _TTL_S:
+        return None
+    return markdown, source_method
+def put(url: str, markdown: str, source_method: str) -> None:
+    """Store a successful scrape. No-op when disabled."""
+    if not enabled():
+        return
+    conn = _conn()  # NB: acquires _LOCK itself — must be outside the lock below
+    try:
+        with _LOCK:
+            conn.execute("INSERT OR REPLACE INTO cache (url, markdown, source_method, ts) "
+                         "VALUES (?, ?, ?, ?)",
+                         (_norm(url), markdown, source_method, time.time()))
+            conn.commit()
+    except Exception as e:
+        logger.warning(f"content_cache: write failed: {e}")

switchback/egress.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""Egress configuration — proxy wiring shared across tiers.
+Two optional env vars:
+  SCRAPER_PROXY         applied to *every* request (all tiers, all URLs).
+  SCRAPER_EGRESS_PROXY  applied only while a request is in the "egress scope" —
+                        i.e. for a host the policy flagged needs_egress. This is
+                        the cost-scoped lever: the easy majority that already
+                        succeeds free at the datacenter IP stays direct, and only
+                        the hard, walled hosts spend the (often metered)
+                        residential proxy bandwidth.
+The orchestrator opens an egress scope around the cascade for needs_egress hosts
+(see ``egress_scope``); the per-tier helpers resolve the right proxy for the
+current scope. Both shapes are returned — the requests/curl_cffi ``proxies`` dict
+and the Playwright/Camoufox ``proxy`` dict. Nothing set / not in scope → None and
+the tier runs on the direct connection.
+"""
+from __future__ import annotations
+import os
+import threading
+from contextlib import contextmanager
+from urllib.parse import urlsplit
+# Per-thread scope flag. Threading-safe by construction: each worker thread in a
+# ThreadPoolExecutor gets its own instance (default unset → False), and the
+# orchestrator sets/reads it within the same thread that runs the tier fetch.
+_scope = threading.local()
+@contextmanager
+def egress_scope(enabled: bool):
+    """Mark the enclosed work as egress-scoped (per-thread). While enabled and
+    SCRAPER_EGRESS_PROXY is set, the proxy helpers return that proxy. Always
+    restores the previous scope on exit, including on early return/raise."""
+    prev = getattr(_scope, "egress", False)
+    _scope.egress = bool(enabled)
+    try:
+        yield
+    finally:
+        _scope.egress = prev
+def add_wire_bytes(n: int) -> None:
+    """Tally bytes actually transferred over the network for the current URL's
+    cascade (per-thread). HTTP tiers add the response size; browser tiers sum each
+    loaded resource. The orchestrator reads this to cost residential bandwidth on
+    real wire bytes, not the cleaned markdown."""
+    _scope.wire_bytes = getattr(_scope, "wire_bytes", 0) + int(n)
+def take_wire_bytes() -> int:
+    """Return and reset the per-thread wire-byte tally (call once per URL)."""
+    n = getattr(_scope, "wire_bytes", 0)
+    _scope.wire_bytes = 0
+    return n
+def has_egress_proxy() -> bool:
+    """True when a residential/escalation proxy is configured."""
+    return bool(os.getenv("SCRAPER_EGRESS_PROXY"))
+def in_egress_scope() -> bool:
+    """Raw per-thread egress flag. Lets a tier that offloads its blocking call to
+    a worker thread re-apply the scope there (thread-locals don't inherit)."""
+    return getattr(_scope, "egress", False)
+def scope_label() -> str:
+    """'egress' when the request routes through the residential egress proxy,
+    else 'direct'. cf_clearance is IP-bound, so the session cache keys on this
+    to never replay a cookie across the direct/proxy boundary."""
+    if getattr(_scope, "egress", False) and os.getenv("SCRAPER_EGRESS_PROXY"):
+        return "egress"
+    return "direct"
+def _active_proxy_url() -> str | None:
+    """The proxy URL for the current scope: the egress proxy when in scope and
+    set, otherwise the global proxy (or None)."""
+    if getattr(_scope, "egress", False) and os.getenv("SCRAPER_EGRESS_PROXY"):
+        return os.getenv("SCRAPER_EGRESS_PROXY")
+    return os.getenv("SCRAPER_PROXY") or None
+def requests_proxies() -> dict | None:
+    """For curl_cffi / requests / cloudscraper: {"http": url, "https": url}."""
+    url = _active_proxy_url()
+    return {"http": url, "https": url} if url else None
+def playwright_proxy() -> dict | None:
+    """For patchright / camoufox: {"server", "username"?, "password"?}."""
+    url = _active_proxy_url()
+    if not url:
+        return None
+    parts = urlsplit(url)
+    server = f"{parts.scheme}://{parts.hostname}"
+    if parts.port:
+        server += f":{parts.port}"
+    cfg: dict[str, str] = {"server": server}
+    if parts.username:
+        cfg["username"] = parts.username
+    if parts.password:
+        cfg["password"] = parts.password
+    return cfg

switchback/extract.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Per-domain extraction preferences — remember how to carve each site.
+Markdown of the whole page is the default. Some sites need scoping (drop the
+mega-nav, keep the article) or specific elements pulled out. Rather than hardcode
+that per tier, declare it once per host in ``config/extraction.json`` and every
+tier's normalize step picks it up:
+    {
+      "www.example.com": {"selector": "article.main", "drop": [".ad", ".related"]},
+      "blog.example.org": {"selector": "main .post-body"}
+    }
+  selector  CSS selector to scope to (first match wins); page minus the rest.
+  drop      extra CSS selectors to remove before converting (ads, share bars).
+Matching is by exact host (FQDN), consistent with the botwall policy. Absent /
+unparseable config → no prefs, default whole-page markdown. Override the path
+with SCRAPER_EXTRACTION_FILE.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+from .policy.gates import host_of
+logger = logging.getLogger(__name__)
+_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+_DEFAULT_FILE = os.path.join(_PROJECT_ROOT, "config", "extraction.json")
+PREFS_FILE = os.getenv("SCRAPER_EXTRACTION_FILE", _DEFAULT_FILE)
+_PREFS: dict | None = None
+def _load() -> dict:
+    global _PREFS
+    if _PREFS is not None:
+        return _PREFS
+    prefs: dict = {}
+    if os.path.exists(PREFS_FILE):
+        try:
+            with open(PREFS_FILE) as f:
+                prefs = json.load(f) or {}
+        except Exception as e:
+            logger.warning(f"extract: could not read {PREFS_FILE}: {e}")
+    _PREFS = prefs
+    return _PREFS
+def prefs_for(url: str | None) -> dict:
+    """Extraction prefs for this URL's host, or {} when none are configured."""
+    if not url:
+        return {}
+    return _load().get(host_of(url), {})

switchback/flags.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""Periodic flagging — surface the things worth a human glance, on a schedule.
+Requirement 13: "the system should flag some of these details from time to
+time." This isn't a daemon — it's a single pass over the metrics rollup that
+emits a digest to the logger (which ships to the OTLP backend when setup_logs() is on).
+Run it from cron / the /loop skill / any scheduler:
+    python -m switchback.flags                 # text digest to stdout + logs
+    python -m switchback.flags --minutes 60    # only the last hour
+    python -m switchback.flags --json          # machine-readable digest
+What it flags:
+  • domains still landing on paid Firecrawl (winning_tier == tier4_firecrawl)
+  • domains escalated to residential egress (needs_egress)
+  • domains throwing the most bot-wall challenges (by vendor)
+  • low coverage / negative cost savings in the window
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+from datetime import datetime, timedelta, timezone
+from .reporting import build_report
+from .tracing import setup_logs
+logger = logging.getLogger(__name__)
+# A domain is "stuck" if its winning tier is the paid one — these are the hosts
+# that still cost money and are the prime targets for a new tier / cookie / rule.
+_PAID_TIER = "tier4_firecrawl"
+def build_digest(minutes: int | None = None) -> dict:
+    since = datetime.now(timezone.utc) - timedelta(minutes=minutes) if minutes else None
+    rep = build_report(since=since)
+    domains = rep["domains"]
+    stuck = sorted(
+        (h for h, d in domains.items() if d.get("winning_tier") == _PAID_TIER),
+        key=lambda h: -domains[h]["attempts"],
+    )
+    egress = [h for h, d in domains.items() if d.get("needs_egress")]
+    challengers = sorted(
+        ((h, d["challenges"]) for h, d in domains.items() if d.get("challenges")),
+        key=lambda kv: -sum(kv[1].values()),
+    )
+    return {
+        "window_minutes": minutes,
+        "coverage": rep["coverage"],
+        "cost": rep["cost"],
+        "stuck_on_firecrawl": stuck,
+        "needs_egress": egress,
+        "top_challenged": [{"host": h, "challenges": c} for h, c in challengers[:10]],
+    }
+def emit(digest: dict) -> None:
+    """Log the noteworthy parts at WARNING so they surface in the OTLP backend."""
+    cov, cost = digest["coverage"], digest["cost"]
+    logger.info("flags: coverage %.1f%% (%d/%d urls), savings $%.4f (%.1f%%)",
+                cov["success_pct"], cov["succeeded"], cov["unique_urls"],
+                cost["savings_usd"], cost["savings_pct"])
+    if digest["stuck_on_firecrawl"]:
+        logger.warning("flags: %d domain(s) still on paid Firecrawl: %s",
+                       len(digest["stuck_on_firecrawl"]),
+                       ", ".join(digest["stuck_on_firecrawl"][:20]))
+    if digest["needs_egress"]:
+        logger.warning("flags: %d domain(s) escalated to residential egress: %s",
+                       len(digest["needs_egress"]), ", ".join(digest["needs_egress"][:20]))
+    for item in digest["top_challenged"]:
+        logger.info("flags: %s challenges %s", item["host"], item["challenges"])
+    if cost["savings_usd"] < 0:
+        logger.warning("flags: NEGATIVE savings ($%.4f) — engine cost exceeds the "
+                       "Firecrawl baseline in this window", cost["savings_usd"])
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--minutes", type=int, help="Only consider the last N minutes")
+    ap.add_argument("--json", action="store_true", help="Emit the digest as JSON")
+    args = ap.parse_args()
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    setup_logs()  # ship the digest to the OTLP backend too when configured
+    digest = build_digest(args.minutes)
+    if args.json:
+        print(json.dumps(digest, indent=2))
+    else:
+        emit(digest)
+if __name__ == "__main__":
+    main()

switchback/normalize.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Shared content normalization — HTML→Markdown and PDF→text.
+Ported from musings-by-hermes/scripts/muse_helpers.py (the most mature version):
+strips boilerplate, promotes lazy-loaded images, resolves relative URLs.
+"""
+from __future__ import annotations
+import io
+import logging
+logger = logging.getLogger(__name__)
+UA = ("Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 "
+      "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+def html_to_markdown(html: str, base_url: str | None = None) -> str:
+    """HTML → Markdown, preserving images/blockquotes/code.
+    - Strips script/style/nav/header/footer/aside boilerplate.
+    - Applies any per-domain extraction prefs (scope selector / extra drops),
+      see switchback.extract.
+    - Promotes lazy-load attrs (data-src, data-original, srcset) to src.
+    - Resolves relative image/link URLs against base_url.
+    """
+    try:
+        from markdownify import markdownify
+        try:
+            from bs4 import BeautifulSoup
+            from urllib.parse import urljoin
+            from .extract import prefs_for
+            prefs = prefs_for(base_url)
+            soup = BeautifulSoup(html or "", "html.parser")
+            for tag in soup(["script", "style", "noscript", "nav", "header",
+                             "footer", "aside", "form", "iframe"]):
+                tag.decompose()
+            # Per-domain: remove configured noise, then scope to the content node.
+            for sel in prefs.get("drop", []):
+                for tag in soup.select(sel):
+                    tag.decompose()
+            selector = prefs.get("selector")
+            if selector:
+                node = soup.select_one(selector)
+                if node is not None:
+                    soup = BeautifulSoup(str(node), "html.parser")
+                else:
+                    logger.debug(f"extract: selector {selector!r} matched nothing for {base_url}")
+            for img in soup.find_all("img"):
+                src = (img.get("src") or img.get("data-src")
+                       or img.get("data-original") or img.get("data-lazy-src"))
+                if not src and img.get("srcset"):
+                    src = img["srcset"].split(",")[0].strip().split(" ")[0]
+                if src:
+                    if base_url:
+                        src = urljoin(base_url, src)
+                    img["src"] = src
+            if base_url:
+                for a in soup.find_all("a", href=True):
+                    a["href"] = urljoin(base_url, a["href"])
+            html = str(soup)
+        except Exception as e:
+            logger.debug(f"soup pre-clean skipped: {e}")
+        md = markdownify(html, heading_style="ATX", code_language="",
+                         bullets="-", strip=["script", "style"])
+        return (md or "").strip()
+    except Exception as e:
+        logger.warning(f"markdownify failed: {e}")
+        return (html or "").strip()
+def pdf_bytes_to_text(data: bytes) -> str:
+    """Extract text from PDF bytes. In-memory only — nothing written to disk."""
+    from pypdf import PdfReader
+    buf = io.BytesIO(data)
+    try:
+        reader = PdfReader(buf)
+        return "\n\n".join((p.extract_text() or "") for p in reader.pages).strip()
+    finally:
+        buf.close()