switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ """Tier (residential egress) — drive a remote browser over CDP.
2
+
3
+ The strongest egress lever we own: connect to a browser running on a *residential*
4
+ IP with a real, full Chrome fingerprint, instead of this box's datacenter IP. That
5
+ is the one thing no local fingerprint trick can fake, and it's the actual reason
6
+ hard Cloudflare / DataDome hosts (and datacenter-blocklisted sites) wall us.
7
+
8
+ Operator-provided seam: export BU_CDP_URL to a CDP endpoint and this tier connects
9
+ to it. Start one with browser-harness:
10
+
11
+ start_remote_daemon("scrape", proxyCountryCode="us") # prints a cdpUrl
12
+
13
+ then export that URL as BU_CDP_URL. Unset → tier is disabled and the cascade
14
+ skips it. It sits after the local browser tiers (only walls that beat them pay
15
+ this cost) and before paid Firecrawl.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import os
20
+
21
+ from ..concurrency import browser_slot
22
+ from ..normalize import html_to_markdown
23
+ from ..policy.gates import check
24
+
25
+ NAME = "tier_residential"
26
+ PAID = False
27
+
28
+ _TIMEOUT_MS = int(os.getenv("SCRAPER_RESIDENTIAL_TIMEOUT_MS", "30000"))
29
+
30
+
31
+ def disabled() -> bool:
32
+ """Off unless an operator has wired a residential CDP endpoint."""
33
+ return not os.getenv("BU_CDP_URL")
34
+
35
+
36
+ def fetch(url: str) -> str:
37
+ from patchright.sync_api import sync_playwright
38
+ cdp = os.environ["BU_CDP_URL"]
39
+ with browser_slot(NAME), sync_playwright() as p:
40
+ browser = p.chromium.connect_over_cdp(cdp)
41
+ try:
42
+ ctx = browser.contexts[0] if browser.contexts else browser.new_context()
43
+ page = ctx.new_page()
44
+ try:
45
+ page.goto(url, timeout=_TIMEOUT_MS, wait_until="domcontentloaded")
46
+ if len(page.content()) < 5000:
47
+ try:
48
+ page.wait_for_load_state("networkidle", timeout=8000)
49
+ except Exception:
50
+ pass
51
+ html = page.content()
52
+ finally:
53
+ page.close()
54
+ md = html_to_markdown(html, base_url=page.url or url)
55
+ finally:
56
+ browser.close()
57
+ return check(url, md)
switchback/tracing.py ADDED
@@ -0,0 +1,152 @@
1
+ """OpenTelemetry wiring → any OTLP backend: traces and logs.
2
+
3
+ Degrades gracefully: if the OTel packages aren't installed the `span()` helper
4
+ becomes a no-op context manager and `setup_logs()` is a no-op, so the engine
5
+ still runs without a backend. Point it at an OTLP backend (Jaeger, Tempo, SigNoz) with:
6
+
7
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
8
+ export OTEL_SERVICE_NAME=switchback
9
+
10
+ Traces: one trace per scraped URL, one span per tier attempt. Span attributes
11
+ use the keys in `Attr` so backend dashboards stay consistent.
12
+
13
+ Logs: `setup_logs()` routes the stdlib `logging` records to an OTLP backend
14
+ (opt-in — call it from the CLI / app entry point, not from library code, so we
15
+ never hijack a host app's logging). Records emitted inside a span are
16
+ auto-correlated with that trace.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import logging
21
+ import os
22
+ import threading
23
+ from contextlib import contextmanager
24
+
25
+
26
+ class Attr:
27
+ """Canonical span-attribute keys (keep dashboards consistent)."""
28
+ HOST = "scrape.host"
29
+ TIER = "scrape.tier"
30
+ OUTCOME = "scrape.outcome" # ok | short_content | botwall | http_block |
31
+ # rate_limited | timeout | connection |
32
+ # http_error | error | not_applicable |
33
+ # deadline_exceeded | all_failed | *_skipped
34
+ ERROR_CLASS = "scrape.error_class" # normalized failure class (see classify_error)
35
+ CHALLENGE = "scrape.challenge" # bot-wall vendor when one was served
36
+ # (cloudflare / datadome / akamai / …)
37
+ STATUS_CODE = "scrape.status_code" # upstream HTTP status when known (403/429/…)
38
+ MD_LEN = "scrape.md_len"
39
+ SOURCE = "scrape.source_method"
40
+ ERROR = "scrape.error"
41
+ COST_USD = "scrape.cost_usd"
42
+ LATENCY_MS = "scrape.latency_ms" # per-tier attempt, and total on the root
43
+ DEADLINE_S = "scrape.deadline_s" # the per-request budget that was in force
44
+
45
+
46
+ _tracer = None
47
+ _init_lock = threading.Lock()
48
+
49
+
50
+ def _init():
51
+ global _tracer
52
+ if _tracer is not None:
53
+ return _tracer
54
+ with _init_lock:
55
+ if _tracer is not None:
56
+ return _tracer
57
+ try:
58
+ from opentelemetry import trace
59
+ from opentelemetry.sdk.resources import Resource
60
+ from opentelemetry.sdk.trace import TracerProvider
61
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
62
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
63
+
64
+ service = os.getenv("OTEL_SERVICE_NAME", "switchback")
65
+ provider = TracerProvider(resource=Resource.create({"service.name": service}))
66
+ # Endpoint comes from OTEL_EXPORTER_OTLP_ENDPOINT; defaults to localhost:4317.
67
+ provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
68
+ trace.set_tracer_provider(provider)
69
+ _tracer = trace.get_tracer("switchback")
70
+ except Exception:
71
+ _tracer = False # tried and unavailable → no-op mode
72
+ return _tracer
73
+
74
+
75
+ _log_provider = None
76
+
77
+
78
+ def setup_logs(level: int = logging.INFO) -> bool:
79
+ """Route stdlib logging → an OTLP backend. Idempotent. Returns False (and
80
+ does nothing) if OTel isn't installed/configured. Opt-in: call from an app
81
+ entry point, not from library code."""
82
+ global _log_provider
83
+ if _log_provider is not None:
84
+ return bool(_log_provider)
85
+ try:
86
+ from opentelemetry._logs import set_logger_provider
87
+ from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
88
+ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
89
+ from opentelemetry.sdk.resources import Resource
90
+ from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
91
+
92
+ service = os.getenv("OTEL_SERVICE_NAME", "switchback")
93
+ provider = LoggerProvider(resource=Resource.create({"service.name": service}))
94
+ provider.add_log_record_processor(BatchLogRecordProcessor(OTLPLogExporter()))
95
+ set_logger_provider(provider)
96
+ handler = LoggingHandler(level=level, logger_provider=provider)
97
+ logging.getLogger().addHandler(handler)
98
+ _log_provider = provider
99
+ except Exception:
100
+ _log_provider = False # tried and unavailable → no-op
101
+ return bool(_log_provider)
102
+
103
+
104
+ def flush(timeout_ms: int = 5000) -> None:
105
+ """Force-export buffered spans and logs. Call at the end of a batch/CLI run
106
+ so telemetry lands before the process exits (the batch processors otherwise
107
+ flush on their own ~5s timer)."""
108
+ if _tracer:
109
+ try:
110
+ from opentelemetry import trace
111
+ provider = trace.get_tracer_provider()
112
+ if hasattr(provider, "force_flush"):
113
+ provider.force_flush(timeout_ms)
114
+ except Exception:
115
+ pass
116
+ if _log_provider:
117
+ try:
118
+ _log_provider.force_flush(timeout_ms)
119
+ except Exception:
120
+ pass
121
+
122
+
123
+ @contextmanager
124
+ def span(name: str, **attrs):
125
+ """Start a span. No-op if OTel isn't installed/configured.
126
+
127
+ Yields a small object with `.set(key, value)` so tiers can attach the
128
+ outcome/length/error once they know it.
129
+ """
130
+ tracer = _init()
131
+ if not tracer:
132
+ yield _NoopSpan()
133
+ return
134
+ with tracer.start_as_current_span(name) as sp:
135
+ for k, v in attrs.items():
136
+ if v is not None:
137
+ sp.set_attribute(k, v)
138
+ yield _RealSpan(sp)
139
+
140
+
141
+ class _NoopSpan:
142
+ def set(self, key, value):
143
+ pass
144
+
145
+
146
+ class _RealSpan:
147
+ def __init__(self, sp):
148
+ self._sp = sp
149
+
150
+ def set(self, key, value):
151
+ if value is not None:
152
+ self._sp.set_attribute(key, value)
@@ -0,0 +1,325 @@
1
+ Metadata-Version: 2.4
2
+ Name: switchback
3
+ Version: 0.1.0
4
+ Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
+ Author-email: Akash Kodavuru <akash@theaklabs.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/akash-kr/switchback
8
+ Project-URL: Repository, https://github.com/akash-kr/switchback
9
+ Project-URL: Issues, https://github.com/akash-kr/switchback/issues
10
+ Project-URL: Changelog, https://github.com/akash-kr/switchback/blob/main/CHANGELOG.md
11
+ Keywords: scraping,crawler,cloudflare,markdown,cascade,anti-bot,stealth
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ License-File: NOTICE
28
+ Requires-Dist: markdownify
29
+ Requires-Dist: beautifulsoup4
30
+ Requires-Dist: pypdf
31
+ Requires-Dist: requests
32
+ Requires-Dist: curl_cffi
33
+ Provides-Extra: cloudflare
34
+ Requires-Dist: cloudscraper; extra == "cloudflare"
35
+ Provides-Extra: browser
36
+ Requires-Dist: patchright; extra == "browser"
37
+ Provides-Extra: camoufox
38
+ Requires-Dist: camoufox[geoip]; extra == "camoufox"
39
+ Provides-Extra: firecrawl
40
+ Requires-Dist: firecrawl-py; extra == "firecrawl"
41
+ Provides-Extra: server
42
+ Requires-Dist: fastapi; extra == "server"
43
+ Requires-Dist: uvicorn; extra == "server"
44
+ Provides-Extra: tracing
45
+ Requires-Dist: opentelemetry-sdk; extra == "tracing"
46
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
47
+ Provides-Extra: all
48
+ Requires-Dist: cloudscraper; extra == "all"
49
+ Requires-Dist: patchright; extra == "all"
50
+ Requires-Dist: camoufox[geoip]; extra == "all"
51
+ Requires-Dist: firecrawl-py; extra == "all"
52
+ Requires-Dist: fastapi; extra == "all"
53
+ Requires-Dist: uvicorn; extra == "all"
54
+ Requires-Dist: opentelemetry-sdk; extra == "all"
55
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "all"
56
+ Provides-Extra: dev
57
+ Requires-Dist: pytest; extra == "dev"
58
+ Dynamic: license-file
59
+
60
+ <!-- switchback -->
61
+
62
+ ```
63
+ ███████╗██╗ ██╗██╗████████╗ ██████╗██╗ ██╗██████╗ █████╗ ██████╗██╗ ██╗
64
+ ██╔════╝██║ ██║██║╚══██╔══╝██╔════╝██║ ██║██╔══██╗██╔══██╗██╔════╝██║ ██╔╝
65
+ ███████╗██║ █╗ ██║██║ ██║ ██║ ███████║██████╔╝███████║██║ █████╔╝
66
+ ╚════██║██║███╗██║██║ ██║ ██║ ██╔══██║██╔══██╗██╔══██║██║ ██╔═██╗
67
+ ███████║╚███╔███╔╝██║ ██║ ╚██████╗██║ ██║██████╔╝██║ ██║╚██████╗██║ ██╗
68
+ ╚══════╝ ╚══╝╚══╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝
69
+ ```
70
+
71
+ <div align="center">
72
+
73
+ **One cost-ordered scrape cascade — HTTP → stealth browser → paid — shared by every tool.**
74
+
75
+ Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
76
+ to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
77
+
78
+ [![PyPI](https://img.shields.io/pypi/v/switchback.svg)](https://pypi.org/project/switchback/)
79
+ [![Python](https://img.shields.io/pypi/pyversions/switchback.svg)](https://pypi.org/project/switchback/)
80
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
81
+ [![CI](https://github.com/akash-kr/switchback/actions/workflows/ci.yml/badge.svg)](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
82
+
83
+ </div>
84
+
85
+ ---
86
+
87
+ ## Why
88
+
89
+ Most scrapers either give up on hard pages or send *everything* through an expensive
90
+ headless browser / paid API. **switchback** orders the methods by cost and walks them
91
+ cheapest-first, per host, learning which tier wins where so the next run starts there.
92
+ The easy majority stays free; only genuinely-walled hosts pay for the heavy tiers.
93
+
94
+ - **Cost-ordered cascade** — free APIs → cheap HTTP → anti-bot solver → stealth browser → paid API.
95
+ - **Per-host memory (botwall)** — remembers the winning tier per host, skip-lists hard blockers, auto-skips hosts stuck on the paid tier.
96
+ - **Cost-scoped residential egress** — routes *only* walled hosts through a residential proxy, never the easy majority.
97
+ - **One shape, three entry points** — Python library, CLI (JSON on stdout), or an HTTP service.
98
+ - **Observable** — every attempt is an OpenTelemetry span; logs ship trace-correlated to any OTLP backend (Jaeger, Tempo, SigNoz).
99
+ - **Runs with any subset installed** — each tier imports its deps lazily; a missing one is just a tier miss.
100
+
101
+ ## Quickstart
102
+
103
+ ```bash
104
+ pip install switchback # core: cheap tiers (0/1) + search
105
+ ```
106
+
107
+ ```python
108
+ from switchback import scrape
109
+
110
+ for r in scrape(["https://arxiv.org/abs/1706.03762"]):
111
+ print(r.source_method, len(r.markdown))
112
+ ```
113
+
114
+ ```bash
115
+ python -m switchback https://example.com/article # JSON on stdout — bridge for any language
116
+ ```
117
+
118
+ That's the whole loop. Add tiers as you need them (see [Install](#install)).
119
+
120
+ ## The cascade (stop at first success)
121
+
122
+ | Tier | Strategy | Cost |
123
+ |---|---|---|
124
+ | 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
+ | 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
+ | 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
+ | 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
+ | 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
+ | 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
+ | 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
131
+
132
+ Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
133
+ tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
134
+ latency + outcome (`ok` / `short_content` / `rate_limited` / `miss` / `not_applicable`)
135
+ to its span and the botwall event log; the root span carries total latency and the final
136
+ outcome (incl. `deadline_exceeded`).
137
+
138
+ Search (query → URLs) is separate from the scrape cascade: `switchback.search()` /
139
+ `python -m switchback.api --search <query>`, backed by a local SearXNG.
140
+
141
+
142
+ ## Install
143
+
144
+ ```bash
145
+ pip install switchback # core: normalization + cheap tiers (0/1) + search
146
+ pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
147
+ pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
148
+ pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
149
+ pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
150
+ pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
151
+ pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
152
+ pip install "switchback[all]" # everything
153
+ ```
154
+
155
+ For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
156
+ 3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
157
+ git-URL dep inside a published package, so install it alongside):
158
+
159
+ ```bash
160
+ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
161
+ ```
162
+
163
+ Or run the whole thing as a container:
164
+ `docker build -t switchback . && docker run -p 8799:8799 switchback`.
165
+
166
+ ## Use it from your app
167
+
168
+ Three interchangeable entry points — all return the same shape
169
+ (`[{url, source_method, markdown}]`, successes only):
170
+
171
+ **Python library**
172
+ ```python
173
+ from switchback import scrape
174
+ for r in scrape(["https://arxiv.org/abs/1706.03762"]):
175
+ print(r.source_method, len(r.markdown))
176
+
177
+ # Need failures + reasons too? scrape_detailed returns a ScrapeOutcome per URL
178
+ # (ok, final_outcome, error_class, status_code, and the per-tier attempts):
179
+ from switchback import scrape_detailed
180
+ for o in scrape_detailed(["https://www.pcmag.com/news"]):
181
+ if not o.ok:
182
+ print(o.url, o.final_outcome, o.error_class, o.status_code)
183
+ ```
184
+
185
+ **CLI** (JSON on stdout — bridge for any language)
186
+ ```bash
187
+ python -m switchback https://example.com/article # or: switchback <url>
188
+ ```
189
+
190
+ **HTTP service** (language-agnostic; one warm process keeps the browser pool hot)
191
+ ```bash
192
+ switchback-server # listens on :8799
193
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"]}'
194
+ curl 'localhost:8799/search?q=web+scraping'
195
+ ```
196
+
197
+ Non-Python callers: see [clients/node_bridge.md](clients/node_bridge.md). Python
198
+ callers that want HTTP-with-CLI-fallback can drop in
199
+ [clients/python_client.py](clients/python_client.py).
200
+
201
+ ## Cost-scoped residential egress
202
+
203
+ The dominant reason hard hosts wall you is the **datacenter IP**, not the
204
+ fingerprint. When a host repeatedly walls the local tiers (a 403/429 or a
205
+ bot-wall page, `SCRAPER_BOTWALL_EGRESS_AFTER` times) it's flagged `needs_egress`
206
+ and the cascade reruns through a **residential proxy** — but only for that host:
207
+
208
+ ```bash
209
+ export SCRAPER_EGRESS_PROXY="http://user:pass@p.webshare.io:80"
210
+ ```
211
+
212
+ The easy majority that already succeeds free at the datacenter IP stays direct,
213
+ so you never spend (often metered) residential bandwidth on it. Escalation tries
214
+ the cheap HTTP tiers through the proxy first (~0.2MB/page) before the heavier
215
+ browser tiers. [Webshare](https://www.webshare.io/)'s free plan includes ~1GB/mo
216
+ of residential bandwidth — enough for low-volume hard-host recovery at $0. Use
217
+ `SCRAPER_PROXY` instead to force *every* request through a proxy.
218
+
219
+ ## Metrics & reporting
220
+
221
+ The engine derives all metrics from its own state files (no external store): the
222
+ botwall event log (one row per tier attempt, incl. the detected challenge vendor)
223
+ and the per-host DB (winning tier, per-vendor `challenge_counts`).
224
+
225
+ ```bash
226
+ curl localhost:8799/metrics # cost savings vs Firecrawl, coverage,
227
+ # overall + per-tier latency, outcomes
228
+ curl localhost:8799/metrics/domains # per-domain: error codes, challenges, latency
229
+ python -m switchback.flags # periodic digest: domains stuck on Firecrawl,
230
+ # escalated to egress, top challenged (cron-friendly)
231
+ ```
232
+
233
+ Both endpoints accept `?minutes=N` to window the event-derived sections. The
234
+ **savings** figure compares engine spend (Firecrawl invocations only) against a
235
+ Firecrawl-everything baseline, charging the hard-page credit multiplier
236
+ (`BENCH_FIRECRAWL_HARD_MULT`) for URLs that needed a browser/residential tier or
237
+ hit a challenge — i.e. exactly the ones Firecrawl bills more for.
238
+
239
+ ## Configuration
240
+
241
+ All configuration is via environment variables. The engine runs with missing
242
+ pieces: each tier imports its deps lazily and a missing one just counts as a tier
243
+ miss. Tracing no-ops if OTel isn't installed/configured.
244
+
245
+ <details>
246
+ <summary><b>Tracing (optional)</b></summary>
247
+
248
+ ```bash
249
+ export OTEL_SERVICE_NAME=switchback
250
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
251
+ ```
252
+ </details>
253
+
254
+ <details>
255
+ <summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
256
+
257
+ - `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
258
+ - `FIRECRAWL_API_KEY` — enable Tier 4
259
+ - `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
260
+ - `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
261
+ - `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
262
+ - `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
263
+ - `SEARXNG_URL` — defaults to `http://localhost:8888`
264
+ - `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
265
+ - `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
266
+ - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
267
+ </details>
268
+
269
+ <details>
270
+ <summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
271
+
272
+ - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
273
+ - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
274
+ - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
275
+ - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
276
+ - `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
277
+ - `SCRAPER_SESSION_TTL_S` — cf_clearance reuse window (1800s)
278
+ - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
279
+ - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
280
+ - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
281
+ - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
282
+ - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
283
+ - `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
284
+ - `BENCH_FIRECRAWL_USD` / `BENCH_FIRECRAWL_HARD_MULT` — cost model for the savings report
285
+ </details>
286
+
287
+ ### Logged-in sessions
288
+ Beyond a static `SCRAPER_COOKIES_FILE`, wire `SCRAPER_LOGIN_HOOK` to a callable
289
+ `func(host) -> {cookie: value}`. When an authenticated host trips a login/bot
290
+ wall, the engine calls the hook once, persists the returned cookies per host, and
291
+ overlays them on every tier (and future runs), then re-runs that URL on a fresh
292
+ budget. The hook owns the site-specific login mechanics; the engine stays generic.
293
+
294
+ ### Session traces
295
+ With `SCRAPER_TRACE_SESSION=1`, each browser-tier attempt writes a Playwright
296
+ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
297
+ `GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
298
+ `playwright show-trace <zip>`. Off by default (traces are MBs each).
299
+
300
+ ### Per-domain extraction
301
+ Markdown of the whole page is the default. To scope a site to its content node or
302
+ strip site-specific noise, declare prefs per host in `config/extraction.json`
303
+ (see [config/extraction.example.json](config/extraction.example.json)); every
304
+ tier's normalize step picks them up automatically.
305
+
306
+ ## Contributing
307
+
308
+ Issues and PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md). Start with the
309
+ cascade runner in `switchback/orchestrator.py`.
310
+
311
+ ## Responsible use
312
+
313
+ This engine is for lawful data collection. You are responsible for respecting
314
+ each target site's Terms of Service, `robots.txt`, and rate limits, and for
315
+ having the right to access the content you fetch. The stealth / anti-bot tiers
316
+ (`cloudscraper`, `patchright`, `camoufox`) exist to handle legitimate access
317
+ friction (e.g. generic bot interstitials on public pages) — not to evade access
318
+ controls, paywalls, or authentication you aren't authorized to bypass. The
319
+ software is provided "as is", without warranty (see [LICENSE](LICENSE)).
320
+
321
+ ## License
322
+
323
+ MIT — see [LICENSE](LICENSE). Third-party dependencies and their licenses are
324
+ listed in [NOTICE](NOTICE); all are permissive (MIT / BSD-3-Clause / Apache-2.0)
325
+ and compatible with this project's MIT license.
@@ -0,0 +1,36 @@
1
+ switchback/__init__.py,sha256=a3WwmcxW3YLBDVMPn5xho8sh-VBIUJC34r6zREpMeKk,506
2
+ switchback/__main__.py,sha256=FNhL9UYwA2BpXukEaEzFSbc5H-FHKDb390n6yUYM6mU,97
3
+ switchback/api.py,sha256=h9kx1o3q1vvxTpr5C-0b3lhORCdk3M_84NreyL3be2A,2973
4
+ switchback/concurrency.py,sha256=FYOjHVa9gMpydYK72Nm5t27lmNBRg_Eh507blHKjLzk,1348
5
+ switchback/content_cache.py,sha256=1Svp4PNFGXnywtrJzT4ptKorB9JlpDw9onmXtWPfKpc,3264
6
+ switchback/egress.py,sha256=_3fqqc8TSw9kuoH8o1YzDST9Tv-tbdEGjHGxTPVFWy0,4211
7
+ switchback/extract.py,sha256=dKWj4vHGewHb9PqBhjw-NDOo-aweLItGC1NWltD4-dY,1825
8
+ switchback/flags.py,sha256=R08P2RzRfijdkhvAN-JxqdsxZZZW9STqSLMRfIbLPZI,4012
9
+ switchback/normalize.py,sha256=hbpTmsVgCGiuJtSoV-SpFcZYiH4ndnF_wb_hwzBTKtM,3266
10
+ switchback/orchestrator.py,sha256=DAzBZBoB7CkSrklUWmzowVWaFxZQ-dot3uJN57cpnPw,15915
11
+ switchback/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ switchback/reporting.py,sha256=9do6pqreH7bHpGyQKKu9TqLrxPxR4bbrMlMb2GYB2V4,9218
13
+ switchback/search.py,sha256=Fg2bz2tPvPM_YOCCQE-u-WScvCyNtFdXsmAbvqvraoE,1352
14
+ switchback/server.py,sha256=8pdFg4pWBFQoExMUzEZUcKl5bOTqvEUwMzIt5-r0QgY,3766
15
+ switchback/session_cache.py,sha256=wBvMMLOUw8hriu4N7VbvXo10k6-cdKJ8CKDuMVoCKjQ,9986
16
+ switchback/session_trace.py,sha256=jJTmEyd7iZVAaDgIFyk4vu2mPAI8iWSH_pLjRUOfxg4,3332
17
+ switchback/tracing.py,sha256=5uTpUfYl8a2gIiaNg94Pk8COK143kBxeA1zKJoei73U,5772
18
+ switchback/policy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ switchback/policy/botwall.py,sha256=uZsoerKM0YsC_y_KtyckIufYeqIZhW2c2TMKhLgWc4g,16187
20
+ switchback/policy/gates.py,sha256=z7eeSWdb1Bqdjcz1qHrUtZ4mIgX8-Hms2Mvk9AEP30I,7103
21
+ switchback/tiers/__init__.py,sha256=seqE3NDnc71XVnf3DuklRx1KfqQDVzudPwxofCflHkY,851
22
+ switchback/tiers/_browser.py,sha256=4kaWxh8wkegyKsnE2pwe9LLv7xJG-eSxq5Gjwz53DFI,2195
23
+ switchback/tiers/tier0_apis.py,sha256=dAwTf_qkdYj1rCNVdH5Qn93JwAoGU_VhMT9YryYFTc8,3086
24
+ switchback/tiers/tier1_http.py,sha256=bpDrOY0lUen2sS9sgF3l_oObkcogasKpuYk-OJSNxto,2902
25
+ switchback/tiers/tier2_cloudscraper.py,sha256=weNXrZj61d5iATf7JVfjL5qG9Q-lsup8SSV9fsQoGe8,5522
26
+ switchback/tiers/tier3_browser.py,sha256=oYG_pDvvG79AwDdyhn1a1EV5NnDZfMp1hYgFO6eDZAY,2531
27
+ switchback/tiers/tier3b_camoufox.py,sha256=UFTIeUQ36r7ixQSjqtswbf9ddMfM9xv2fmKEB2tKs4o,3515
28
+ switchback/tiers/tier4_firecrawl.py,sha256=YZ02zx4Adv8xXALJw2C2XgzZAs4r6wJWMDmWeOlngPQ,1556
29
+ switchback/tiers/tier_residential.py,sha256=IlR0d-OYKYWqMOMp0ZLrliIVRsPXn2wCvWgSls-DIZ4,2097
30
+ switchback-0.1.0.dist-info/licenses/LICENSE,sha256=b8V82Q_eJ8JgOap-Zg7JexHZKRF-g1k6ZPfGn2FJXf8,1071
31
+ switchback-0.1.0.dist-info/licenses/NOTICE,sha256=wloHWGC3lw_iV6GM2AfaIOhBQmoTGWF1j3Gin8YNKrM,1461
32
+ switchback-0.1.0.dist-info/METADATA,sha256=JpEJURvlQYbmbrYFYWIUDaJFXN1ZlQP0JjJD-LfSQbM,16400
33
+ switchback-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
34
+ switchback-0.1.0.dist-info/entry_points.txt,sha256=Y51hpCqJxN5MaIhTkkBxE6LBs8N6CoIEF3bX58wqbh0,95
35
+ switchback-0.1.0.dist-info/top_level.txt,sha256=ttbyWWHmZeKuLw0aWB5AGSm4DBRjjJXNruNtJzeuSUk,11
36
+ switchback-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ switchback = switchback.api:_main
3
+ switchback-server = switchback.server:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Akash Kodavuru
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,34 @@
1
+ switchback
2
+ Copyright (c) 2026 Akash Kodavuru
3
+ Licensed under the MIT License (see LICENSE).
4
+
5
+ This product depends on third-party components, each under its own license.
6
+ They are installed via pip and are NOT bundled or redistributed in this
7
+ repository — no upstream source or full license texts are vendored here. All
8
+ dependencies use permissive licenses (MIT / BSD-3-Clause / Apache-2.0), which
9
+ are compatible with this project's MIT license.
10
+
11
+ Third-party components
12
+ ----------------------
13
+
14
+ MIT
15
+ markdownify HTML -> Markdown normalization
16
+ beautifulsoup4 HTML parsing / normalization
17
+ curl_cffi tier1: plain HTTP with TLS impersonation
18
+ cloudscraper tier2: Cloudflare / anti-bot challenge solver
19
+ camoufox tier3b: Firefox stealth (optional)
20
+ firecrawl-py tier4: paid last-resort scrape API (optional)
21
+ fastapi HTTP service (optional)
22
+
23
+ BSD-3-Clause
24
+ pypdf PDF -> text extraction (tier1 PDFs)
25
+ uvicorn ASGI server for the HTTP service (optional)
26
+
27
+ Apache-2.0
28
+ requests HTTP client
29
+ patchright tier3: stealth headless Chromium (optional)
30
+ opentelemetry-sdk tracing -> any OTLP backend (optional)
31
+ opentelemetry-exporter-otlp-proto-grpc OTLP trace/log export (optional)
32
+
33
+ Full license texts are available with each package distribution (e.g. in its
34
+ `*.dist-info` directory after install) and from each project's repository.
@@ -0,0 +1 @@
1
+ switchback