switchback 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. switchback-0.1.0/.env.example +70 -0
  2. switchback-0.1.0/CHANGELOG.md +34 -0
  3. switchback-0.1.0/CONTRIBUTING.md +35 -0
  4. switchback-0.1.0/LICENSE +21 -0
  5. switchback-0.1.0/MANIFEST.in +12 -0
  6. switchback-0.1.0/NOTICE +34 -0
  7. switchback-0.1.0/PKG-INFO +325 -0
  8. switchback-0.1.0/README.md +266 -0
  9. switchback-0.1.0/SECURITY.md +27 -0
  10. switchback-0.1.0/clients/node_bridge.md +44 -0
  11. switchback-0.1.0/clients/python_client.py +87 -0
  12. switchback-0.1.0/config/botwall_skip_urls.txt +11 -0
  13. switchback-0.1.0/config/extraction.example.json +10 -0
  14. switchback-0.1.0/pyproject.toml +77 -0
  15. switchback-0.1.0/setup.cfg +4 -0
  16. switchback-0.1.0/switchback/__init__.py +12 -0
  17. switchback-0.1.0/switchback/__main__.py +4 -0
  18. switchback-0.1.0/switchback/api.py +81 -0
  19. switchback-0.1.0/switchback/concurrency.py +37 -0
  20. switchback-0.1.0/switchback/content_cache.py +94 -0
  21. switchback-0.1.0/switchback/egress.py +108 -0
  22. switchback-0.1.0/switchback/extract.py +56 -0
  23. switchback-0.1.0/switchback/flags.py +96 -0
  24. switchback-0.1.0/switchback/normalize.py +81 -0
  25. switchback-0.1.0/switchback/orchestrator.py +343 -0
  26. switchback-0.1.0/switchback/policy/__init__.py +0 -0
  27. switchback-0.1.0/switchback/policy/botwall.py +393 -0
  28. switchback-0.1.0/switchback/policy/gates.py +173 -0
  29. switchback-0.1.0/switchback/py.typed +0 -0
  30. switchback-0.1.0/switchback/reporting.py +236 -0
  31. switchback-0.1.0/switchback/search.py +39 -0
  32. switchback-0.1.0/switchback/server.py +114 -0
  33. switchback-0.1.0/switchback/session_cache.py +274 -0
  34. switchback-0.1.0/switchback/session_trace.py +96 -0
  35. switchback-0.1.0/switchback/tiers/__init__.py +24 -0
  36. switchback-0.1.0/switchback/tiers/_browser.py +50 -0
  37. switchback-0.1.0/switchback/tiers/tier0_apis.py +77 -0
  38. switchback-0.1.0/switchback/tiers/tier1_http.py +65 -0
  39. switchback-0.1.0/switchback/tiers/tier2_cloudscraper.py +135 -0
  40. switchback-0.1.0/switchback/tiers/tier3_browser.py +59 -0
  41. switchback-0.1.0/switchback/tiers/tier3b_camoufox.py +89 -0
  42. switchback-0.1.0/switchback/tiers/tier4_firecrawl.py +48 -0
  43. switchback-0.1.0/switchback/tiers/tier_residential.py +57 -0
  44. switchback-0.1.0/switchback/tracing.py +152 -0
  45. switchback-0.1.0/switchback.egg-info/PKG-INFO +325 -0
  46. switchback-0.1.0/switchback.egg-info/SOURCES.txt +48 -0
  47. switchback-0.1.0/switchback.egg-info/dependency_links.txt +1 -0
  48. switchback-0.1.0/switchback.egg-info/entry_points.txt +3 -0
  49. switchback-0.1.0/switchback.egg-info/requires.txt +38 -0
  50. switchback-0.1.0/switchback.egg-info/top_level.txt +1 -0
@@ -0,0 +1,70 @@
1
+ # switchback — environment configuration
2
+ #
3
+ # The engine reads these via os.getenv; it does NOT auto-load this file.
4
+ # To use it: cp .env.example .env then load before running:
5
+ # set -a; source .env; set +a
6
+ # python -m switchback.api <url>
7
+ # Everything here has a sane default or is optional — an empty .env still runs.
8
+
9
+ # ── Tracing → OTLP backend (OTLP/gRPC) ────────────────────────────────────────────
10
+ # Spans/logs export here; if unreachable the engine degrades gracefully.
11
+ OTEL_SERVICE_NAME=switchback
12
+ OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
13
+
14
+ # ── Search (Tier-0 SearXNG, query → URLs) ───────────────────────────────────
15
+ SEARXNG_URL=http://localhost:8888
16
+
17
+ # ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
18
+ # Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
19
+ JINA_API_KEY=
20
+ SCRAPER_JINA_TIMEOUT_S=20
21
+
22
+ # ── Tier 3b · Camoufox (Firefox stealth) ────────────────────────────────────
23
+ # ON by default. Needs: pip install camoufox && camoufox fetch
24
+ # Set to 1 to turn the tier off entirely.
25
+ SCRAPER_DISABLE_CAMOUFOX=
26
+ SCRAPER_CAMOUFOX_TIMEOUT_MS=45000
27
+
28
+ # ── Tier 4 · Firecrawl (paid, last resort) ──────────────────────────────────
29
+ # Required only if this tier runs. Set SCRAPER_DISABLE_FIRECRAWL=1 to skip it.
30
+ FIRECRAWL_API_KEY=
31
+ SCRAPER_DISABLE_FIRECRAWL=
32
+
33
+ # ── Orchestrator ────────────────────────────────────────────────────────────
34
+ # Per-URL wall-clock budget (s), checked between tiers. 45s balances latency vs
35
+ # coverage — roughly fits a Camoufox solve (~40s) that starts after the cheaper
36
+ # tiers fail fast. Lower toward 30s for tighter latency, raise toward 60s for
37
+ # deeper hard-host coverage.
38
+ SCRAPER_DEADLINE_S=45
39
+
40
+ # Max headless browsers (patchright ~150MB, Camoufox ~600MB) running at once.
41
+ # 1 matches the sequential design; raise only if the box has RAM headroom and
42
+ # you scrape in parallel.
43
+ SCRAPER_BROWSER_CONCURRENCY=1
44
+
45
+ # ── State ───────────────────────────────────────────────────────────────────
46
+ # Where the botwall DB + JSONL event log live (default: ./state).
47
+ SCRAPER_STATE_DIR=
48
+
49
+ # ── Botwall policy ──────────────────────────────────────────────────────────
50
+ # SCRAPER_BOTWALL_URL_SKIP_AFTER
51
+ # Hard failures (botwall hit or short content) on the *same URL* before that
52
+ # URL is excluded. Set to 0 to disable URL-level auto-exclusion.
53
+ SCRAPER_BOTWALL_URL_SKIP_AFTER=2
54
+
55
+ # SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER
56
+ # Hard failures across any URLs on a domain before the whole domain is
57
+ # skip-listed. 0 (default) = domains are never auto-skipped; only seeded
58
+ # hard-block domains and manual overrides in the DB are domain-level skips.
59
+ # Set to a positive number (e.g. 10) to re-enable domain-level auto-skip.
60
+ SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER=0
61
+
62
+ # SCRAPER_BOTWALL_COUNT_FIRECRAWL
63
+ # When true, each Firecrawl invocation counts toward the domain failure total
64
+ # (original v1 behaviour). No effect when DOMAIN_SKIP_AFTER is 0.
65
+ SCRAPER_BOTWALL_COUNT_FIRECRAWL=false
66
+
67
+ # SCRAPER_BOTWALL_SKIP_URLS_FILE
68
+ # Path to the manual URL skip list (default: config/botwall_skip_urls.txt).
69
+ # Format: one URL per line, optional " # reason" suffix.
70
+ SCRAPER_BOTWALL_SKIP_URLS_FILE=
@@ -0,0 +1,34 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. Format loosely follows
4
+ [Keep a Changelog](https://keepachangelog.com/); this project uses semantic-ish
5
+ versioning while pre-1.0.
6
+
7
+ ## [Unreleased]
8
+
9
+ ### Added
10
+ - **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
11
+ DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
12
+ botwall DB; the vendor is attached to each event and OTel span (`scrape.challenge`).
13
+ - **Metrics & reporting** — `switchback.reporting` rolls the event log + botwall DB
14
+ into cost-savings-vs-Firecrawl, coverage, overall/per-tier/per-domain latency
15
+ (mean/median/min/max/p50/p95), outcomes, error codes by domain, and challenges
16
+ by domain. Exposed via `GET /metrics` and `GET /metrics/domains` (both accept
17
+ `?minutes=N`).
18
+ - **Periodic flagging** — `python -m switchback.flags` emits a cron-friendly digest
19
+ (domains stuck on Firecrawl, escalated to egress, most-challenged) to logs/OTel.
20
+ - **Content cache** — optional URL→result cache (`SCRAPER_CONTENT_TTL_S`, sqlite,
21
+ off by default) short-circuits re-scrapes before any tier runs.
22
+ - **Login-session refresh** — `SCRAPER_LOGIN_HOOK` (`pkg.module:func`) refreshes a
23
+ dead logged-in session on demand; cookies overlay every tier and persist.
24
+ - **Exponential backoff** — between-tier backoff with jitter after rate-limit /
25
+ timeout (`SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS`, off by default).
26
+ - **Per-domain extraction prefs** — `config/extraction.json` (CSS scope selector +
27
+ extra drops) applied automatically in the normalize step for every tier.
28
+ - **Session traces** — opt-in Playwright trace capture (`SCRAPER_TRACE_SESSION=1`)
29
+ for browser tiers, with `GET/DELETE /traces` management endpoints.
30
+
31
+ ### Changed
32
+ - Tier 2's `cloudscraper` moved from a core dependency (which pinned a git-URL
33
+ fork PyPI can't publish) to the `cloudflare` extra; see the README for installing
34
+ the 3.x Enhanced Edition fork for full stealth.
@@ -0,0 +1,35 @@
1
+ # Contributing
2
+
3
+ Thanks for your interest in improving switchback.
4
+
5
+ ## Development setup
6
+ ```bash
7
+ python -m venv .venv && . .venv/bin/activate
8
+ pip install -e ".[all]"
9
+ patchright install chromium && camoufox fetch # for the browser tiers
10
+ ```
11
+
12
+ ## Architecture
13
+ The engine is a cost-ordered cascade (`switchback/tiers/`) governed by a per-host
14
+ policy (`switchback/policy/`). Start with the cascade runner in
15
+ `switchback/orchestrator.py`.
16
+
17
+ ## Guidelines
18
+ - **Keep the core small.** Each tier imports its deps lazily and a missing dep is
19
+ just a tier miss — keep heavy/paid/optional pieces behind extras in
20
+ `pyproject.toml`.
21
+ - **Make new behavior configurable and off-safe.** New features should be gated by
22
+ an env var that defaults to current behavior (see the existing `SCRAPER_*` vars).
23
+ - **Match the surrounding style** — terse, comment-the-why, no speculative
24
+ abstractions.
25
+ - **Don't commit secrets or run artifacts.** `.env`, `state/`, and `*.csv` are
26
+ gitignored; keep it that way.
27
+
28
+ ## Tests
29
+ `tests/test_suite.py` exercises the cascade across the anti-bot difficulty
30
+ spectrum (needs network + browser tiers): `python tests/test_suite.py --quick`
31
+ for a fast tier-0/1 pass.
32
+
33
+ ## Pull requests
34
+ Keep PRs focused; describe what changed and why, and note any new env var or
35
+ endpoint in the README and CHANGELOG.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Akash Kodavuru
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,12 @@
1
+ include LICENSE
2
+ include NOTICE
3
+ include README.md
4
+ include CHANGELOG.md
5
+ include CONTRIBUTING.md
6
+ include SECURITY.md
7
+ include .env.example
8
+ include switchback/py.typed
9
+ recursive-include config *.txt *.json
10
+ graft clients
11
+ graft deploy
12
+ prune tests
@@ -0,0 +1,34 @@
1
+ switchback
2
+ Copyright (c) 2026 Akash Kodavuru
3
+ Licensed under the MIT License (see LICENSE).
4
+
5
+ This product depends on third-party components, each under its own license.
6
+ They are installed via pip and are NOT bundled or redistributed in this
7
+ repository — no upstream source or full license texts are vendored here. All
8
+ dependencies use permissive licenses (MIT / BSD-3-Clause / Apache-2.0), which
9
+ are compatible with this project's MIT license.
10
+
11
+ Third-party components
12
+ ----------------------
13
+
14
+ MIT
15
+ markdownify HTML -> Markdown normalization
16
+ beautifulsoup4 HTML parsing / normalization
17
+ curl_cffi tier1: plain HTTP with TLS impersonation
18
+ cloudscraper tier2: Cloudflare / anti-bot challenge solver
19
+ camoufox tier3b: Firefox stealth (optional)
20
+ firecrawl-py tier4: paid last-resort scrape API (optional)
21
+ fastapi HTTP service (optional)
22
+
23
+ BSD-3-Clause
24
+ pypdf PDF -> text extraction (tier1 PDFs)
25
+ uvicorn ASGI server for the HTTP service (optional)
26
+
27
+ Apache-2.0
28
+ requests HTTP client
29
+ patchright tier3: stealth headless Chromium (optional)
30
+ opentelemetry-sdk tracing -> any OTLP backend (optional)
31
+ opentelemetry-exporter-otlp-proto-grpc OTLP trace/log export (optional)
32
+
33
+ Full license texts are available with each package distribution (e.g. in its
34
+ `*.dist-info` directory after install) and from each project's repository.
@@ -0,0 +1,325 @@
1
+ Metadata-Version: 2.4
2
+ Name: switchback
3
+ Version: 0.1.0
4
+ Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
+ Author-email: Akash Kodavuru <akash@theaklabs.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/akash-kr/switchback
8
+ Project-URL: Repository, https://github.com/akash-kr/switchback
9
+ Project-URL: Issues, https://github.com/akash-kr/switchback/issues
10
+ Project-URL: Changelog, https://github.com/akash-kr/switchback/blob/main/CHANGELOG.md
11
+ Keywords: scraping,crawler,cloudflare,markdown,cascade,anti-bot,stealth
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ License-File: NOTICE
28
+ Requires-Dist: markdownify
29
+ Requires-Dist: beautifulsoup4
30
+ Requires-Dist: pypdf
31
+ Requires-Dist: requests
32
+ Requires-Dist: curl_cffi
33
+ Provides-Extra: cloudflare
34
+ Requires-Dist: cloudscraper; extra == "cloudflare"
35
+ Provides-Extra: browser
36
+ Requires-Dist: patchright; extra == "browser"
37
+ Provides-Extra: camoufox
38
+ Requires-Dist: camoufox[geoip]; extra == "camoufox"
39
+ Provides-Extra: firecrawl
40
+ Requires-Dist: firecrawl-py; extra == "firecrawl"
41
+ Provides-Extra: server
42
+ Requires-Dist: fastapi; extra == "server"
43
+ Requires-Dist: uvicorn; extra == "server"
44
+ Provides-Extra: tracing
45
+ Requires-Dist: opentelemetry-sdk; extra == "tracing"
46
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
47
+ Provides-Extra: all
48
+ Requires-Dist: cloudscraper; extra == "all"
49
+ Requires-Dist: patchright; extra == "all"
50
+ Requires-Dist: camoufox[geoip]; extra == "all"
51
+ Requires-Dist: firecrawl-py; extra == "all"
52
+ Requires-Dist: fastapi; extra == "all"
53
+ Requires-Dist: uvicorn; extra == "all"
54
+ Requires-Dist: opentelemetry-sdk; extra == "all"
55
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "all"
56
+ Provides-Extra: dev
57
+ Requires-Dist: pytest; extra == "dev"
58
+ Dynamic: license-file
59
+
60
+ <!-- switchback -->
61
+
62
+ ```
63
+ ███████╗██╗ ██╗██╗████████╗ ██████╗██╗ ██╗██████╗ █████╗ ██████╗██╗ ██╗
64
+ ██╔════╝██║ ██║██║╚══██╔══╝██╔════╝██║ ██║██╔══██╗██╔══██╗██╔════╝██║ ██╔╝
65
+ ███████╗██║ █╗ ██║██║ ██║ ██║ ███████║██████╔╝███████║██║ █████╔╝
66
+ ╚════██║██║███╗██║██║ ██║ ██║ ██╔══██║██╔══██╗██╔══██║██║ ██╔═██╗
67
+ ███████║╚███╔███╔╝██║ ██║ ╚██████╗██║ ██║██████╔╝██║ ██║╚██████╗██║ ██╗
68
+ ╚══════╝ ╚══╝╚══╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝
69
+ ```
70
+
71
+ <div align="center">
72
+
73
+ **One cost-ordered scrape cascade — HTTP → stealth browser → paid — shared by every tool.**
74
+
75
+ Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
76
+ to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
77
+
78
+ [![PyPI](https://img.shields.io/pypi/v/switchback.svg)](https://pypi.org/project/switchback/)
79
+ [![Python](https://img.shields.io/pypi/pyversions/switchback.svg)](https://pypi.org/project/switchback/)
80
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
81
+ [![CI](https://github.com/akash-kr/switchback/actions/workflows/ci.yml/badge.svg)](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
82
+
83
+ </div>
84
+
85
+ ---
86
+
87
+ ## Why
88
+
89
+ Most scrapers either give up on hard pages or send *everything* through an expensive
90
+ headless browser / paid API. **switchback** orders the methods by cost and walks them
91
+ cheapest-first, per host, learning which tier wins where so the next run starts there.
92
+ The easy majority stays free; only genuinely-walled hosts pay for the heavy tiers.
93
+
94
+ - **Cost-ordered cascade** — free APIs → cheap HTTP → anti-bot solver → stealth browser → paid API.
95
+ - **Per-host memory (botwall)** — remembers the winning tier per host, skip-lists hard blockers, auto-skips hosts stuck on the paid tier.
96
+ - **Cost-scoped residential egress** — routes *only* walled hosts through a residential proxy, never the easy majority.
97
+ - **One shape, three entry points** — Python library, CLI (JSON on stdout), or an HTTP service.
98
+ - **Observable** — every attempt is an OpenTelemetry span; logs ship trace-correlated to any OTLP backend (Jaeger, Tempo, SigNoz).
99
+ - **Runs with any subset installed** — each tier imports its deps lazily; a missing one is just a tier miss.
100
+
101
+ ## Quickstart
102
+
103
+ ```bash
104
+ pip install switchback # core: cheap tiers (0/1) + search
105
+ ```
106
+
107
+ ```python
108
+ from switchback import scrape
109
+
110
+ for r in scrape(["https://arxiv.org/abs/1706.03762"]):
111
+ print(r.source_method, len(r.markdown))
112
+ ```
113
+
114
+ ```bash
115
+ python -m switchback https://example.com/article # JSON on stdout — bridge for any language
116
+ ```
117
+
118
+ That's the whole loop. Add tiers as you need them (see [Install](#install)).
119
+
120
+ ## The cascade (stop at first success)
121
+
122
+ | Tier | Strategy | Cost |
123
+ |---|---|---|
124
+ | 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
+ | 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
+ | 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
+ | 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
+ | 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
+ | 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
+ | 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
131
+
132
+ Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
133
+ tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
134
+ latency + outcome (`ok` / `short_content` / `rate_limited` / `miss` / `not_applicable`)
135
+ to its span and the botwall event log; the root span carries total latency and the final
136
+ outcome (incl. `deadline_exceeded`).
137
+
138
+ Search (query → URLs) is separate from the scrape cascade: `switchback.search()` /
139
+ `python -m switchback.api --search <query>`, backed by a local SearXNG.
140
+
141
+
142
+ ## Install
143
+
144
+ ```bash
145
+ pip install switchback # core: normalization + cheap tiers (0/1) + search
146
+ pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
147
+ pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
148
+ pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
149
+ pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
150
+ pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
151
+ pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
152
+ pip install "switchback[all]" # everything
153
+ ```
154
+
155
+ For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
156
+ 3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
157
+ git-URL dep inside a published package, so install it alongside):
158
+
159
+ ```bash
160
+ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
161
+ ```
162
+
163
+ Or run the whole thing as a container:
164
+ `docker build -t switchback . && docker run -p 8799:8799 switchback`.
165
+
166
+ ## Use it from your app
167
+
168
+ Three interchangeable entry points — all return the same shape
169
+ (`[{url, source_method, markdown}]`, successes only):
170
+
171
+ **Python library**
172
+ ```python
173
+ from switchback import scrape
174
+ for r in scrape(["https://arxiv.org/abs/1706.03762"]):
175
+ print(r.source_method, len(r.markdown))
176
+
177
+ # Need failures + reasons too? scrape_detailed returns a ScrapeOutcome per URL
178
+ # (ok, final_outcome, error_class, status_code, and the per-tier attempts):
179
+ from switchback import scrape_detailed
180
+ for o in scrape_detailed(["https://www.pcmag.com/news"]):
181
+ if not o.ok:
182
+ print(o.url, o.final_outcome, o.error_class, o.status_code)
183
+ ```
184
+
185
+ **CLI** (JSON on stdout — bridge for any language)
186
+ ```bash
187
+ python -m switchback https://example.com/article # or: switchback <url>
188
+ ```
189
+
190
+ **HTTP service** (language-agnostic; one warm process keeps the browser pool hot)
191
+ ```bash
192
+ switchback-server # listens on :8799
193
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"]}'
194
+ curl 'localhost:8799/search?q=web+scraping'
195
+ ```
196
+
197
+ Non-Python callers: see [clients/node_bridge.md](clients/node_bridge.md). Python
198
+ callers that want HTTP-with-CLI-fallback can drop in
199
+ [clients/python_client.py](clients/python_client.py).
200
+
201
+ ## Cost-scoped residential egress
202
+
203
+ The dominant reason hard hosts wall you is the **datacenter IP**, not the
204
+ fingerprint. When a host repeatedly walls the local tiers (a 403/429 or a
205
+ bot-wall page, `SCRAPER_BOTWALL_EGRESS_AFTER` times) it's flagged `needs_egress`
206
+ and the cascade reruns through a **residential proxy** — but only for that host:
207
+
208
+ ```bash
209
+ export SCRAPER_EGRESS_PROXY="http://user:pass@p.webshare.io:80"
210
+ ```
211
+
212
+ The easy majority that already succeeds free at the datacenter IP stays direct,
213
+ so you never spend (often metered) residential bandwidth on it. Escalation tries
214
+ the cheap HTTP tiers through the proxy first (~0.2MB/page) before the heavier
215
+ browser tiers. [Webshare](https://www.webshare.io/)'s free plan includes ~1GB/mo
216
+ of residential bandwidth — enough for low-volume hard-host recovery at $0. Use
217
+ `SCRAPER_PROXY` instead to force *every* request through a proxy.
218
+
219
+ ## Metrics & reporting
220
+
221
+ The engine derives all metrics from its own state files (no external store): the
222
+ botwall event log (one row per tier attempt, incl. the detected challenge vendor)
223
+ and the per-host DB (winning tier, per-vendor `challenge_counts`).
224
+
225
+ ```bash
226
+ curl localhost:8799/metrics # cost savings vs Firecrawl, coverage,
227
+ # overall + per-tier latency, outcomes
228
+ curl localhost:8799/metrics/domains # per-domain: error codes, challenges, latency
229
+ python -m switchback.flags # periodic digest: domains stuck on Firecrawl,
230
+ # escalated to egress, top challenged (cron-friendly)
231
+ ```
232
+
233
+ Both endpoints accept `?minutes=N` to window the event-derived sections. The
234
+ **savings** figure compares engine spend (Firecrawl invocations only) against a
235
+ Firecrawl-everything baseline, charging the hard-page credit multiplier
236
+ (`BENCH_FIRECRAWL_HARD_MULT`) for URLs that needed a browser/residential tier or
237
+ hit a challenge — i.e. exactly the ones Firecrawl bills more for.
238
+
239
+ ## Configuration
240
+
241
+ All configuration is via environment variables. The engine runs with missing
242
+ pieces: each tier imports its deps lazily and a missing one just counts as a tier
243
+ miss. Tracing no-ops if OTel isn't installed/configured.
244
+
245
+ <details>
246
+ <summary><b>Tracing (optional)</b></summary>
247
+
248
+ ```bash
249
+ export OTEL_SERVICE_NAME=switchback
250
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
251
+ ```
252
+ </details>
253
+
254
+ <details>
255
+ <summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
256
+
257
+ - `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
258
+ - `FIRECRAWL_API_KEY` — enable Tier 4
259
+ - `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
260
+ - `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
261
+ - `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
262
+ - `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
263
+ - `SEARXNG_URL` — defaults to `http://localhost:8888`
264
+ - `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
265
+ - `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
266
+ - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
267
+ </details>
268
+
269
+ <details>
270
+ <summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
271
+
272
+ - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
273
+ - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
274
+ - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
275
+ - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
276
+ - `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
277
+ - `SCRAPER_SESSION_TTL_S` — cf_clearance reuse window (1800s)
278
+ - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
279
+ - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
280
+ - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
281
+ - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
282
+ - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
283
+ - `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
284
+ - `BENCH_FIRECRAWL_USD` / `BENCH_FIRECRAWL_HARD_MULT` — cost model for the savings report
285
+ </details>
286
+
287
+ ### Logged-in sessions
288
+ Beyond a static `SCRAPER_COOKIES_FILE`, wire `SCRAPER_LOGIN_HOOK` to a callable
289
+ `func(host) -> {cookie: value}`. When an authenticated host trips a login/bot
290
+ wall, the engine calls the hook once, persists the returned cookies per host, and
291
+ overlays them on every tier (and future runs), then re-runs that URL on a fresh
292
+ budget. The hook owns the site-specific login mechanics; the engine stays generic.
293
+
294
+ ### Session traces
295
+ With `SCRAPER_TRACE_SESSION=1`, each browser-tier attempt writes a Playwright
296
+ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
297
+ `GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
298
+ `playwright show-trace <zip>`. Off by default (traces are MBs each).
299
+
300
+ ### Per-domain extraction
301
+ Markdown of the whole page is the default. To scope a site to its content node or
302
+ strip site-specific noise, declare prefs per host in `config/extraction.json`
303
+ (see [config/extraction.example.json](config/extraction.example.json)); every
304
+ tier's normalize step picks them up automatically.
305
+
306
+ ## Contributing
307
+
308
+ Issues and PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md). Start with the
309
+ cascade runner in `switchback/orchestrator.py`.
310
+
311
+ ## Responsible use
312
+
313
+ This engine is for lawful data collection. You are responsible for respecting
314
+ each target site's Terms of Service, `robots.txt`, and rate limits, and for
315
+ having the right to access the content you fetch. The stealth / anti-bot tiers
316
+ (`cloudscraper`, `patchright`, `camoufox`) exist to handle legitimate access
317
+ friction (e.g. generic bot interstitials on public pages) — not to evade access
318
+ controls, paywalls, or authentication you aren't authorized to bypass. The
319
+ software is provided "as is", without warranty (see [LICENSE](LICENSE)).
320
+
321
+ ## License
322
+
323
+ MIT — see [LICENSE](LICENSE). Third-party dependencies and their licenses are
324
+ listed in [NOTICE](NOTICE); all are permissive (MIT / BSD-3-Clause / Apache-2.0)
325
+ and compatible with this project's MIT license.